Source code for lm_polygraph.estimators.self_certainty

import numpy as np
from typing import Dict
from .estimator import Estimator


[docs]class SelfCertainty(Estimator): """ Computes a self-certainty metric for language model outputs by estimating the KL divergence between a uniform distribution and the model's autoregressive token distribution at each position. Returns the negative mean of these divergences. A higher output value indicates higher uncertainty in the model's predictions. Reference: "Scalable Best-of-N Selection for Large Language Models via Self-Certainty" (https://arxiv.org/pdf/2502.18581) """ def __init__(self): super().__init__(["greedy_log_probs"], "sequence") def __str__(self): return "SelfCertainty" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ Computes the self-certainty score for each sample in the batch. For each token in a sample, calculates the KL divergence from a uniform distribution to the predicted distribution (approximated via negative log-prob), then averages across tokens. Finally, returns the negative mean of these per-token values per sample. Parameters: stats (Dict[str, np.ndarray]): A dictionary containing: - 'greedy_log_probs': list of list of np.ndarrays, where each inner array contains log-probabilities over the vocabulary for a specific token position. Returns: np.ndarray: An array of self-certainty scores (one per sample). Higher values mean higher uncertainty. """ logprobs = stats["greedy_log_probs"] self_certainties: list[list[float]] = [] for s_lp in logprobs: # iterate over samples self_certainties.append([]) for lp in s_lp: # iterate over tokens in the sample token_logprobs = np.array(lp[~np.isinf(lp)]) # remove -inf values # Compute self-certainty: KL(uniform || predicted) = -mean(log p) - log(V) self_certainties[-1].append( -np.mean(token_logprobs) - np.log(len(token_logprobs)) ) # Aggregate self-certainty over tokens, and negate to produce final score return np.array([-np.mean(sc) for sc in self_certainties])