Source code for lm_polygraph.estimators.self_certainty

import numpy as np
from typing import Dict
from .estimator import Estimator


[docs]class SelfCertainty(Estimator):
    """
    Computes a self-certainty metric for language model outputs by estimating
    the KL divergence between a uniform distribution and the model's autoregressive
    token distribution at each position. Returns the negative mean of these divergences.
    A higher output value indicates higher uncertainty in the model's predictions.

    Reference:
        "Scalable Best-of-N Selection for Large Language Models via Self-Certainty"
        (https://arxiv.org/pdf/2502.18581)
    """

    def __init__(self):
        super().__init__(["greedy_log_probs"], "sequence")

    def __str__(self):
        return "SelfCertainty"

    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
        """
        Computes the self-certainty score for each sample in the batch.
        For each token in a sample, calculates the KL divergence from a uniform
        distribution to the predicted distribution (approximated via negative log-prob),
        then averages across tokens. Finally, returns the negative mean of these
        per-token values per sample.

        Parameters:
            stats (Dict[str, np.ndarray]): A dictionary containing:
                - 'greedy_log_probs': list of list of np.ndarrays, where each inner array
                  contains log-probabilities over the vocabulary for a specific token position.

        Returns:
            np.ndarray: An array of self-certainty scores (one per sample).
                        Higher values mean higher uncertainty.
        """
        logprobs = stats["greedy_log_probs"]
        self_certainties: list[list[float]] = []

        for s_lp in logprobs:  # iterate over samples
            self_certainties.append([])
            for lp in s_lp:  # iterate over tokens in the sample
                token_logprobs = np.array(lp[~np.isinf(lp)])  # remove -inf values
                # Compute self-certainty: KL(uniform || predicted) = -mean(log p) - log(V)
                self_certainties[-1].append(
                    -np.mean(token_logprobs) - np.log(len(token_logprobs))
                )

        # Aggregate self-certainty over tokens, and negate to produce final score
        return np.array([-np.mean(sc) for sc in self_certainties])