Source code for lm_polygraph.estimators.rauq

import numpy as np
from typing import Dict, List, Optional

from .estimator import Estimator


[docs]class RAUQ(Estimator): """ RAUQ (Recurrent Attention-based Uncertainty Quantification) from https://arxiv.org/abs/2505.20045 This estimator quantifies uncertainty in LLM outputs by combining attention patterns with token probabilities in a recurrent manner. Args: alpha: Weight parameter for combining attention and probability scores model_name: Name or path of the model to load configuration from use_entropy: Whether to use entropy-based uncertainty instruct: Whether the model is instruction-tuned """ def __init__( self, alpha: Optional[float] = None, n_layers: Optional[int] = None, n_heads: Optional[int] = None, use_entropy: bool = False, instruct: bool = False, ): dependencies = ["attention_all", "greedy_log_likelihoods"] if use_entropy: dependencies.append("entropy") super().__init__(dependencies, "sequence") self.use_entropy = use_entropy self.instruct = instruct self.alpha = alpha if alpha is not None else self.get_alpha() self.n_layers = n_layers self.n_heads = n_heads # Focus on middle third of layers which typically contain most relevant information if self.n_layers is not None: self.layers = list( range(self.n_layers // 3, int(np.ceil(self.n_layers / 3 * 2) + 1)) ) else: self.layers = None def __str__(self) -> str: """Returns a string representation of the estimator.""" method_desc = " (entropy)" if self.use_entropy else "" return f"RAUQ{method_desc}"
[docs] def get_alpha(self) -> float: """ Returns the default alpha parameter based on model configuration. Returns: float: Alpha value between 0 and 1 """ if self.instruct: return 0.9 if self.use_entropy else 0.5 return 0.8 if self.use_entropy else 0.2
def _calculate_confidence_scores( self, log_probabilities: np.ndarray, attentions: np.ndarray, layer: int, head: int, ) -> List[float]: """ Calculate confidence scores for a sequence using attention and probabilities. Args: log_probabilities: Log probabilities for each token attentions: Attention patterns layer: Current layer index head: Selected attention head Returns: List[float]: Confidence scores for each position """ confidence_scores = [np.exp(log_probabilities[0])] for j in range(1, len(log_probabilities)): current_prob = np.exp(log_probabilities[j]) prev_confidence = confidence_scores[-1] attention_weight = attentions[layer, head, j - 1] confidence = ( self.alpha * current_prob + (1 - self.alpha) * attention_weight * prev_confidence ) confidence_scores.append(confidence) return confidence_scores def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ Calculate uncertainty scores for each sequence in a batch. Args: stats: Dictionary containing model statistics including attention weights and log likelihoods Returns: np.ndarray: Uncertainty scores for each sequence """ if self.n_layers is None: _cfg = getattr( stats["model"].model.config, "text_config", stats["model"].model.config ) self.n_layers = _cfg.num_hidden_layers self.layers = list( range(self.n_layers // 3, int(np.ceil(self.n_layers / 3 * 2) + 1)) ) if self.n_heads is None: _cfg = getattr( stats["model"].model.config, "text_config", stats["model"].model.config ) self.n_heads = _cfg.num_attention_heads # Extract diagonal attention patterns for each sequence attentions = [] for attention_weight in stats["attention_all"]: # Reshape attention weights to separate layers and heads reshaped_weights = attention_weight.reshape( self.n_layers, self.n_heads, attention_weight.shape[-2], attention_weight.shape[-1], ) # Extract attention weights for previous token with offset -1 attenion_prev_token = np.diagonal( reshaped_weights, offset=-1, axis1=2, axis2=3 ) attentions.append(attenion_prev_token) greedy_log_likelihoods = stats["greedy_log_likelihoods"] if self.use_entropy: entropy = stats["entropy"] vocab_size = len(stats["greedy_log_probs"][0][0]) max_entropy = np.log(vocab_size) uncertainty_scores = [] for idx in range(len(greedy_log_likelihoods)): # Get log probabilities for current sequence if self.use_entropy: log_probabilities = np.log(max_entropy - np.array(entropy[idx]) + 1e-10) else: log_probabilities = greedy_log_likelihoods[idx] # Calculate uncertainty scores for each layer layer_scores = [] for layer in self.layers: # Select most attentive head for current layer head = attentions[idx][layer].mean(-1).argmax() # Calculate confidence scores confidence_scores = self._calculate_confidence_scores( log_probabilities, attentions[idx], layer, head ) # Calculate uncertainty score uncertainty = 1 - np.log(confidence_scores).mean() layer_scores.append(uncertainty) # Take maximum uncertainty across layers uncertainty_scores.append(np.max(layer_scores)) return np.array(uncertainty_scores)