Source code for lm_polygraph.estimators.verbalized_2s

import numpy as np
import re

from typing import Dict

from .estimator import Estimator


[docs]class Verbalized2S(Estimator):
    """
    Asks model to output it's confidence in a provided follow-up prompt and
    extracts the confidence estimate from the model's answer using a provided regex.
    Only usabe for instruct-finetuned models with chat template support.
    Adapted from the original implementation in the paper https://arxiv.org/abs/2305.14975
    """

    def __init__(
        self,
        confidence_prompt: str,
        confidence_regex: str = "",
        max_new_tokens: int = 10,
        name_postfix="",
    ):
        self.max_new_tokens = max_new_tokens
        self.confidence_prompt = confidence_prompt
        self.confidence_regex = confidence_regex
        self.postfix = name_postfix
        super().__init__(["input_texts", "greedy_texts"], "sequence")

    def __str__(self):
        return f"Verbalized2S{self.postfix}"

    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
        model = stats["model"]

        chats = []
        prompts = stats["input_texts"]
        guesses = stats["greedy_texts"]
        for prompt, guess in zip(prompts, guesses):
            chats.append(
                [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": guess},
                    {"role": "user", "content": self.confidence_prompt},
                ]
            )

        out = model.generate_texts(
            chats,
            min_new_tokens=2,
            max_new_tokens=self.max_new_tokens,
            num_return_sequences=1,
        )

        ues = []
        conf_re = re.compile(self.confidence_regex)
        for answer in out:
            match = re.search(conf_re, answer)

            try:
                ue = 1 - float(match.groups()[0])
            except AttributeError:
                ue = np.nan

            ues.append(ue)

        return np.array(ues)