Skip to content

Evaluation

Evaluators for assessing agent output quality.

Rules-Based Evaluator

agentprobe.eval.rules

Rule-based evaluator with configurable rules and weighted scoring.

Provides a declarative evaluation approach using built-in rule handlers like contains_any, not_contains, max_length, regex, and json_valid.

RuleSpec

Bases: BaseModel

Specification for a single evaluation rule.

Attributes:

Name Type Description
rule_type str

The type of rule (e.g. 'contains_any', 'regex').

params dict[str, Any]

Parameters for the rule handler.

weight float

Relative weight of this rule in the overall score.

description str

Human-readable description of what this rule checks.

Source code in src/agentprobe/eval/rules.py
class RuleSpec(BaseModel):
    """Specification for a single evaluation rule.

    Attributes:
        rule_type: The type of rule (e.g. 'contains_any', 'regex').
        params: Parameters for the rule handler.
        weight: Relative weight of this rule in the overall score.
        description: Human-readable description of what this rule checks.
    """

    model_config = ConfigDict(strict=True, extra="forbid")

    rule_type: str
    params: dict[str, Any] = Field(default_factory=dict)
    weight: float = Field(default=1.0, gt=0)
    description: str = ""

RuleBasedEvaluator

Bases: BaseEvaluator

Evaluator that applies a set of declarative rules with weighted scoring.

Each rule is checked against the agent output. The final score is the weighted average of passing rules.

Attributes:

Name Type Description
rules

List of rule specifications to evaluate.

Source code in src/agentprobe/eval/rules.py
class RuleBasedEvaluator(BaseEvaluator):
    """Evaluator that applies a set of declarative rules with weighted scoring.

    Each rule is checked against the agent output. The final score is
    the weighted average of passing rules.

    Attributes:
        rules: List of rule specifications to evaluate.
    """

    def __init__(
        self,
        name: str = "rule-based",
        rules: list[RuleSpec] | None = None,
    ) -> None:
        """Initialize the rule-based evaluator.

        Args:
            name: Evaluator name.
            rules: List of rule specifications. Defaults to empty.
        """
        super().__init__(name)
        self.rules = rules or []

    async def _evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
        """Evaluate the trace output against all configured rules.

        Args:
            test_case: The test case that was executed.
            trace: The execution trace to evaluate.

        Returns:
            An evaluation result with weighted score.
        """
        if not self.rules:
            return EvalResult(
                evaluator_name=self.name,
                verdict=EvalVerdict.PASS,
                score=1.0,
                reason="No rules configured — pass by default",
            )

        output = trace.output_text
        total_weight = 0.0
        weighted_score = 0.0
        results: list[dict[str, Any]] = []

        for rule in self.rules:
            handler = _RULE_HANDLERS.get(rule.rule_type)
            if handler is None:
                logger.warning("Unknown rule type: %s", rule.rule_type)
                results.append(
                    {
                        "rule": rule.rule_type,
                        "passed": False,
                        "error": "unknown rule type",
                    }
                )
                total_weight += rule.weight
                continue

            passed = handler(output, rule.params)
            total_weight += rule.weight
            if passed:
                weighted_score += rule.weight

            results.append(
                {
                    "rule": rule.rule_type,
                    "description": rule.description,
                    "passed": passed,
                    "weight": rule.weight,
                }
            )

        score = weighted_score / total_weight if total_weight > 0 else 0.0
        all_passed = all(r["passed"] for r in results)

        _partial_threshold = 0.5
        if all_passed:
            verdict = EvalVerdict.PASS
        elif score >= _partial_threshold:
            verdict = EvalVerdict.PARTIAL
        else:
            verdict = EvalVerdict.FAIL

        return EvalResult(
            evaluator_name=self.name,
            verdict=verdict,
            score=score,
            reason=f"{int(weighted_score)}/{int(total_weight)} rules passed (weighted)",
            metadata={"rule_results": results},
        )

__init__(name='rule-based', rules=None)

Initialize the rule-based evaluator.

Parameters:

Name Type Description Default
name str

Evaluator name.

'rule-based'
rules list[RuleSpec] | None

List of rule specifications. Defaults to empty.

None
Source code in src/agentprobe/eval/rules.py
def __init__(
    self,
    name: str = "rule-based",
    rules: list[RuleSpec] | None = None,
) -> None:
    """Initialize the rule-based evaluator.

    Args:
        name: Evaluator name.
        rules: List of rule specifications. Defaults to empty.
    """
    super().__init__(name)
    self.rules = rules or []

Embedding Evaluator

agentprobe.eval.embedding

Embedding similarity evaluator using cosine similarity.

Compares agent output embeddings against expected output embeddings to produce a similarity score.

EmbeddingSimilarityEvaluator

Bases: BaseEvaluator

Evaluator that compares embeddings via cosine similarity.

Obtains embeddings for expected and actual outputs from an embedding API, then computes cosine similarity. A threshold determines pass/fail.

Attributes:

Name Type Description
model

Embedding model identifier.

provider

API provider ('openai').

threshold

Minimum similarity score to pass.

Source code in src/agentprobe/eval/embedding.py
class EmbeddingSimilarityEvaluator(BaseEvaluator):
    """Evaluator that compares embeddings via cosine similarity.

    Obtains embeddings for expected and actual outputs from an
    embedding API, then computes cosine similarity. A threshold
    determines pass/fail.

    Attributes:
        model: Embedding model identifier.
        provider: API provider ('openai').
        threshold: Minimum similarity score to pass.
    """

    def __init__(
        self,
        *,
        model: str = "text-embedding-3-small",
        provider: str = "openai",
        api_key: str | None = None,
        threshold: float = 0.8,
        name: str = "embedding-similarity",
    ) -> None:
        """Initialize the embedding similarity evaluator.

        Args:
            model: Embedding model name.
            provider: API provider.
            api_key: API key. Read from environment if None.
            threshold: Minimum similarity to pass.
            name: Evaluator name.
        """
        super().__init__(name)
        self.model = model
        self.provider = provider
        self._api_key = api_key
        self.threshold = threshold
        self._cache: dict[str, list[float]] = {}

    async def _evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
        """Compare embeddings of expected and actual output.

        Args:
            test_case: Test case with expected output.
            trace: Execution trace with actual output.

        Returns:
            Evaluation result based on cosine similarity.
        """
        if not test_case.expected_output:
            return EvalResult(
                evaluator_name=self.name,
                verdict=EvalVerdict.PASS,
                score=1.0,
                reason="No expected output — skip embedding comparison",
            )

        expected_emb = await self._get_embedding(test_case.expected_output)
        actual_emb = await self._get_embedding(trace.output_text)

        similarity = cosine_similarity(expected_emb, actual_emb)
        score = max(0.0, min(1.0, similarity))

        if score >= self.threshold:
            verdict = EvalVerdict.PASS
        elif score >= self.threshold * 0.75:
            verdict = EvalVerdict.PARTIAL
        else:
            verdict = EvalVerdict.FAIL

        return EvalResult(
            evaluator_name=self.name,
            verdict=verdict,
            score=score,
            reason=f"Cosine similarity: {similarity:.4f} (threshold: {self.threshold})",
            metadata={"similarity": similarity, "threshold": self.threshold},
        )

    async def _get_embedding(self, text: str) -> list[float]:
        """Get the embedding for a text string, using cache.

        Args:
            text: The text to embed.

        Returns:
            Embedding vector.
        """
        if text in self._cache:
            return self._cache[text]

        embedding = await self._call_embedding_api(text)
        self._cache[text] = embedding
        return embedding

    async def _call_embedding_api(self, text: str) -> list[float]:  # pragma: no cover
        """Call the embedding API.

        Args:
            text: Text to embed.

        Returns:
            Embedding vector.

        Raises:
            EvaluatorError: If the API call fails.
        """
        import os

        api_key = self._api_key or os.environ.get("OPENAI_API_KEY")
        if not api_key:
            raise EvaluatorError("OPENAI_API_KEY not set for embedding API")

        url = "https://api.openai.com/v1/embeddings"
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }
        payload = {"model": self.model, "input": text}

        _http_ok = 200
        async with (
            aiohttp.ClientSession() as session,
            session.post(url, json=payload, headers=headers) as resp,
        ):
            if resp.status != _http_ok:
                body = await resp.text()
                raise EvaluatorError(f"Embedding API error: {resp.status} — {body}")
            data = await resp.json()
            embedding: list[float] = data["data"][0]["embedding"]
            return embedding

__init__(*, model='text-embedding-3-small', provider='openai', api_key=None, threshold=0.8, name='embedding-similarity')

Initialize the embedding similarity evaluator.

Parameters:

Name Type Description Default
model str

Embedding model name.

'text-embedding-3-small'
provider str

API provider.

'openai'
api_key str | None

API key. Read from environment if None.

None
threshold float

Minimum similarity to pass.

0.8
name str

Evaluator name.

'embedding-similarity'
Source code in src/agentprobe/eval/embedding.py
def __init__(
    self,
    *,
    model: str = "text-embedding-3-small",
    provider: str = "openai",
    api_key: str | None = None,
    threshold: float = 0.8,
    name: str = "embedding-similarity",
) -> None:
    """Initialize the embedding similarity evaluator.

    Args:
        model: Embedding model name.
        provider: API provider.
        api_key: API key. Read from environment if None.
        threshold: Minimum similarity to pass.
        name: Evaluator name.
    """
    super().__init__(name)
    self.model = model
    self.provider = provider
    self._api_key = api_key
    self.threshold = threshold
    self._cache: dict[str, list[float]] = {}

cosine_similarity(vec_a, vec_b)

Compute cosine similarity between two vectors.

Parameters:

Name Type Description Default
vec_a list[float]

First vector.

required
vec_b list[float]

Second vector.

required

Returns:

Type Description
float

Cosine similarity score in [-1.0, 1.0].

Raises:

Type Description
ValueError

If vectors have different lengths or are empty.

Source code in src/agentprobe/eval/embedding.py
def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
    """Compute cosine similarity between two vectors.

    Args:
        vec_a: First vector.
        vec_b: Second vector.

    Returns:
        Cosine similarity score in [-1.0, 1.0].

    Raises:
        ValueError: If vectors have different lengths or are empty.
    """
    if len(vec_a) != len(vec_b):
        msg = f"Vector length mismatch: {len(vec_a)} vs {len(vec_b)}"
        raise ValueError(msg)

    if len(vec_a) == 0:
        msg = "Cannot compute similarity of empty vectors"
        raise ValueError(msg)

    dot = sum(a * b for a, b in zip(vec_a, vec_b, strict=True))
    norm_a = math.sqrt(sum(a * a for a in vec_a))
    norm_b = math.sqrt(sum(b * b for b in vec_b))

    if norm_a == 0.0 or norm_b == 0.0:
        return 0.0

    return dot / (norm_a * norm_b)

Judge Evaluator

agentprobe.eval.llm_judge

Judge evaluator that uses a language model to assess agent outputs.

Sends the agent's output along with a rubric to a judge model and parses the structured JSON response into an EvalResult.

LLMJudge

Bases: BaseEvaluator

Evaluator that uses a language model as a judge.

Calls an external model API (Anthropic or OpenAI) with the agent's output and a rubric, then parses the JSON verdict response.

Attributes:

Name Type Description
model

The judge model identifier.

provider

API provider ('anthropic' or 'openai').

temperature

Sampling temperature for the judge.

max_tokens

Maximum response tokens.

rubric

Evaluation rubric/criteria text.

Source code in src/agentprobe/eval/llm_judge.py
class LLMJudge(BaseEvaluator):
    """Evaluator that uses a language model as a judge.

    Calls an external model API (Anthropic or OpenAI) with the agent's
    output and a rubric, then parses the JSON verdict response.

    Attributes:
        model: The judge model identifier.
        provider: API provider ('anthropic' or 'openai').
        temperature: Sampling temperature for the judge.
        max_tokens: Maximum response tokens.
        rubric: Evaluation rubric/criteria text.
    """

    def __init__(
        self,
        *,
        model: str = "claude-sonnet-4-5-20250929",
        provider: str = "anthropic",
        api_key: str | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
        rubric: str = "",
        name: str = "llm-judge",
    ) -> None:
        """Initialize the judge evaluator.

        Args:
            model: Judge model identifier.
            provider: API provider name.
            api_key: API key. Read from environment if None.
            temperature: Sampling temperature.
            max_tokens: Max response tokens.
            rubric: Evaluation criteria text.
            name: Evaluator name.
        """
        super().__init__(name)
        self.model = model
        self.provider = provider
        self._api_key = api_key
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.rubric = rubric

    async def _evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
        """Send the output to the judge model and parse the verdict.

        Args:
            test_case: The test case.
            trace: The execution trace.

        Returns:
            Parsed evaluation result from the judge.
        """
        prompt = self._build_prompt(test_case, trace)
        response_text = await self._call_api(prompt)
        return self._parse_response(response_text)

    def _build_prompt(self, test_case: TestCase, trace: Trace) -> str:
        """Build the evaluation prompt for the judge.

        Args:
            test_case: The test case with expectations.
            trace: The execution trace with the output.

        Returns:
            Formatted prompt string.
        """
        parts = [f"## Agent Input\n{test_case.input_text}"]

        if test_case.expected_output:
            parts.append(f"## Expected Output\n{test_case.expected_output}")

        parts.append(f"## Actual Output\n{trace.output_text}")

        if self.rubric:
            parts.append(f"## Evaluation Criteria\n{self.rubric}")

        return "\n\n".join(parts)

    async def _call_api(self, prompt: str) -> str:
        """Call the judge model API.

        Args:
            prompt: The evaluation prompt.

        Returns:
            Raw response text from the judge.

        Raises:
            JudgeAPIError: If the API call fails.
        """
        if self.provider == "anthropic":
            return await self._call_anthropic(prompt)
        elif self.provider == "openai":
            return await self._call_openai(prompt)
        else:
            raise JudgeAPIError(self.model, 0, f"Unknown provider: {self.provider}")

    async def _call_anthropic(self, prompt: str) -> str:  # pragma: no cover
        """Call the Anthropic Messages API.

        Args:
            prompt: The evaluation prompt.

        Returns:
            Response text.
        """
        import os

        api_key = self._api_key or os.environ.get("ANTHROPIC_API_KEY")
        if not api_key:
            raise JudgeAPIError(self.model, 0, "ANTHROPIC_API_KEY not set")

        url = "https://api.anthropic.com/v1/messages"
        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
        }
        payload = {
            "model": self.model,
            "max_tokens": self.max_tokens,
            "temperature": self.temperature,
            "system": _DEFAULT_SYSTEM_PROMPT,
            "messages": [{"role": "user", "content": prompt}],
        }

        _http_ok = 200
        async with (
            aiohttp.ClientSession() as session,
            session.post(url, json=payload, headers=headers) as resp,
        ):
            if resp.status != _http_ok:
                body = await resp.text()
                raise JudgeAPIError(self.model, resp.status, body)
            data = await resp.json()
            return str(data["content"][0]["text"])

    async def _call_openai(self, prompt: str) -> str:  # pragma: no cover
        """Call the OpenAI Chat Completions API.

        Args:
            prompt: The evaluation prompt.

        Returns:
            Response text.
        """
        import os

        api_key = self._api_key or os.environ.get("OPENAI_API_KEY")
        if not api_key:
            raise JudgeAPIError(self.model, 0, "OPENAI_API_KEY not set")

        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }
        payload = {
            "model": self.model,
            "max_tokens": self.max_tokens,
            "temperature": self.temperature,
            "messages": [
                {"role": "system", "content": _DEFAULT_SYSTEM_PROMPT},
                {"role": "user", "content": prompt},
            ],
        }

        _http_ok = 200
        async with (
            aiohttp.ClientSession() as session,
            session.post(url, json=payload, headers=headers) as resp,
        ):
            if resp.status != _http_ok:
                body = await resp.text()
                raise JudgeAPIError(self.model, resp.status, body)
            data = await resp.json()
            return str(data["choices"][0]["message"]["content"])

    def _parse_response(self, response_text: str) -> EvalResult:
        """Parse the judge's JSON response into an EvalResult.

        Args:
            response_text: Raw response text.

        Returns:
            Parsed EvalResult.
        """
        try:
            data = json.loads(response_text)
        except json.JSONDecodeError:
            cleaned = response_text.strip()
            start = cleaned.find("{")
            end = cleaned.rfind("}") + 1
            if start >= 0 and end > start:
                try:
                    data = json.loads(cleaned[start:end])
                except json.JSONDecodeError:
                    return EvalResult(
                        evaluator_name=self.name,
                        verdict=EvalVerdict.ERROR,
                        score=0.0,
                        reason=f"Failed to parse judge response: {response_text[:200]}",
                    )
            else:
                return EvalResult(
                    evaluator_name=self.name,
                    verdict=EvalVerdict.ERROR,
                    score=0.0,
                    reason=f"No JSON found in judge response: {response_text[:200]}",
                )

        verdict_str = str(data.get("verdict", "error")).lower()
        verdict_map = {
            "pass": EvalVerdict.PASS,
            "fail": EvalVerdict.FAIL,
            "partial": EvalVerdict.PARTIAL,
        }
        verdict = verdict_map.get(verdict_str, EvalVerdict.ERROR)

        score = float(data.get("score", 0.0))
        score = max(0.0, min(1.0, score))

        reason = str(data.get("reason", ""))

        return EvalResult(
            evaluator_name=self.name,
            verdict=verdict,
            score=score,
            reason=reason,
        )

__init__(*, model='claude-sonnet-4-5-20250929', provider='anthropic', api_key=None, temperature=0.0, max_tokens=1024, rubric='', name='llm-judge')

Initialize the judge evaluator.

Parameters:

Name Type Description Default
model str

Judge model identifier.

'claude-sonnet-4-5-20250929'
provider str

API provider name.

'anthropic'
api_key str | None

API key. Read from environment if None.

None
temperature float

Sampling temperature.

0.0
max_tokens int

Max response tokens.

1024
rubric str

Evaluation criteria text.

''
name str

Evaluator name.

'llm-judge'
Source code in src/agentprobe/eval/llm_judge.py
def __init__(
    self,
    *,
    model: str = "claude-sonnet-4-5-20250929",
    provider: str = "anthropic",
    api_key: str | None = None,
    temperature: float = 0.0,
    max_tokens: int = 1024,
    rubric: str = "",
    name: str = "llm-judge",
) -> None:
    """Initialize the judge evaluator.

    Args:
        model: Judge model identifier.
        provider: API provider name.
        api_key: API key. Read from environment if None.
        temperature: Sampling temperature.
        max_tokens: Max response tokens.
        rubric: Evaluation criteria text.
        name: Evaluator name.
    """
    super().__init__(name)
    self.model = model
    self.provider = provider
    self._api_key = api_key
    self.temperature = temperature
    self.max_tokens = max_tokens
    self.rubric = rubric

Statistical Evaluator

agentprobe.eval.statistical

Statistical evaluator for repeated evaluation with aggregated metrics.

Wraps an inner evaluator and runs it multiple times across pre-collected traces, computing mean, standard deviation, percentiles, and confidence intervals from the score distribution.

StatisticalEvaluator

Bases: BaseEvaluator

Evaluator that runs an inner evaluator multiple times and aggregates stats.

Wraps another evaluator and runs it against multiple traces for the same test case, computing distributional statistics on the resulting scores.

Attributes:

Name Type Description
inner BaseEvaluator

The wrapped evaluator instance.

pass_threshold BaseEvaluator

Minimum mean score to consider a pass.

Source code in src/agentprobe/eval/statistical.py
class StatisticalEvaluator(BaseEvaluator):
    """Evaluator that runs an inner evaluator multiple times and aggregates stats.

    Wraps another evaluator and runs it against multiple traces for the same
    test case, computing distributional statistics on the resulting scores.

    Attributes:
        inner: The wrapped evaluator instance.
        pass_threshold: Minimum mean score to consider a pass.
    """

    def __init__(
        self,
        inner: BaseEvaluator,
        *,
        name: str | None = None,
        pass_threshold: float = 0.7,
    ) -> None:
        """Initialize the statistical evaluator.

        Args:
            inner: The evaluator to wrap and run repeatedly.
            name: Optional name override. Defaults to 'statistical-{inner.name}'.
            pass_threshold: Minimum mean score for a pass verdict.
        """
        resolved_name = name or f"statistical-{inner.name}"
        super().__init__(resolved_name)
        self._inner = inner
        self._pass_threshold = pass_threshold

    @property
    def inner(self) -> BaseEvaluator:
        """Return the wrapped evaluator."""
        return self._inner

    async def _evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
        """Run the inner evaluator once (single-trace mode).

        For statistical analysis, use ``evaluate_multiple()`` instead.

        Args:
            test_case: The test case.
            trace: A single trace to evaluate.

        Returns:
            The inner evaluator's result.
        """
        return await self._inner.evaluate(test_case, trace)

    async def evaluate_multiple(
        self,
        test_case: TestCase,
        traces: Sequence[Trace],
    ) -> StatisticalSummary:
        """Evaluate multiple traces and compute aggregate statistics.

        Runs the inner evaluator on each trace, collects scores, and
        computes mean, standard deviation, median, percentiles, and
        a 95% confidence interval.

        Args:
            test_case: The test case specification.
            traces: Pre-collected traces to evaluate.

        Returns:
            A statistical summary of the score distribution.
        """
        scores: list[float] = []
        for trace in traces:
            result = await self._inner.evaluate(test_case, trace)
            scores.append(result.score)

        if not scores:
            return StatisticalSummary(
                evaluator_name=self.name,
                sample_count=1,
                scores=(0.0,),
                mean=0.0,
                std_dev=0.0,
                median=0.0,
                p5=0.0,
                p95=0.0,
                ci_lower=0.0,
                ci_upper=0.0,
            )

        n = len(scores)
        mean = statistics.mean(scores)
        std_dev = statistics.stdev(scores) if n > 1 else 0.0

        sorted_scores = sorted(scores)
        median = statistics.median(sorted_scores)
        p5 = _percentile(sorted_scores, 5)
        p95 = _percentile(sorted_scores, 95)

        # 95% confidence interval using t-distribution approximation
        if n > 1:
            se = std_dev / math.sqrt(n)
            # Approximate t-value for 95% CI (use 1.96 for large n)
            t_val = 1.96
            ci_lower = max(0.0, mean - t_val * se)
            ci_upper = min(1.0, mean + t_val * se)
        else:
            ci_lower = mean
            ci_upper = mean

        return StatisticalSummary(
            evaluator_name=self.name,
            sample_count=n,
            scores=tuple(scores),
            mean=round(mean, 6),
            std_dev=round(std_dev, 6),
            median=round(median, 6),
            p5=round(p5, 6),
            p95=round(p95, 6),
            ci_lower=round(ci_lower, 6),
            ci_upper=round(ci_upper, 6),
        )

    def summary_to_eval_result(self, summary: StatisticalSummary) -> EvalResult:
        """Convert a statistical summary into a standard EvalResult.

        Args:
            summary: The summary to convert.

        Returns:
            An EvalResult with the mean score and appropriate verdict.
        """
        _partial_threshold = 0.5
        if summary.mean >= self._pass_threshold:
            verdict = EvalVerdict.PASS
        elif summary.mean >= _partial_threshold:
            verdict = EvalVerdict.PARTIAL
        else:
            verdict = EvalVerdict.FAIL

        return EvalResult(
            evaluator_name=self.name,
            verdict=verdict,
            score=summary.mean,
            reason=(
                f"Statistical: mean={summary.mean:.3f}, "
                f"std={summary.std_dev:.3f}, n={summary.sample_count}"
            ),
            metadata={
                "std_dev": summary.std_dev,
                "median": summary.median,
                "p5": summary.p5,
                "p95": summary.p95,
                "ci_lower": summary.ci_lower,
                "ci_upper": summary.ci_upper,
                "sample_count": summary.sample_count,
            },
        )

inner property

Return the wrapped evaluator.

__init__(inner, *, name=None, pass_threshold=0.7)

Initialize the statistical evaluator.

Parameters:

Name Type Description Default
inner BaseEvaluator

The evaluator to wrap and run repeatedly.

required
name str | None

Optional name override. Defaults to 'statistical-{inner.name}'.

None
pass_threshold float

Minimum mean score for a pass verdict.

0.7
Source code in src/agentprobe/eval/statistical.py
def __init__(
    self,
    inner: BaseEvaluator,
    *,
    name: str | None = None,
    pass_threshold: float = 0.7,
) -> None:
    """Initialize the statistical evaluator.

    Args:
        inner: The evaluator to wrap and run repeatedly.
        name: Optional name override. Defaults to 'statistical-{inner.name}'.
        pass_threshold: Minimum mean score for a pass verdict.
    """
    resolved_name = name or f"statistical-{inner.name}"
    super().__init__(resolved_name)
    self._inner = inner
    self._pass_threshold = pass_threshold

evaluate_multiple(test_case, traces) async

Evaluate multiple traces and compute aggregate statistics.

Runs the inner evaluator on each trace, collects scores, and computes mean, standard deviation, median, percentiles, and a 95% confidence interval.

Parameters:

Name Type Description Default
test_case TestCase

The test case specification.

required
traces Sequence[Trace]

Pre-collected traces to evaluate.

required

Returns:

Type Description
StatisticalSummary

A statistical summary of the score distribution.

Source code in src/agentprobe/eval/statistical.py
async def evaluate_multiple(
    self,
    test_case: TestCase,
    traces: Sequence[Trace],
) -> StatisticalSummary:
    """Evaluate multiple traces and compute aggregate statistics.

    Runs the inner evaluator on each trace, collects scores, and
    computes mean, standard deviation, median, percentiles, and
    a 95% confidence interval.

    Args:
        test_case: The test case specification.
        traces: Pre-collected traces to evaluate.

    Returns:
        A statistical summary of the score distribution.
    """
    scores: list[float] = []
    for trace in traces:
        result = await self._inner.evaluate(test_case, trace)
        scores.append(result.score)

    if not scores:
        return StatisticalSummary(
            evaluator_name=self.name,
            sample_count=1,
            scores=(0.0,),
            mean=0.0,
            std_dev=0.0,
            median=0.0,
            p5=0.0,
            p95=0.0,
            ci_lower=0.0,
            ci_upper=0.0,
        )

    n = len(scores)
    mean = statistics.mean(scores)
    std_dev = statistics.stdev(scores) if n > 1 else 0.0

    sorted_scores = sorted(scores)
    median = statistics.median(sorted_scores)
    p5 = _percentile(sorted_scores, 5)
    p95 = _percentile(sorted_scores, 95)

    # 95% confidence interval using t-distribution approximation
    if n > 1:
        se = std_dev / math.sqrt(n)
        # Approximate t-value for 95% CI (use 1.96 for large n)
        t_val = 1.96
        ci_lower = max(0.0, mean - t_val * se)
        ci_upper = min(1.0, mean + t_val * se)
    else:
        ci_lower = mean
        ci_upper = mean

    return StatisticalSummary(
        evaluator_name=self.name,
        sample_count=n,
        scores=tuple(scores),
        mean=round(mean, 6),
        std_dev=round(std_dev, 6),
        median=round(median, 6),
        p5=round(p5, 6),
        p95=round(p95, 6),
        ci_lower=round(ci_lower, 6),
        ci_upper=round(ci_upper, 6),
    )

summary_to_eval_result(summary)

Convert a statistical summary into a standard EvalResult.

Parameters:

Name Type Description Default
summary StatisticalSummary

The summary to convert.

required

Returns:

Type Description
EvalResult

An EvalResult with the mean score and appropriate verdict.

Source code in src/agentprobe/eval/statistical.py
def summary_to_eval_result(self, summary: StatisticalSummary) -> EvalResult:
    """Convert a statistical summary into a standard EvalResult.

    Args:
        summary: The summary to convert.

    Returns:
        An EvalResult with the mean score and appropriate verdict.
    """
    _partial_threshold = 0.5
    if summary.mean >= self._pass_threshold:
        verdict = EvalVerdict.PASS
    elif summary.mean >= _partial_threshold:
        verdict = EvalVerdict.PARTIAL
    else:
        verdict = EvalVerdict.FAIL

    return EvalResult(
        evaluator_name=self.name,
        verdict=verdict,
        score=summary.mean,
        reason=(
            f"Statistical: mean={summary.mean:.3f}, "
            f"std={summary.std_dev:.3f}, n={summary.sample_count}"
        ),
        metadata={
            "std_dev": summary.std_dev,
            "median": summary.median,
            "p5": summary.p5,
            "p95": summary.p95,
            "ci_lower": summary.ci_lower,
            "ci_upper": summary.ci_upper,
            "sample_count": summary.sample_count,
        },
    )

Trace Comparison Evaluator

agentprobe.eval.trace_compare

Trace comparison evaluator with weighted multi-dimension scoring.

Compares two traces across tool sequences, tool parameters, output similarity, and cost deviation, producing a weighted composite score.

TraceComparisonEvaluator

Bases: BaseEvaluator

Evaluator that compares a trace against a reference trace.

Computes similarity across multiple dimensions with configurable weights: tool sequence, tool parameters, output text, and cost.

Attributes:

Name Type Description
reference_trace

The reference trace to compare against.

weights

Per-dimension weight configuration.

Source code in src/agentprobe/eval/trace_compare.py
class TraceComparisonEvaluator(BaseEvaluator):
    """Evaluator that compares a trace against a reference trace.

    Computes similarity across multiple dimensions with configurable
    weights: tool sequence, tool parameters, output text, and cost.

    Attributes:
        reference_trace: The reference trace to compare against.
        weights: Per-dimension weight configuration.
    """

    DEFAULT_WEIGHTS: ClassVar[dict[str, float]] = {
        "tool_sequence": 0.3,
        "tool_parameters": 0.2,
        "output_similarity": 0.35,
        "cost_deviation": 0.15,
    }

    def __init__(
        self,
        reference_trace: Trace,
        *,
        name: str = "trace-compare",
        weights: dict[str, float] | None = None,
        pass_threshold: float = 0.7,
    ) -> None:
        """Initialize the trace comparison evaluator.

        Args:
            reference_trace: The baseline trace to compare against.
            name: Evaluator name.
            weights: Dimension weight overrides.
            pass_threshold: Minimum score for a pass verdict.
        """
        super().__init__(name)
        self._reference = reference_trace
        self._weights = weights or dict(self.DEFAULT_WEIGHTS)
        self._pass_threshold = pass_threshold

    async def _evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
        """Compare the trace against the reference.

        Args:
            test_case: The test case (used for context).
            trace: The current trace to compare.

        Returns:
            An evaluation result with the composite similarity score.
        """
        scores: dict[str, float] = {}

        # Tool sequence similarity (Levenshtein)
        ref_tools = [tc.tool_name for tc in self._reference.tool_calls]
        cur_tools = [tc.tool_name for tc in trace.tool_calls]
        scores["tool_sequence"] = _levenshtein_similarity(ref_tools, cur_tools)

        # Tool parameter similarity (Jaccard on parameter keys)
        ref_params = _collect_param_keys(self._reference)
        cur_params = _collect_param_keys(trace)
        scores["tool_parameters"] = _jaccard_similarity(ref_params, cur_params)

        # Output text similarity (word-level Jaccard)
        scores["output_similarity"] = _keyword_overlap(
            self._reference.output_text, trace.output_text
        )

        # Cost deviation
        ref_tokens = self._reference.total_input_tokens + self._reference.total_output_tokens
        cur_tokens = trace.total_input_tokens + trace.total_output_tokens
        if ref_tokens > 0:
            cost_ratio = min(cur_tokens, ref_tokens) / max(cur_tokens, ref_tokens)
        elif cur_tokens == 0:
            cost_ratio = 1.0
        else:
            cost_ratio = 0.0
        scores["cost_deviation"] = cost_ratio

        # Weighted composite
        total_weight = sum(self._weights.get(k, 0.0) for k in scores)
        composite = sum(scores[k] * self._weights.get(k, 0.0) for k in scores)
        final_score = composite / total_weight if total_weight > 0 else 0.0
        final_score = round(min(max(final_score, 0.0), 1.0), 4)

        _partial_threshold = 0.5
        if final_score >= self._pass_threshold:
            verdict = EvalVerdict.PASS
        elif final_score >= _partial_threshold:
            verdict = EvalVerdict.PARTIAL
        else:
            verdict = EvalVerdict.FAIL

        return EvalResult(
            evaluator_name=self.name,
            verdict=verdict,
            score=final_score,
            reason=f"Trace comparison: {final_score:.3f} ({_format_scores(scores)})",
            metadata={"dimension_scores": scores, "weights": self._weights},
        )

__init__(reference_trace, *, name='trace-compare', weights=None, pass_threshold=0.7)

Initialize the trace comparison evaluator.

Parameters:

Name Type Description Default
reference_trace Trace

The baseline trace to compare against.

required
name str

Evaluator name.

'trace-compare'
weights dict[str, float] | None

Dimension weight overrides.

None
pass_threshold float

Minimum score for a pass verdict.

0.7
Source code in src/agentprobe/eval/trace_compare.py
def __init__(
    self,
    reference_trace: Trace,
    *,
    name: str = "trace-compare",
    weights: dict[str, float] | None = None,
    pass_threshold: float = 0.7,
) -> None:
    """Initialize the trace comparison evaluator.

    Args:
        reference_trace: The baseline trace to compare against.
        name: Evaluator name.
        weights: Dimension weight overrides.
        pass_threshold: Minimum score for a pass verdict.
    """
    super().__init__(name)
    self._reference = reference_trace
    self._weights = weights or dict(self.DEFAULT_WEIGHTS)
    self._pass_threshold = pass_threshold

Base Evaluator

agentprobe.eval.base

Abstract base evaluator with template-method pattern.

Subclasses implement _evaluate() while the base class handles timing, error wrapping, and consistent result construction.

BaseEvaluator

Bases: ABC

Abstract base class for all evaluators.

Provides a public evaluate() template method that delegates to the subclass-defined _evaluate(), adding timing and error handling.

Attributes:

Name Type Description
_name

The evaluator's name, used in results and logging.

Source code in src/agentprobe/eval/base.py
class BaseEvaluator(ABC):
    """Abstract base class for all evaluators.

    Provides a public ``evaluate()`` template method that delegates to
    the subclass-defined ``_evaluate()``, adding timing and error handling.

    Attributes:
        _name: The evaluator's name, used in results and logging.
    """

    def __init__(self, name: str) -> None:
        """Initialize the evaluator.

        Args:
            name: A unique name identifying this evaluator instance.
        """
        self._name = name

    @property
    def name(self) -> str:
        """Return the evaluator name."""
        return self._name

    async def evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
        """Evaluate an agent trace for a given test case.

        This template method times the evaluation, catches errors, and
        ensures a consistent EvalResult is always returned.

        Args:
            test_case: The test case that was executed.
            trace: The execution trace to evaluate.

        Returns:
            An evaluation result with score and verdict.
        """
        start = time.monotonic()
        try:
            result = await self._evaluate(test_case, trace)
        except EvaluatorError:
            raise
        except Exception as exc:
            elapsed_ms = int((time.monotonic() - start) * 1000)
            logger.error(
                "Evaluator '%s' failed for test '%s': %s",
                self._name,
                test_case.name,
                exc,
            )
            return EvalResult(
                evaluator_name=self._name,
                verdict=EvalVerdict.ERROR,
                score=0.0,
                reason=f"Evaluation error: {exc}",
                metadata={"duration_ms": elapsed_ms},
            )
        else:
            elapsed_ms = int((time.monotonic() - start) * 1000)
            logger.debug(
                "Evaluator '%s' completed for test '%s' in %dms: %s (%.2f)",
                self._name,
                test_case.name,
                elapsed_ms,
                result.verdict.value,
                result.score,
            )
            return result

    @abstractmethod
    async def _evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
        """Perform the actual evaluation logic.

        Subclasses must implement this method.

        Args:
            test_case: The test case that was executed.
            trace: The execution trace to evaluate.

        Returns:
            An evaluation result with score and verdict.
        """
        ...

name property

Return the evaluator name.

__init__(name)

Initialize the evaluator.

Parameters:

Name Type Description Default
name str

A unique name identifying this evaluator instance.

required
Source code in src/agentprobe/eval/base.py
def __init__(self, name: str) -> None:
    """Initialize the evaluator.

    Args:
        name: A unique name identifying this evaluator instance.
    """
    self._name = name

evaluate(test_case, trace) async

Evaluate an agent trace for a given test case.

This template method times the evaluation, catches errors, and ensures a consistent EvalResult is always returned.

Parameters:

Name Type Description Default
test_case TestCase

The test case that was executed.

required
trace Trace

The execution trace to evaluate.

required

Returns:

Type Description
EvalResult

An evaluation result with score and verdict.

Source code in src/agentprobe/eval/base.py
async def evaluate(self, test_case: TestCase, trace: Trace) -> EvalResult:
    """Evaluate an agent trace for a given test case.

    This template method times the evaluation, catches errors, and
    ensures a consistent EvalResult is always returned.

    Args:
        test_case: The test case that was executed.
        trace: The execution trace to evaluate.

    Returns:
        An evaluation result with score and verdict.
    """
    start = time.monotonic()
    try:
        result = await self._evaluate(test_case, trace)
    except EvaluatorError:
        raise
    except Exception as exc:
        elapsed_ms = int((time.monotonic() - start) * 1000)
        logger.error(
            "Evaluator '%s' failed for test '%s': %s",
            self._name,
            test_case.name,
            exc,
        )
        return EvalResult(
            evaluator_name=self._name,
            verdict=EvalVerdict.ERROR,
            score=0.0,
            reason=f"Evaluation error: {exc}",
            metadata={"duration_ms": elapsed_ms},
        )
    else:
        elapsed_ms = int((time.monotonic() - start) * 1000)
        logger.debug(
            "Evaluator '%s' completed for test '%s' in %dms: %s (%.2f)",
            self._name,
            test_case.name,
            elapsed_ms,
            result.verdict.value,
            result.score,
        )
        return result