Skip to content

Metrics

Metric collection, aggregation, and trend analysis.

Collector

agentprobe.metrics.collector

Stateless metric collector that extracts measurements from traces and results.

Converts traces, test results, and agent runs into MetricValue instances for storage and analysis.

MetricCollector

Extracts metric values from traces, results, and runs.

Stateless: receives objects and returns lists of MetricValue. Does not store or persist anything.

Source code in src/agentprobe/metrics/collector.py
class MetricCollector:
    """Extracts metric values from traces, results, and runs.

    Stateless: receives objects and returns lists of MetricValue.
    Does not store or persist anything.
    """

    def collect_from_trace(self, trace: Trace) -> list[MetricValue]:
        """Extract metric values from a single trace.

        Collects latency, tool call count, and response length metrics.

        Args:
            trace: The execution trace to extract metrics from.

        Returns:
            A list of metric values extracted from the trace.
        """
        now = datetime.now(UTC)
        tags = tuple(trace.tags)
        metrics: list[MetricValue] = []

        metrics.append(
            MetricValue(
                metric_name="latency_ms",
                value=float(trace.total_latency_ms),
                tags=tags,
                metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
                timestamp=now,
            )
        )

        metrics.append(
            MetricValue(
                metric_name="tool_call_count",
                value=float(len(trace.tool_calls)),
                tags=tags,
                metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
                timestamp=now,
            )
        )

        metrics.append(
            MetricValue(
                metric_name="response_length",
                value=float(len(trace.output_text)),
                tags=tags,
                metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
                timestamp=now,
            )
        )

        return metrics

    def collect_from_result(self, result: TestResult) -> list[MetricValue]:
        """Extract metric values from a test result.

        Collects latency, eval score, and any trace-level metrics.

        Args:
            result: The test result to extract metrics from.

        Returns:
            A list of metric values extracted from the result.
        """
        now = datetime.now(UTC)
        metrics: list[MetricValue] = []

        metrics.append(
            MetricValue(
                metric_name="latency_ms",
                value=float(result.duration_ms),
                metadata={"test_name": result.test_name, "result_id": result.result_id},
                timestamp=now,
            )
        )

        metrics.append(
            MetricValue(
                metric_name="eval_score",
                value=result.score,
                metadata={"test_name": result.test_name, "result_id": result.result_id},
                timestamp=now,
            )
        )

        if result.trace is not None:
            trace_metrics = self.collect_from_trace(result.trace)
            metrics.extend(trace_metrics)

        return metrics

    def collect_from_run(self, run: AgentRun) -> list[MetricValue]:
        """Extract metric values from a complete agent run.

        Collects pass rate plus per-result metrics for all results.

        Args:
            run: The agent run to extract metrics from.

        Returns:
            A list of metric values extracted from the run.
        """
        now = datetime.now(UTC)
        metrics: list[MetricValue] = []

        if run.total_tests > 0:
            passed = sum(1 for r in run.test_results if r.status == TestStatus.PASSED)
            pass_rate = passed / run.total_tests
        else:
            pass_rate = 0.0

        metrics.append(
            MetricValue(
                metric_name="pass_rate",
                value=pass_rate,
                metadata={"run_id": run.run_id, "agent_name": run.agent_name},
                timestamp=now,
            )
        )

        for result in run.test_results:
            result_metrics = self.collect_from_result(result)
            metrics.extend(result_metrics)

        return metrics

collect_from_trace(trace)

Extract metric values from a single trace.

Collects latency, tool call count, and response length metrics.

Parameters:

Name Type Description Default
trace Trace

The execution trace to extract metrics from.

required

Returns:

Type Description
list[MetricValue]

A list of metric values extracted from the trace.

Source code in src/agentprobe/metrics/collector.py
def collect_from_trace(self, trace: Trace) -> list[MetricValue]:
    """Extract metric values from a single trace.

    Collects latency, tool call count, and response length metrics.

    Args:
        trace: The execution trace to extract metrics from.

    Returns:
        A list of metric values extracted from the trace.
    """
    now = datetime.now(UTC)
    tags = tuple(trace.tags)
    metrics: list[MetricValue] = []

    metrics.append(
        MetricValue(
            metric_name="latency_ms",
            value=float(trace.total_latency_ms),
            tags=tags,
            metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
            timestamp=now,
        )
    )

    metrics.append(
        MetricValue(
            metric_name="tool_call_count",
            value=float(len(trace.tool_calls)),
            tags=tags,
            metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
            timestamp=now,
        )
    )

    metrics.append(
        MetricValue(
            metric_name="response_length",
            value=float(len(trace.output_text)),
            tags=tags,
            metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
            timestamp=now,
        )
    )

    return metrics

collect_from_result(result)

Extract metric values from a test result.

Collects latency, eval score, and any trace-level metrics.

Parameters:

Name Type Description Default
result TestResult

The test result to extract metrics from.

required

Returns:

Type Description
list[MetricValue]

A list of metric values extracted from the result.

Source code in src/agentprobe/metrics/collector.py
def collect_from_result(self, result: TestResult) -> list[MetricValue]:
    """Extract metric values from a test result.

    Collects latency, eval score, and any trace-level metrics.

    Args:
        result: The test result to extract metrics from.

    Returns:
        A list of metric values extracted from the result.
    """
    now = datetime.now(UTC)
    metrics: list[MetricValue] = []

    metrics.append(
        MetricValue(
            metric_name="latency_ms",
            value=float(result.duration_ms),
            metadata={"test_name": result.test_name, "result_id": result.result_id},
            timestamp=now,
        )
    )

    metrics.append(
        MetricValue(
            metric_name="eval_score",
            value=result.score,
            metadata={"test_name": result.test_name, "result_id": result.result_id},
            timestamp=now,
        )
    )

    if result.trace is not None:
        trace_metrics = self.collect_from_trace(result.trace)
        metrics.extend(trace_metrics)

    return metrics

collect_from_run(run)

Extract metric values from a complete agent run.

Collects pass rate plus per-result metrics for all results.

Parameters:

Name Type Description Default
run AgentRun

The agent run to extract metrics from.

required

Returns:

Type Description
list[MetricValue]

A list of metric values extracted from the run.

Source code in src/agentprobe/metrics/collector.py
def collect_from_run(self, run: AgentRun) -> list[MetricValue]:
    """Extract metric values from a complete agent run.

    Collects pass rate plus per-result metrics for all results.

    Args:
        run: The agent run to extract metrics from.

    Returns:
        A list of metric values extracted from the run.
    """
    now = datetime.now(UTC)
    metrics: list[MetricValue] = []

    if run.total_tests > 0:
        passed = sum(1 for r in run.test_results if r.status == TestStatus.PASSED)
        pass_rate = passed / run.total_tests
    else:
        pass_rate = 0.0

    metrics.append(
        MetricValue(
            metric_name="pass_rate",
            value=pass_rate,
            metadata={"run_id": run.run_id, "agent_name": run.agent_name},
            timestamp=now,
        )
    )

    for result in run.test_results:
        result_metrics = self.collect_from_result(result)
        metrics.extend(result_metrics)

    return metrics

Aggregator

agentprobe.metrics.aggregator

Metric aggregation: computes statistical summaries from metric values.

Uses stdlib statistics module for calculations — no numpy dependency.

MetricAggregator

Computes statistical aggregations over collections of metric values.

Supports mean, median, min, max, p95, p99, and standard deviation. All computations use the stdlib statistics module.

Source code in src/agentprobe/metrics/aggregator.py
class MetricAggregator:
    """Computes statistical aggregations over collections of metric values.

    Supports mean, median, min, max, p95, p99, and standard deviation.
    All computations use the stdlib ``statistics`` module.
    """

    def aggregate(self, values: list[MetricValue]) -> MetricAggregation:
        """Aggregate a list of metric values into summary statistics.

        All values must share the same metric_name.

        Args:
            values: List of metric values to aggregate.

        Returns:
            A MetricAggregation with computed statistics.

        Raises:
            MetricsError: If values is empty or metric names are inconsistent.
        """
        if not values:
            raise MetricsError("Cannot aggregate empty metric list")

        names = {v.metric_name for v in values}
        if len(names) > 1:
            raise MetricsError(f"Cannot aggregate mixed metrics: {', '.join(sorted(names))}")

        metric_name = values[0].metric_name
        raw = [v.value for v in values]

        return self._compute_stats(metric_name, raw)

    def aggregate_by_name(self, values: list[MetricValue]) -> dict[str, MetricAggregation]:
        """Group metric values by name and aggregate each group.

        Args:
            values: List of metric values (may contain multiple metric names).

        Returns:
            A dictionary mapping metric names to their aggregations.

        Raises:
            MetricsError: If values is empty.
        """
        if not values:
            raise MetricsError("Cannot aggregate empty metric list")

        grouped: dict[str, list[float]] = defaultdict(list)
        for v in values:
            grouped[v.metric_name].append(v.value)

        return {name: self._compute_stats(name, raw_values) for name, raw_values in grouped.items()}

    def _compute_stats(self, metric_name: str, raw: list[float]) -> MetricAggregation:
        """Compute statistics for a list of numeric values.

        Args:
            metric_name: The metric name for the aggregation.
            raw: Raw numeric values to aggregate.

        Returns:
            A MetricAggregation with computed statistics.
        """
        n = len(raw)
        mean = statistics.mean(raw)
        median = statistics.median(raw)
        min_val = min(raw)
        max_val = max(raw)
        std_dev = statistics.stdev(raw) if n >= _MIN_STDEV_SAMPLES else 0.0

        sorted_raw = sorted(raw)
        p95 = self._percentile(sorted_raw, 0.95)
        p99 = self._percentile(sorted_raw, 0.99)

        return MetricAggregation(
            metric_name=metric_name,
            count=n,
            mean=mean,
            median=median,
            min_value=min_val,
            max_value=max_val,
            p95=p95,
            p99=p99,
            std_dev=std_dev,
        )

    @staticmethod
    def _percentile(sorted_data: list[float], pct: float) -> float:
        """Compute a percentile using linear interpolation.

        Args:
            sorted_data: Pre-sorted list of values.
            pct: Percentile as a fraction (e.g. 0.95 for 95th).

        Returns:
            The interpolated percentile value.
        """
        n = len(sorted_data)
        if n == 1:
            return sorted_data[0]

        idx = pct * (n - 1)
        lower = math.floor(idx)
        upper = math.ceil(idx)

        if lower == upper:
            return sorted_data[lower]

        frac = idx - lower
        return sorted_data[lower] * (1.0 - frac) + sorted_data[upper] * frac

aggregate(values)

Aggregate a list of metric values into summary statistics.

All values must share the same metric_name.

Parameters:

Name Type Description Default
values list[MetricValue]

List of metric values to aggregate.

required

Returns:

Type Description
MetricAggregation

A MetricAggregation with computed statistics.

Raises:

Type Description
MetricsError

If values is empty or metric names are inconsistent.

Source code in src/agentprobe/metrics/aggregator.py
def aggregate(self, values: list[MetricValue]) -> MetricAggregation:
    """Aggregate a list of metric values into summary statistics.

    All values must share the same metric_name.

    Args:
        values: List of metric values to aggregate.

    Returns:
        A MetricAggregation with computed statistics.

    Raises:
        MetricsError: If values is empty or metric names are inconsistent.
    """
    if not values:
        raise MetricsError("Cannot aggregate empty metric list")

    names = {v.metric_name for v in values}
    if len(names) > 1:
        raise MetricsError(f"Cannot aggregate mixed metrics: {', '.join(sorted(names))}")

    metric_name = values[0].metric_name
    raw = [v.value for v in values]

    return self._compute_stats(metric_name, raw)

aggregate_by_name(values)

Group metric values by name and aggregate each group.

Parameters:

Name Type Description Default
values list[MetricValue]

List of metric values (may contain multiple metric names).

required

Returns:

Type Description
dict[str, MetricAggregation]

A dictionary mapping metric names to their aggregations.

Raises:

Type Description
MetricsError

If values is empty.

Source code in src/agentprobe/metrics/aggregator.py
def aggregate_by_name(self, values: list[MetricValue]) -> dict[str, MetricAggregation]:
    """Group metric values by name and aggregate each group.

    Args:
        values: List of metric values (may contain multiple metric names).

    Returns:
        A dictionary mapping metric names to their aggregations.

    Raises:
        MetricsError: If values is empty.
    """
    if not values:
        raise MetricsError("Cannot aggregate empty metric list")

    grouped: dict[str, list[float]] = defaultdict(list)
    for v in values:
        grouped[v.metric_name].append(v.value)

    return {name: self._compute_stats(name, raw_values) for name, raw_values in grouped.items()}

Trend Analysis

agentprobe.metrics.trend

Metric trend analysis: detects improving, degrading, or stable trends.

Compares recent metric values against a historical window to determine whether performance is changing over time.

MetricTrend

Analyzes metric trends by comparing recent vs historical values.

Uses a split-window approach: divides a time-ordered series of values into a historical window and a recent window, then compares means.

Attributes:

Name Type Description
threshold

Minimum relative change to flag as improving/degrading.

Source code in src/agentprobe/metrics/trend.py
class MetricTrend:
    """Analyzes metric trends by comparing recent vs historical values.

    Uses a split-window approach: divides a time-ordered series of values
    into a historical window and a recent window, then compares means.

    Attributes:
        threshold: Minimum relative change to flag as improving/degrading.
    """

    def __init__(self, threshold: float = 0.1) -> None:
        """Initialize the trend analyzer.

        Args:
            threshold: Minimum relative change (fraction) to consider
                a trend as improving or degrading. Defaults to 0.1 (10%).
        """
        self._threshold = threshold

    def analyze(
        self,
        values: list[MetricValue],
        lower_is_better: bool = True,
    ) -> TrendDirection:
        """Analyze the trend direction for a series of metric values.

        Splits the values in half (by order) and compares means.

        Args:
            values: Time-ordered list of metric values (oldest first).
            lower_is_better: Whether lower values indicate improvement.

        Returns:
            The detected trend direction.

        Raises:
            MetricsError: If fewer than 2 values are provided.
        """
        if len(values) < _MIN_TREND_SAMPLES:
            return TrendDirection.INSUFFICIENT_DATA

        raw = [v.value for v in values]
        return self._analyze_raw(raw, lower_is_better)

    def analyze_series(
        self,
        raw_values: list[float],
        lower_is_better: bool = True,
    ) -> TrendDirection:
        """Analyze the trend from a raw numeric series.

        Args:
            raw_values: Time-ordered list of numeric values (oldest first).
            lower_is_better: Whether lower values indicate improvement.

        Returns:
            The detected trend direction.
        """
        if len(raw_values) < _MIN_TREND_SAMPLES:
            return TrendDirection.INSUFFICIENT_DATA

        return self._analyze_raw(raw_values, lower_is_better)

    def _analyze_raw(self, raw: list[float], lower_is_better: bool) -> TrendDirection:
        """Core trend analysis on raw numeric values.

        Args:
            raw: Ordered list of values.
            lower_is_better: Direction semantics.

        Returns:
            The trend direction.
        """
        midpoint = len(raw) // 2
        historical = raw[:midpoint]
        recent = raw[midpoint:]

        hist_mean = statistics.mean(historical)
        recent_mean = statistics.mean(recent)

        if hist_mean == 0.0:
            if recent_mean == 0.0:
                return TrendDirection.STABLE
            return TrendDirection.DEGRADING if lower_is_better else TrendDirection.IMPROVING

        relative_change = (recent_mean - hist_mean) / abs(hist_mean)

        if abs(relative_change) < self._threshold:
            return TrendDirection.STABLE

        value_decreased = relative_change < 0

        if lower_is_better:
            return TrendDirection.IMPROVING if value_decreased else TrendDirection.DEGRADING
        else:
            return TrendDirection.DEGRADING if value_decreased else TrendDirection.IMPROVING

__init__(threshold=0.1)

Initialize the trend analyzer.

Parameters:

Name Type Description Default
threshold float

Minimum relative change (fraction) to consider a trend as improving or degrading. Defaults to 0.1 (10%).

0.1
Source code in src/agentprobe/metrics/trend.py
def __init__(self, threshold: float = 0.1) -> None:
    """Initialize the trend analyzer.

    Args:
        threshold: Minimum relative change (fraction) to consider
            a trend as improving or degrading. Defaults to 0.1 (10%).
    """
    self._threshold = threshold

analyze(values, lower_is_better=True)

Analyze the trend direction for a series of metric values.

Splits the values in half (by order) and compares means.

Parameters:

Name Type Description Default
values list[MetricValue]

Time-ordered list of metric values (oldest first).

required
lower_is_better bool

Whether lower values indicate improvement.

True

Returns:

Type Description
TrendDirection

The detected trend direction.

Raises:

Type Description
MetricsError

If fewer than 2 values are provided.

Source code in src/agentprobe/metrics/trend.py
def analyze(
    self,
    values: list[MetricValue],
    lower_is_better: bool = True,
) -> TrendDirection:
    """Analyze the trend direction for a series of metric values.

    Splits the values in half (by order) and compares means.

    Args:
        values: Time-ordered list of metric values (oldest first).
        lower_is_better: Whether lower values indicate improvement.

    Returns:
        The detected trend direction.

    Raises:
        MetricsError: If fewer than 2 values are provided.
    """
    if len(values) < _MIN_TREND_SAMPLES:
        return TrendDirection.INSUFFICIENT_DATA

    raw = [v.value for v in values]
    return self._analyze_raw(raw, lower_is_better)

analyze_series(raw_values, lower_is_better=True)

Analyze the trend from a raw numeric series.

Parameters:

Name Type Description Default
raw_values list[float]

Time-ordered list of numeric values (oldest first).

required
lower_is_better bool

Whether lower values indicate improvement.

True

Returns:

Type Description
TrendDirection

The detected trend direction.

Source code in src/agentprobe/metrics/trend.py
def analyze_series(
    self,
    raw_values: list[float],
    lower_is_better: bool = True,
) -> TrendDirection:
    """Analyze the trend from a raw numeric series.

    Args:
        raw_values: Time-ordered list of numeric values (oldest first).
        lower_is_better: Whether lower values indicate improvement.

    Returns:
        The detected trend direction.
    """
    if len(raw_values) < _MIN_TREND_SAMPLES:
        return TrendDirection.INSUFFICIENT_DATA

    return self._analyze_raw(raw_values, lower_is_better)

Built-in Definitions

agentprobe.metrics.definitions

Built-in metric definitions for common agent performance measurements.

Provides a registry of standard metrics that can be collected automatically during test execution, covering latency, cost, token usage, and scores.

get_builtin_definitions()

Return all built-in metric definitions.

Returns:

Type Description
dict[str, MetricDefinition]

A dictionary mapping metric names to their definitions.

Source code in src/agentprobe/metrics/definitions.py
def get_builtin_definitions() -> dict[str, MetricDefinition]:
    """Return all built-in metric definitions.

    Returns:
        A dictionary mapping metric names to their definitions.
    """
    return dict(BUILTIN_METRICS)

get_definition(name)

Look up a built-in metric definition by name.

Parameters:

Name Type Description Default
name str

The metric name to look up.

required

Returns:

Type Description
MetricDefinition | None

The metric definition if found, otherwise None.

Source code in src/agentprobe/metrics/definitions.py
def get_definition(name: str) -> MetricDefinition | None:
    """Look up a built-in metric definition by name.

    Args:
        name: The metric name to look up.

    Returns:
        The metric definition if found, otherwise None.
    """
    return BUILTIN_METRICS.get(name)