Metrics¶

Metric collection, aggregation, and trend analysis.

Collector¶

`agentprobe.metrics.collector` ¶

Stateless metric collector that extracts measurements from traces and results.

Converts traces, test results, and agent runs into MetricValue instances for storage and analysis.

`MetricCollector` ¶

Extracts metric values from traces, results, and runs.

Stateless: receives objects and returns lists of MetricValue. Does not store or persist anything.

Source code in src/agentprobe/metrics/collector.py

class MetricCollector:
    """Extracts metric values from traces, results, and runs.

    Stateless: receives objects and returns lists of MetricValue.
    Does not store or persist anything.
    """

    def collect_from_trace(self, trace: Trace) -> list[MetricValue]:
        """Extract metric values from a single trace.

        Collects latency, tool call count, and response length metrics.

        Args:
            trace: The execution trace to extract metrics from.

        Returns:
            A list of metric values extracted from the trace.
        """
        now = datetime.now(UTC)
        tags = tuple(trace.tags)
        metrics: list[MetricValue] = []

        metrics.append(
            MetricValue(
                metric_name="latency_ms",
                value=float(trace.total_latency_ms),
                tags=tags,
                metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
                timestamp=now,
            )
        )

        metrics.append(
            MetricValue(
                metric_name="tool_call_count",
                value=float(len(trace.tool_calls)),
                tags=tags,
                metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
                timestamp=now,
            )
        )

        metrics.append(
            MetricValue(
                metric_name="response_length",
                value=float(len(trace.output_text)),
                tags=tags,
                metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
                timestamp=now,
            )
        )

        return metrics

    def collect_from_result(self, result: TestResult) -> list[MetricValue]:
        """Extract metric values from a test result.

        Collects latency, eval score, and any trace-level metrics.

        Args:
            result: The test result to extract metrics from.

        Returns:
            A list of metric values extracted from the result.
        """
        now = datetime.now(UTC)
        metrics: list[MetricValue] = []

        metrics.append(
            MetricValue(
                metric_name="latency_ms",
                value=float(result.duration_ms),
                metadata={"test_name": result.test_name, "result_id": result.result_id},
                timestamp=now,
            )
        )

        metrics.append(
            MetricValue(
                metric_name="eval_score",
                value=result.score,
                metadata={"test_name": result.test_name, "result_id": result.result_id},
                timestamp=now,
            )
        )

        if result.trace is not None:
            trace_metrics = self.collect_from_trace(result.trace)
            metrics.extend(trace_metrics)

        return metrics

    def collect_from_run(self, run: AgentRun) -> list[MetricValue]:
        """Extract metric values from a complete agent run.

        Collects pass rate plus per-result metrics for all results.

        Args:
            run: The agent run to extract metrics from.

        Returns:
            A list of metric values extracted from the run.
        """
        now = datetime.now(UTC)
        metrics: list[MetricValue] = []

        if run.total_tests > 0:
            passed = sum(1 for r in run.test_results if r.status == TestStatus.PASSED)
            pass_rate = passed / run.total_tests
        else:
            pass_rate = 0.0

        metrics.append(
            MetricValue(
                metric_name="pass_rate",
                value=pass_rate,
                metadata={"run_id": run.run_id, "agent_name": run.agent_name},
                timestamp=now,
            )
        )

        for result in run.test_results:
            result_metrics = self.collect_from_result(result)
            metrics.extend(result_metrics)

        return metrics

`collect_from_trace(trace)` ¶

Extract metric values from a single trace.

Collects latency, tool call count, and response length metrics.

Parameters:

Name	Type	Description	Default
`trace`	`Trace`	The execution trace to extract metrics from.	required

Returns:

Type	Description
`list[MetricValue]`	A list of metric values extracted from the trace.

Source code in src/agentprobe/metrics/collector.py

def collect_from_trace(self, trace: Trace) -> list[MetricValue]:
    """Extract metric values from a single trace.

    Collects latency, tool call count, and response length metrics.

    Args:
        trace: The execution trace to extract metrics from.

    Returns:
        A list of metric values extracted from the trace.
    """
    now = datetime.now(UTC)
    tags = tuple(trace.tags)
    metrics: list[MetricValue] = []

    metrics.append(
        MetricValue(
            metric_name="latency_ms",
            value=float(trace.total_latency_ms),
            tags=tags,
            metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
            timestamp=now,
        )
    )

    metrics.append(
        MetricValue(
            metric_name="tool_call_count",
            value=float(len(trace.tool_calls)),
            tags=tags,
            metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
            timestamp=now,
        )
    )

    metrics.append(
        MetricValue(
            metric_name="response_length",
            value=float(len(trace.output_text)),
            tags=tags,
            metadata={"trace_id": trace.trace_id, "agent_name": trace.agent_name},
            timestamp=now,
        )
    )

    return metrics

`collect_from_result(result)` ¶

Extract metric values from a test result.

Collects latency, eval score, and any trace-level metrics.

Parameters:

Name	Type	Description	Default
`result`	`TestResult`	The test result to extract metrics from.	required

Returns:

Type	Description
`list[MetricValue]`	A list of metric values extracted from the result.

Source code in src/agentprobe/metrics/collector.py

def collect_from_result(self, result: TestResult) -> list[MetricValue]:
    """Extract metric values from a test result.

    Collects latency, eval score, and any trace-level metrics.

    Args:
        result: The test result to extract metrics from.

    Returns:
        A list of metric values extracted from the result.
    """
    now = datetime.now(UTC)
    metrics: list[MetricValue] = []

    metrics.append(
        MetricValue(
            metric_name="latency_ms",
            value=float(result.duration_ms),
            metadata={"test_name": result.test_name, "result_id": result.result_id},
            timestamp=now,
        )
    )

    metrics.append(
        MetricValue(
            metric_name="eval_score",
            value=result.score,
            metadata={"test_name": result.test_name, "result_id": result.result_id},
            timestamp=now,
        )
    )

    if result.trace is not None:
        trace_metrics = self.collect_from_trace(result.trace)
        metrics.extend(trace_metrics)

    return metrics

`collect_from_run(run)` ¶

Extract metric values from a complete agent run.

Collects pass rate plus per-result metrics for all results.

Parameters:

Name	Type	Description	Default
`run`	`AgentRun`	The agent run to extract metrics from.	required

Returns:

Type	Description
`list[MetricValue]`	A list of metric values extracted from the run.

Source code in src/agentprobe/metrics/collector.py

def collect_from_run(self, run: AgentRun) -> list[MetricValue]:
    """Extract metric values from a complete agent run.

    Collects pass rate plus per-result metrics for all results.

    Args:
        run: The agent run to extract metrics from.

    Returns:
        A list of metric values extracted from the run.
    """
    now = datetime.now(UTC)
    metrics: list[MetricValue] = []

    if run.total_tests > 0:
        passed = sum(1 for r in run.test_results if r.status == TestStatus.PASSED)
        pass_rate = passed / run.total_tests
    else:
        pass_rate = 0.0

    metrics.append(
        MetricValue(
            metric_name="pass_rate",
            value=pass_rate,
            metadata={"run_id": run.run_id, "agent_name": run.agent_name},
            timestamp=now,
        )
    )

    for result in run.test_results:
        result_metrics = self.collect_from_result(result)
        metrics.extend(result_metrics)

    return metrics

Aggregator¶

`agentprobe.metrics.aggregator` ¶

Metric aggregation: computes statistical summaries from metric values.

Uses stdlib statistics module for calculations — no numpy dependency.

`MetricAggregator` ¶

Computes statistical aggregations over collections of metric values.

Supports mean, median, min, max, p95, p99, and standard deviation. All computations use the stdlib statistics module.

Source code in src/agentprobe/metrics/aggregator.py

class MetricAggregator:
    """Computes statistical aggregations over collections of metric values.

    Supports mean, median, min, max, p95, p99, and standard deviation.
    All computations use the stdlib ``statistics`` module.
    """

    def aggregate(self, values: list[MetricValue]) -> MetricAggregation:
        """Aggregate a list of metric values into summary statistics.

        All values must share the same metric_name.

        Args:
            values: List of metric values to aggregate.

        Returns:
            A MetricAggregation with computed statistics.

        Raises:
            MetricsError: If values is empty or metric names are inconsistent.
        """
        if not values:
            raise MetricsError("Cannot aggregate empty metric list")

        names = {v.metric_name for v in values}
        if len(names) > 1:
            raise MetricsError(f"Cannot aggregate mixed metrics: {', '.join(sorted(names))}")

        metric_name = values[0].metric_name
        raw = [v.value for v in values]

        return self._compute_stats(metric_name, raw)

    def aggregate_by_name(self, values: list[MetricValue]) -> dict[str, MetricAggregation]:
        """Group metric values by name and aggregate each group.

        Args:
            values: List of metric values (may contain multiple metric names).

        Returns:
            A dictionary mapping metric names to their aggregations.

        Raises:
            MetricsError: If values is empty.
        """
        if not values:
            raise MetricsError("Cannot aggregate empty metric list")

        grouped: dict[str, list[float]] = defaultdict(list)
        for v in values:
            grouped[v.metric_name].append(v.value)

        return {name: self._compute_stats(name, raw_values) for name, raw_values in grouped.items()}

    def _compute_stats(self, metric_name: str, raw: list[float]) -> MetricAggregation:
        """Compute statistics for a list of numeric values.

        Args:
            metric_name: The metric name for the aggregation.
            raw: Raw numeric values to aggregate.

        Returns:
            A MetricAggregation with computed statistics.
        """
        n = len(raw)
        mean = statistics.mean(raw)
        median = statistics.median(raw)
        min_val = min(raw)
        max_val = max(raw)
        std_dev = statistics.stdev(raw) if n >= _MIN_STDEV_SAMPLES else 0.0

        sorted_raw = sorted(raw)
        p95 = self._percentile(sorted_raw, 0.95)
        p99 = self._percentile(sorted_raw, 0.99)

        return MetricAggregation(
            metric_name=metric_name,
            count=n,
            mean=mean,
            median=median,
            min_value=min_val,
            max_value=max_val,
            p95=p95,
            p99=p99,
            std_dev=std_dev,
        )

    @staticmethod
    def _percentile(sorted_data: list[float], pct: float) -> float:
        """Compute a percentile using linear interpolation.

        Args:
            sorted_data: Pre-sorted list of values.
            pct: Percentile as a fraction (e.g. 0.95 for 95th).

        Returns:
            The interpolated percentile value.
        """
        n = len(sorted_data)
        if n == 1:
            return sorted_data[0]

        idx = pct * (n - 1)
        lower = math.floor(idx)
        upper = math.ceil(idx)

        if lower == upper:
            return sorted_data[lower]

        frac = idx - lower
        return sorted_data[lower] * (1.0 - frac) + sorted_data[upper] * frac

`aggregate(values)` ¶

Aggregate a list of metric values into summary statistics.

All values must share the same metric_name.

Parameters:

Name	Type	Description	Default
`values`	`list[MetricValue]`	List of metric values to aggregate.	required

Returns:

Type	Description
`MetricAggregation`	A MetricAggregation with computed statistics.

Raises:

Type	Description
`MetricsError`	If values is empty or metric names are inconsistent.

Source code in src/agentprobe/metrics/aggregator.py

def aggregate(self, values: list[MetricValue]) -> MetricAggregation:
    """Aggregate a list of metric values into summary statistics.

    All values must share the same metric_name.

    Args:
        values: List of metric values to aggregate.

    Returns:
        A MetricAggregation with computed statistics.

    Raises:
        MetricsError: If values is empty or metric names are inconsistent.
    """
    if not values:
        raise MetricsError("Cannot aggregate empty metric list")

    names = {v.metric_name for v in values}
    if len(names) > 1:
        raise MetricsError(f"Cannot aggregate mixed metrics: {', '.join(sorted(names))}")

    metric_name = values[0].metric_name
    raw = [v.value for v in values]

    return self._compute_stats(metric_name, raw)

`aggregate_by_name(values)` ¶

Group metric values by name and aggregate each group.

Parameters:

Name	Type	Description	Default
`values`	`list[MetricValue]`	List of metric values (may contain multiple metric names).	required

Returns:

Type	Description
`dict[str, MetricAggregation]`	A dictionary mapping metric names to their aggregations.

Raises:

Type	Description
`MetricsError`	If values is empty.

Source code in src/agentprobe/metrics/aggregator.py

def aggregate_by_name(self, values: list[MetricValue]) -> dict[str, MetricAggregation]:
    """Group metric values by name and aggregate each group.

    Args:
        values: List of metric values (may contain multiple metric names).

    Returns:
        A dictionary mapping metric names to their aggregations.

    Raises:
        MetricsError: If values is empty.
    """
    if not values:
        raise MetricsError("Cannot aggregate empty metric list")

    grouped: dict[str, list[float]] = defaultdict(list)
    for v in values:
        grouped[v.metric_name].append(v.value)

    return {name: self._compute_stats(name, raw_values) for name, raw_values in grouped.items()}

Trend Analysis¶

`agentprobe.metrics.trend` ¶

Metric trend analysis: detects improving, degrading, or stable trends.

Compares recent metric values against a historical window to determine whether performance is changing over time.

`MetricTrend` ¶

Analyzes metric trends by comparing recent vs historical values.

Uses a split-window approach: divides a time-ordered series of values into a historical window and a recent window, then compares means.

Attributes:

Name	Type	Description
`threshold`		Minimum relative change to flag as improving/degrading.

Source code in src/agentprobe/metrics/trend.py

class MetricTrend:
    """Analyzes metric trends by comparing recent vs historical values.

    Uses a split-window approach: divides a time-ordered series of values
    into a historical window and a recent window, then compares means.

    Attributes:
        threshold: Minimum relative change to flag as improving/degrading.
    """

    def __init__(self, threshold: float = 0.1) -> None:
        """Initialize the trend analyzer.

        Args:
            threshold: Minimum relative change (fraction) to consider
                a trend as improving or degrading. Defaults to 0.1 (10%).
        """
        self._threshold = threshold

    def analyze(
        self,
        values: list[MetricValue],
        lower_is_better: bool = True,
    ) -> TrendDirection:
        """Analyze the trend direction for a series of metric values.

        Splits the values in half (by order) and compares means.

        Args:
            values: Time-ordered list of metric values (oldest first).
            lower_is_better: Whether lower values indicate improvement.

        Returns:
            The detected trend direction.

        Raises:
            MetricsError: If fewer than 2 values are provided.
        """
        if len(values) < _MIN_TREND_SAMPLES:
            return TrendDirection.INSUFFICIENT_DATA

        raw = [v.value for v in values]
        return self._analyze_raw(raw, lower_is_better)

    def analyze_series(
        self,
        raw_values: list[float],
        lower_is_better: bool = True,
    ) -> TrendDirection:
        """Analyze the trend from a raw numeric series.

        Args:
            raw_values: Time-ordered list of numeric values (oldest first).
            lower_is_better: Whether lower values indicate improvement.

        Returns:
            The detected trend direction.
        """
        if len(raw_values) < _MIN_TREND_SAMPLES:
            return TrendDirection.INSUFFICIENT_DATA

        return self._analyze_raw(raw_values, lower_is_better)

    def _analyze_raw(self, raw: list[float], lower_is_better: bool) -> TrendDirection:
        """Core trend analysis on raw numeric values.

        Args:
            raw: Ordered list of values.
            lower_is_better: Direction semantics.

        Returns:
            The trend direction.
        """
        midpoint = len(raw) // 2
        historical = raw[:midpoint]
        recent = raw[midpoint:]

        hist_mean = statistics.mean(historical)
        recent_mean = statistics.mean(recent)

        if hist_mean == 0.0:
            if recent_mean == 0.0:
                return TrendDirection.STABLE
            return TrendDirection.DEGRADING if lower_is_better else TrendDirection.IMPROVING

        relative_change = (recent_mean - hist_mean) / abs(hist_mean)

        if abs(relative_change) < self._threshold:
            return TrendDirection.STABLE

        value_decreased = relative_change < 0

        if lower_is_better:
            return TrendDirection.IMPROVING if value_decreased else TrendDirection.DEGRADING
        else:
            return TrendDirection.DEGRADING if value_decreased else TrendDirection.IMPROVING

`init(threshold=0.1)` ¶

Initialize the trend analyzer.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	Minimum relative change (fraction) to consider a trend as improving or degrading. Defaults to 0.1 (10%).	`0.1`

Source code in src/agentprobe/metrics/trend.py

def __init__(self, threshold: float = 0.1) -> None:
    """Initialize the trend analyzer.

    Args:
        threshold: Minimum relative change (fraction) to consider
            a trend as improving or degrading. Defaults to 0.1 (10%).
    """
    self._threshold = threshold

`analyze(values, lower_is_better=True)` ¶

Analyze the trend direction for a series of metric values.

Splits the values in half (by order) and compares means.

Parameters:

Name	Type	Description	Default
`values`	`list[MetricValue]`	Time-ordered list of metric values (oldest first).	required
`lower_is_better`	`bool`	Whether lower values indicate improvement.	`True`

Returns:

Type	Description
`TrendDirection`	The detected trend direction.

Raises:

Type	Description
`MetricsError`	If fewer than 2 values are provided.

Source code in src/agentprobe/metrics/trend.py

def analyze(
    self,
    values: list[MetricValue],
    lower_is_better: bool = True,
) -> TrendDirection:
    """Analyze the trend direction for a series of metric values.

    Splits the values in half (by order) and compares means.

    Args:
        values: Time-ordered list of metric values (oldest first).
        lower_is_better: Whether lower values indicate improvement.

    Returns:
        The detected trend direction.

    Raises:
        MetricsError: If fewer than 2 values are provided.
    """
    if len(values) < _MIN_TREND_SAMPLES:
        return TrendDirection.INSUFFICIENT_DATA

    raw = [v.value for v in values]
    return self._analyze_raw(raw, lower_is_better)

`analyze_series(raw_values, lower_is_better=True)` ¶

Analyze the trend from a raw numeric series.

Parameters:

Name	Type	Description	Default
`raw_values`	`list[float]`	Time-ordered list of numeric values (oldest first).	required
`lower_is_better`	`bool`	Whether lower values indicate improvement.	`True`

Returns:

Type	Description
`TrendDirection`	The detected trend direction.

Source code in src/agentprobe/metrics/trend.py

def analyze_series(
    self,
    raw_values: list[float],
    lower_is_better: bool = True,
) -> TrendDirection:
    """Analyze the trend from a raw numeric series.

    Args:
        raw_values: Time-ordered list of numeric values (oldest first).
        lower_is_better: Whether lower values indicate improvement.

    Returns:
        The detected trend direction.
    """
    if len(raw_values) < _MIN_TREND_SAMPLES:
        return TrendDirection.INSUFFICIENT_DATA

    return self._analyze_raw(raw_values, lower_is_better)

Built-in Definitions¶

`agentprobe.metrics.definitions` ¶

Built-in metric definitions for common agent performance measurements.

Provides a registry of standard metrics that can be collected automatically during test execution, covering latency, cost, token usage, and scores.

`get_builtin_definitions()` ¶

Return all built-in metric definitions.

Returns:

Type	Description
`dict[str, MetricDefinition]`	A dictionary mapping metric names to their definitions.

Source code in src/agentprobe/metrics/definitions.py

def get_builtin_definitions() -> dict[str, MetricDefinition]:
    """Return all built-in metric definitions.

    Returns:
        A dictionary mapping metric names to their definitions.
    """
    return dict(BUILTIN_METRICS)

`get_definition(name)` ¶

Look up a built-in metric definition by name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The metric name to look up.	required

Returns:

Type	Description
`MetricDefinition \| None`	The metric definition if found, otherwise None.

Source code in src/agentprobe/metrics/definitions.py

def get_definition(name: str) -> MetricDefinition | None:
    """Look up a built-in metric definition by name.

    Args:
        name: The metric name to look up.

    Returns:
        The metric definition if found, otherwise None.
    """
    return BUILTIN_METRICS.get(name)

Metrics¶

Collector¶

agentprobe.metrics.collector ¶

MetricCollector ¶

collect_from_trace(trace) ¶

collect_from_result(result) ¶

collect_from_run(run) ¶

Aggregator¶

agentprobe.metrics.aggregator ¶

MetricAggregator ¶

aggregate(values) ¶

aggregate_by_name(values) ¶

Trend Analysis¶

agentprobe.metrics.trend ¶

MetricTrend ¶

__init__(threshold=0.1) ¶

analyze(values, lower_is_better=True) ¶

analyze_series(raw_values, lower_is_better=True) ¶

Built-in Definitions¶

agentprobe.metrics.definitions ¶

get_builtin_definitions() ¶

get_definition(name) ¶

`agentprobe.metrics.collector` ¶

`MetricCollector` ¶

`collect_from_trace(trace)` ¶

`collect_from_result(result)` ¶

`collect_from_run(run)` ¶

`agentprobe.metrics.aggregator` ¶

`MetricAggregator` ¶

`aggregate(values)` ¶

`aggregate_by_name(values)` ¶

`agentprobe.metrics.trend` ¶

`MetricTrend` ¶

`init(threshold=0.1)` ¶

`analyze(values, lower_is_better=True)` ¶

`analyze_series(raw_values, lower_is_better=True)` ¶

`agentprobe.metrics.definitions` ¶

`get_builtin_definitions()` ¶

`get_definition(name)` ¶