Skip to content

Core

Models, runner, assertions, protocols, configuration, and exception hierarchy.

Models

agentprobe.core.models

Core data models and enumerations for AgentProbe.

This module defines all Pydantic models used throughout the framework, including traces, test cases, results, and cost summaries. Output types are frozen (immutable); input/configuration types are mutable.

TestStatus

Bases: StrEnum

Status of a single test case execution.

Source code in src/agentprobe/core/models.py
class TestStatus(StrEnum):
    """Status of a single test case execution."""

    PENDING = "pending"
    RUNNING = "running"
    PASSED = "passed"
    FAILED = "failed"
    ERROR = "error"
    SKIPPED = "skipped"
    TIMEOUT = "timeout"

RunStatus

Bases: StrEnum

Status of an overall agent run or test suite execution.

Source code in src/agentprobe/core/models.py
class RunStatus(StrEnum):
    """Status of an overall agent run or test suite execution."""

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"

TurnType

Bases: StrEnum

Type of event within a trace turn.

Source code in src/agentprobe/core/models.py
class TurnType(StrEnum):
    """Type of event within a trace turn."""

    LLM_CALL = "llm_call"
    TOOL_CALL = "tool_call"
    USER_MESSAGE = "user_message"
    AGENT_MESSAGE = "agent_message"

EvalVerdict

Bases: StrEnum

Verdict produced by an evaluator.

Source code in src/agentprobe/core/models.py
class EvalVerdict(StrEnum):
    """Verdict produced by an evaluator."""

    PASS = "pass"
    FAIL = "fail"
    PARTIAL = "partial"
    ERROR = "error"

LLMCall

Bases: BaseModel

A single call to a language model within a trace.

Attributes:

Name Type Description
call_id str

Unique identifier for this call.

model str

Model identifier string (e.g. 'claude-sonnet-4-5-20250929').

input_tokens int

Number of input/prompt tokens consumed.

output_tokens int

Number of output/completion tokens produced.

input_text str

The prompt or input sent to the model.

output_text str

The response text from the model.

latency_ms int

Round-trip latency in milliseconds.

metadata dict[str, Any]

Additional provider-specific metadata.

timestamp datetime

When the call was made.

Source code in src/agentprobe/core/models.py
class LLMCall(BaseModel):
    """A single call to a language model within a trace.

    Attributes:
        call_id: Unique identifier for this call.
        model: Model identifier string (e.g. 'claude-sonnet-4-5-20250929').
        input_tokens: Number of input/prompt tokens consumed.
        output_tokens: Number of output/completion tokens produced.
        input_text: The prompt or input sent to the model.
        output_text: The response text from the model.
        latency_ms: Round-trip latency in milliseconds.
        metadata: Additional provider-specific metadata.
        timestamp: When the call was made.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    call_id: str = Field(default_factory=lambda: str(uuid4()))
    model: str
    input_tokens: int = Field(default=0, ge=0)
    output_tokens: int = Field(default=0, ge=0)
    input_text: str = ""
    output_text: str = ""
    latency_ms: int = Field(default=0, ge=0)
    metadata: dict[str, Any] = Field(default_factory=dict)
    timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))

ToolCall

Bases: BaseModel

A single tool invocation within a trace.

Attributes:

Name Type Description
call_id str

Unique identifier for this call.

tool_name str

Name of the tool invoked.

tool_input dict[str, Any]

Arguments passed to the tool.

tool_output Any

Output returned by the tool.

success bool

Whether the tool call succeeded.

error str | None

Error message if the call failed.

latency_ms int

Round-trip latency in milliseconds.

timestamp datetime

When the call was made.

Source code in src/agentprobe/core/models.py
class ToolCall(BaseModel):
    """A single tool invocation within a trace.

    Attributes:
        call_id: Unique identifier for this call.
        tool_name: Name of the tool invoked.
        tool_input: Arguments passed to the tool.
        tool_output: Output returned by the tool.
        success: Whether the tool call succeeded.
        error: Error message if the call failed.
        latency_ms: Round-trip latency in milliseconds.
        timestamp: When the call was made.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    call_id: str = Field(default_factory=lambda: str(uuid4()))
    tool_name: str
    tool_input: dict[str, Any] = Field(default_factory=dict)
    tool_output: Any = None
    success: bool = True
    error: str | None = None
    latency_ms: int = Field(default=0, ge=0)
    timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))

Turn

Bases: BaseModel

A single turn (event) within a trace timeline.

Attributes:

Name Type Description
turn_id str

Unique identifier for this turn.

turn_type TurnType

The type of event this turn represents.

content str

Text content of the turn.

llm_call LLMCall | None

Associated LLM call, if this is an LLM turn.

tool_call ToolCall | None

Associated tool call, if this is a tool turn.

timestamp datetime

When the turn occurred.

Source code in src/agentprobe/core/models.py
class Turn(BaseModel):
    """A single turn (event) within a trace timeline.

    Attributes:
        turn_id: Unique identifier for this turn.
        turn_type: The type of event this turn represents.
        content: Text content of the turn.
        llm_call: Associated LLM call, if this is an LLM turn.
        tool_call: Associated tool call, if this is a tool turn.
        timestamp: When the turn occurred.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    turn_id: str = Field(default_factory=lambda: str(uuid4()))
    turn_type: TurnType
    content: str = ""
    llm_call: LLMCall | None = None
    tool_call: ToolCall | None = None
    timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))

Trace

Bases: BaseModel

Complete execution trace of an agent run.

A trace captures the full timeline of LLM calls, tool invocations, and message exchanges during a single agent execution. Once assembled by the TraceRecorder, traces are immutable.

Attributes:

Name Type Description
trace_id str

Unique identifier for this trace.

agent_name str

Name of the agent that produced this trace.

model str | None

Primary model used during the run.

input_text str

The input/prompt given to the agent.

output_text str

The final output produced by the agent.

turns tuple[Turn, ...]

Ordered list of turns in the execution timeline.

llm_calls tuple[LLMCall, ...]

All LLM calls made during the run.

tool_calls tuple[ToolCall, ...]

All tool calls made during the run.

total_input_tokens int

Aggregate input tokens across all LLM calls.

total_output_tokens int

Aggregate output tokens across all LLM calls.

total_latency_ms int

Total execution time in milliseconds.

tags tuple[str, ...]

Tags for filtering and grouping.

metadata dict[str, Any]

Additional run metadata.

created_at datetime

When the trace was created.

Source code in src/agentprobe/core/models.py
class Trace(BaseModel):
    """Complete execution trace of an agent run.

    A trace captures the full timeline of LLM calls, tool invocations,
    and message exchanges during a single agent execution. Once assembled
    by the TraceRecorder, traces are immutable.

    Attributes:
        trace_id: Unique identifier for this trace.
        agent_name: Name of the agent that produced this trace.
        model: Primary model used during the run.
        input_text: The input/prompt given to the agent.
        output_text: The final output produced by the agent.
        turns: Ordered list of turns in the execution timeline.
        llm_calls: All LLM calls made during the run.
        tool_calls: All tool calls made during the run.
        total_input_tokens: Aggregate input tokens across all LLM calls.
        total_output_tokens: Aggregate output tokens across all LLM calls.
        total_latency_ms: Total execution time in milliseconds.
        tags: Tags for filtering and grouping.
        metadata: Additional run metadata.
        created_at: When the trace was created.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    trace_id: str = Field(default_factory=lambda: str(uuid4()))
    agent_name: str
    model: str | None = None
    input_text: str = ""
    output_text: str = ""
    turns: tuple[Turn, ...] = ()
    llm_calls: tuple[LLMCall, ...] = ()
    tool_calls: tuple[ToolCall, ...] = ()
    total_input_tokens: int = Field(default=0, ge=0)
    total_output_tokens: int = Field(default=0, ge=0)
    total_latency_ms: int = Field(default=0, ge=0)
    tags: tuple[str, ...] = ()
    metadata: dict[str, Any] = Field(default_factory=dict)
    created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))

EvalResult

Bases: BaseModel

Result produced by an evaluator.

Attributes:

Name Type Description
eval_id str

Unique identifier for this evaluation.

evaluator_name str

Name of the evaluator that produced this result.

verdict EvalVerdict

Pass/fail/partial/error verdict.

score float

Numeric score between 0.0 and 1.0.

reason str

Human-readable explanation of the verdict.

metadata dict[str, Any]

Additional evaluator-specific data.

created_at datetime

When the evaluation was performed.

Source code in src/agentprobe/core/models.py
class EvalResult(BaseModel):
    """Result produced by an evaluator.

    Attributes:
        eval_id: Unique identifier for this evaluation.
        evaluator_name: Name of the evaluator that produced this result.
        verdict: Pass/fail/partial/error verdict.
        score: Numeric score between 0.0 and 1.0.
        reason: Human-readable explanation of the verdict.
        metadata: Additional evaluator-specific data.
        created_at: When the evaluation was performed.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    eval_id: str = Field(default_factory=lambda: str(uuid4()))
    evaluator_name: str
    verdict: EvalVerdict
    score: float = Field(..., ge=0.0, le=1.0)
    reason: str = ""
    metadata: dict[str, Any] = Field(default_factory=dict)
    created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))

AssertionResult

Bases: BaseModel

Result of a single test assertion.

Attributes:

Name Type Description
assertion_type str

Type of assertion (e.g. 'contain', 'match').

passed bool

Whether the assertion passed.

expected Any

The expected value.

actual Any

The actual value.

message str

Descriptive message about the result.

Source code in src/agentprobe/core/models.py
class AssertionResult(BaseModel):
    """Result of a single test assertion.

    Attributes:
        assertion_type: Type of assertion (e.g. 'contain', 'match').
        passed: Whether the assertion passed.
        expected: The expected value.
        actual: The actual value.
        message: Descriptive message about the result.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    assertion_type: str
    passed: bool
    expected: Any = None
    actual: Any = None
    message: str = ""

TestCase

Bases: BaseModel

A single test scenario to be executed against an agent.

TestCase is mutable because the runner populates fields during execution (e.g. status transitions, attaching results).

Attributes:

Name Type Description
test_id str

Unique identifier for this test case.

name str

Human-readable name (usually from the @scenario decorator).

description str

Detailed description of what this test validates.

input_text str

The input prompt to send to the agent.

expected_output str | None

Optional expected output for comparison.

tags list[str]

Tags for filtering and grouping.

timeout_seconds float

Maximum allowed execution time.

evaluators list[str]

Names of evaluators to run on this test.

metadata dict[str, Any]

Additional test configuration.

Source code in src/agentprobe/core/models.py
class TestCase(BaseModel):
    """A single test scenario to be executed against an agent.

    TestCase is mutable because the runner populates fields during execution
    (e.g. status transitions, attaching results).

    Attributes:
        test_id: Unique identifier for this test case.
        name: Human-readable name (usually from the @scenario decorator).
        description: Detailed description of what this test validates.
        input_text: The input prompt to send to the agent.
        expected_output: Optional expected output for comparison.
        tags: Tags for filtering and grouping.
        timeout_seconds: Maximum allowed execution time.
        evaluators: Names of evaluators to run on this test.
        metadata: Additional test configuration.
    """

    model_config = ConfigDict(strict=True, extra="forbid")

    test_id: str = Field(default_factory=lambda: str(uuid4()))
    name: str = Field(..., min_length=1, max_length=200)
    description: str = ""
    input_text: str = ""
    expected_output: str | None = None
    tags: list[str] = Field(default_factory=list)
    timeout_seconds: float = Field(default=30.0, gt=0)
    evaluators: list[str] = Field(default_factory=list)
    metadata: dict[str, Any] = Field(default_factory=dict)

    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str) -> str:
        """Ensure test name contains only valid characters."""
        cleaned = v.replace("_", "").replace("-", "").replace(" ", "")
        if not cleaned.replace(".", "").isalnum():
            msg = "Test name must be alphanumeric with underscores, hyphens, spaces, or dots"
            raise ValueError(msg)
        return v

validate_name(v) classmethod

Ensure test name contains only valid characters.

Source code in src/agentprobe/core/models.py
@field_validator("name")
@classmethod
def validate_name(cls, v: str) -> str:
    """Ensure test name contains only valid characters."""
    cleaned = v.replace("_", "").replace("-", "").replace(" ", "")
    if not cleaned.replace(".", "").isalnum():
        msg = "Test name must be alphanumeric with underscores, hyphens, spaces, or dots"
        raise ValueError(msg)
    return v

TestResult

Bases: BaseModel

Complete result of executing a single test case.

Attributes:

Name Type Description
result_id str

Unique identifier for this result.

test_name str

Name of the test that was executed.

status TestStatus

Final status of the test execution.

score float

Aggregate score from evaluators (0.0 to 1.0).

duration_ms int

Execution time in milliseconds.

trace Trace | None

The execution trace, if recording was enabled.

eval_results tuple[EvalResult, ...]

Results from all evaluators run on this test.

assertion_results tuple[AssertionResult, ...]

Results from all assertions.

error_message str | None

Error description if the test errored.

created_at datetime

When the result was recorded.

Source code in src/agentprobe/core/models.py
class TestResult(BaseModel):
    """Complete result of executing a single test case.

    Attributes:
        result_id: Unique identifier for this result.
        test_name: Name of the test that was executed.
        status: Final status of the test execution.
        score: Aggregate score from evaluators (0.0 to 1.0).
        duration_ms: Execution time in milliseconds.
        trace: The execution trace, if recording was enabled.
        eval_results: Results from all evaluators run on this test.
        assertion_results: Results from all assertions.
        error_message: Error description if the test errored.
        created_at: When the result was recorded.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    result_id: str = Field(default_factory=lambda: str(uuid4()))
    test_name: str = Field(..., min_length=1, max_length=200)
    status: TestStatus
    score: float = Field(default=0.0, ge=0.0, le=1.0)
    duration_ms: int = Field(default=0, ge=0)
    trace: Trace | None = None
    eval_results: tuple[EvalResult, ...] = ()
    assertion_results: tuple[AssertionResult, ...] = ()
    error_message: str | None = None
    created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))

CostBreakdown

Bases: BaseModel

Cost breakdown for a single model.

Attributes:

Name Type Description
model str

The model identifier.

input_tokens int

Total input tokens for this model.

output_tokens int

Total output tokens for this model.

input_cost_usd float

Cost for input tokens in USD.

output_cost_usd float

Cost for output tokens in USD.

total_cost_usd float

Total cost in USD.

call_count int

Number of calls to this model.

Source code in src/agentprobe/core/models.py
class CostBreakdown(BaseModel):
    """Cost breakdown for a single model.

    Attributes:
        model: The model identifier.
        input_tokens: Total input tokens for this model.
        output_tokens: Total output tokens for this model.
        input_cost_usd: Cost for input tokens in USD.
        output_cost_usd: Cost for output tokens in USD.
        total_cost_usd: Total cost in USD.
        call_count: Number of calls to this model.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    model: str
    input_tokens: int = Field(default=0, ge=0)
    output_tokens: int = Field(default=0, ge=0)
    input_cost_usd: float = Field(default=0.0, ge=0.0)
    output_cost_usd: float = Field(default=0.0, ge=0.0)
    total_cost_usd: float = Field(default=0.0, ge=0.0)
    call_count: int = Field(default=0, ge=0)

CostSummary

Bases: BaseModel

Aggregate cost summary for a trace or test suite.

Attributes:

Name Type Description
total_llm_cost_usd float

Total cost of all LLM calls in USD.

total_tool_cost_usd float

Total cost of tool usage in USD.

total_cost_usd float

Grand total cost in USD.

breakdown_by_model dict[str, CostBreakdown]

Per-model cost breakdown.

total_input_tokens int

Aggregate input tokens.

total_output_tokens int

Aggregate output tokens.

Source code in src/agentprobe/core/models.py
class CostSummary(BaseModel):
    """Aggregate cost summary for a trace or test suite.

    Attributes:
        total_llm_cost_usd: Total cost of all LLM calls in USD.
        total_tool_cost_usd: Total cost of tool usage in USD.
        total_cost_usd: Grand total cost in USD.
        breakdown_by_model: Per-model cost breakdown.
        total_input_tokens: Aggregate input tokens.
        total_output_tokens: Aggregate output tokens.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    total_llm_cost_usd: float = Field(default=0.0, ge=0.0)
    total_tool_cost_usd: float = Field(default=0.0, ge=0.0)
    total_cost_usd: float = Field(default=0.0, ge=0.0)
    breakdown_by_model: dict[str, CostBreakdown] = Field(default_factory=dict)
    total_input_tokens: int = Field(default=0, ge=0)
    total_output_tokens: int = Field(default=0, ge=0)

MetricType

Bases: StrEnum

Type of metric being measured.

Source code in src/agentprobe/core/models.py
class MetricType(StrEnum):
    """Type of metric being measured."""

    LATENCY = "latency"
    COST = "cost"
    TOKENS = "tokens"
    SCORE = "score"
    COUNT = "count"
    RATE = "rate"

TrendDirection

Bases: StrEnum

Direction of a metric trend over time.

Source code in src/agentprobe/core/models.py
class TrendDirection(StrEnum):
    """Direction of a metric trend over time."""

    IMPROVING = "improving"
    DEGRADING = "degrading"
    STABLE = "stable"
    INSUFFICIENT_DATA = "insufficient_data"

PluginType

Bases: StrEnum

Type of plugin extension.

Source code in src/agentprobe/core/models.py
class PluginType(StrEnum):
    """Type of plugin extension."""

    EVALUATOR = "evaluator"
    ADAPTER = "adapter"
    REPORTER = "reporter"
    STORAGE = "storage"

ChaosType

Bases: StrEnum

Type of chaos fault to inject during testing.

Source code in src/agentprobe/core/models.py
class ChaosType(StrEnum):
    """Type of chaos fault to inject during testing."""

    TIMEOUT = "timeout"
    ERROR = "error"
    MALFORMED = "malformed"
    RATE_LIMIT = "rate_limit"
    SLOW = "slow"
    EMPTY = "empty"

ConversationTurn

Bases: BaseModel

Specification for a single turn in a multi-turn conversation test.

Attributes:

Name Type Description
turn_id str

Unique identifier for this turn.

input_text str

The input to send for this turn.

expected_output str | None

Optional expected output for this turn.

evaluators tuple[str, ...]

Evaluator names to run on this turn's result.

metadata dict[str, Any]

Additional turn-level configuration.

Source code in src/agentprobe/core/models.py
class ConversationTurn(BaseModel):
    """Specification for a single turn in a multi-turn conversation test.

    Attributes:
        turn_id: Unique identifier for this turn.
        input_text: The input to send for this turn.
        expected_output: Optional expected output for this turn.
        evaluators: Evaluator names to run on this turn's result.
        metadata: Additional turn-level configuration.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    turn_id: str = Field(default_factory=lambda: str(uuid4()))
    input_text: str
    expected_output: str | None = None
    evaluators: tuple[str, ...] = ()
    metadata: dict[str, Any] = Field(default_factory=dict)

TurnResult

Bases: BaseModel

Result from executing a single conversation turn.

Attributes:

Name Type Description
turn_index int

Zero-based index of this turn.

input_text str

The input sent for this turn.

trace Trace | None

Execution trace from this turn.

eval_results tuple[EvalResult, ...]

Results from evaluators run on this turn.

duration_ms int

Execution time for this turn in milliseconds.

Source code in src/agentprobe/core/models.py
class TurnResult(BaseModel):
    """Result from executing a single conversation turn.

    Attributes:
        turn_index: Zero-based index of this turn.
        input_text: The input sent for this turn.
        trace: Execution trace from this turn.
        eval_results: Results from evaluators run on this turn.
        duration_ms: Execution time for this turn in milliseconds.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    turn_index: int = Field(ge=0)
    input_text: str = ""
    trace: Trace | None = None
    eval_results: tuple[EvalResult, ...] = ()
    duration_ms: int = Field(default=0, ge=0)

ConversationResult

Bases: BaseModel

Aggregate result from a multi-turn conversation test.

Attributes:

Name Type Description
conversation_id str

Unique identifier for this conversation.

agent_name str

Name of the agent tested.

turn_results tuple[TurnResult, ...]

Per-turn results in order.

total_turns int

Number of turns executed.

passed_turns int

Number of turns where all evaluators passed.

aggregate_score float

Mean score across all turns.

total_duration_ms int

Total execution time in milliseconds.

Source code in src/agentprobe/core/models.py
class ConversationResult(BaseModel):
    """Aggregate result from a multi-turn conversation test.

    Attributes:
        conversation_id: Unique identifier for this conversation.
        agent_name: Name of the agent tested.
        turn_results: Per-turn results in order.
        total_turns: Number of turns executed.
        passed_turns: Number of turns where all evaluators passed.
        aggregate_score: Mean score across all turns.
        total_duration_ms: Total execution time in milliseconds.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    conversation_id: str = Field(default_factory=lambda: str(uuid4()))
    agent_name: str = ""
    turn_results: tuple[TurnResult, ...] = ()
    total_turns: int = Field(default=0, ge=0)
    passed_turns: int = Field(default=0, ge=0)
    aggregate_score: float = Field(default=0.0, ge=0.0, le=1.0)
    total_duration_ms: int = Field(default=0, ge=0)

StatisticalSummary

Bases: BaseModel

Summary statistics from repeated evaluations.

Attributes:

Name Type Description
evaluator_name str

Name of the evaluator that produced these stats.

sample_count int

Number of evaluation runs.

scores tuple[float, ...]

Raw scores from each run (for reproducibility).

mean float

Arithmetic mean of scores.

std_dev float

Standard deviation of scores.

median float

Median score.

p5 float

5th percentile score.

p95 float

95th percentile score.

ci_lower float

Lower bound of 95% confidence interval.

ci_upper float

Upper bound of 95% confidence interval.

Source code in src/agentprobe/core/models.py
class StatisticalSummary(BaseModel):
    """Summary statistics from repeated evaluations.

    Attributes:
        evaluator_name: Name of the evaluator that produced these stats.
        sample_count: Number of evaluation runs.
        scores: Raw scores from each run (for reproducibility).
        mean: Arithmetic mean of scores.
        std_dev: Standard deviation of scores.
        median: Median score.
        p5: 5th percentile score.
        p95: 95th percentile score.
        ci_lower: Lower bound of 95% confidence interval.
        ci_upper: Upper bound of 95% confidence interval.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    evaluator_name: str
    sample_count: int = Field(ge=1)
    scores: tuple[float, ...] = ()
    mean: float = Field(default=0.0, ge=0.0, le=1.0)
    std_dev: float = Field(default=0.0, ge=0.0)
    median: float = Field(default=0.0, ge=0.0, le=1.0)
    p5: float = Field(default=0.0, ge=0.0, le=1.0)
    p95: float = Field(default=0.0, ge=0.0, le=1.0)
    ci_lower: float = Field(default=0.0, ge=0.0, le=1.0)
    ci_upper: float = Field(default=0.0, ge=0.0, le=1.0)

TestComparison

Bases: BaseModel

Comparison of a single test between baseline and current results.

Attributes:

Name Type Description
test_name str

Name of the compared test.

baseline_score float

Score from the baseline run.

current_score float

Score from the current run.

delta float

Score change (current - baseline).

is_regression bool

Whether the change constitutes a regression.

is_improvement bool

Whether the change constitutes an improvement.

Source code in src/agentprobe/core/models.py
class TestComparison(BaseModel):
    """Comparison of a single test between baseline and current results.

    Attributes:
        test_name: Name of the compared test.
        baseline_score: Score from the baseline run.
        current_score: Score from the current run.
        delta: Score change (current - baseline).
        is_regression: Whether the change constitutes a regression.
        is_improvement: Whether the change constitutes an improvement.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    test_name: str
    baseline_score: float = Field(ge=0.0, le=1.0)
    current_score: float = Field(ge=0.0, le=1.0)
    delta: float = 0.0
    is_regression: bool = False
    is_improvement: bool = False

RegressionReport

Bases: BaseModel

Report from comparing current results against a baseline.

Attributes:

Name Type Description
baseline_name str

Name of the baseline used for comparison.

comparisons tuple[TestComparison, ...]

Per-test comparisons.

total_tests int

Number of tests compared.

regressions int

Number of tests that regressed.

improvements int

Number of tests that improved.

unchanged int

Number of tests with no significant change.

threshold float

Score delta threshold used for regression detection.

Source code in src/agentprobe/core/models.py
class RegressionReport(BaseModel):
    """Report from comparing current results against a baseline.

    Attributes:
        baseline_name: Name of the baseline used for comparison.
        comparisons: Per-test comparisons.
        total_tests: Number of tests compared.
        regressions: Number of tests that regressed.
        improvements: Number of tests that improved.
        unchanged: Number of tests with no significant change.
        threshold: Score delta threshold used for regression detection.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    baseline_name: str
    comparisons: tuple[TestComparison, ...] = ()
    total_tests: int = Field(default=0, ge=0)
    regressions: int = Field(default=0, ge=0)
    improvements: int = Field(default=0, ge=0)
    unchanged: int = Field(default=0, ge=0)
    threshold: float = Field(default=0.05, ge=0.0, le=1.0)

BudgetCheckResult

Bases: BaseModel

Result of checking a cost against a budget.

Attributes:

Name Type Description
within_budget bool

Whether the cost is within the budget.

actual_cost_usd float

The actual cost incurred.

budget_limit_usd float

The budget limit.

remaining_usd float

Budget remaining (may be negative if exceeded).

utilization_pct float

Percentage of budget used.

Source code in src/agentprobe/core/models.py
class BudgetCheckResult(BaseModel):
    """Result of checking a cost against a budget.

    Attributes:
        within_budget: Whether the cost is within the budget.
        actual_cost_usd: The actual cost incurred.
        budget_limit_usd: The budget limit.
        remaining_usd: Budget remaining (may be negative if exceeded).
        utilization_pct: Percentage of budget used.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    within_budget: bool
    actual_cost_usd: float = Field(ge=0.0)
    budget_limit_usd: float = Field(ge=0.0)
    remaining_usd: float = 0.0
    utilization_pct: float = Field(default=0.0, ge=0.0)

DiffItem

Bases: BaseModel

A single difference between two snapshots.

Attributes:

Name Type Description
dimension str

The dimension being compared (e.g. 'tool_calls', 'cost').

expected Any

The expected (baseline) value.

actual Any

The actual (current) value.

similarity float

Similarity score for this dimension (0.0 to 1.0).

Source code in src/agentprobe/core/models.py
class DiffItem(BaseModel):
    """A single difference between two snapshots.

    Attributes:
        dimension: The dimension being compared (e.g. 'tool_calls', 'cost').
        expected: The expected (baseline) value.
        actual: The actual (current) value.
        similarity: Similarity score for this dimension (0.0 to 1.0).
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    dimension: str
    expected: Any = None
    actual: Any = None
    similarity: float = Field(default=0.0, ge=0.0, le=1.0)

SnapshotDiff

Bases: BaseModel

Comparison result between a snapshot and current output.

Attributes:

Name Type Description
snapshot_name str

Name of the snapshot being compared.

overall_similarity float

Weighted average similarity across dimensions.

diffs tuple[DiffItem, ...]

Per-dimension comparison details.

is_match bool

Whether the overall similarity meets the threshold.

threshold float

Similarity threshold used.

Source code in src/agentprobe/core/models.py
class SnapshotDiff(BaseModel):
    """Comparison result between a snapshot and current output.

    Attributes:
        snapshot_name: Name of the snapshot being compared.
        overall_similarity: Weighted average similarity across dimensions.
        diffs: Per-dimension comparison details.
        is_match: Whether the overall similarity meets the threshold.
        threshold: Similarity threshold used.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    snapshot_name: str
    overall_similarity: float = Field(default=0.0, ge=0.0, le=1.0)
    diffs: tuple[DiffItem, ...] = ()
    is_match: bool = False
    threshold: float = Field(default=0.8, ge=0.0, le=1.0)

TraceStep

Bases: BaseModel

A single step in a time-travel trace, with cumulative metrics.

Attributes:

Name Type Description
step_index int

Zero-based index of this step.

turn Turn

The trace turn at this step.

cumulative_input_tokens int

Total input tokens up to this step.

cumulative_output_tokens int

Total output tokens up to this step.

cumulative_cost_usd float

Estimated cumulative cost up to this step.

cumulative_latency_ms int

Total latency up to this step.

Source code in src/agentprobe/core/models.py
class TraceStep(BaseModel):
    """A single step in a time-travel trace, with cumulative metrics.

    Attributes:
        step_index: Zero-based index of this step.
        turn: The trace turn at this step.
        cumulative_input_tokens: Total input tokens up to this step.
        cumulative_output_tokens: Total output tokens up to this step.
        cumulative_cost_usd: Estimated cumulative cost up to this step.
        cumulative_latency_ms: Total latency up to this step.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    step_index: int = Field(ge=0)
    turn: Turn
    cumulative_input_tokens: int = Field(default=0, ge=0)
    cumulative_output_tokens: int = Field(default=0, ge=0)
    cumulative_cost_usd: float = Field(default=0.0, ge=0.0)
    cumulative_latency_ms: int = Field(default=0, ge=0)

ReplayDiff

Bases: BaseModel

Diff between an original trace and a replay trace.

Attributes:

Name Type Description
original_trace_id str

ID of the original trace.

replay_trace_id str

ID of the replay trace.

tool_call_diffs tuple[DiffItem, ...]

Differences in tool calls.

output_matches bool

Whether the outputs match.

original_output str

Output from the original trace.

replay_output str

Output from the replay trace.

Source code in src/agentprobe/core/models.py
class ReplayDiff(BaseModel):
    """Diff between an original trace and a replay trace.

    Attributes:
        original_trace_id: ID of the original trace.
        replay_trace_id: ID of the replay trace.
        tool_call_diffs: Differences in tool calls.
        output_matches: Whether the outputs match.
        original_output: Output from the original trace.
        replay_output: Output from the replay trace.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    original_trace_id: str = ""
    replay_trace_id: str = ""
    tool_call_diffs: tuple[DiffItem, ...] = ()
    output_matches: bool = False
    original_output: str = ""
    replay_output: str = ""

ChaosOverride

Bases: BaseModel

Configuration for a single chaos fault injection.

Attributes:

Name Type Description
chaos_type ChaosType

Type of fault to inject.

probability float

Probability of applying this fault (0.0 to 1.0).

target_tool str | None

If set, only apply to this specific tool.

delay_ms int

Delay in ms for SLOW type.

error_message str

Custom error message for ERROR type.

Source code in src/agentprobe/core/models.py
class ChaosOverride(BaseModel):
    """Configuration for a single chaos fault injection.

    Attributes:
        chaos_type: Type of fault to inject.
        probability: Probability of applying this fault (0.0 to 1.0).
        target_tool: If set, only apply to this specific tool.
        delay_ms: Delay in ms for SLOW type.
        error_message: Custom error message for ERROR type.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    chaos_type: ChaosType
    probability: float = Field(default=1.0, ge=0.0, le=1.0)
    target_tool: str | None = None
    delay_ms: int = Field(default=5000, ge=0)
    error_message: str = "Chaos fault injected"

AgentRun

Bases: BaseModel

A complete agent test run encompassing multiple test results.

Attributes:

Name Type Description
run_id str

Unique identifier for this run.

agent_name str

Name of the agent tested.

status RunStatus

Overall run status.

test_results tuple[TestResult, ...]

All test results from this run.

total_tests int

Total number of tests.

passed int

Number of tests that passed.

failed int

Number of tests that failed.

errors int

Number of tests that errored.

skipped int

Number of tests skipped.

cost_summary CostSummary | None

Aggregate cost for the run.

duration_ms int

Total run duration in milliseconds.

tags tuple[str, ...]

Tags for filtering.

metadata dict[str, Any]

Additional run metadata.

created_at datetime

When the run started.

Source code in src/agentprobe/core/models.py
class AgentRun(BaseModel):
    """A complete agent test run encompassing multiple test results.

    Attributes:
        run_id: Unique identifier for this run.
        agent_name: Name of the agent tested.
        status: Overall run status.
        test_results: All test results from this run.
        total_tests: Total number of tests.
        passed: Number of tests that passed.
        failed: Number of tests that failed.
        errors: Number of tests that errored.
        skipped: Number of tests skipped.
        cost_summary: Aggregate cost for the run.
        duration_ms: Total run duration in milliseconds.
        tags: Tags for filtering.
        metadata: Additional run metadata.
        created_at: When the run started.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    run_id: str = Field(default_factory=lambda: str(uuid4()))
    agent_name: str
    status: RunStatus
    test_results: tuple[TestResult, ...] = ()
    total_tests: int = Field(default=0, ge=0)
    passed: int = Field(default=0, ge=0)
    failed: int = Field(default=0, ge=0)
    errors: int = Field(default=0, ge=0)
    skipped: int = Field(default=0, ge=0)
    cost_summary: CostSummary | None = None
    duration_ms: int = Field(default=0, ge=0)
    tags: tuple[str, ...] = ()
    metadata: dict[str, Any] = Field(default_factory=dict)
    created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))

MetricDefinition

Bases: BaseModel

Definition of a named metric that can be collected and tracked.

Attributes:

Name Type Description
name str

Unique metric identifier (e.g. 'latency_ms', 'token_cost_usd').

metric_type MetricType

Category of the metric.

description str

Human-readable description.

unit str

Unit of measurement (e.g. 'ms', 'usd', 'count').

lower_is_better bool

Whether lower values indicate better performance.

Source code in src/agentprobe/core/models.py
class MetricDefinition(BaseModel):
    """Definition of a named metric that can be collected and tracked.

    Attributes:
        name: Unique metric identifier (e.g. 'latency_ms', 'token_cost_usd').
        metric_type: Category of the metric.
        description: Human-readable description.
        unit: Unit of measurement (e.g. 'ms', 'usd', 'count').
        lower_is_better: Whether lower values indicate better performance.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    name: str = Field(..., min_length=1, max_length=200)
    metric_type: MetricType
    description: str = ""
    unit: str = ""
    lower_is_better: bool = True

MetricValue

Bases: BaseModel

A single metric measurement at a point in time.

Attributes:

Name Type Description
metric_name str

Name of the metric this value belongs to.

value float

The numeric measurement.

tags tuple[str, ...]

Tags for filtering and grouping.

metadata dict[str, Any]

Additional context about this measurement.

timestamp datetime

When the measurement was taken.

Source code in src/agentprobe/core/models.py
class MetricValue(BaseModel):
    """A single metric measurement at a point in time.

    Attributes:
        metric_name: Name of the metric this value belongs to.
        value: The numeric measurement.
        tags: Tags for filtering and grouping.
        metadata: Additional context about this measurement.
        timestamp: When the measurement was taken.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    metric_name: str = Field(..., min_length=1)
    value: float
    tags: tuple[str, ...] = ()
    metadata: dict[str, Any] = Field(default_factory=dict)
    timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))

MetricAggregation

Bases: BaseModel

Aggregated statistics for a collection of metric values.

Attributes:

Name Type Description
metric_name str

Name of the metric.

count int

Number of values aggregated.

mean float

Arithmetic mean.

median float

Median value.

min_value float

Minimum value.

max_value float

Maximum value.

p95 float

95th percentile.

p99 float

99th percentile.

std_dev float

Standard deviation.

Source code in src/agentprobe/core/models.py
class MetricAggregation(BaseModel):
    """Aggregated statistics for a collection of metric values.

    Attributes:
        metric_name: Name of the metric.
        count: Number of values aggregated.
        mean: Arithmetic mean.
        median: Median value.
        min_value: Minimum value.
        max_value: Maximum value.
        p95: 95th percentile.
        p99: 99th percentile.
        std_dev: Standard deviation.
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    metric_name: str = Field(..., min_length=1)
    count: int = Field(ge=1)
    mean: float = 0.0
    median: float = 0.0
    min_value: float = 0.0
    max_value: float = 0.0
    p95: float = 0.0
    p99: float = 0.0
    std_dev: float = Field(default=0.0, ge=0.0)

TraceDiffReport

Bases: BaseModel

Report from comparing two independent traces.

Compares output text, tool call sequences, model usage, token counts, and latency between any two traces.

Attributes:

Name Type Description
trace_a_id str

ID of the first trace.

trace_b_id str

ID of the second trace.

tool_call_diffs tuple[DiffItem, ...]

Per-tool-call comparison items.

output_matches bool

Whether the output texts match exactly.

token_delta int

Difference in total tokens (B - A).

latency_delta_ms int

Difference in total latency (B - A).

overall_similarity float

Weighted similarity score (0.0 to 1.0).

Source code in src/agentprobe/core/models.py
class TraceDiffReport(BaseModel):
    """Report from comparing two independent traces.

    Compares output text, tool call sequences, model usage,
    token counts, and latency between any two traces.

    Attributes:
        trace_a_id: ID of the first trace.
        trace_b_id: ID of the second trace.
        tool_call_diffs: Per-tool-call comparison items.
        output_matches: Whether the output texts match exactly.
        token_delta: Difference in total tokens (B - A).
        latency_delta_ms: Difference in total latency (B - A).
        overall_similarity: Weighted similarity score (0.0 to 1.0).
    """

    model_config = ConfigDict(strict=True, frozen=True, extra="forbid")

    trace_a_id: str = ""
    trace_b_id: str = ""
    tool_call_diffs: tuple[DiffItem, ...] = ()
    output_matches: bool = False
    token_delta: int = 0
    latency_delta_ms: int = 0
    overall_similarity: float = Field(default=0.0, ge=0.0, le=1.0)

Runner

agentprobe.core.runner

Test runner: orchestrates test execution with optional parallelism.

Discovers tests, invokes them against an adapter, runs evaluators, and assembles results into an AgentRun.

TestRunner

Orchestrates test case execution against an agent adapter.

Supports sequential and parallel execution modes, per-test timeouts, and evaluator orchestration.

Attributes:

Name Type Description
config

The runner configuration.

evaluators

Evaluators to run on each test result.

Source code in src/agentprobe/core/runner.py
class TestRunner:
    """Orchestrates test case execution against an agent adapter.

    Supports sequential and parallel execution modes, per-test timeouts,
    and evaluator orchestration.

    Attributes:
        config: The runner configuration.
        evaluators: Evaluators to run on each test result.
    """

    def __init__(
        self,
        config: AgentProbeConfig | None = None,
        evaluators: list[EvaluatorProtocol] | None = None,
    ) -> None:
        """Initialize the test runner.

        Args:
            config: AgentProbe configuration. Uses defaults if None.
            evaluators: Evaluators to apply to test results.
        """
        self._config = config or AgentProbeConfig()
        self._evaluators = evaluators or []

    async def run(
        self,
        test_cases: Sequence[TestCase],
        adapter: AdapterProtocol,
    ) -> AgentRun:
        """Execute test cases against an adapter and collect results.

        Args:
            test_cases: The test cases to execute.
            adapter: The agent adapter to test.

        Returns:
            An AgentRun with all results.
        """
        start = time.monotonic()
        results: list[TestResult] = []

        if self._config.runner.parallel:
            results = await self._run_parallel(list(test_cases), adapter)
        else:
            results = await self._run_sequential(list(test_cases), adapter)

        elapsed_ms = int((time.monotonic() - start) * 1000)
        passed = sum(1 for r in results if r.status == TestStatus.PASSED)
        failed = sum(1 for r in results if r.status == TestStatus.FAILED)
        errors = sum(1 for r in results if r.status == TestStatus.ERROR)
        skipped = sum(1 for r in results if r.status == TestStatus.SKIPPED)

        status = RunStatus.COMPLETED if errors == 0 else RunStatus.FAILED

        return AgentRun(
            agent_name=adapter.name,
            status=status,
            test_results=tuple(results),
            total_tests=len(results),
            passed=passed,
            failed=failed,
            errors=errors,
            skipped=skipped,
            duration_ms=elapsed_ms,
        )

    async def _run_sequential(
        self,
        test_cases: list[TestCase],
        adapter: AdapterProtocol,
    ) -> list[TestResult]:
        """Execute tests one at a time."""
        results: list[TestResult] = []
        for tc in test_cases:
            result = await self._execute_single(tc, adapter)
            results.append(result)
        return results

    async def _run_parallel(
        self,
        test_cases: list[TestCase],
        adapter: AdapterProtocol,
    ) -> list[TestResult]:
        """Execute tests concurrently with a semaphore limit."""
        semaphore = asyncio.Semaphore(self._config.runner.max_workers)
        results: list[TestResult] = [None] * len(test_cases)  # type: ignore[list-item]

        async def _run_with_semaphore(idx: int, tc: TestCase) -> None:
            async with semaphore:
                results[idx] = await self._execute_single(tc, adapter)

        async with asyncio.TaskGroup() as tg:
            for i, tc in enumerate(test_cases):
                tg.create_task(_run_with_semaphore(i, tc))

        return results

    async def _execute_single(
        self,
        test_case: TestCase,
        adapter: AdapterProtocol,
    ) -> TestResult:
        """Execute a single test case with timeout and error handling.

        Args:
            test_case: The test to execute.
            adapter: The agent adapter.

        Returns:
            A TestResult reflecting the outcome.
        """
        start = time.monotonic()
        timeout = test_case.timeout_seconds or self._config.runner.default_timeout

        try:
            trace = await asyncio.wait_for(
                adapter.invoke(test_case.input_text),
                timeout=timeout,
            )
        except TimeoutError:
            elapsed_ms = int((time.monotonic() - start) * 1000)
            logger.warning("Test '%s' timed out after %.1fs", test_case.name, timeout)
            return TestResult(
                test_name=test_case.name,
                status=TestStatus.TIMEOUT,
                duration_ms=elapsed_ms,
                error_message=f"Timed out after {timeout}s",
            )
        except Exception as exc:
            elapsed_ms = int((time.monotonic() - start) * 1000)
            logger.error("Test '%s' errored: %s", test_case.name, exc)
            return TestResult(
                test_name=test_case.name,
                status=TestStatus.ERROR,
                duration_ms=elapsed_ms,
                error_message=str(exc),
            )

        eval_results = await self._run_evaluators(test_case, trace)
        elapsed_ms = int((time.monotonic() - start) * 1000)

        if eval_results:
            avg_score = sum(r.score for r in eval_results) / len(eval_results)
            all_passed = all(r.verdict.value in ("pass", "partial") for r in eval_results)
        else:
            avg_score = 1.0
            all_passed = True

        status = TestStatus.PASSED if all_passed else TestStatus.FAILED

        return TestResult(
            test_name=test_case.name,
            status=status,
            score=avg_score,
            duration_ms=elapsed_ms,
            trace=trace,
            eval_results=tuple(eval_results),
        )

    async def _run_evaluators(
        self,
        test_case: TestCase,
        trace: Trace,
    ) -> list[EvalResult]:
        """Run all evaluators against a test result.

        Args:
            test_case: The test case.
            trace: The execution trace.

        Returns:
            List of evaluation results.
        """
        results: list[EvalResult] = []
        for evaluator in self._evaluators:
            try:
                result = await evaluator.evaluate(test_case, trace)
                results.append(result)
            except Exception:
                logger.exception(
                    "Evaluator '%s' failed for test '%s'",
                    evaluator.name,
                    test_case.name,
                )
        return results

__init__(config=None, evaluators=None)

Initialize the test runner.

Parameters:

Name Type Description Default
config AgentProbeConfig | None

AgentProbe configuration. Uses defaults if None.

None
evaluators list[EvaluatorProtocol] | None

Evaluators to apply to test results.

None
Source code in src/agentprobe/core/runner.py
def __init__(
    self,
    config: AgentProbeConfig | None = None,
    evaluators: list[EvaluatorProtocol] | None = None,
) -> None:
    """Initialize the test runner.

    Args:
        config: AgentProbe configuration. Uses defaults if None.
        evaluators: Evaluators to apply to test results.
    """
    self._config = config or AgentProbeConfig()
    self._evaluators = evaluators or []

run(test_cases, adapter) async

Execute test cases against an adapter and collect results.

Parameters:

Name Type Description Default
test_cases Sequence[TestCase]

The test cases to execute.

required
adapter AdapterProtocol

The agent adapter to test.

required

Returns:

Type Description
AgentRun

An AgentRun with all results.

Source code in src/agentprobe/core/runner.py
async def run(
    self,
    test_cases: Sequence[TestCase],
    adapter: AdapterProtocol,
) -> AgentRun:
    """Execute test cases against an adapter and collect results.

    Args:
        test_cases: The test cases to execute.
        adapter: The agent adapter to test.

    Returns:
        An AgentRun with all results.
    """
    start = time.monotonic()
    results: list[TestResult] = []

    if self._config.runner.parallel:
        results = await self._run_parallel(list(test_cases), adapter)
    else:
        results = await self._run_sequential(list(test_cases), adapter)

    elapsed_ms = int((time.monotonic() - start) * 1000)
    passed = sum(1 for r in results if r.status == TestStatus.PASSED)
    failed = sum(1 for r in results if r.status == TestStatus.FAILED)
    errors = sum(1 for r in results if r.status == TestStatus.ERROR)
    skipped = sum(1 for r in results if r.status == TestStatus.SKIPPED)

    status = RunStatus.COMPLETED if errors == 0 else RunStatus.FAILED

    return AgentRun(
        agent_name=adapter.name,
        status=status,
        test_results=tuple(results),
        total_tests=len(results),
        passed=passed,
        failed=failed,
        errors=errors,
        skipped=skipped,
        duration_ms=elapsed_ms,
    )

Assertions

agentprobe.core.assertions

Fluent assertion API for validating agent outputs and tool calls.

Provides expect() and expect_tool_calls() entry points that return chainable expectation objects.

OutputExpectation

Fluent expectation chain for validating string output.

Each assertion method returns self for chaining. Results accumulate in results and can be checked with all_passed().

Example
expect(output).to_contain("hello").to_not_contain("error")
Source code in src/agentprobe/core/assertions.py
class OutputExpectation:
    """Fluent expectation chain for validating string output.

    Each assertion method returns ``self`` for chaining. Results
    accumulate in ``results`` and can be checked with ``all_passed()``.

    Example:
        ```python
        expect(output).to_contain("hello").to_not_contain("error")
        ```
    """

    def __init__(self, output: str) -> None:
        self._output = output
        self.results: list[AssertionResult] = []

    def _record(
        self,
        assertion_type: str,
        passed: bool,
        expected: object,
        actual: object,
        message: str = "",
    ) -> OutputExpectation:
        self.results.append(
            AssertionResult(
                assertion_type=assertion_type,
                passed=passed,
                expected=expected,
                actual=actual,
                message=message,
            )
        )
        if not passed:
            raise AssertionFailedError(
                assertion_type=assertion_type,
                expected=expected,
                actual=actual,
                message=message or None,
            )
        return self

    def to_contain(self, substring: str) -> OutputExpectation:
        """Assert that the output contains the given substring.

        Args:
            substring: The substring to search for.

        Returns:
            Self for chaining.
        """
        found = substring in self._output
        return self._record(
            "contain",
            found,
            substring,
            self._output[:200],
            f"Expected output to contain '{substring}'" if not found else "",
        )

    def to_not_contain(self, substring: str) -> OutputExpectation:
        """Assert that the output does NOT contain the given substring.

        Args:
            substring: The substring that should not appear.

        Returns:
            Self for chaining.
        """
        found = substring not in self._output
        return self._record(
            "not_contain",
            found,
            substring,
            self._output[:200],
            f"Expected output to not contain '{substring}'" if not found else "",
        )

    def to_match(self, pattern: str) -> OutputExpectation:
        """Assert that the output matches a regex pattern.

        Args:
            pattern: Regular expression pattern.

        Returns:
            Self for chaining.
        """
        matched = re.search(pattern, self._output) is not None
        return self._record(
            "match",
            matched,
            pattern,
            self._output[:200],
            f"Expected output to match pattern '{pattern}'" if not matched else "",
        )

    def to_have_length_less_than(self, max_length: int) -> OutputExpectation:
        """Assert that the output length is less than the given value.

        Args:
            max_length: Maximum allowed length.

        Returns:
            Self for chaining.
        """
        actual_len = len(self._output)
        passed = actual_len < max_length
        return self._record(
            "length_less_than",
            passed,
            max_length,
            actual_len,
            f"Expected length < {max_length}, got {actual_len}" if not passed else "",
        )

    def to_be_valid_json(self) -> OutputExpectation:
        """Assert that the output is valid JSON.

        Returns:
            Self for chaining.
        """
        try:
            json.loads(self._output)
            valid = True
        except (json.JSONDecodeError, TypeError):
            valid = False
        return self._record(
            "valid_json",
            valid,
            "valid JSON",
            self._output[:200],
            "Expected output to be valid JSON" if not valid else "",
        )

    def to_contain_any_of(self, substrings: Sequence[str]) -> OutputExpectation:
        """Assert that the output contains at least one of the substrings.

        Args:
            substrings: Substrings to check for.

        Returns:
            Self for chaining.
        """
        found = any(s in self._output for s in substrings)
        return self._record(
            "contain_any_of",
            found,
            list(substrings),
            self._output[:200],
            f"Expected output to contain one of {list(substrings)}" if not found else "",
        )

    def all_passed(self) -> bool:
        """Return True if all recorded assertions passed."""
        return all(r.passed for r in self.results)

to_contain(substring)

Assert that the output contains the given substring.

Parameters:

Name Type Description Default
substring str

The substring to search for.

required

Returns:

Type Description
OutputExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_contain(self, substring: str) -> OutputExpectation:
    """Assert that the output contains the given substring.

    Args:
        substring: The substring to search for.

    Returns:
        Self for chaining.
    """
    found = substring in self._output
    return self._record(
        "contain",
        found,
        substring,
        self._output[:200],
        f"Expected output to contain '{substring}'" if not found else "",
    )

to_not_contain(substring)

Assert that the output does NOT contain the given substring.

Parameters:

Name Type Description Default
substring str

The substring that should not appear.

required

Returns:

Type Description
OutputExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_not_contain(self, substring: str) -> OutputExpectation:
    """Assert that the output does NOT contain the given substring.

    Args:
        substring: The substring that should not appear.

    Returns:
        Self for chaining.
    """
    found = substring not in self._output
    return self._record(
        "not_contain",
        found,
        substring,
        self._output[:200],
        f"Expected output to not contain '{substring}'" if not found else "",
    )

to_match(pattern)

Assert that the output matches a regex pattern.

Parameters:

Name Type Description Default
pattern str

Regular expression pattern.

required

Returns:

Type Description
OutputExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_match(self, pattern: str) -> OutputExpectation:
    """Assert that the output matches a regex pattern.

    Args:
        pattern: Regular expression pattern.

    Returns:
        Self for chaining.
    """
    matched = re.search(pattern, self._output) is not None
    return self._record(
        "match",
        matched,
        pattern,
        self._output[:200],
        f"Expected output to match pattern '{pattern}'" if not matched else "",
    )

to_have_length_less_than(max_length)

Assert that the output length is less than the given value.

Parameters:

Name Type Description Default
max_length int

Maximum allowed length.

required

Returns:

Type Description
OutputExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_have_length_less_than(self, max_length: int) -> OutputExpectation:
    """Assert that the output length is less than the given value.

    Args:
        max_length: Maximum allowed length.

    Returns:
        Self for chaining.
    """
    actual_len = len(self._output)
    passed = actual_len < max_length
    return self._record(
        "length_less_than",
        passed,
        max_length,
        actual_len,
        f"Expected length < {max_length}, got {actual_len}" if not passed else "",
    )

to_be_valid_json()

Assert that the output is valid JSON.

Returns:

Type Description
OutputExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_be_valid_json(self) -> OutputExpectation:
    """Assert that the output is valid JSON.

    Returns:
        Self for chaining.
    """
    try:
        json.loads(self._output)
        valid = True
    except (json.JSONDecodeError, TypeError):
        valid = False
    return self._record(
        "valid_json",
        valid,
        "valid JSON",
        self._output[:200],
        "Expected output to be valid JSON" if not valid else "",
    )

to_contain_any_of(substrings)

Assert that the output contains at least one of the substrings.

Parameters:

Name Type Description Default
substrings Sequence[str]

Substrings to check for.

required

Returns:

Type Description
OutputExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_contain_any_of(self, substrings: Sequence[str]) -> OutputExpectation:
    """Assert that the output contains at least one of the substrings.

    Args:
        substrings: Substrings to check for.

    Returns:
        Self for chaining.
    """
    found = any(s in self._output for s in substrings)
    return self._record(
        "contain_any_of",
        found,
        list(substrings),
        self._output[:200],
        f"Expected output to contain one of {list(substrings)}" if not found else "",
    )

all_passed()

Return True if all recorded assertions passed.

Source code in src/agentprobe/core/assertions.py
def all_passed(self) -> bool:
    """Return True if all recorded assertions passed."""
    return all(r.passed for r in self.results)

ToolCallExpectation

Fluent expectation chain for validating tool call sequences.

Example
expect_tool_calls(trace.tool_calls).to_contain("search").to_have_count(2)
Source code in src/agentprobe/core/assertions.py
class ToolCallExpectation:
    """Fluent expectation chain for validating tool call sequences.

    Example:
        ```python
        expect_tool_calls(trace.tool_calls).to_contain("search").to_have_count(2)
        ```
    """

    def __init__(self, tool_calls: Sequence[ToolCall]) -> None:
        self._tool_calls = list(tool_calls)
        self._names = [tc.tool_name for tc in self._tool_calls]
        self.results: list[AssertionResult] = []

    def _record(
        self,
        assertion_type: str,
        passed: bool,
        expected: object,
        actual: object,
        message: str = "",
    ) -> ToolCallExpectation:
        self.results.append(
            AssertionResult(
                assertion_type=assertion_type,
                passed=passed,
                expected=expected,
                actual=actual,
                message=message,
            )
        )
        if not passed:
            raise AssertionFailedError(
                assertion_type=assertion_type,
                expected=expected,
                actual=actual,
                message=message or None,
            )
        return self

    def to_contain(self, tool_name: str) -> ToolCallExpectation:
        """Assert that a tool with the given name was called.

        Args:
            tool_name: The expected tool name.

        Returns:
            Self for chaining.
        """
        found = tool_name in self._names
        return self._record(
            "tool_contain",
            found,
            tool_name,
            self._names,
            f"Expected tool '{tool_name}' in calls {self._names}" if not found else "",
        )

    def to_have_sequence(self, expected_sequence: Sequence[str]) -> ToolCallExpectation:
        """Assert that tools were called in the given order.

        The expected sequence must appear as a contiguous subsequence
        in the actual tool call names.

        Args:
            expected_sequence: Ordered tool names to match.

        Returns:
            Self for chaining.
        """
        expected = list(expected_sequence)
        seq_len = len(expected)
        found = (
            any(
                self._names[i : i + seq_len] == expected
                for i in range(len(self._names) - seq_len + 1)
            )
            if seq_len <= len(self._names)
            else False
        )
        return self._record(
            "tool_sequence",
            found,
            expected,
            self._names,
            f"Expected sequence {expected} in calls {self._names}" if not found else "",
        )

    def to_have_count(self, count: int) -> ToolCallExpectation:
        """Assert the total number of tool calls.

        Args:
            count: Expected number of tool calls.

        Returns:
            Self for chaining.
        """
        actual = len(self._tool_calls)
        passed = actual == count
        return self._record(
            "tool_count",
            passed,
            count,
            actual,
            f"Expected {count} tool calls, got {actual}" if not passed else "",
        )

    def all_passed(self) -> bool:
        """Return True if all recorded assertions passed."""
        return all(r.passed for r in self.results)

to_contain(tool_name)

Assert that a tool with the given name was called.

Parameters:

Name Type Description Default
tool_name str

The expected tool name.

required

Returns:

Type Description
ToolCallExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_contain(self, tool_name: str) -> ToolCallExpectation:
    """Assert that a tool with the given name was called.

    Args:
        tool_name: The expected tool name.

    Returns:
        Self for chaining.
    """
    found = tool_name in self._names
    return self._record(
        "tool_contain",
        found,
        tool_name,
        self._names,
        f"Expected tool '{tool_name}' in calls {self._names}" if not found else "",
    )

to_have_sequence(expected_sequence)

Assert that tools were called in the given order.

The expected sequence must appear as a contiguous subsequence in the actual tool call names.

Parameters:

Name Type Description Default
expected_sequence Sequence[str]

Ordered tool names to match.

required

Returns:

Type Description
ToolCallExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_have_sequence(self, expected_sequence: Sequence[str]) -> ToolCallExpectation:
    """Assert that tools were called in the given order.

    The expected sequence must appear as a contiguous subsequence
    in the actual tool call names.

    Args:
        expected_sequence: Ordered tool names to match.

    Returns:
        Self for chaining.
    """
    expected = list(expected_sequence)
    seq_len = len(expected)
    found = (
        any(
            self._names[i : i + seq_len] == expected
            for i in range(len(self._names) - seq_len + 1)
        )
        if seq_len <= len(self._names)
        else False
    )
    return self._record(
        "tool_sequence",
        found,
        expected,
        self._names,
        f"Expected sequence {expected} in calls {self._names}" if not found else "",
    )

to_have_count(count)

Assert the total number of tool calls.

Parameters:

Name Type Description Default
count int

Expected number of tool calls.

required

Returns:

Type Description
ToolCallExpectation

Self for chaining.

Source code in src/agentprobe/core/assertions.py
def to_have_count(self, count: int) -> ToolCallExpectation:
    """Assert the total number of tool calls.

    Args:
        count: Expected number of tool calls.

    Returns:
        Self for chaining.
    """
    actual = len(self._tool_calls)
    passed = actual == count
    return self._record(
        "tool_count",
        passed,
        count,
        actual,
        f"Expected {count} tool calls, got {actual}" if not passed else "",
    )

all_passed()

Return True if all recorded assertions passed.

Source code in src/agentprobe/core/assertions.py
def all_passed(self) -> bool:
    """Return True if all recorded assertions passed."""
    return all(r.passed for r in self.results)

expect(output)

Create a fluent output expectation.

Parameters:

Name Type Description Default
output str

The agent output string to validate.

required

Returns:

Type Description
OutputExpectation

An OutputExpectation for chaining assertions.

Source code in src/agentprobe/core/assertions.py
def expect(output: str) -> OutputExpectation:
    """Create a fluent output expectation.

    Args:
        output: The agent output string to validate.

    Returns:
        An OutputExpectation for chaining assertions.
    """
    return OutputExpectation(output)

expect_tool_calls(tool_calls)

Create a fluent tool call expectation.

Parameters:

Name Type Description Default
tool_calls Sequence[ToolCall]

The sequence of tool calls to validate.

required

Returns:

Type Description
ToolCallExpectation

A ToolCallExpectation for chaining assertions.

Source code in src/agentprobe/core/assertions.py
def expect_tool_calls(tool_calls: Sequence[ToolCall]) -> ToolCallExpectation:
    """Create a fluent tool call expectation.

    Args:
        tool_calls: The sequence of tool calls to validate.

    Returns:
        A ToolCallExpectation for chaining assertions.
    """
    return ToolCallExpectation(tool_calls)

Scenario Decorator

agentprobe.core.scenario

Scenario decorator and registry for defining agent test cases.

The @scenario decorator marks functions as test scenarios and registers them in a global registry for discovery by the test runner.

scenario(name=None, *, input_text='', expected_output=None, tags=None, timeout=30.0, evaluators=None)

Decorator that registers a function as a test scenario.

The decorated function can optionally accept a TestCase argument and mutate it (e.g. setting dynamic input). If it returns a string, that string overrides input_text.

Parameters:

Name Type Description Default
name str | None

Test name. Defaults to the function name.

None
input_text str

The input prompt to send to the agent.

''
expected_output str | None

Optional expected output for comparison.

None
tags list[str] | None

Tags for filtering and grouping.

None
timeout float

Maximum execution time in seconds.

30.0
evaluators list[str] | None

Names of evaluators to run.

None

Returns:

Type Description
Callable[[Callable[..., Any]], Callable[..., Any]]

A decorator that registers the function.

Example
@scenario(name="greeting_test", input_text="Hello!")
def test_greeting():
    pass
Source code in src/agentprobe/core/scenario.py
def scenario(
    name: str | None = None,
    *,
    input_text: str = "",
    expected_output: str | None = None,
    tags: list[str] | None = None,
    timeout: float = 30.0,
    evaluators: list[str] | None = None,
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
    """Decorator that registers a function as a test scenario.

    The decorated function can optionally accept a ``TestCase`` argument
    and mutate it (e.g. setting dynamic input). If it returns a string,
    that string overrides ``input_text``.

    Args:
        name: Test name. Defaults to the function name.
        input_text: The input prompt to send to the agent.
        expected_output: Optional expected output for comparison.
        tags: Tags for filtering and grouping.
        timeout: Maximum execution time in seconds.
        evaluators: Names of evaluators to run.

    Returns:
        A decorator that registers the function.

    Example:
        ```python
        @scenario(name="greeting_test", input_text="Hello!")
        def test_greeting():
            pass
        ```
    """

    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
        resolved_name = name or func.__name__
        test_case = TestCase(
            name=resolved_name,
            input_text=input_text,
            expected_output=expected_output,
            tags=tags or [],
            timeout_seconds=timeout,
            evaluators=evaluators or [],
            metadata={"source_function": func.__qualname__},
        )

        module = func.__module__
        _scenario_registry.setdefault(module, []).append(test_case)

        @functools.wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            return func(*args, **kwargs)

        wrapper._agentprobe_scenario = test_case  # type: ignore[attr-defined]
        return wrapper

    return decorator

get_scenarios(module_name=None)

Retrieve registered scenarios.

Parameters:

Name Type Description Default
module_name str | None

If provided, return scenarios from this module only. If None, return all registered scenarios.

None

Returns:

Type Description
list[TestCase]

A list of TestCase objects.

Source code in src/agentprobe/core/scenario.py
def get_scenarios(module_name: str | None = None) -> list[TestCase]:
    """Retrieve registered scenarios.

    Args:
        module_name: If provided, return scenarios from this module only.
            If None, return all registered scenarios.

    Returns:
        A list of TestCase objects.
    """
    if module_name is not None:
        return list(_scenario_registry.get(module_name, []))
    return [tc for cases in _scenario_registry.values() for tc in cases]

clear_registry()

Clear all registered scenarios. Primarily for testing.

Source code in src/agentprobe/core/scenario.py
def clear_registry() -> None:
    """Clear all registered scenarios. Primarily for testing."""
    _scenario_registry.clear()

Configuration

agentprobe.core.config

Configuration loading and validation for AgentProbe.

Loads configuration from agentprobe.yaml with support for ${ENV_VAR} interpolation and sensible defaults.

RunnerConfig

Bases: BaseModel

Configuration for the test runner.

Attributes:

Name Type Description
parallel bool

Whether to run tests in parallel.

max_workers int

Maximum number of concurrent tests.

default_timeout float

Default test timeout in seconds.

Source code in src/agentprobe/core/config.py
class RunnerConfig(BaseModel):
    """Configuration for the test runner.

    Attributes:
        parallel: Whether to run tests in parallel.
        max_workers: Maximum number of concurrent tests.
        default_timeout: Default test timeout in seconds.
    """

    model_config = ConfigDict(extra="forbid")

    parallel: bool = False
    max_workers: int = Field(default=4, ge=1)
    default_timeout: float = Field(default=30.0, gt=0)

EvalConfig

Bases: BaseModel

Configuration for evaluators.

Attributes:

Name Type Description
default_evaluators list[str]

Evaluator names to apply to all tests.

Source code in src/agentprobe/core/config.py
class EvalConfig(BaseModel):
    """Configuration for evaluators.

    Attributes:
        default_evaluators: Evaluator names to apply to all tests.
    """

    model_config = ConfigDict(extra="forbid")

    default_evaluators: list[str] = Field(default_factory=list)

JudgeConfig

Bases: BaseModel

Configuration for the judge evaluator.

Attributes:

Name Type Description
model str

Model to use for judging.

provider str

API provider name.

temperature float

Sampling temperature.

max_tokens int

Maximum response tokens.

Source code in src/agentprobe/core/config.py
class JudgeConfig(BaseModel):
    """Configuration for the judge evaluator.

    Attributes:
        model: Model to use for judging.
        provider: API provider name.
        temperature: Sampling temperature.
        max_tokens: Maximum response tokens.
    """

    model_config = ConfigDict(extra="forbid")

    model: str = "claude-sonnet-4-5-20250929"
    provider: str = "anthropic"
    temperature: float = Field(default=0.0, ge=0.0, le=2.0)
    max_tokens: int = Field(default=1024, ge=1)

TraceConfig

Bases: BaseModel

Configuration for trace recording and storage.

Attributes:

Name Type Description
enabled bool

Whether to record traces.

storage_backend str

Storage backend type.

database_path str

Path to SQLite database file.

Source code in src/agentprobe/core/config.py
class TraceConfig(BaseModel):
    """Configuration for trace recording and storage.

    Attributes:
        enabled: Whether to record traces.
        storage_backend: Storage backend type.
        database_path: Path to SQLite database file.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = True
    storage_backend: str = "sqlite"
    database_path: str = ".agentprobe/traces.db"

CostConfig

Bases: BaseModel

Configuration for cost tracking.

Attributes:

Name Type Description
enabled bool

Whether to track costs.

budget_limit_usd float | None

Maximum allowed cost per run.

pricing_dir str | None

Directory containing pricing YAML files.

Source code in src/agentprobe/core/config.py
class CostConfig(BaseModel):
    """Configuration for cost tracking.

    Attributes:
        enabled: Whether to track costs.
        budget_limit_usd: Maximum allowed cost per run.
        pricing_dir: Directory containing pricing YAML files.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = True
    budget_limit_usd: float | None = None
    pricing_dir: str | None = None

SafetyConfig

Bases: BaseModel

Configuration for safety testing.

Attributes:

Name Type Description
enabled bool

Whether to run safety tests.

suites list[str]

List of safety suite names to run.

Source code in src/agentprobe/core/config.py
class SafetyConfig(BaseModel):
    """Configuration for safety testing.

    Attributes:
        enabled: Whether to run safety tests.
        suites: List of safety suite names to run.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = False
    suites: list[str] = Field(default_factory=list)

ChaosConfig

Bases: BaseModel

Configuration for chaos fault injection testing.

Attributes:

Name Type Description
enabled bool

Whether chaos testing is enabled.

seed int

Random seed for deterministic fault injection.

default_probability float

Default probability of applying a fault.

Source code in src/agentprobe/core/config.py
class ChaosConfig(BaseModel):
    """Configuration for chaos fault injection testing.

    Attributes:
        enabled: Whether chaos testing is enabled.
        seed: Random seed for deterministic fault injection.
        default_probability: Default probability of applying a fault.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = False
    seed: int = 42
    default_probability: float = Field(default=0.5, ge=0.0, le=1.0)

SnapshotConfig

Bases: BaseModel

Configuration for snapshot/golden file testing.

Attributes:

Name Type Description
enabled bool

Whether snapshot testing is enabled.

snapshot_dir str

Directory for storing snapshot files.

update_on_first_run bool

Whether to create snapshots on first run.

threshold float

Similarity threshold for snapshot matching.

Source code in src/agentprobe/core/config.py
class SnapshotConfig(BaseModel):
    """Configuration for snapshot/golden file testing.

    Attributes:
        enabled: Whether snapshot testing is enabled.
        snapshot_dir: Directory for storing snapshot files.
        update_on_first_run: Whether to create snapshots on first run.
        threshold: Similarity threshold for snapshot matching.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = False
    snapshot_dir: str = ".agentprobe/snapshots"
    update_on_first_run: bool = True
    threshold: float = Field(default=0.8, ge=0.0, le=1.0)

BudgetConfig

Bases: BaseModel

Configuration for per-test and per-suite cost budgets.

Attributes:

Name Type Description
test_budget_usd float | None

Maximum cost per individual test.

suite_budget_usd float | None

Maximum cost per test suite run.

Source code in src/agentprobe/core/config.py
class BudgetConfig(BaseModel):
    """Configuration for per-test and per-suite cost budgets.

    Attributes:
        test_budget_usd: Maximum cost per individual test.
        suite_budget_usd: Maximum cost per test suite run.
    """

    model_config = ConfigDict(extra="forbid")

    test_budget_usd: float | None = None
    suite_budget_usd: float | None = None

RegressionConfig

Bases: BaseModel

Configuration for regression detection.

Attributes:

Name Type Description
enabled bool

Whether regression detection is enabled.

baseline_dir str

Directory for storing baseline files.

threshold float

Score delta threshold for flagging regressions.

Source code in src/agentprobe/core/config.py
class RegressionConfig(BaseModel):
    """Configuration for regression detection.

    Attributes:
        enabled: Whether regression detection is enabled.
        baseline_dir: Directory for storing baseline files.
        threshold: Score delta threshold for flagging regressions.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = False
    baseline_dir: str = ".agentprobe/baselines"
    threshold: float = Field(default=0.05, ge=0.0, le=1.0)

MetricsConfig

Bases: BaseModel

Configuration for metric collection and trending.

Attributes:

Name Type Description
enabled bool

Whether metric collection is enabled.

builtin_metrics bool

Whether to collect built-in metrics automatically.

trend_window int

Number of recent runs to use for trend analysis.

Source code in src/agentprobe/core/config.py
class MetricsConfig(BaseModel):
    """Configuration for metric collection and trending.

    Attributes:
        enabled: Whether metric collection is enabled.
        builtin_metrics: Whether to collect built-in metrics automatically.
        trend_window: Number of recent runs to use for trend analysis.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = True
    builtin_metrics: bool = True
    trend_window: int = Field(default=10, ge=2)

PluginConfig

Bases: BaseModel

Configuration for the plugin system.

Attributes:

Name Type Description
enabled bool

Whether the plugin system is enabled.

directories list[str]

Additional directories to scan for plugins.

entry_point_group str

Entry point group name for plugin discovery.

Source code in src/agentprobe/core/config.py
class PluginConfig(BaseModel):
    """Configuration for the plugin system.

    Attributes:
        enabled: Whether the plugin system is enabled.
        directories: Additional directories to scan for plugins.
        entry_point_group: Entry point group name for plugin discovery.
    """

    model_config = ConfigDict(extra="forbid")

    enabled: bool = True
    directories: list[str] = Field(default_factory=list)
    entry_point_group: str = "agentprobe.plugins"

ReportingConfig

Bases: BaseModel

Configuration for result reporting.

Attributes:

Name Type Description
formats list[str]

Output format names.

output_dir str

Directory for report files.

Source code in src/agentprobe/core/config.py
class ReportingConfig(BaseModel):
    """Configuration for result reporting.

    Attributes:
        formats: Output format names.
        output_dir: Directory for report files.
    """

    model_config = ConfigDict(extra="forbid")

    formats: list[str] = Field(default_factory=lambda: ["terminal"])
    output_dir: str = "agentprobe-report"

AgentProbeConfig

Bases: BaseModel

Top-level AgentProbe configuration.

Attributes:

Name Type Description
project_name str

Name of the project being tested.

test_dir str

Directory containing test files.

runner RunnerConfig

Test runner configuration.

eval EvalConfig

Evaluator configuration.

judge JudgeConfig

Judge evaluator configuration.

trace TraceConfig

Trace recording configuration.

cost CostConfig

Cost tracking configuration.

safety SafetyConfig

Safety testing configuration.

reporting ReportingConfig

Reporting configuration.

Source code in src/agentprobe/core/config.py
class AgentProbeConfig(BaseModel):
    """Top-level AgentProbe configuration.

    Attributes:
        project_name: Name of the project being tested.
        test_dir: Directory containing test files.
        runner: Test runner configuration.
        eval: Evaluator configuration.
        judge: Judge evaluator configuration.
        trace: Trace recording configuration.
        cost: Cost tracking configuration.
        safety: Safety testing configuration.
        reporting: Reporting configuration.
    """

    model_config = ConfigDict(extra="forbid")

    project_name: str = "agentprobe"
    test_dir: str = "tests"
    runner: RunnerConfig = Field(default_factory=RunnerConfig)
    eval: EvalConfig = Field(default_factory=EvalConfig)
    judge: JudgeConfig = Field(default_factory=JudgeConfig)
    trace: TraceConfig = Field(default_factory=TraceConfig)
    cost: CostConfig = Field(default_factory=CostConfig)
    safety: SafetyConfig = Field(default_factory=SafetyConfig)
    reporting: ReportingConfig = Field(default_factory=ReportingConfig)
    chaos: ChaosConfig = Field(default_factory=ChaosConfig)
    snapshot: SnapshotConfig = Field(default_factory=SnapshotConfig)
    budget: BudgetConfig = Field(default_factory=BudgetConfig)
    regression: RegressionConfig = Field(default_factory=RegressionConfig)
    metrics: MetricsConfig = Field(default_factory=MetricsConfig)
    plugins: PluginConfig = Field(default_factory=PluginConfig)

load_config(path=None)

Load configuration from a YAML file.

Searches for agentprobe.yaml or agentprobe.yml in the current directory if no path is provided. Returns default config if no file is found.

Parameters:

Name Type Description Default
path str | Path | None

Explicit path to a config file.

None

Returns:

Type Description
AgentProbeConfig

A validated AgentProbeConfig instance.

Raises:

Type Description
ConfigError

If the file exists but is invalid.

Source code in src/agentprobe/core/config.py
def load_config(
    path: str | Path | None = None,
) -> AgentProbeConfig:
    """Load configuration from a YAML file.

    Searches for ``agentprobe.yaml`` or ``agentprobe.yml`` in the
    current directory if no path is provided. Returns default config
    if no file is found.

    Args:
        path: Explicit path to a config file.

    Returns:
        A validated AgentProbeConfig instance.

    Raises:
        ConfigError: If the file exists but is invalid.
    """
    if path is not None:
        config_path = Path(path)
        if not config_path.exists():
            raise ConfigError(f"Config file not found: {config_path}")
    else:
        for candidate in ["agentprobe.yaml", "agentprobe.yml"]:
            config_path = Path(candidate)
            if config_path.exists():
                break
        else:
            logger.debug("No config file found, using defaults")
            return AgentProbeConfig()

    logger.info("Loading config from %s", config_path)
    try:
        raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
    except yaml.YAMLError as exc:
        raise ConfigError(f"Invalid YAML in {config_path}: {exc}") from exc

    if raw is None:
        return AgentProbeConfig()

    if not isinstance(raw, dict):
        raise ConfigError(f"Config file must be a YAML mapping, got {type(raw).__name__}")

    interpolated = _interpolate_recursive(raw)

    try:
        return AgentProbeConfig.model_validate(interpolated)
    except Exception as exc:
        raise ConfigError(f"Invalid configuration: {exc}") from exc

Protocols

agentprobe.core.protocols

Protocol definitions for AgentProbe's pluggable architecture.

All protocols are runtime-checkable, allowing isinstance() verification of structural subtyping. Implementors do not need to inherit from these protocols — they only need to provide the required methods.

AdapterProtocol

Bases: Protocol

Interface for agent framework adapters.

Adapters wrap specific agent frameworks (LangChain, CrewAI, etc.) and translate their execution into AgentProbe's Trace format.

Source code in src/agentprobe/core/protocols.py
@runtime_checkable
class AdapterProtocol(Protocol):
    """Interface for agent framework adapters.

    Adapters wrap specific agent frameworks (LangChain, CrewAI, etc.)
    and translate their execution into AgentProbe's Trace format.
    """

    @property
    def name(self) -> str:
        """Return the adapter name."""
        ...

    async def invoke(
        self,
        input_text: str,
        **kwargs: Any,
    ) -> Trace:
        """Invoke the agent with the given input and return a trace.

        Args:
            input_text: The input prompt to send to the agent.
            **kwargs: Additional adapter-specific arguments.

        Returns:
            A complete execution trace.
        """
        ...

name property

Return the adapter name.

invoke(input_text, **kwargs) async

Invoke the agent with the given input and return a trace.

Parameters:

Name Type Description Default
input_text str

The input prompt to send to the agent.

required
**kwargs Any

Additional adapter-specific arguments.

{}

Returns:

Type Description
Trace

A complete execution trace.

Source code in src/agentprobe/core/protocols.py
async def invoke(
    self,
    input_text: str,
    **kwargs: Any,
) -> Trace:
    """Invoke the agent with the given input and return a trace.

    Args:
        input_text: The input prompt to send to the agent.
        **kwargs: Additional adapter-specific arguments.

    Returns:
        A complete execution trace.
    """
    ...

EvaluatorProtocol

Bases: Protocol

Interface for test result evaluators.

Evaluators assess agent outputs against expectations, producing scored results with pass/fail verdicts.

Source code in src/agentprobe/core/protocols.py
@runtime_checkable
class EvaluatorProtocol(Protocol):
    """Interface for test result evaluators.

    Evaluators assess agent outputs against expectations, producing
    scored results with pass/fail verdicts.
    """

    @property
    def name(self) -> str:
        """Return the evaluator name."""
        ...

    async def evaluate(
        self,
        test_case: TestCase,
        trace: Trace,
    ) -> EvalResult:
        """Evaluate an agent's output for a given test case.

        Args:
            test_case: The test case that was executed.
            trace: The execution trace to evaluate.

        Returns:
            An evaluation result with score and verdict.
        """
        ...

name property

Return the evaluator name.

evaluate(test_case, trace) async

Evaluate an agent's output for a given test case.

Parameters:

Name Type Description Default
test_case TestCase

The test case that was executed.

required
trace Trace

The execution trace to evaluate.

required

Returns:

Type Description
EvalResult

An evaluation result with score and verdict.

Source code in src/agentprobe/core/protocols.py
async def evaluate(
    self,
    test_case: TestCase,
    trace: Trace,
) -> EvalResult:
    """Evaluate an agent's output for a given test case.

    Args:
        test_case: The test case that was executed.
        trace: The execution trace to evaluate.

    Returns:
        An evaluation result with score and verdict.
    """
    ...

StorageProtocol

Bases: Protocol

Interface for persistence backends.

Storage implementations handle saving and loading traces, test results, and agent runs.

Source code in src/agentprobe/core/protocols.py
@runtime_checkable
class StorageProtocol(Protocol):
    """Interface for persistence backends.

    Storage implementations handle saving and loading traces,
    test results, and agent runs.
    """

    async def setup(self) -> None:
        """Initialize the storage backend (create tables, etc.)."""
        ...

    async def save_trace(self, trace: Trace) -> None:
        """Persist a trace.

        Args:
            trace: The trace to save.
        """
        ...

    async def load_trace(self, trace_id: str) -> Trace | None:
        """Load a trace by ID.

        Args:
            trace_id: The unique identifier of the trace.

        Returns:
            The trace if found, otherwise None.
        """
        ...

    async def list_traces(
        self,
        agent_name: str | None = None,
        limit: int = 100,
    ) -> Sequence[Trace]:
        """List traces with optional filtering.

        Args:
            agent_name: Filter by agent name if provided.
            limit: Maximum number of traces to return.

        Returns:
            A sequence of matching traces.
        """
        ...

    async def save_result(self, result: TestResult) -> None:
        """Persist a test result.

        Args:
            result: The test result to save.
        """
        ...

    async def load_results(
        self,
        test_name: str | None = None,
        limit: int = 100,
    ) -> Sequence[TestResult]:
        """Load test results with optional filtering.

        Args:
            test_name: Filter by test name if provided.
            limit: Maximum number of results to return.

        Returns:
            A sequence of matching test results.
        """
        ...

setup() async

Initialize the storage backend (create tables, etc.).

Source code in src/agentprobe/core/protocols.py
async def setup(self) -> None:
    """Initialize the storage backend (create tables, etc.)."""
    ...

save_trace(trace) async

Persist a trace.

Parameters:

Name Type Description Default
trace Trace

The trace to save.

required
Source code in src/agentprobe/core/protocols.py
async def save_trace(self, trace: Trace) -> None:
    """Persist a trace.

    Args:
        trace: The trace to save.
    """
    ...

load_trace(trace_id) async

Load a trace by ID.

Parameters:

Name Type Description Default
trace_id str

The unique identifier of the trace.

required

Returns:

Type Description
Trace | None

The trace if found, otherwise None.

Source code in src/agentprobe/core/protocols.py
async def load_trace(self, trace_id: str) -> Trace | None:
    """Load a trace by ID.

    Args:
        trace_id: The unique identifier of the trace.

    Returns:
        The trace if found, otherwise None.
    """
    ...

list_traces(agent_name=None, limit=100) async

List traces with optional filtering.

Parameters:

Name Type Description Default
agent_name str | None

Filter by agent name if provided.

None
limit int

Maximum number of traces to return.

100

Returns:

Type Description
Sequence[Trace]

A sequence of matching traces.

Source code in src/agentprobe/core/protocols.py
async def list_traces(
    self,
    agent_name: str | None = None,
    limit: int = 100,
) -> Sequence[Trace]:
    """List traces with optional filtering.

    Args:
        agent_name: Filter by agent name if provided.
        limit: Maximum number of traces to return.

    Returns:
        A sequence of matching traces.
    """
    ...

save_result(result) async

Persist a test result.

Parameters:

Name Type Description Default
result TestResult

The test result to save.

required
Source code in src/agentprobe/core/protocols.py
async def save_result(self, result: TestResult) -> None:
    """Persist a test result.

    Args:
        result: The test result to save.
    """
    ...

load_results(test_name=None, limit=100) async

Load test results with optional filtering.

Parameters:

Name Type Description Default
test_name str | None

Filter by test name if provided.

None
limit int

Maximum number of results to return.

100

Returns:

Type Description
Sequence[TestResult]

A sequence of matching test results.

Source code in src/agentprobe/core/protocols.py
async def load_results(
    self,
    test_name: str | None = None,
    limit: int = 100,
) -> Sequence[TestResult]:
    """Load test results with optional filtering.

    Args:
        test_name: Filter by test name if provided.
        limit: Maximum number of results to return.

    Returns:
        A sequence of matching test results.
    """
    ...

MetricStoreProtocol

Bases: Protocol

Interface for metric persistence backends.

Metric storage is optional and separate from the main StorageProtocol, allowing implementations to opt in to metric tracking independently.

Source code in src/agentprobe/core/protocols.py
@runtime_checkable
class MetricStoreProtocol(Protocol):
    """Interface for metric persistence backends.

    Metric storage is optional and separate from the main StorageProtocol,
    allowing implementations to opt in to metric tracking independently.
    """

    async def save_metrics(self, metrics: Sequence[MetricValue]) -> None:
        """Persist a batch of metric values.

        Args:
            metrics: The metric values to save.
        """
        ...

    async def load_metrics(
        self,
        metric_name: str | None = None,
        limit: int = 1000,
    ) -> Sequence[MetricValue]:
        """Load metric values with optional filtering.

        Args:
            metric_name: Filter by metric name if provided.
            limit: Maximum number of values to return.

        Returns:
            A sequence of matching metric values.
        """
        ...

save_metrics(metrics) async

Persist a batch of metric values.

Parameters:

Name Type Description Default
metrics Sequence[MetricValue]

The metric values to save.

required
Source code in src/agentprobe/core/protocols.py
async def save_metrics(self, metrics: Sequence[MetricValue]) -> None:
    """Persist a batch of metric values.

    Args:
        metrics: The metric values to save.
    """
    ...

load_metrics(metric_name=None, limit=1000) async

Load metric values with optional filtering.

Parameters:

Name Type Description Default
metric_name str | None

Filter by metric name if provided.

None
limit int

Maximum number of values to return.

1000

Returns:

Type Description
Sequence[MetricValue]

A sequence of matching metric values.

Source code in src/agentprobe/core/protocols.py
async def load_metrics(
    self,
    metric_name: str | None = None,
    limit: int = 1000,
) -> Sequence[MetricValue]:
    """Load metric values with optional filtering.

    Args:
        metric_name: Filter by metric name if provided.
        limit: Maximum number of values to return.

    Returns:
        A sequence of matching metric values.
    """
    ...

RunnerProtocol

Bases: Protocol

Interface for test execution engines.

Source code in src/agentprobe/core/protocols.py
@runtime_checkable
class RunnerProtocol(Protocol):
    """Interface for test execution engines."""

    async def run(
        self,
        test_cases: Sequence[TestCase],
        adapter: AdapterProtocol,
    ) -> AgentRun:
        """Execute a batch of test cases against an agent adapter.

        Args:
            test_cases: The test cases to execute.
            adapter: The agent adapter to test against.

        Returns:
            An AgentRun containing all results.
        """
        ...

run(test_cases, adapter) async

Execute a batch of test cases against an agent adapter.

Parameters:

Name Type Description Default
test_cases Sequence[TestCase]

The test cases to execute.

required
adapter AdapterProtocol

The agent adapter to test against.

required

Returns:

Type Description
AgentRun

An AgentRun containing all results.

Source code in src/agentprobe/core/protocols.py
async def run(
    self,
    test_cases: Sequence[TestCase],
    adapter: AdapterProtocol,
) -> AgentRun:
    """Execute a batch of test cases against an agent adapter.

    Args:
        test_cases: The test cases to execute.
        adapter: The agent adapter to test against.

    Returns:
        An AgentRun containing all results.
    """
    ...

ReporterProtocol

Bases: Protocol

Interface for test result reporters.

Source code in src/agentprobe/core/protocols.py
@runtime_checkable
class ReporterProtocol(Protocol):
    """Interface for test result reporters."""

    async def report(self, run: AgentRun) -> None:
        """Generate and output a report for an agent run.

        Args:
            run: The completed agent run to report on.
        """
        ...

report(run) async

Generate and output a report for an agent run.

Parameters:

Name Type Description Default
run AgentRun

The completed agent run to report on.

required
Source code in src/agentprobe/core/protocols.py
async def report(self, run: AgentRun) -> None:
    """Generate and output a report for an agent run.

    Args:
        run: The completed agent run to report on.
    """
    ...

Exceptions

agentprobe.core.exceptions

Exception hierarchy for the AgentProbe framework.

All exceptions inherit from AgentProbeError, allowing callers to catch the base type for generic error handling or specific subclasses for targeted recovery.

AgentProbeError

Bases: Exception

Base exception for all AgentProbe errors.

Source code in src/agentprobe/core/exceptions.py
class AgentProbeError(Exception):
    """Base exception for all AgentProbe errors."""

ConfigError

Bases: AgentProbeError

Raised when configuration is invalid or missing.

Source code in src/agentprobe/core/exceptions.py
class ConfigError(AgentProbeError):
    """Raised when configuration is invalid or missing."""

RunnerError

Bases: AgentProbeError

Raised when the test runner encounters an execution failure.

Source code in src/agentprobe/core/exceptions.py
class RunnerError(AgentProbeError):
    """Raised when the test runner encounters an execution failure."""

TestTimeoutError

Bases: RunnerError

Raised when a test exceeds its configured timeout.

Source code in src/agentprobe/core/exceptions.py
class TestTimeoutError(RunnerError):
    """Raised when a test exceeds its configured timeout."""

    def __init__(self, test_name: str, timeout_seconds: float) -> None:
        self.test_name = test_name
        self.timeout_seconds = timeout_seconds
        super().__init__(f"Test '{test_name}' exceeded {timeout_seconds}s timeout")

AdapterError

Bases: AgentProbeError

Raised when an agent adapter fails during invocation.

Source code in src/agentprobe/core/exceptions.py
class AdapterError(AgentProbeError):
    """Raised when an agent adapter fails during invocation."""

    def __init__(self, adapter_name: str, message: str) -> None:
        self.adapter_name = adapter_name
        super().__init__(f"Adapter '{adapter_name}' error: {message}")

EvaluatorError

Bases: AgentProbeError

Base exception for evaluation errors.

Source code in src/agentprobe/core/exceptions.py
class EvaluatorError(AgentProbeError):
    """Base exception for evaluation errors."""

JudgeAPIError

Bases: EvaluatorError

Raised when the judge model API call fails.

Source code in src/agentprobe/core/exceptions.py
class JudgeAPIError(EvaluatorError):
    """Raised when the judge model API call fails."""

    def __init__(self, model: str, status_code: int, message: str) -> None:
        self.model = model
        self.status_code = status_code
        super().__init__(f"Judge API error ({model}): {status_code} — {message}")

StorageError

Bases: AgentProbeError

Raised when a storage backend operation fails.

Source code in src/agentprobe/core/exceptions.py
class StorageError(AgentProbeError):
    """Raised when a storage backend operation fails."""

TraceError

Bases: AgentProbeError

Raised when trace recording or processing fails.

Source code in src/agentprobe/core/exceptions.py
class TraceError(AgentProbeError):
    """Raised when trace recording or processing fails."""

CostError

Bases: AgentProbeError

Raised when cost calculation encounters an error.

Source code in src/agentprobe/core/exceptions.py
class CostError(AgentProbeError):
    """Raised when cost calculation encounters an error."""

BudgetExceededError

Bases: CostError

Raised when a cost budget limit is exceeded.

Source code in src/agentprobe/core/exceptions.py
class BudgetExceededError(CostError):
    """Raised when a cost budget limit is exceeded."""

    def __init__(self, actual: float, limit: float, currency: str = "USD") -> None:
        self.actual = actual
        self.limit = limit
        self.currency = currency
        super().__init__(f"Budget exceeded: ${actual:.4f} > ${limit:.4f} {currency} limit")

SafetyError

Bases: AgentProbeError

Raised when a safety check fails or encounters an error.

Source code in src/agentprobe/core/exceptions.py
class SafetyError(AgentProbeError):
    """Raised when a safety check fails or encounters an error."""

SecurityError

Bases: AgentProbeError

Raised when a security violation is detected.

Source code in src/agentprobe/core/exceptions.py
class SecurityError(AgentProbeError):
    """Raised when a security violation is detected."""

MetricsError

Bases: AgentProbeError

Raised when metric collection, aggregation, or trending fails.

Source code in src/agentprobe/core/exceptions.py
class MetricsError(AgentProbeError):
    """Raised when metric collection, aggregation, or trending fails."""

PluginError

Bases: AgentProbeError

Raised when a plugin fails to load or execute.

Source code in src/agentprobe/core/exceptions.py
class PluginError(AgentProbeError):
    """Raised when a plugin fails to load or execute."""

ChaosError

Bases: AgentProbeError

Raised when a chaos fault injection causes a failure.

Source code in src/agentprobe/core/exceptions.py
class ChaosError(AgentProbeError):
    """Raised when a chaos fault injection causes a failure."""

SnapshotError

Bases: AgentProbeError

Raised when a snapshot operation fails.

Source code in src/agentprobe/core/exceptions.py
class SnapshotError(AgentProbeError):
    """Raised when a snapshot operation fails."""

ReplayError

Bases: AgentProbeError

Raised when trace replay encounters an error.

Source code in src/agentprobe/core/exceptions.py
class ReplayError(AgentProbeError):
    """Raised when trace replay encounters an error."""

RegressionError

Bases: AgentProbeError

Raised when regression detection encounters an error.

Source code in src/agentprobe/core/exceptions.py
class RegressionError(AgentProbeError):
    """Raised when regression detection encounters an error."""

ConversationError

Bases: AgentProbeError

Raised when a multi-turn conversation test fails.

Source code in src/agentprobe/core/exceptions.py
class ConversationError(AgentProbeError):
    """Raised when a multi-turn conversation test fails."""

DashboardError

Bases: AgentProbeError

Raised when dashboard operations fail.

Source code in src/agentprobe/core/exceptions.py
class DashboardError(AgentProbeError):
    """Raised when dashboard operations fail."""

AssertionFailedError

Bases: AgentProbeError

Raised when a test assertion fails.

Attributes:

Name Type Description
assertion_type

The type of assertion that failed (e.g. 'contain', 'match').

expected

The expected value or pattern.

actual

The actual value received.

Source code in src/agentprobe/core/exceptions.py
class AssertionFailedError(AgentProbeError):
    """Raised when a test assertion fails.

    Attributes:
        assertion_type: The type of assertion that failed (e.g. 'contain', 'match').
        expected: The expected value or pattern.
        actual: The actual value received.
    """

    def __init__(
        self,
        assertion_type: str,
        expected: object,
        actual: object,
        message: str | None = None,
    ) -> None:
        self.assertion_type = assertion_type
        self.expected = expected
        self.actual = actual
        msg = message or (
            f"Assertion '{assertion_type}' failed: expected {expected!r}, got {actual!r}"
        )
        super().__init__(msg)

Discovery

agentprobe.core.discovery

Test discovery: finds and loads test modules with @scenario decorators.

Scans directories for Python files matching test patterns, imports them, and extracts registered test cases.

discover_test_files(test_dir, pattern='test_*.py')

Find test files matching a pattern in the given directory.

Parameters:

Name Type Description Default
test_dir str | Path

Root directory to search.

required
pattern str

Glob pattern for test files.

'test_*.py'

Returns:

Type Description
list[Path]

Sorted list of matching file paths.

Source code in src/agentprobe/core/discovery.py
def discover_test_files(
    test_dir: str | Path,
    pattern: str = "test_*.py",
) -> list[Path]:
    """Find test files matching a pattern in the given directory.

    Args:
        test_dir: Root directory to search.
        pattern: Glob pattern for test files.

    Returns:
        Sorted list of matching file paths.
    """
    test_path = Path(test_dir)
    if not test_path.is_dir():
        logger.warning("Test directory does not exist: %s", test_path)
        return []

    files = sorted(test_path.rglob(pattern))
    logger.info("Discovered %d test files in %s", len(files), test_path)
    return files

load_test_module(file_path)

Import a test module from a file path.

Uses importlib to load the module with a unique name derived from the file path. The module is registered in sys.modules.

Parameters:

Name Type Description Default
file_path Path

Path to the Python test file.

required

Returns:

Type Description
str

The module name used for registration.

Raises:

Type Description
ImportError

If the module cannot be loaded.

Source code in src/agentprobe/core/discovery.py
def load_test_module(file_path: Path) -> str:
    """Import a test module from a file path.

    Uses importlib to load the module with a unique name derived
    from the file path. The module is registered in ``sys.modules``.

    Args:
        file_path: Path to the Python test file.

    Returns:
        The module name used for registration.

    Raises:
        ImportError: If the module cannot be loaded.
    """
    module_name = f"agentprobe_tests.{file_path.stem}_{id(file_path)}"
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    if spec is None or spec.loader is None:
        msg = f"Cannot create module spec for {file_path}"
        raise ImportError(msg)

    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module

    try:
        spec.loader.exec_module(module)
    except Exception as exc:
        del sys.modules[module_name]
        raise ImportError(f"Failed to load {file_path}: {exc}") from exc

    logger.debug("Loaded test module: %s from %s", module_name, file_path)
    return module_name

extract_test_cases(test_dir, pattern='test_*.py')

Discover and extract all test cases from a directory.

Finds test files, imports them (triggering @scenario registration), then collects all registered test cases.

Parameters:

Name Type Description Default
test_dir str | Path

Root directory to search.

required
pattern str

Glob pattern for test files.

'test_*.py'

Returns:

Type Description
list[TestCase]

List of all discovered TestCase objects.

Source code in src/agentprobe/core/discovery.py
def extract_test_cases(
    test_dir: str | Path,
    pattern: str = "test_*.py",
) -> list[TestCase]:
    """Discover and extract all test cases from a directory.

    Finds test files, imports them (triggering @scenario registration),
    then collects all registered test cases.

    Args:
        test_dir: Root directory to search.
        pattern: Glob pattern for test files.

    Returns:
        List of all discovered TestCase objects.
    """
    files = discover_test_files(test_dir, pattern)
    module_names: list[str] = []

    for file_path in files:
        try:
            name = load_test_module(file_path)
            module_names.append(name)
        except ImportError:
            logger.exception("Skipping unloadable file: %s", file_path)

    all_cases: list[TestCase] = []
    for module_name in module_names:
        cases = get_scenarios(module_name)
        all_cases.extend(cases)

    logger.info("Extracted %d test cases from %d modules", len(all_cases), len(module_names))
    return all_cases

Conversation Runner

agentprobe.core.conversation

Multi-turn conversation runner for sequential dialogue testing.

Executes a series of conversation turns against an agent adapter, collecting per-turn traces and evaluation results, then aggregates into a ConversationResult.

ConversationRunner

Runs multi-turn conversation tests against an agent.

Executes each turn sequentially, optionally passing the previous output as context to the next turn's input. Collects per-turn evaluation results and aggregates into a final ConversationResult.

Attributes:

Name Type Description
evaluators

Mapping of evaluator names to instances.

Source code in src/agentprobe/core/conversation.py
class ConversationRunner:
    """Runs multi-turn conversation tests against an agent.

    Executes each turn sequentially, optionally passing the previous
    output as context to the next turn's input. Collects per-turn
    evaluation results and aggregates into a final ConversationResult.

    Attributes:
        evaluators: Mapping of evaluator names to instances.
    """

    def __init__(
        self,
        evaluators: dict[str, EvaluatorProtocol] | None = None,
    ) -> None:
        """Initialize the conversation runner.

        Args:
            evaluators: Named evaluator instances for per-turn evaluation.
        """
        self._evaluators = evaluators or {}

    async def run(
        self,
        adapter: AdapterProtocol,
        turns: Sequence[ConversationTurn],
        *,
        pass_context: bool = True,
    ) -> ConversationResult:
        """Execute a multi-turn conversation.

        Args:
            adapter: The agent adapter to invoke for each turn.
            turns: The conversation turns to execute in order.
            pass_context: If True, prepend previous output to next turn's input.

        Returns:
            A ConversationResult with per-turn details and aggregate metrics.

        Raises:
            ConversationError: If a critical error occurs during execution.
        """
        turn_results: list[TurnResult] = []
        previous_output = ""
        total_start = time.monotonic()

        for i, turn in enumerate(turns):
            input_text = turn.input_text
            if pass_context and previous_output:
                input_text = f"{previous_output}\n\n{turn.input_text}"

            turn_start = time.monotonic()
            try:
                trace = await adapter.invoke(input_text)
                previous_output = trace.output_text
            except Exception as exc:
                logger.error("Turn %d failed: %s", i, exc)
                turn_results.append(
                    TurnResult(
                        turn_index=i,
                        input_text=turn.input_text,
                        trace=None,
                        eval_results=(),
                        duration_ms=int((time.monotonic() - turn_start) * 1000),
                    )
                )
                continue

            # Run per-turn evaluators
            eval_results: list[EvalResult] = []
            if turn.evaluators:
                test_case = TestCase(
                    name=f"turn_{i}",
                    input_text=turn.input_text,
                    expected_output=turn.expected_output,
                )
                for eval_name in turn.evaluators:
                    evaluator = self._evaluators.get(eval_name)
                    if evaluator is None:
                        logger.warning("Turn %d: evaluator '%s' not found", i, eval_name)
                        continue
                    result = await evaluator.evaluate(test_case, trace)
                    eval_results.append(result)

            duration_ms = int((time.monotonic() - turn_start) * 1000)
            turn_results.append(
                TurnResult(
                    turn_index=i,
                    input_text=turn.input_text,
                    trace=trace,
                    eval_results=tuple(eval_results),
                    duration_ms=duration_ms,
                )
            )

        total_duration = int((time.monotonic() - total_start) * 1000)
        return self._build_result(adapter.name, turn_results, total_duration)

    @staticmethod
    def _build_result(
        agent_name: str,
        turn_results: list[TurnResult],
        total_duration_ms: int,
    ) -> ConversationResult:
        """Aggregate per-turn results into a ConversationResult."""
        passed = 0
        scores: list[float] = []

        for tr in turn_results:
            if tr.eval_results:
                turn_passed = all(er.verdict == EvalVerdict.PASS for er in tr.eval_results)
                if turn_passed:
                    passed += 1
                turn_score = sum(er.score for er in tr.eval_results) / len(tr.eval_results)
                scores.append(turn_score)
            elif tr.trace is not None:
                # No evaluators but trace exists = pass
                passed += 1
                scores.append(1.0)
            else:
                scores.append(0.0)

        aggregate_score = sum(scores) / len(scores) if scores else 0.0

        return ConversationResult(
            agent_name=agent_name,
            turn_results=tuple(turn_results),
            total_turns=len(turn_results),
            passed_turns=passed,
            aggregate_score=round(min(aggregate_score, 1.0), 6),
            total_duration_ms=total_duration_ms,
        )

__init__(evaluators=None)

Initialize the conversation runner.

Parameters:

Name Type Description Default
evaluators dict[str, EvaluatorProtocol] | None

Named evaluator instances for per-turn evaluation.

None
Source code in src/agentprobe/core/conversation.py
def __init__(
    self,
    evaluators: dict[str, EvaluatorProtocol] | None = None,
) -> None:
    """Initialize the conversation runner.

    Args:
        evaluators: Named evaluator instances for per-turn evaluation.
    """
    self._evaluators = evaluators or {}

run(adapter, turns, *, pass_context=True) async

Execute a multi-turn conversation.

Parameters:

Name Type Description Default
adapter AdapterProtocol

The agent adapter to invoke for each turn.

required
turns Sequence[ConversationTurn]

The conversation turns to execute in order.

required
pass_context bool

If True, prepend previous output to next turn's input.

True

Returns:

Type Description
ConversationResult

A ConversationResult with per-turn details and aggregate metrics.

Raises:

Type Description
ConversationError

If a critical error occurs during execution.

Source code in src/agentprobe/core/conversation.py
async def run(
    self,
    adapter: AdapterProtocol,
    turns: Sequence[ConversationTurn],
    *,
    pass_context: bool = True,
) -> ConversationResult:
    """Execute a multi-turn conversation.

    Args:
        adapter: The agent adapter to invoke for each turn.
        turns: The conversation turns to execute in order.
        pass_context: If True, prepend previous output to next turn's input.

    Returns:
        A ConversationResult with per-turn details and aggregate metrics.

    Raises:
        ConversationError: If a critical error occurs during execution.
    """
    turn_results: list[TurnResult] = []
    previous_output = ""
    total_start = time.monotonic()

    for i, turn in enumerate(turns):
        input_text = turn.input_text
        if pass_context and previous_output:
            input_text = f"{previous_output}\n\n{turn.input_text}"

        turn_start = time.monotonic()
        try:
            trace = await adapter.invoke(input_text)
            previous_output = trace.output_text
        except Exception as exc:
            logger.error("Turn %d failed: %s", i, exc)
            turn_results.append(
                TurnResult(
                    turn_index=i,
                    input_text=turn.input_text,
                    trace=None,
                    eval_results=(),
                    duration_ms=int((time.monotonic() - turn_start) * 1000),
                )
            )
            continue

        # Run per-turn evaluators
        eval_results: list[EvalResult] = []
        if turn.evaluators:
            test_case = TestCase(
                name=f"turn_{i}",
                input_text=turn.input_text,
                expected_output=turn.expected_output,
            )
            for eval_name in turn.evaluators:
                evaluator = self._evaluators.get(eval_name)
                if evaluator is None:
                    logger.warning("Turn %d: evaluator '%s' not found", i, eval_name)
                    continue
                result = await evaluator.evaluate(test_case, trace)
                eval_results.append(result)

        duration_ms = int((time.monotonic() - turn_start) * 1000)
        turn_results.append(
            TurnResult(
                turn_index=i,
                input_text=turn.input_text,
                trace=trace,
                eval_results=tuple(eval_results),
                duration_ms=duration_ms,
            )
        )

    total_duration = int((time.monotonic() - total_start) * 1000)
    return self._build_result(adapter.name, turn_results, total_duration)

Chaos Proxy

agentprobe.core.chaos

Chaos fault injection proxy for testing agent resilience.

Wraps an adapter and modifies tool call results in the resulting trace to simulate failures, timeouts, malformed data, rate limits, slow responses, and empty responses.

ChaosProxy

Wraps an adapter and injects chaos faults into tool call results.

After the real adapter produces a trace, ChaosProxy scans tool calls and probabilistically replaces their outputs with fault-injected variants. The modified trace is returned as a frozen copy.

Attributes:

Name Type Description
overrides

Configured fault injection rules.

Source code in src/agentprobe/core/chaos.py
class ChaosProxy:
    """Wraps an adapter and injects chaos faults into tool call results.

    After the real adapter produces a trace, ChaosProxy scans tool calls
    and probabilistically replaces their outputs with fault-injected
    variants. The modified trace is returned as a frozen copy.

    Attributes:
        overrides: Configured fault injection rules.
    """

    def __init__(
        self,
        adapter: AdapterProtocol,
        overrides: list[ChaosOverride],
        *,
        seed: int = 42,
    ) -> None:
        """Initialize the chaos proxy.

        Args:
            adapter: The real adapter to wrap.
            overrides: Fault injection rules to apply.
            seed: Random seed for deterministic fault injection.
        """
        self._adapter = adapter
        self._overrides = overrides
        self._rng = random.Random(seed)

    @property
    def name(self) -> str:
        """Return the adapter name with chaos prefix."""
        return f"chaos-{self._adapter.name}"

    async def invoke(self, input_text: str, **kwargs: Any) -> Trace:
        """Invoke the wrapped adapter and inject faults.

        Args:
            input_text: Input text to send to the adapter.
            **kwargs: Additional adapter arguments.

        Returns:
            A modified trace with chaos faults injected.
        """
        trace = await self._adapter.invoke(input_text, **kwargs)
        return self._apply_chaos(trace)

    def _apply_chaos(self, trace: Trace) -> Trace:
        """Apply chaos overrides to tool calls in the trace."""
        if not trace.tool_calls or not self._overrides:
            return trace

        modified_calls: list[ToolCall] = []
        any_modified = False

        for tc in trace.tool_calls:
            override = self._match_override(tc)
            if override is not None and self._rng.random() < override.probability:
                modified_calls.append(self._inject_fault(tc, override))
                any_modified = True
            else:
                modified_calls.append(tc)

        if not any_modified:
            return trace

        return trace.model_copy(
            update={"tool_calls": tuple(modified_calls)},
        )

    def _match_override(self, tool_call: ToolCall) -> ChaosOverride | None:
        """Find the first matching override for a tool call."""
        for override in self._overrides:
            if override.target_tool is None or override.target_tool == tool_call.tool_name:
                return override
        return None

    def _inject_fault(self, tool_call: ToolCall, override: ChaosOverride) -> ToolCall:
        """Create a fault-injected copy of a tool call."""
        logger.debug(
            "Injecting %s fault into tool '%s'",
            override.chaos_type.value,
            tool_call.tool_name,
        )
        fault_map: dict[ChaosType, dict[str, Any]] = {
            ChaosType.TIMEOUT: {
                "success": False,
                "error": "Chaos: operation timed out",
                "tool_output": None,
            },
            ChaosType.ERROR: {
                "success": False,
                "error": f"Chaos: {override.error_message}",
                "tool_output": None,
            },
            ChaosType.MALFORMED: {
                "success": True,
                "tool_output": "{malformed: data, <<invalid>>}",
            },
            ChaosType.RATE_LIMIT: {
                "success": False,
                "error": "Chaos: rate limit exceeded (429)",
                "tool_output": None,
            },
            ChaosType.SLOW: {
                "success": True,
                "tool_output": tool_call.tool_output,
                "latency_ms": tool_call.latency_ms + override.delay_ms,
            },
            ChaosType.EMPTY: {
                "success": True,
                "tool_output": "",
            },
        }
        updates = fault_map.get(
            override.chaos_type,
            {"success": False, "error": f"Chaos: unknown type {override.chaos_type}"},
        )
        return tool_call.model_copy(update=updates)

name property

Return the adapter name with chaos prefix.

__init__(adapter, overrides, *, seed=42)

Initialize the chaos proxy.

Parameters:

Name Type Description Default
adapter AdapterProtocol

The real adapter to wrap.

required
overrides list[ChaosOverride]

Fault injection rules to apply.

required
seed int

Random seed for deterministic fault injection.

42
Source code in src/agentprobe/core/chaos.py
def __init__(
    self,
    adapter: AdapterProtocol,
    overrides: list[ChaosOverride],
    *,
    seed: int = 42,
) -> None:
    """Initialize the chaos proxy.

    Args:
        adapter: The real adapter to wrap.
        overrides: Fault injection rules to apply.
        seed: Random seed for deterministic fault injection.
    """
    self._adapter = adapter
    self._overrides = overrides
    self._rng = random.Random(seed)

invoke(input_text, **kwargs) async

Invoke the wrapped adapter and inject faults.

Parameters:

Name Type Description Default
input_text str

Input text to send to the adapter.

required
**kwargs Any

Additional adapter arguments.

{}

Returns:

Type Description
Trace

A modified trace with chaos faults injected.

Source code in src/agentprobe/core/chaos.py
async def invoke(self, input_text: str, **kwargs: Any) -> Trace:
    """Invoke the wrapped adapter and inject faults.

    Args:
        input_text: Input text to send to the adapter.
        **kwargs: Additional adapter arguments.

    Returns:
        A modified trace with chaos faults injected.
    """
    trace = await self._adapter.invoke(input_text, **kwargs)
    return self._apply_chaos(trace)

Snapshot Manager

agentprobe.core.snapshot

Snapshot (golden file) management for output comparison testing.

Saves, loads, compares, and updates agent output snapshots stored as JSON files. Supports multi-dimension comparison including tool calls, response structure, key facts, cost, and latency.

SnapshotManager

Manages snapshot files for golden-file testing.

Saves traces as JSON snapshots and compares current traces against saved snapshots across multiple dimensions.

Attributes:

Name Type Description
snapshot_dir

Directory where snapshot files are stored.

threshold

Similarity threshold for matching.

Source code in src/agentprobe/core/snapshot.py
class SnapshotManager:
    """Manages snapshot files for golden-file testing.

    Saves traces as JSON snapshots and compares current traces against
    saved snapshots across multiple dimensions.

    Attributes:
        snapshot_dir: Directory where snapshot files are stored.
        threshold: Similarity threshold for matching.
    """

    def __init__(
        self,
        snapshot_dir: str | Path = ".agentprobe/snapshots",
        *,
        threshold: float = 0.8,
    ) -> None:
        """Initialize the snapshot manager.

        Args:
            snapshot_dir: Directory for snapshot storage.
            threshold: Similarity threshold for a match.
        """
        self._dir = Path(snapshot_dir)
        self._threshold = threshold

    def _snapshot_path(self, name: str) -> Path:
        """Get the file path for a named snapshot."""
        return self._dir / f"{name}.json"

    def save(self, name: str, trace: Trace) -> Path:
        """Save a trace as a named snapshot.

        Args:
            name: Snapshot name.
            trace: Trace to save.

        Returns:
            Path to the saved snapshot file.
        """
        self._dir.mkdir(parents=True, exist_ok=True)
        path = self._snapshot_path(name)
        path.write_text(trace.model_dump_json(indent=2), encoding="utf-8")
        logger.info("Snapshot saved: %s", path)
        return path

    def load(self, name: str) -> Trace:
        """Load a named snapshot.

        Args:
            name: Snapshot name.

        Returns:
            The saved Trace.

        Raises:
            SnapshotError: If the snapshot does not exist.
        """
        path = self._snapshot_path(name)
        if not path.exists():
            raise SnapshotError(f"Snapshot not found: {name}")
        return Trace.model_validate_json(path.read_text(encoding="utf-8"))

    def exists(self, name: str) -> bool:
        """Check if a named snapshot exists."""
        return self._snapshot_path(name).exists()

    def list_snapshots(self) -> list[str]:
        """List all snapshot names."""
        if not self._dir.is_dir():
            return []
        return sorted(p.stem for p in self._dir.glob("*.json"))

    def delete(self, name: str) -> bool:
        """Delete a named snapshot.

        Args:
            name: Snapshot name.

        Returns:
            True if deleted, False if not found.
        """
        path = self._snapshot_path(name)
        if path.exists():
            path.unlink()
            logger.info("Snapshot deleted: %s", name)
            return True
        return False

    def compare(self, name: str, current: Trace) -> SnapshotDiff:
        """Compare a current trace against a saved snapshot.

        Compares across dimensions: tool_calls, output, token_usage,
        latency, and metadata.

        Args:
            name: Snapshot name.
            current: Current trace to compare.

        Returns:
            A SnapshotDiff with per-dimension similarity scores.

        Raises:
            SnapshotError: If the snapshot does not exist.
        """
        baseline = self.load(name)
        diffs: list[DiffItem] = []

        # Tool call sequence similarity
        baseline_tools = [tc.tool_name for tc in baseline.tool_calls]
        current_tools = [tc.tool_name for tc in current.tool_calls]
        tool_sim = _sequence_similarity(baseline_tools, current_tools)
        diffs.append(
            DiffItem(
                dimension="tool_calls",
                expected=baseline_tools,
                actual=current_tools,
                similarity=round(tool_sim, 4),
            )
        )

        # Output text similarity
        output_sim = _keyword_overlap(baseline.output_text, current.output_text)
        diffs.append(
            DiffItem(
                dimension="output",
                expected=baseline.output_text[:200],
                actual=current.output_text[:200],
                similarity=round(output_sim, 4),
            )
        )

        # Token usage similarity
        baseline_tokens = baseline.total_input_tokens + baseline.total_output_tokens
        current_tokens = current.total_input_tokens + current.total_output_tokens
        if baseline_tokens > 0:
            token_ratio = min(current_tokens, baseline_tokens) / max(
                current_tokens, baseline_tokens
            )
        elif current_tokens == 0:
            token_ratio = 1.0
        else:
            token_ratio = 0.0
        diffs.append(
            DiffItem(
                dimension="token_usage",
                expected=baseline_tokens,
                actual=current_tokens,
                similarity=round(token_ratio, 4),
            )
        )

        # Latency similarity
        if baseline.total_latency_ms > 0:
            latency_ratio = min(current.total_latency_ms, baseline.total_latency_ms) / max(
                current.total_latency_ms, baseline.total_latency_ms
            )
        elif current.total_latency_ms == 0:
            latency_ratio = 1.0
        else:
            latency_ratio = 0.0
        diffs.append(
            DiffItem(
                dimension="latency",
                expected=baseline.total_latency_ms,
                actual=current.total_latency_ms,
                similarity=round(latency_ratio, 4),
            )
        )

        # Overall weighted average
        weights = {"tool_calls": 0.35, "output": 0.35, "token_usage": 0.15, "latency": 0.15}
        overall = sum(d.similarity * weights.get(d.dimension, 0.0) for d in diffs)

        is_match = overall >= self._threshold

        return SnapshotDiff(
            snapshot_name=name,
            overall_similarity=round(overall, 4),
            diffs=tuple(diffs),
            is_match=is_match,
            threshold=self._threshold,
        )

    def update_all(self, snapshots: dict[str, Trace]) -> int:
        """Update multiple snapshots at once.

        Args:
            snapshots: Mapping of snapshot names to traces.

        Returns:
            Number of snapshots updated.
        """
        count = 0
        for name, trace in snapshots.items():
            self.save(name, trace)
            count += 1
        return count

__init__(snapshot_dir='.agentprobe/snapshots', *, threshold=0.8)

Initialize the snapshot manager.

Parameters:

Name Type Description Default
snapshot_dir str | Path

Directory for snapshot storage.

'.agentprobe/snapshots'
threshold float

Similarity threshold for a match.

0.8
Source code in src/agentprobe/core/snapshot.py
def __init__(
    self,
    snapshot_dir: str | Path = ".agentprobe/snapshots",
    *,
    threshold: float = 0.8,
) -> None:
    """Initialize the snapshot manager.

    Args:
        snapshot_dir: Directory for snapshot storage.
        threshold: Similarity threshold for a match.
    """
    self._dir = Path(snapshot_dir)
    self._threshold = threshold

save(name, trace)

Save a trace as a named snapshot.

Parameters:

Name Type Description Default
name str

Snapshot name.

required
trace Trace

Trace to save.

required

Returns:

Type Description
Path

Path to the saved snapshot file.

Source code in src/agentprobe/core/snapshot.py
def save(self, name: str, trace: Trace) -> Path:
    """Save a trace as a named snapshot.

    Args:
        name: Snapshot name.
        trace: Trace to save.

    Returns:
        Path to the saved snapshot file.
    """
    self._dir.mkdir(parents=True, exist_ok=True)
    path = self._snapshot_path(name)
    path.write_text(trace.model_dump_json(indent=2), encoding="utf-8")
    logger.info("Snapshot saved: %s", path)
    return path

load(name)

Load a named snapshot.

Parameters:

Name Type Description Default
name str

Snapshot name.

required

Returns:

Type Description
Trace

The saved Trace.

Raises:

Type Description
SnapshotError

If the snapshot does not exist.

Source code in src/agentprobe/core/snapshot.py
def load(self, name: str) -> Trace:
    """Load a named snapshot.

    Args:
        name: Snapshot name.

    Returns:
        The saved Trace.

    Raises:
        SnapshotError: If the snapshot does not exist.
    """
    path = self._snapshot_path(name)
    if not path.exists():
        raise SnapshotError(f"Snapshot not found: {name}")
    return Trace.model_validate_json(path.read_text(encoding="utf-8"))

exists(name)

Check if a named snapshot exists.

Source code in src/agentprobe/core/snapshot.py
def exists(self, name: str) -> bool:
    """Check if a named snapshot exists."""
    return self._snapshot_path(name).exists()

list_snapshots()

List all snapshot names.

Source code in src/agentprobe/core/snapshot.py
def list_snapshots(self) -> list[str]:
    """List all snapshot names."""
    if not self._dir.is_dir():
        return []
    return sorted(p.stem for p in self._dir.glob("*.json"))

delete(name)

Delete a named snapshot.

Parameters:

Name Type Description Default
name str

Snapshot name.

required

Returns:

Type Description
bool

True if deleted, False if not found.

Source code in src/agentprobe/core/snapshot.py
def delete(self, name: str) -> bool:
    """Delete a named snapshot.

    Args:
        name: Snapshot name.

    Returns:
        True if deleted, False if not found.
    """
    path = self._snapshot_path(name)
    if path.exists():
        path.unlink()
        logger.info("Snapshot deleted: %s", name)
        return True
    return False

compare(name, current)

Compare a current trace against a saved snapshot.

Compares across dimensions: tool_calls, output, token_usage, latency, and metadata.

Parameters:

Name Type Description Default
name str

Snapshot name.

required
current Trace

Current trace to compare.

required

Returns:

Type Description
SnapshotDiff

A SnapshotDiff with per-dimension similarity scores.

Raises:

Type Description
SnapshotError

If the snapshot does not exist.

Source code in src/agentprobe/core/snapshot.py
def compare(self, name: str, current: Trace) -> SnapshotDiff:
    """Compare a current trace against a saved snapshot.

    Compares across dimensions: tool_calls, output, token_usage,
    latency, and metadata.

    Args:
        name: Snapshot name.
        current: Current trace to compare.

    Returns:
        A SnapshotDiff with per-dimension similarity scores.

    Raises:
        SnapshotError: If the snapshot does not exist.
    """
    baseline = self.load(name)
    diffs: list[DiffItem] = []

    # Tool call sequence similarity
    baseline_tools = [tc.tool_name for tc in baseline.tool_calls]
    current_tools = [tc.tool_name for tc in current.tool_calls]
    tool_sim = _sequence_similarity(baseline_tools, current_tools)
    diffs.append(
        DiffItem(
            dimension="tool_calls",
            expected=baseline_tools,
            actual=current_tools,
            similarity=round(tool_sim, 4),
        )
    )

    # Output text similarity
    output_sim = _keyword_overlap(baseline.output_text, current.output_text)
    diffs.append(
        DiffItem(
            dimension="output",
            expected=baseline.output_text[:200],
            actual=current.output_text[:200],
            similarity=round(output_sim, 4),
        )
    )

    # Token usage similarity
    baseline_tokens = baseline.total_input_tokens + baseline.total_output_tokens
    current_tokens = current.total_input_tokens + current.total_output_tokens
    if baseline_tokens > 0:
        token_ratio = min(current_tokens, baseline_tokens) / max(
            current_tokens, baseline_tokens
        )
    elif current_tokens == 0:
        token_ratio = 1.0
    else:
        token_ratio = 0.0
    diffs.append(
        DiffItem(
            dimension="token_usage",
            expected=baseline_tokens,
            actual=current_tokens,
            similarity=round(token_ratio, 4),
        )
    )

    # Latency similarity
    if baseline.total_latency_ms > 0:
        latency_ratio = min(current.total_latency_ms, baseline.total_latency_ms) / max(
            current.total_latency_ms, baseline.total_latency_ms
        )
    elif current.total_latency_ms == 0:
        latency_ratio = 1.0
    else:
        latency_ratio = 0.0
    diffs.append(
        DiffItem(
            dimension="latency",
            expected=baseline.total_latency_ms,
            actual=current.total_latency_ms,
            similarity=round(latency_ratio, 4),
        )
    )

    # Overall weighted average
    weights = {"tool_calls": 0.35, "output": 0.35, "token_usage": 0.15, "latency": 0.15}
    overall = sum(d.similarity * weights.get(d.dimension, 0.0) for d in diffs)

    is_match = overall >= self._threshold

    return SnapshotDiff(
        snapshot_name=name,
        overall_similarity=round(overall, 4),
        diffs=tuple(diffs),
        is_match=is_match,
        threshold=self._threshold,
    )

update_all(snapshots)

Update multiple snapshots at once.

Parameters:

Name Type Description Default
snapshots dict[str, Trace]

Mapping of snapshot names to traces.

required

Returns:

Type Description
int

Number of snapshots updated.

Source code in src/agentprobe/core/snapshot.py
def update_all(self, snapshots: dict[str, Trace]) -> int:
    """Update multiple snapshots at once.

    Args:
        snapshots: Mapping of snapshot names to traces.

    Returns:
        Number of snapshots updated.
    """
    count = 0
    for name, trace in snapshots.items():
        self.save(name, trace)
        count += 1
    return count