Skip to content

Regression

Regression detection and baseline management.

Detector

agentprobe.regression.detector

Regression detection by comparing baseline and current test results.

Flags regressions (score decreases) and improvements (score increases) based on configurable delta thresholds.

RegressionDetector

Compares current test results against a baseline to detect regressions.

Attributes:

Name Type Description
threshold

Minimum score delta to flag as regression/improvement.

Source code in src/agentprobe/regression/detector.py
class RegressionDetector:
    """Compares current test results against a baseline to detect regressions.

    Attributes:
        threshold: Minimum score delta to flag as regression/improvement.
    """

    def __init__(self, threshold: float = 0.05) -> None:
        """Initialize the regression detector.

        Args:
            threshold: Score delta threshold for flagging changes.
        """
        self._threshold = threshold

    def compare(
        self,
        baseline_name: str,
        baseline_results: Sequence[TestResult],
        current_results: Sequence[TestResult],
    ) -> RegressionReport:
        """Compare current results against a baseline.

        Tests are matched by name. Tests present in only one set are
        excluded from comparison.

        Args:
            baseline_name: Name of the baseline for reporting.
            baseline_results: Test results from the baseline run.
            current_results: Test results from the current run.

        Returns:
            A RegressionReport with per-test comparisons.
        """
        baseline_map = {r.test_name: r for r in baseline_results}
        current_map = {r.test_name: r for r in current_results}

        common_names = sorted(set(baseline_map) & set(current_map))
        comparisons: list[TestComparison] = []
        regressions = 0
        improvements = 0
        unchanged = 0

        for name in common_names:
            bl = baseline_map[name]
            cr = current_map[name]
            delta = round(cr.score - bl.score, 6)

            is_regression = delta < -self._threshold
            is_improvement = delta > self._threshold

            if is_regression:
                regressions += 1
                logger.warning(
                    "Regression detected: %s (%.3f -> %.3f, delta=%.3f)",
                    name,
                    bl.score,
                    cr.score,
                    delta,
                )
            elif is_improvement:
                improvements += 1
                logger.info(
                    "Improvement detected: %s (%.3f -> %.3f, delta=%.3f)",
                    name,
                    bl.score,
                    cr.score,
                    delta,
                )
            else:
                unchanged += 1

            comparisons.append(
                TestComparison(
                    test_name=name,
                    baseline_score=bl.score,
                    current_score=cr.score,
                    delta=delta,
                    is_regression=is_regression,
                    is_improvement=is_improvement,
                )
            )

        return RegressionReport(
            baseline_name=baseline_name,
            comparisons=tuple(comparisons),
            total_tests=len(comparisons),
            regressions=regressions,
            improvements=improvements,
            unchanged=unchanged,
            threshold=self._threshold,
        )

__init__(threshold=0.05)

Initialize the regression detector.

Parameters:

Name Type Description Default
threshold float

Score delta threshold for flagging changes.

0.05
Source code in src/agentprobe/regression/detector.py
def __init__(self, threshold: float = 0.05) -> None:
    """Initialize the regression detector.

    Args:
        threshold: Score delta threshold for flagging changes.
    """
    self._threshold = threshold

compare(baseline_name, baseline_results, current_results)

Compare current results against a baseline.

Tests are matched by name. Tests present in only one set are excluded from comparison.

Parameters:

Name Type Description Default
baseline_name str

Name of the baseline for reporting.

required
baseline_results Sequence[TestResult]

Test results from the baseline run.

required
current_results Sequence[TestResult]

Test results from the current run.

required

Returns:

Type Description
RegressionReport

A RegressionReport with per-test comparisons.

Source code in src/agentprobe/regression/detector.py
def compare(
    self,
    baseline_name: str,
    baseline_results: Sequence[TestResult],
    current_results: Sequence[TestResult],
) -> RegressionReport:
    """Compare current results against a baseline.

    Tests are matched by name. Tests present in only one set are
    excluded from comparison.

    Args:
        baseline_name: Name of the baseline for reporting.
        baseline_results: Test results from the baseline run.
        current_results: Test results from the current run.

    Returns:
        A RegressionReport with per-test comparisons.
    """
    baseline_map = {r.test_name: r for r in baseline_results}
    current_map = {r.test_name: r for r in current_results}

    common_names = sorted(set(baseline_map) & set(current_map))
    comparisons: list[TestComparison] = []
    regressions = 0
    improvements = 0
    unchanged = 0

    for name in common_names:
        bl = baseline_map[name]
        cr = current_map[name]
        delta = round(cr.score - bl.score, 6)

        is_regression = delta < -self._threshold
        is_improvement = delta > self._threshold

        if is_regression:
            regressions += 1
            logger.warning(
                "Regression detected: %s (%.3f -> %.3f, delta=%.3f)",
                name,
                bl.score,
                cr.score,
                delta,
            )
        elif is_improvement:
            improvements += 1
            logger.info(
                "Improvement detected: %s (%.3f -> %.3f, delta=%.3f)",
                name,
                bl.score,
                cr.score,
                delta,
            )
        else:
            unchanged += 1

        comparisons.append(
            TestComparison(
                test_name=name,
                baseline_score=bl.score,
                current_score=cr.score,
                delta=delta,
                is_regression=is_regression,
                is_improvement=is_improvement,
            )
        )

    return RegressionReport(
        baseline_name=baseline_name,
        comparisons=tuple(comparisons),
        total_tests=len(comparisons),
        regressions=regressions,
        improvements=improvements,
        unchanged=unchanged,
        threshold=self._threshold,
    )

Baseline Manager

agentprobe.regression.baseline

Baseline management for regression testing.

Provides CRUD operations for named baselines stored as JSON files containing serialized TestResult lists.

BaselineManager

Manages baseline files for regression testing.

Stores sets of TestResult objects as JSON files, enabling comparison between historical and current test runs.

Attributes:

Name Type Description
baseline_dir

Directory where baseline files are stored.

Source code in src/agentprobe/regression/baseline.py
class BaselineManager:
    """Manages baseline files for regression testing.

    Stores sets of TestResult objects as JSON files, enabling
    comparison between historical and current test runs.

    Attributes:
        baseline_dir: Directory where baseline files are stored.
    """

    def __init__(self, baseline_dir: str | Path = ".agentprobe/baselines") -> None:
        """Initialize the baseline manager.

        Args:
            baseline_dir: Directory for baseline storage.
        """
        self._dir = Path(baseline_dir)

    def _baseline_path(self, name: str) -> Path:
        """Get the file path for a named baseline."""
        return self._dir / f"{name}.json"

    def save(self, name: str, results: Sequence[TestResult]) -> Path:
        """Save test results as a named baseline.

        Args:
            name: Baseline name.
            results: Test results to save.

        Returns:
            Path to the saved baseline file.
        """
        self._dir.mkdir(parents=True, exist_ok=True)
        path = self._baseline_path(name)

        data = [json.loads(r.model_dump_json()) for r in results]
        path.write_text(
            json.dumps(data, indent=2, ensure_ascii=False),
            encoding="utf-8",
        )
        logger.info("Baseline saved: %s (%d results)", name, len(data))
        return path

    def load(self, name: str) -> list[TestResult]:
        """Load a named baseline.

        Args:
            name: Baseline name.

        Returns:
            List of saved TestResult objects.

        Raises:
            RegressionError: If the baseline does not exist.
        """
        path = self._baseline_path(name)
        if not path.exists():
            raise RegressionError(f"Baseline not found: {name}")

        raw = json.loads(path.read_text(encoding="utf-8"))
        return [TestResult.model_validate_json(json.dumps(item)) for item in raw]

    def exists(self, name: str) -> bool:
        """Check if a named baseline exists."""
        return self._baseline_path(name).exists()

    def list_baselines(self) -> list[str]:
        """List all baseline names."""
        if not self._dir.is_dir():
            return []
        return sorted(p.stem for p in self._dir.glob("*.json"))

    def delete(self, name: str) -> bool:
        """Delete a named baseline.

        Args:
            name: Baseline name.

        Returns:
            True if deleted, False if not found.
        """
        path = self._baseline_path(name)
        if path.exists():
            path.unlink()
            logger.info("Baseline deleted: %s", name)
            return True
        return False

__init__(baseline_dir='.agentprobe/baselines')

Initialize the baseline manager.

Parameters:

Name Type Description Default
baseline_dir str | Path

Directory for baseline storage.

'.agentprobe/baselines'
Source code in src/agentprobe/regression/baseline.py
def __init__(self, baseline_dir: str | Path = ".agentprobe/baselines") -> None:
    """Initialize the baseline manager.

    Args:
        baseline_dir: Directory for baseline storage.
    """
    self._dir = Path(baseline_dir)

save(name, results)

Save test results as a named baseline.

Parameters:

Name Type Description Default
name str

Baseline name.

required
results Sequence[TestResult]

Test results to save.

required

Returns:

Type Description
Path

Path to the saved baseline file.

Source code in src/agentprobe/regression/baseline.py
def save(self, name: str, results: Sequence[TestResult]) -> Path:
    """Save test results as a named baseline.

    Args:
        name: Baseline name.
        results: Test results to save.

    Returns:
        Path to the saved baseline file.
    """
    self._dir.mkdir(parents=True, exist_ok=True)
    path = self._baseline_path(name)

    data = [json.loads(r.model_dump_json()) for r in results]
    path.write_text(
        json.dumps(data, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )
    logger.info("Baseline saved: %s (%d results)", name, len(data))
    return path

load(name)

Load a named baseline.

Parameters:

Name Type Description Default
name str

Baseline name.

required

Returns:

Type Description
list[TestResult]

List of saved TestResult objects.

Raises:

Type Description
RegressionError

If the baseline does not exist.

Source code in src/agentprobe/regression/baseline.py
def load(self, name: str) -> list[TestResult]:
    """Load a named baseline.

    Args:
        name: Baseline name.

    Returns:
        List of saved TestResult objects.

    Raises:
        RegressionError: If the baseline does not exist.
    """
    path = self._baseline_path(name)
    if not path.exists():
        raise RegressionError(f"Baseline not found: {name}")

    raw = json.loads(path.read_text(encoding="utf-8"))
    return [TestResult.model_validate_json(json.dumps(item)) for item in raw]

exists(name)

Check if a named baseline exists.

Source code in src/agentprobe/regression/baseline.py
def exists(self, name: str) -> bool:
    """Check if a named baseline exists."""
    return self._baseline_path(name).exists()

list_baselines()

List all baseline names.

Source code in src/agentprobe/regression/baseline.py
def list_baselines(self) -> list[str]:
    """List all baseline names."""
    if not self._dir.is_dir():
        return []
    return sorted(p.stem for p in self._dir.glob("*.json"))

delete(name)

Delete a named baseline.

Parameters:

Name Type Description Default
name str

Baseline name.

required

Returns:

Type Description
bool

True if deleted, False if not found.

Source code in src/agentprobe/regression/baseline.py
def delete(self, name: str) -> bool:
    """Delete a named baseline.

    Args:
        name: Baseline name.

    Returns:
        True if deleted, False if not found.
    """
    path = self._baseline_path(name)
    if path.exists():
        path.unlink()
        logger.info("Baseline deleted: %s", name)
        return True
    return False