# Pytest

This guide shows how to integrate [Scorable](https://scorable.ai) LLM-as-a-Judge evaluators into your Python test suites using `pytest`.

## Installation

```bash
pip install scorable
```

## Setup

### Scorable Fixtures

Create a `conftest.py` file to define reusable pytest fixtures for Scorable:

```python
import pytest
import os
from scorable import Scorable

@pytest.fixture(scope="session")
def scorable_client():
    """
    Initializes the Scorable client once for the test session.
    """
    api_key = os.getenv("SCORABLE_API_KEY")
    if not api_key:
        pytest.skip("SCORABLE_API_KEY not set")
    return Scorable(api_key=api_key)

@pytest.fixture
def assert_scorable_quality(scorable_client):
    """
    A helper fixture to run a Scorable judge by name and assert the quality.
    """
    def _check(judge_name: str, request: str, response: str, threshold: float = 0.7):
        # Execute the judge by its name
        result = scorable_client.judges.run_by_name(
            name=judge_name,
            request=request,
            response=response,
            tags=["test", "<git-hash>"]
        )

        # Alternatively, call an evaluator directly
        # result = scorable_client.evaluators.run_by_name(
        #     name="Accuracy",
        #     request=request,
        #     response=response,
        #     tags=["test", "<git-hash>"]
        # )

        # Calculate average score across all active evaluators
        scores = [r.score for r in result.evaluator_results]
        avg_score = sum(scores) / len(scores)

        # Log justification if assertion fails
        if avg_score < threshold:
            details = "\n".join(
                [f"- {r.evaluator_name}: {r.score} (Reason: {r.justification})"
                 for r in result.evaluator_results]
            )
            pytest.fail(
                f"Scorable Judge '{judge_name}' evaluation failed.\n"
                f"Score: {avg_score:.2f} (Threshold: {threshold})\n"
                f"Details:\n{details}"
            )

        return avg_score

    return _check
```

## Writing Tests

Create your test file (e.g., `test_ai_assistant.py`):

```python
import pytest
from my_app import my_ai_workflow

TEST_CASES = [
    "Archive my last 3 newsletters and let me know when done.",
    "Create a label called 'Receipts' and apply it to my latest Amazon email.",
    "Summarize the thread from 'Travel Booking' about my flight.",
]

@pytest.mark.parametrize("user_request", TEST_CASES)
def test_assistant_scenarios(assert_scorable_quality, user_request):
    """
    Test multiple AI assistant scenarios using a parametrized Scorable evaluation.
    """
    # Replace with your own AI workflow
    ai_response = my_ai_workflow(user_request)

    # Evaluate with Scorable using the Judge Name
    assert_scorable_quality(
        judge_name="Gmail Assistant Response Auditor",
        request=user_request,
        response=ai_response,
        threshold=0.8
    )
```
