Evaluate a multi-turn chatbot conversation
Setup
pip install openai scorableBuilding an Evaluated Chatbot
from openai import OpenAI
from scorable import Scorable
from scorable.multiturn import Turn
class EvaluatedChat:
def __init__(self, model="gpt-5.2", scorable_api_key=None, openai_api_key=None):
self.system_prompt = (
"You are a helpful cooking assistant that answers questions about recipes and cooking."
)
self.model = model
self.openai_client = OpenAI(api_key=openai_api_key)
self.scorable_client = Scorable(api_key=scorable_api_key)
self.conversation_history = []
def add_message(self, user_message):
# Add user message to history
self.conversation_history.append({"role": "user", "content": user_message})
# Get response from OpenAI using Responses API
response = self.openai_client.responses.create(
model=self.model,
instructions=self.system_prompt,
input=self.conversation_history,
)
# Extract assistant response
assistant_message = response.output_text
self.conversation_history.append({"role": "assistant", "content": assistant_message})
# Evaluate the conversation
evaluation = self.evaluate_conversation()
return {"response": assistant_message, "evaluation": evaluation}
def evaluate_conversation(self):
# Convert conversation history to Scorable Turns format
turns = [Turn(role=m["role"], content=m["content"]) for m in self.conversation_history]
# Evaluate helpfulness
result = self.scorable_client.evaluators.Helpfulness(turns=turns)
return {"score": result.score, "justification": result.justification}Example Usage
Using Judges for Multiple Evaluators
Last updated