import time
from dataclasses import dataclass
from typing import List, Dict
import statistics
@dataclass
class ModelMetrics:
model_name: str
response_time: float
token_count: int
cost: float
quality_score: float
timestamp: float
class ModelPerformanceMonitor:
def __init__(self):
self.metrics: List[ModelMetrics] = []
self.quality_evaluator = QualityEvaluator()
def track_request(self, model_name: str, response_time: float,
response_text: str, prompt: str, cost: float):
"""Track a model request"""
# Calculate metrics
token_count = len(response_text.split()) # Simplified
quality_score = self.quality_evaluator.evaluate(prompt, response_text)
metrics = ModelMetrics(
model_name=model_name,
response_time=response_time,
token_count=token_count,
cost=cost,
quality_score=quality_score,
timestamp=time.time()
)
self.metrics.append(metrics)
def get_model_performance(self, model_name: str, hours: int = 24) -> Dict:
"""Get performance summary for a model"""
cutoff_time = time.time() - (hours * 3600)
# Filter metrics for this model and time period
model_metrics = [
m for m in self.metrics
if m.model_name == model_name and m.timestamp > cutoff_time
]
if not model_metrics:
return {"error": "No data available"}
return {
"model": model_name,
"requests": len(model_metrics),
"avg_response_time": statistics.mean(m.response_time for m in model_metrics),
"avg_quality": statistics.mean(m.quality_score for m in model_metrics),
"total_cost": sum(m.cost for m in model_metrics),
"avg_tokens": statistics.mean(m.token_count for m in model_metrics),
"cost_per_request": sum(m.cost for m in model_metrics) / len(model_metrics)
}
def compare_models(self, hours: int = 24) -> List[Dict]:
"""Compare performance across all models"""
model_names = set(m.model_name for m in self.metrics)
comparisons = []
for model in model_names:
performance = self.get_model_performance(model, hours)
if "error" not in performance:
comparisons.append(performance)
# Sort by quality score
return sorted(comparisons, key=lambda x: x["avg_quality"], reverse=True)
class QualityEvaluator:
def evaluate(self, prompt: str, response: str) -> float:
"""Evaluate response quality (simplified)"""
score = 0.0
# Length appropriateness (not too short, not too long)
response_length = len(response.split())
if 10 <= response_length <= 200:
score += 0.3
# Relevance (simple keyword matching)
prompt_words = set(prompt.lower().split())
response_words = set(response.lower().split())
overlap = len(prompt_words & response_words) / len(prompt_words)
score += overlap * 0.4
# Politeness indicators
polite_phrases = ["please", "thank you", "happy to help", "sorry"]
if any(phrase in response.lower() for phrase in polite_phrases):
score += 0.2
# Completeness (ends with proper punctuation)
if response.strip().endswith((".", "!", "?")):
score += 0.1
return min(score, 1.0) # Cap at 1.0
# Usage
monitor = ModelPerformanceMonitor()
# Track requests
start_time = time.time()
response = "Thank you for your question! Our return policy allows..."
end_time = time.time()
monitor.track_request(
model_name="claude-3-sonnet",
response_time=end_time - start_time,
response_text=response,
prompt="What is your return policy?",
cost=0.003
)
# Get performance reports
performance = monitor.get_model_performance("claude-3-sonnet")
comparison = monitor.compare_models()