| |
| """ |
| REAL SCORES BENCHMARK |
| - Uses actual Claude & OpenAI APIs |
| - Calculates real semantic accuracy |
| - No hard-coded scores |
| """ |
|
|
| import time |
| import requests |
| import anthropic |
| from openai import OpenAI |
| from sentence_transformers import SentenceTransformer |
| import numpy as np |
| import statistics |
|
|
| class RealScoresBenchmark: |
| def __init__(self): |
| self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2') |
| |
| |
| self.claude_client = anthropic.Anthropic( |
| api_key="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" |
| ) |
| |
| self.openai_client = OpenAI( |
| api_key="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" |
| ) |
| |
| self.claude_opus_id = "claude-3-opus-20240229" |
| self.gpt4_turbo_id = "gpt-4-turbo-preview" |
| |
| |
| self.ground_truths = { |
| "car_city": "A car is driving through a vibrant city at night with colorful neon lights reflecting on wet streets", |
| "person_dancing": "A person is dancing energetically in a room with dynamic colorful lighting effects and moving shadows" |
| } |
| |
| def calculate_semantic_accuracy(self, text1, text2): |
| """Calculate real semantic similarity""" |
| emb1 = self.semantic_model.encode([text1])[0] |
| emb2 = self.semantic_model.encode([text2])[0] |
| similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) |
| return float(similarity) |
| |
| def assess_narrative_quality(self, text): |
| """Real narrative quality assessment""" |
| if not text or len(text.strip()) < 8: |
| return 0.0 |
| |
| checks = [ |
| text[0].isupper(), |
| text.endswith('.'), |
| 8 <= len(text.split()) <= 30, |
| any(connector in text.lower() for connector in ['through', 'with', 'under', 'across', 'against', 'amidst']), |
| any(cinematic in text.lower() for cinematic in ['sleek', 'vibrant', 'illuminated', 'graceful', 'dramatic', 'colorful']), |
| ' ' not in text, |
| text.count('.') <= 3, |
| not any(broken in text.lower() for broken in [' a a ', ' the the ', 'neon neon']) |
| ] |
| |
| return sum(checks) / len(checks) |
| |
| def test_our_system(self, scene_description, scene_key): |
| """Test our polished system""" |
| start_time = time.time() |
| try: |
| response = requests.post( |
| "http://localhost:8008/describe/scene", |
| json={ |
| "scene_description": scene_description, |
| "enhance_adjectives": True |
| }, |
| timeout=5 |
| ) |
| processing_time = (time.time() - start_time) * 1000 |
| |
| if response.status_code == 200: |
| result = response.json() |
| output = result["enhanced_description"] |
| narrative_quality = self.assess_narrative_quality(output) |
| semantic_accuracy = self.calculate_semantic_accuracy(output, self.ground_truths[scene_key]) |
| |
| return { |
| "description": output, |
| "time_ms": processing_time, |
| "narrative_quality": narrative_quality, |
| "semantic_accuracy": semantic_accuracy, |
| "success": True |
| } |
| except Exception as e: |
| print(f"Our system error: {e}") |
| |
| return {"success": False, "time_ms": 0, "description": "", "narrative_quality": 0.0, "semantic_accuracy": 0.0} |
| |
| def test_claude_opus(self, scene_description, scene_key): |
| """Test Claude Opus with real API""" |
| start_time = time.time() |
| try: |
| response = self.claude_client.messages.create( |
| model=self.claude_opus_id, |
| max_tokens=100, |
| messages=[{ |
| "role": "user", |
| "content": f"Describe this scene vividly in one cinematic sentence: {scene_description}" |
| }] |
| ) |
| processing_time = (time.time() - start_time) * 1000 |
| |
| description = response.content[0].text |
| narrative_quality = self.assess_narrative_quality(description) |
| semantic_accuracy = self.calculate_semantic_accuracy(description, self.ground_truths[scene_key]) |
| |
| return { |
| "description": description, |
| "time_ms": processing_time, |
| "narrative_quality": narrative_quality, |
| "semantic_accuracy": semantic_accuracy, |
| "success": True |
| } |
| except Exception as e: |
| print(f"Claude error: {e}") |
| return {"success": False, "time_ms": 0, "description": "", "narrative_quality": 0.0, "semantic_accuracy": 0.0} |
| |
| def test_gpt4_turbo(self, scene_description, scene_key): |
| """Test GPT-4 Turbo with real API""" |
| start_time = time.time() |
| try: |
| response = self.openai_client.chat.completions.create( |
| model=self.gpt4_turbo_id, |
| messages=[{ |
| "role": "user", |
| "content": f"Describe this scene vividly in one cinematic sentence: {scene_description}" |
| }], |
| max_tokens=100 |
| ) |
| processing_time = (time.time() - start_time) * 1000 |
| |
| description = response.choices[0].message.content |
| narrative_quality = self.assess_narrative_quality(description) |
| semantic_accuracy = self.calculate_semantic_accuracy(description, self.ground_truths[scene_key]) |
| |
| return { |
| "description": description, |
| "time_ms": processing_time, |
| "narrative_quality": narrative_quality, |
| "semantic_accuracy": semantic_accuracy, |
| "success": True |
| } |
| except Exception as e: |
| print(f"GPT-4 error: {e}") |
| return {"success": False, "time_ms": 0, "description": "", "narrative_quality": 0.0, "semantic_accuracy": 0.0} |
| |
| def run_benchmark(self): |
| """Run benchmark with real API calls and real scores""" |
| test_scenes = [ |
| ("A car driving through a city at night with neon lights", "car_city"), |
| ("A person dancing in a room with colorful lighting effects", "person_dancing") |
| ] |
| |
| print("π REAL SCORES BENCHMARK - ACTUAL API PERFORMANCE") |
| print("=" * 70) |
| print("π― Testing with Real Claude & OpenAI APIs") |
| print("=" * 70) |
| |
| our_results = [] |
| claude_results = [] |
| gpt4_results = [] |
| |
| for scene, scene_key in test_scenes: |
| print(f"\n㪠TEST SCENE: {scene}") |
| print("-" * 50) |
| |
| |
| our_result = self.test_our_system(scene, scene_key) |
| if our_result["success"]: |
| our_results.append(our_result) |
| print(f"β
VISUAL NARRATOR:") |
| print(f" β‘ {our_result['time_ms']:.1f}ms") |
| print(f" π¬ Narrative: {our_result['narrative_quality']:.1%}") |
| print(f" π― Semantic: {our_result['semantic_accuracy']:.1%}") |
| print(f" π '{our_result['description']}'") |
| |
| |
| claude_result = self.test_claude_opus(scene, scene_key) |
| if claude_result["success"]: |
| claude_results.append(claude_result) |
| print(f"β
CLAUDE OPUS:") |
| print(f" β‘ {claude_result['time_ms']:.1f}ms") |
| print(f" π¬ Narrative: {claude_result['narrative_quality']:.1%}") |
| print(f" π― Semantic: {claude_result['semantic_accuracy']:.1%}") |
| desc = claude_result['description'] |
| if len(desc) > 70: |
| desc = desc[:67] + "..." |
| print(f" π '{desc}'") |
| |
| |
| gpt4_result = self.test_gpt4_turbo(scene, scene_key) |
| if gpt4_result["success"]: |
| gpt4_results.append(gpt4_result) |
| print(f"β
GPT-4 TURBO:") |
| print(f" β‘ {gpt4_result['time_ms']:.1f}ms") |
| print(f" π¬ Narrative: {gpt4_result['narrative_quality']:.1%}") |
| print(f" π― Semantic: {gpt4_result['semantic_accuracy']:.1%}") |
| desc = gpt4_result['description'] |
| if len(desc) > 70: |
| desc = desc[:67] + "..." |
| print(f" π '{desc}'") |
| |
| |
| if our_results and claude_results and gpt4_results: |
| self.print_real_summary(our_results, claude_results, gpt4_results) |
| else: |
| print("\nβ οΈ Some API calls failed - check connectivity and API keys") |
| |
| def print_real_summary(self, our_results, claude_results, gpt4_results): |
| """Print summary with real calculated scores""" |
| print("\n" + "=" * 70) |
| print("π COMPETITIVE POSITIONING - REAL SCORES") |
| print("=" * 70) |
| |
| |
| our_time = statistics.mean([r["time_ms"] for r in our_results]) |
| claude_time = statistics.mean([r["time_ms"] for r in claude_results]) |
| gpt4_time = statistics.mean([r["time_ms"] for r in gpt4_results]) |
| |
| our_narrative = statistics.mean([r["narrative_quality"] for r in our_results]) |
| claude_narrative = statistics.mean([r["narrative_quality"] for r in claude_results]) |
| gpt4_narrative = statistics.mean([r["narrative_quality"] for r in gpt4_results]) |
| |
| our_semantic = statistics.mean([r["semantic_accuracy"] for r in our_results]) |
| claude_semantic = statistics.mean([r["semantic_accuracy"] for r in claude_results]) |
| gpt4_semantic = statistics.mean([r["semantic_accuracy"] for r in gpt4_results]) |
| |
| print(f"\nπ REAL PERFORMANCE METRICS") |
| print(f"β’ Visual Narrator: {our_time:.1f}ms | {our_narrative:.1%} Narrative | {our_semantic:.1%} Semantic") |
| print(f"β’ Claude Opus: {claude_time:.1f}ms | {claude_narrative:.1%} Narrative | {claude_semantic:.1%} Semantic") |
| print(f"β’ GPT-4 Turbo: {gpt4_time:.1f}ms | {gpt4_narrative:.1%} Narrative | {gpt4_semantic:.1%} Semantic") |
| |
| print(f"\nπ― COMPETITIVE ADVANTAGES") |
| print(f"β
SPEED: {claude_time/our_time:.0f}x faster than Claude Opus") |
| print(f"β
QUALITY: {our_semantic/claude_semantic*100:.0f}% of Claude's semantic accuracy") |
| print(f"β
NARRATIVE: {our_narrative:.1%} professional quality") |
| print(f"β
COST: Zero marginal cost vs. API pricing") |
| |
| print(f"\nπ SAMPLE OUTPUTS (Real API Results)") |
| for i, result in enumerate(our_results, 1): |
| print(f"{i}. {result['description']}") |
| |
| print(f"\nπ METHODOLOGY") |
| print("β’ Narrative Quality: 8-point checklist (capitalization, punctuation, flow, etc.)") |
| print("β’ Semantic Accuracy: Cosine similarity to ground truth descriptions") |
| print("β’ Speed: Actual API response times measured") |
| print("β’ All scores calculated from real API responses") |
|
|
| if __name__ == "__main__": |
| print("π§ Initializing Real Scores Benchmark...") |
| benchmark = RealScoresBenchmark() |
| benchmark.run_benchmark() |
|
|