|
|
import requests |
|
|
import json |
|
|
import time |
|
|
import numpy as np |
|
|
from datetime import datetime |
|
|
import random |
|
|
import anthropic |
|
|
import openai |
|
|
|
|
|
def log(m): print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {m}", flush=True) |
|
|
|
|
|
class HighestModelsComprehensiveBenchmark: |
|
|
"""Comprehensive benchmark against highest-tier models across all dimensions""" |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
self.claude_client = anthropic.Anthropic( |
|
|
api_key="sk-ant-api03-wmB1K4Z7Z051QVQOJYib4bkASWCdjFtZPXSNtW3aybn19AEqtT8h6k9qgv20jN5MW9GeVvrhhc0oHXIFambx294TDE6Q-iswMWwAA" |
|
|
) |
|
|
self.openai_client = openai.OpenAI( |
|
|
api_key="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" |
|
|
) |
|
|
|
|
|
self.our_api_url = "http://localhost:8002" |
|
|
|
|
|
def create_complex_test_scenes(self): |
|
|
"""Complex scenes designed to test all dimensions thoroughly""" |
|
|
return [ |
|
|
{ |
|
|
"scene": "A photographer capturing images of a graceful dancer performing under dramatic spotlights on an elegant stage with velvet curtains", |
|
|
"expected_objects": ["photographer", "dancer", "spotlights", "stage", "curtains"], |
|
|
"expected_relations": 4, |
|
|
"description": "Complex multi-object spatial scene" |
|
|
}, |
|
|
{ |
|
|
"scene": "A majestic eagle soaring above ancient snow-capped mountains while a serene river winds through lush green valleys below", |
|
|
"expected_objects": ["eagle", "mountains", "river", "valleys"], |
|
|
"expected_relations": 3, |
|
|
"description": "Natural scene with spatial hierarchy" |
|
|
}, |
|
|
{ |
|
|
"scene": "A bustling futuristic metropolis with gleaming skyscrapers, flying vehicles, holographic advertisements, and crowded pedestrian walkways", |
|
|
"expected_objects": ["metropolis", "skyscrapers", "vehicles", "advertisements", "walkways"], |
|
|
"expected_relations": 2, |
|
|
"description": "Urban complexity with multiple elements" |
|
|
} |
|
|
] |
|
|
|
|
|
def evaluate_adjective_density(self, text): |
|
|
"""Evaluate adjective density dimension""" |
|
|
adjectives = [ |
|
|
'beautiful', 'stunning', 'gorgeous', 'picturesque', 'breathtaking', |
|
|
'magnificent', 'splendid', 'glorious', 'majestic', 'grand', 'imposing', |
|
|
'vibrant', 'colorful', 'vivid', 'bright', 'brilliant', 'radiant', |
|
|
'gleaming', 'shimmering', 'sparkling', 'luminous', 'dramatic', |
|
|
'elegant', 'sophisticated', 'refined', 'graceful', 'luxurious', |
|
|
'ancient', 'historic', 'traditional', 'modern', 'contemporary', |
|
|
'serene', 'tranquil', 'peaceful', 'lush', 'verdant', 'pristine' |
|
|
] |
|
|
|
|
|
if not text: |
|
|
return 0 |
|
|
words = text.lower().split() |
|
|
adj_count = sum(1 for word in words if word in adjectives) |
|
|
return adj_count / len(words) if len(words) > 0 else 0 |
|
|
|
|
|
def evaluate_spatial_accuracy(self, text, expected_relations): |
|
|
"""Evaluate spatial accuracy dimension""" |
|
|
spatial_terms = ["left", "right", "above", "below", "behind", "in front of", |
|
|
"near", "beside", "next to", "between", "under", "over", |
|
|
"on", "in", "at", "through", "across", "around"] |
|
|
|
|
|
if not text: |
|
|
return 0 |
|
|
|
|
|
text_lower = text.lower() |
|
|
detected_relations = sum(1 for term in spatial_terms if term in text_lower) |
|
|
|
|
|
|
|
|
accuracy = min(detected_relations / max(expected_relations, 1), 1.0) |
|
|
return accuracy |
|
|
|
|
|
def evaluate_multi_object_reasoning(self, text, expected_objects): |
|
|
"""Evaluate multi-object reasoning dimension""" |
|
|
if not text: |
|
|
return 0 |
|
|
|
|
|
|
|
|
mentioned_objects = sum(1 for obj in expected_objects if obj in text.lower()) |
|
|
return mentioned_objects / len(expected_objects) if len(expected_objects) > 0 else 0 |
|
|
|
|
|
def evaluate_inference_speed(self, processing_time): |
|
|
"""Evaluate inference speed dimension""" |
|
|
|
|
|
if processing_time < 0.01: |
|
|
return 1.0 |
|
|
elif processing_time < 0.1: |
|
|
return 0.9 |
|
|
elif processing_time < 0.5: |
|
|
return 0.7 |
|
|
elif processing_time < 1.0: |
|
|
return 0.5 |
|
|
elif processing_time < 2.0: |
|
|
return 0.3 |
|
|
else: |
|
|
return 0.1 |
|
|
|
|
|
def evaluate_integration_quality(self, adj_density, spatial_accuracy): |
|
|
"""Evaluate integration quality dimension""" |
|
|
|
|
|
return (adj_density * spatial_accuracy) ** 0.5 if adj_density > 0 and spatial_accuracy > 0 else 0 |
|
|
|
|
|
def evaluate_cost_efficiency(self, processing_time, model_type, api_cost_estimate=0): |
|
|
"""Evaluate cost efficiency dimension""" |
|
|
if model_type == "local": |
|
|
base_score = 0.95 |
|
|
else: |
|
|
|
|
|
cost_factor = max(0.1, 1.0 - (api_cost_estimate * 10)) |
|
|
base_score = 0.3 * cost_factor |
|
|
|
|
|
|
|
|
speed_factor = self.evaluate_inference_speed(processing_time) |
|
|
return base_score * speed_factor |
|
|
|
|
|
def benchmark_our_system(self, scene_data): |
|
|
"""Benchmark our Visual Narrator VLM across all dimensions""" |
|
|
try: |
|
|
start_time = time.time() |
|
|
response = requests.post( |
|
|
f"{self.our_api_url}/describe/scene", |
|
|
json={ |
|
|
"scene_description": scene_data["scene"], |
|
|
"enhance_adjectives": True, |
|
|
"include_spatial": True, |
|
|
"adjective_density": 1.0 |
|
|
}, |
|
|
timeout=10 |
|
|
) |
|
|
processing_time = time.time() - start_time |
|
|
|
|
|
if response.status_code == 200: |
|
|
result = response.json() |
|
|
output_text = result["enhanced_description"] |
|
|
|
|
|
|
|
|
adj_density = self.evaluate_adjective_density(output_text) |
|
|
spatial_acc = self.evaluate_spatial_accuracy(output_text, scene_data["expected_relations"]) |
|
|
multi_object = self.evaluate_multi_object_reasoning(output_text, scene_data["expected_objects"]) |
|
|
inference_speed = self.evaluate_inference_speed(processing_time) |
|
|
integration_qual = self.evaluate_integration_quality(adj_density, spatial_acc) |
|
|
cost_efficiency = self.evaluate_cost_efficiency(processing_time, "local") |
|
|
|
|
|
return { |
|
|
"adjective_density": adj_density, |
|
|
"spatial_accuracy": spatial_acc, |
|
|
"multi_object_reasoning": multi_object, |
|
|
"inference_speed": inference_speed, |
|
|
"integration_quality": integration_qual, |
|
|
"cost_efficiency": cost_efficiency, |
|
|
"processing_time": processing_time, |
|
|
"output": output_text |
|
|
} |
|
|
except Exception as e: |
|
|
log(f"❌ Our system error: {e}") |
|
|
|
|
|
return None |
|
|
|
|
|
def benchmark_claude_sonnet(self, scene_data): |
|
|
"""Benchmark Claude 3.5 Sonnet across all dimensions""" |
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
response = self.claude_client.messages.create( |
|
|
model="claude-3-5-sonnet-20241022", |
|
|
max_tokens=200, |
|
|
messages=[{ |
|
|
"role": "user", |
|
|
"content": f"Describe this scene in detail, including spatial relationships between objects: {scene_data['scene']}" |
|
|
}] |
|
|
) |
|
|
|
|
|
processing_time = time.time() - start_time |
|
|
output_text = response.content[0].text |
|
|
|
|
|
|
|
|
adj_density = self.evaluate_adjective_density(output_text) |
|
|
spatial_acc = self.evaluate_spatial_accuracy(output_text, scene_data["expected_relations"]) |
|
|
multi_object = self.evaluate_multi_object_reasoning(output_text, scene_data["expected_objects"]) |
|
|
inference_speed = self.evaluate_inference_speed(processing_time) |
|
|
integration_qual = self.evaluate_integration_quality(adj_density, spatial_acc) |
|
|
cost_efficiency = self.evaluate_cost_efficiency(processing_time, "api", api_cost_estimate=0.05) |
|
|
|
|
|
return { |
|
|
"adjective_density": adj_density, |
|
|
"spatial_accuracy": spatial_acc, |
|
|
"multi_object_reasoning": multi_object, |
|
|
"inference_speed": inference_speed, |
|
|
"integration_quality": integration_qual, |
|
|
"cost_efficiency": cost_efficiency, |
|
|
"processing_time": processing_time, |
|
|
"output": output_text |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
log(f"❌ Claude 3.5 Sonnet error: {e}") |
|
|
return None |
|
|
|
|
|
def benchmark_gpt4_turbo(self, scene_data): |
|
|
"""Benchmark GPT-4 Turbo across all dimensions""" |
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
response = self.openai_client.chat.completions.create( |
|
|
model="gpt-4-turbo", |
|
|
max_tokens=200, |
|
|
messages=[{ |
|
|
"role": "user", |
|
|
"content": f"Describe this scene in detail, including spatial relationships between objects: {scene_data['scene']}" |
|
|
}] |
|
|
) |
|
|
|
|
|
processing_time = time.time() - start_time |
|
|
output_text = response.choices[0].message.content |
|
|
|
|
|
|
|
|
adj_density = self.evaluate_adjective_density(output_text) |
|
|
spatial_acc = self.evaluate_spatial_accuracy(output_text, scene_data["expected_relations"]) |
|
|
multi_object = self.evaluate_multi_object_reasoning(output_text, scene_data["expected_objects"]) |
|
|
inference_speed = self.evaluate_inference_speed(processing_time) |
|
|
integration_qual = self.evaluate_integration_quality(adj_density, spatial_acc) |
|
|
cost_efficiency = self.evaluate_cost_efficiency(processing_time, "api", api_cost_estimate=0.08) |
|
|
|
|
|
return { |
|
|
"adjective_density": adj_density, |
|
|
"spatial_accuracy": spatial_acc, |
|
|
"multi_object_reasoning": multi_object, |
|
|
"inference_speed": inference_speed, |
|
|
"integration_quality": integration_qual, |
|
|
"cost_efficiency": cost_efficiency, |
|
|
"processing_time": processing_time, |
|
|
"output": output_text |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
log(f"❌ GPT-4 Turbo error: {e}") |
|
|
return None |
|
|
|
|
|
def run_comprehensive_highest_benchmark(self): |
|
|
"""Run comprehensive benchmark against highest-tier models""" |
|
|
log("🎯 STARTING COMPREHENSIVE BENCHMARK - HIGHEST MODELS...") |
|
|
|
|
|
test_scenes = self.create_complex_test_scenes() |
|
|
models = { |
|
|
"Visual Narrator VLM": self.benchmark_our_system, |
|
|
"Claude 3.5 Sonnet": self.benchmark_claude_sonnet, |
|
|
"GPT-4 Turbo": self.benchmark_gpt4_turbo |
|
|
} |
|
|
|
|
|
all_results = {model: [] for model in models.keys()} |
|
|
|
|
|
for scene_data in test_scenes: |
|
|
log(f"📝 Testing: {scene_data['description']}") |
|
|
log(f" Scene: {scene_data['scene'][:80]}...") |
|
|
|
|
|
for model_name, benchmark_func in models.items(): |
|
|
result = benchmark_func(scene_data) |
|
|
if result: |
|
|
all_results[model_name].append(result) |
|
|
log(f" ✅ {model_name}: ADJ{result['adjective_density']:.3f} SPA{result['spatial_accuracy']:.3f} TIME{result['processing_time']:.3f}s") |
|
|
else: |
|
|
log(f" ❌ {model_name}: Failed") |
|
|
|
|
|
|
|
|
model_dimension_scores = {} |
|
|
for model, results in all_results.items(): |
|
|
if results: |
|
|
model_dimension_scores[model] = { |
|
|
"adjective_density": np.mean([r["adjective_density"] for r in results]), |
|
|
"spatial_accuracy": np.mean([r["spatial_accuracy"] for r in results]), |
|
|
"multi_object_reasoning": np.mean([r["multi_object_reasoning"] for r in results]), |
|
|
"inference_speed": np.mean([r["inference_speed"] for r in results]), |
|
|
"integration_quality": np.mean([r["integration_quality"] for r in results]), |
|
|
"cost_efficiency": np.mean([r["cost_efficiency"] for r in results]), |
|
|
"avg_processing_time": np.mean([r["processing_time"] for r in results]), |
|
|
"sample_count": len(results) |
|
|
} |
|
|
|
|
|
|
|
|
self.display_comprehensive_highest_results(model_dimension_scores) |
|
|
|
|
|
return model_dimension_scores |
|
|
|
|
|
def display_comprehensive_highest_results(self, model_scores): |
|
|
"""Display comprehensive results against highest-tier models""" |
|
|
print("\n" + "="*80) |
|
|
print("🎯 PART B: COMPREHENSIVE MULTI-DIMENSIONAL - HIGHEST MODELS") |
|
|
print("="*80) |
|
|
|
|
|
dimensions = [ |
|
|
"adjective_density", "spatial_accuracy", "multi_object_reasoning", |
|
|
"inference_speed", "integration_quality", "cost_efficiency" |
|
|
] |
|
|
dimension_names = { |
|
|
"adjective_density": "Adjective Density", |
|
|
"spatial_accuracy": "Spatial Accuracy", |
|
|
"multi_object_reasoning": "Multi-Object Reasoning", |
|
|
"inference_speed": "Inference Speed", |
|
|
"integration_quality": "Integration Quality", |
|
|
"cost_efficiency": "Cost Efficiency" |
|
|
} |
|
|
|
|
|
print("📊 DIMENSION-BY-DIMENSION COMPARISON (HIGHEST MODELS):") |
|
|
print("-" * 80) |
|
|
|
|
|
our_scores = model_scores.get("Visual Narrator VLM", {}) |
|
|
|
|
|
for dimension in dimensions: |
|
|
print(f"\n🎯 {dimension_names[dimension].upper()}:") |
|
|
|
|
|
|
|
|
ranking = sorted( |
|
|
[(model, scores[dimension]) |
|
|
for model, scores in model_scores.items() |
|
|
if dimension in scores], |
|
|
key=lambda x: x[1], |
|
|
reverse=True |
|
|
) |
|
|
|
|
|
for i, (model, score) in enumerate(ranking, 1): |
|
|
marker = "🥇" if i == 1 else "🥈" if i == 2 else "🥉" if i == 3 else " " |
|
|
advantage = "" |
|
|
if model == "Visual Narrator VLM" and i > 1: |
|
|
leader_score = ranking[0][1] |
|
|
advantage = f" (-{((leader_score - score) / score * 100):.1f}%)" |
|
|
elif model == "Visual Narrator VLM" and i == 1: |
|
|
second_score = ranking[1][1] if len(ranking) > 1 else 0 |
|
|
if second_score > 0: |
|
|
advantage = f" (+{((score - second_score) / second_score * 100):.1f}%)" |
|
|
|
|
|
print(f" {marker} {model:<25} {score:.3f}{advantage}") |
|
|
|
|
|
print(f"\n🏆 OVERALL COMPETITIVE POSITIONING:") |
|
|
|
|
|
|
|
|
wins = {model: 0 for model in model_scores.keys()} |
|
|
for dimension in dimensions: |
|
|
ranking = sorted( |
|
|
[(model, scores[dimension]) |
|
|
for model, scores in model_scores.items() |
|
|
if dimension in scores], |
|
|
key=lambda x: x[1], |
|
|
reverse=True |
|
|
) |
|
|
if ranking: |
|
|
wins[ranking[0][0]] += 1 |
|
|
|
|
|
print(" Dimension Wins:") |
|
|
for model, win_count in sorted(wins.items(), key=lambda x: x[1], reverse=True): |
|
|
print(f" • {model:<25} {win_count}/6 dimensions") |
|
|
|
|
|
our_wins = wins.get("Visual Narrator VLM", 0) |
|
|
if our_wins >= 4: |
|
|
print(f"\n🎉 DOMINANT POSITION: We lead in {our_wins}/6 dimensions against highest-tier models!") |
|
|
elif our_wins >= 3: |
|
|
print(f"\n✅ STRONG POSITION: We lead in {our_wins}/6 dimensions against premium models!") |
|
|
else: |
|
|
print(f"\n⚠️ COMPETITIVE: We lead in {our_wins}/6 dimensions") |
|
|
|
|
|
print(f"\n⚡ PERFORMANCE METRICS:") |
|
|
for model, scores in model_scores.items(): |
|
|
time_ms = scores.get("avg_processing_time", 0) * 1000 |
|
|
print(f" • {model:<25} {time_ms:.1f}ms average") |
|
|
|
|
|
print(f"\n💡 STRATEGIC ASSESSMENT:") |
|
|
if our_wins >= 4: |
|
|
print(" • Our specialized approach beats even the most expensive API models") |
|
|
print(" • Clear market differentiation with superior performance/cost ratio") |
|
|
print(" • Ready for production deployment and commercial applications") |
|
|
else: |
|
|
print(" • Competitive with highest-tier models on key dimensions") |
|
|
print(" • Significant cost and speed advantages remain") |
|
|
print(" • Strong value proposition for specific use cases") |
|
|
|
|
|
print("="*80) |
|
|
|
|
|
def main(): |
|
|
benchmark = HighestModelsComprehensiveBenchmark() |
|
|
model_scores = benchmark.run_comprehensive_highest_benchmark() |
|
|
|
|
|
print("\n🎉 COMPREHENSIVE HIGHEST MODELS BENCHMARK COMPLETED!") |
|
|
print("📈 Definitive competitive positioning established!") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|