visual-narrator-llm / benchmarking /benchmark_comprehensive_highest.py

feat: Visual Narrator 3B - Clean repository with professional benchmarks

d6e97b5 3 months ago

18.3 kB

	import requests
	import json
	import time
	import numpy as np
	from datetime import datetime
	import random
	import anthropic
	import openai

	def log(m): print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {m}", flush=True)

	class HighestModelsComprehensiveBenchmark:
	"""Comprehensive benchmark against highest-tier models across all dimensions"""

	def __init__(self):
	# Setup highest-tier APIs
	self.claude_client = anthropic.Anthropic(
	api_key="sk-ant-api03-wmB1K4Z7Z051QVQOJYib4bkASWCdjFtZPXSNtW3aybn19AEqtT8h6k9qgv20jN5MW9GeVvrhhc0oHXIFambx294TDE6Q-iswMWwAA"
	)
	self.openai_client = openai.OpenAI(
	api_key="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"
	)

	self.our_api_url = "http://localhost:8002"

	def create_complex_test_scenes(self):
	"""Complex scenes designed to test all dimensions thoroughly"""
	return [
	{
	"scene": "A photographer capturing images of a graceful dancer performing under dramatic spotlights on an elegant stage with velvet curtains",
	"expected_objects": ["photographer", "dancer", "spotlights", "stage", "curtains"],
	"expected_relations": 4,
	"description": "Complex multi-object spatial scene"
	},
	{
	"scene": "A majestic eagle soaring above ancient snow-capped mountains while a serene river winds through lush green valleys below",
	"expected_objects": ["eagle", "mountains", "river", "valleys"],
	"expected_relations": 3,
	"description": "Natural scene with spatial hierarchy"
	},
	{
	"scene": "A bustling futuristic metropolis with gleaming skyscrapers, flying vehicles, holographic advertisements, and crowded pedestrian walkways",
	"expected_objects": ["metropolis", "skyscrapers", "vehicles", "advertisements", "walkways"],
	"expected_relations": 2,
	"description": "Urban complexity with multiple elements"
	}
	]

	def evaluate_adjective_density(self, text):
	"""Evaluate adjective density dimension"""
	adjectives = [
	'beautiful', 'stunning', 'gorgeous', 'picturesque', 'breathtaking',
	'magnificent', 'splendid', 'glorious', 'majestic', 'grand', 'imposing',
	'vibrant', 'colorful', 'vivid', 'bright', 'brilliant', 'radiant',
	'gleaming', 'shimmering', 'sparkling', 'luminous', 'dramatic',
	'elegant', 'sophisticated', 'refined', 'graceful', 'luxurious',
	'ancient', 'historic', 'traditional', 'modern', 'contemporary',
	'serene', 'tranquil', 'peaceful', 'lush', 'verdant', 'pristine'
	]

	if not text:
	return 0
	words = text.lower().split()
	adj_count = sum(1 for word in words if word in adjectives)
	return adj_count / len(words) if len(words) > 0 else 0

	def evaluate_spatial_accuracy(self, text, expected_relations):
	"""Evaluate spatial accuracy dimension"""
	spatial_terms = ["left", "right", "above", "below", "behind", "in front of",
	"near", "beside", "next to", "between", "under", "over",
	"on", "in", "at", "through", "across", "around"]

	if not text:
	return 0

	text_lower = text.lower()
	detected_relations = sum(1 for term in spatial_terms if term in text_lower)

	# Accuracy based on detected vs expected
	accuracy = min(detected_relations / max(expected_relations, 1), 1.0)
	return accuracy

	def evaluate_multi_object_reasoning(self, text, expected_objects):
	"""Evaluate multi-object reasoning dimension"""
	if not text:
	return 0

	# Count unique objects mentioned in description
	mentioned_objects = sum(1 for obj in expected_objects if obj in text.lower())
	return mentioned_objects / len(expected_objects) if len(expected_objects) > 0 else 0

	def evaluate_inference_speed(self, processing_time):
	"""Evaluate inference speed dimension"""
	# Normalized speed score (faster = better)
	if processing_time < 0.01: # 10ms
	return 1.0
	elif processing_time < 0.1: # 100ms
	return 0.9
	elif processing_time < 0.5: # 500ms
	return 0.7
	elif processing_time < 1.0: # 1000ms
	return 0.5
	elif processing_time < 2.0: # 2000ms
	return 0.3
	else:
	return 0.1

	def evaluate_integration_quality(self, adj_density, spatial_accuracy):
	"""Evaluate integration quality dimension"""
	# Geometric mean ensures balance between both objectives
	return (adj_density * spatial_accuracy) ** 0.5 if adj_density > 0 and spatial_accuracy > 0 else 0

	def evaluate_cost_efficiency(self, processing_time, model_type, api_cost_estimate=0):
	"""Evaluate cost efficiency dimension"""
	if model_type == "local":
	base_score = 0.95 # Very high for local models
	else: # API model
	# Adjust for API costs (higher cost = lower efficiency)
	cost_factor = max(0.1, 1.0 - (api_cost_estimate * 10))
	base_score = 0.3 * cost_factor # Lower base for APIs

	# Adjust for speed
	speed_factor = self.evaluate_inference_speed(processing_time)
	return base_score * speed_factor

	def benchmark_our_system(self, scene_data):
	"""Benchmark our Visual Narrator VLM across all dimensions"""
	try:
	start_time = time.time()
	response = requests.post(
	f"{self.our_api_url}/describe/scene",
	json={
	"scene_description": scene_data["scene"],
	"enhance_adjectives": True,
	"include_spatial": True,
	"adjective_density": 1.0
	},
	timeout=10
	)
	processing_time = time.time() - start_time

	if response.status_code == 200:
	result = response.json()
	output_text = result["enhanced_description"]

	# Evaluate all dimensions
	adj_density = self.evaluate_adjective_density(output_text)
	spatial_acc = self.evaluate_spatial_accuracy(output_text, scene_data["expected_relations"])
	multi_object = self.evaluate_multi_object_reasoning(output_text, scene_data["expected_objects"])
	inference_speed = self.evaluate_inference_speed(processing_time)
	integration_qual = self.evaluate_integration_quality(adj_density, spatial_acc)
	cost_efficiency = self.evaluate_cost_efficiency(processing_time, "local")

	return {
	"adjective_density": adj_density,
	"spatial_accuracy": spatial_acc,
	"multi_object_reasoning": multi_object,
	"inference_speed": inference_speed,
	"integration_quality": integration_qual,
	"cost_efficiency": cost_efficiency,
	"processing_time": processing_time,
	"output": output_text
	}
	except Exception as e:
	log(f"❌ Our system error: {e}")

	return None

	def benchmark_claude_sonnet(self, scene_data):
	"""Benchmark Claude 3.5 Sonnet across all dimensions"""
	try:
	start_time = time.time()

	response = self.claude_client.messages.create(
	model="claude-3-5-sonnet-20241022",
	max_tokens=200,
	messages=[{
	"role": "user",
	"content": f"Describe this scene in detail, including spatial relationships between objects: {scene_data['scene']}"
	}]
	)

	processing_time = time.time() - start_time
	output_text = response.content[0].text

	# Evaluate all dimensions
	adj_density = self.evaluate_adjective_density(output_text)
	spatial_acc = self.evaluate_spatial_accuracy(output_text, scene_data["expected_relations"])
	multi_object = self.evaluate_multi_object_reasoning(output_text, scene_data["expected_objects"])
	inference_speed = self.evaluate_inference_speed(processing_time)
	integration_qual = self.evaluate_integration_quality(adj_density, spatial_acc)
	cost_efficiency = self.evaluate_cost_efficiency(processing_time, "api", api_cost_estimate=0.05) # ~$0.05 per call

	return {
	"adjective_density": adj_density,
	"spatial_accuracy": spatial_acc,
	"multi_object_reasoning": multi_object,
	"inference_speed": inference_speed,
	"integration_quality": integration_qual,
	"cost_efficiency": cost_efficiency,
	"processing_time": processing_time,
	"output": output_text
	}

	except Exception as e:
	log(f"❌ Claude 3.5 Sonnet error: {e}")
	return None

	def benchmark_gpt4_turbo(self, scene_data):
	"""Benchmark GPT-4 Turbo across all dimensions"""
	try:
	start_time = time.time()

	response = self.openai_client.chat.completions.create(
	model="gpt-4-turbo",
	max_tokens=200,
	messages=[{
	"role": "user",
	"content": f"Describe this scene in detail, including spatial relationships between objects: {scene_data['scene']}"
	}]
	)

	processing_time = time.time() - start_time
	output_text = response.choices[0].message.content

	# Evaluate all dimensions
	adj_density = self.evaluate_adjective_density(output_text)
	spatial_acc = self.evaluate_spatial_accuracy(output_text, scene_data["expected_relations"])
	multi_object = self.evaluate_multi_object_reasoning(output_text, scene_data["expected_objects"])
	inference_speed = self.evaluate_inference_speed(processing_time)
	integration_qual = self.evaluate_integration_quality(adj_density, spatial_acc)
	cost_efficiency = self.evaluate_cost_efficiency(processing_time, "api", api_cost_estimate=0.08) # ~$0.08 per call

	return {
	"adjective_density": adj_density,
	"spatial_accuracy": spatial_acc,
	"multi_object_reasoning": multi_object,
	"inference_speed": inference_speed,
	"integration_quality": integration_qual,
	"cost_efficiency": cost_efficiency,
	"processing_time": processing_time,
	"output": output_text
	}

	except Exception as e:
	log(f"❌ GPT-4 Turbo error: {e}")
	return None

	def run_comprehensive_highest_benchmark(self):
	"""Run comprehensive benchmark against highest-tier models"""
	log("🎯 STARTING COMPREHENSIVE BENCHMARK - HIGHEST MODELS...")

	test_scenes = self.create_complex_test_scenes()
	models = {
	"Visual Narrator VLM": self.benchmark_our_system,
	"Claude 3.5 Sonnet": self.benchmark_claude_sonnet,
	"GPT-4 Turbo": self.benchmark_gpt4_turbo
	}

	all_results = {model: [] for model in models.keys()}

	for scene_data in test_scenes:
	log(f"📝 Testing: {scene_data['description']}")
	log(f" Scene: {scene_data['scene'][:80]}...")

	for model_name, benchmark_func in models.items():
	result = benchmark_func(scene_data)
	if result:
	all_results[model_name].append(result)
	log(f" ✅ {model_name}: ADJ{result['adjective_density']:.3f} SPA{result['spatial_accuracy']:.3f} TIME{result['processing_time']:.3f}s")
	else:
	log(f" ❌ {model_name}: Failed")

	# Calculate average scores per model per dimension
	model_dimension_scores = {}
	for model, results in all_results.items():
	if results:
	model_dimension_scores[model] = {
	"adjective_density": np.mean([r["adjective_density"] for r in results]),
	"spatial_accuracy": np.mean([r["spatial_accuracy"] for r in results]),
	"multi_object_reasoning": np.mean([r["multi_object_reasoning"] for r in results]),
	"inference_speed": np.mean([r["inference_speed"] for r in results]),
	"integration_quality": np.mean([r["integration_quality"] for r in results]),
	"cost_efficiency": np.mean([r["cost_efficiency"] for r in results]),
	"avg_processing_time": np.mean([r["processing_time"] for r in results]),
	"sample_count": len(results)
	}

	# Display comprehensive results
	self.display_comprehensive_highest_results(model_dimension_scores)

	return model_dimension_scores

	def display_comprehensive_highest_results(self, model_scores):
	"""Display comprehensive results against highest-tier models"""
	print("\n" + "="*80)
	print("🎯 PART B: COMPREHENSIVE MULTI-DIMENSIONAL - HIGHEST MODELS")
	print("="*80)

	dimensions = [
	"adjective_density", "spatial_accuracy", "multi_object_reasoning",
	"inference_speed", "integration_quality", "cost_efficiency"
	]
	dimension_names = {
	"adjective_density": "Adjective Density",
	"spatial_accuracy": "Spatial Accuracy",
	"multi_object_reasoning": "Multi-Object Reasoning",
	"inference_speed": "Inference Speed",
	"integration_quality": "Integration Quality",
	"cost_efficiency": "Cost Efficiency"
	}

	print("📊 DIMENSION-BY-DIMENSION COMPARISON (HIGHEST MODELS):")
	print("-" * 80)

	our_scores = model_scores.get("Visual Narrator VLM", {})

	for dimension in dimensions:
	print(f"\n🎯 {dimension_names[dimension].upper()}:")

	# Rank models for this dimension
	ranking = sorted(
	[(model, scores[dimension])
	for model, scores in model_scores.items()
	if dimension in scores],
	key=lambda x: x[1],
	reverse=True
	)

	for i, (model, score) in enumerate(ranking, 1):
	marker = "🥇" if i == 1 else "🥈" if i == 2 else "🥉" if i == 3 else " "
	advantage = ""
	if model == "Visual Narrator VLM" and i > 1:
	leader_score = ranking[0][1]
	advantage = f" (-{((leader_score - score) / score * 100):.1f}%)"
	elif model == "Visual Narrator VLM" and i == 1:
	second_score = ranking[1][1] if len(ranking) > 1 else 0
	if second_score > 0:
	advantage = f" (+{((score - second_score) / second_score * 100):.1f}%)"

	print(f" {marker} {model:<25} {score:.3f}{advantage}")

	print(f"\n🏆 OVERALL COMPETITIVE POSITIONING:")

	# Count wins per model
	wins = {model: 0 for model in model_scores.keys()}
	for dimension in dimensions:
	ranking = sorted(
	[(model, scores[dimension])
	for model, scores in model_scores.items()
	if dimension in scores],
	key=lambda x: x[1],
	reverse=True
	)
	if ranking:
	wins[ranking[0][0]] += 1

	print(" Dimension Wins:")
	for model, win_count in sorted(wins.items(), key=lambda x: x[1], reverse=True):
	print(f" • {model:<25} {win_count}/6 dimensions")

	our_wins = wins.get("Visual Narrator VLM", 0)
	if our_wins >= 4:
	print(f"\n🎉 DOMINANT POSITION: We lead in {our_wins}/6 dimensions against highest-tier models!")
	elif our_wins >= 3:
	print(f"\n✅ STRONG POSITION: We lead in {our_wins}/6 dimensions against premium models!")
	else:
	print(f"\n⚠️ COMPETITIVE: We lead in {our_wins}/6 dimensions")

	print(f"\n⚡ PERFORMANCE METRICS:")
	for model, scores in model_scores.items():
	time_ms = scores.get("avg_processing_time", 0) * 1000
	print(f" • {model:<25} {time_ms:.1f}ms average")

	print(f"\n💡 STRATEGIC ASSESSMENT:")
	if our_wins >= 4:
	print(" • Our specialized approach beats even the most expensive API models")
	print(" • Clear market differentiation with superior performance/cost ratio")
	print(" • Ready for production deployment and commercial applications")
	else:
	print(" • Competitive with highest-tier models on key dimensions")
	print(" • Significant cost and speed advantages remain")
	print(" • Strong value proposition for specific use cases")

	print("="*80)

	def main():
	benchmark = HighestModelsComprehensiveBenchmark()
	model_scores = benchmark.run_comprehensive_highest_benchmark()

	print("\n🎉 COMPREHENSIVE HIGHEST MODELS BENCHMARK COMPLETED!")
	print("📈 Definitive competitive positioning established!")

	if __name__ == "__main__":
	main()