Scene_Foundry_Demo / run_evaluation.py
Chunteng's picture
Initial commit (Fresh Start)
a03fc9e
"""
Final Evaluation Pipeline - Production Ready
- Uses real app.py generation pipeline
- 20 test cases covering diverse layouts
- Balanced evaluator optimized for Infinigen's capabilities
"""
import sys
from pathlib import Path
import shutil
import time
from evaluation.test_cases import get_test_cases
from evaluation.mesh_extractor import MeshGraphExtractor
from evaluation.balanced_evaluator import BalancedEvaluator
def validate_and_fix_config(config_path: Path) -> bool:
"""Validate and fix configuration to prevent Infinigen bugs"""
import yaml
try:
with open(config_path, 'r') as f:
original_content = f.read()
f.seek(0)
config = yaml.safe_load(f)
# Debug: Show original AI output
print(f" πŸ“„ AI generated config:")
for line in original_content.split('\n')[:15]: # First 15 lines
if line.strip():
print(f" {line}")
# Ensure all area values are >= 5 (prevent division by zero)
min_area = 5
area_keys = ['kitchen_area', 'bedroom_area', 'living_room_area',
'dining_room_area', 'bathroom_area']
fixed = False
for key in area_keys:
if key in config:
if config[key] is None or config[key] < min_area:
print(f" ⚠️ Fixing {key}: {config[key]} β†’ {min_area}")
config[key] = min_area
fixed = True
else:
# Add missing area with default value
default_areas = {
'kitchen_area': 15,
'bedroom_area': 20,
'living_room_area': 30,
'dining_room_area': 15,
'bathroom_area': 8
}
print(f" ⚠️ Adding missing {key}: {default_areas[key]}")
config[key] = default_areas[key]
fixed = True
# Ensure counts are >= 1
if 'bedroom_count' in config and config['bedroom_count'] < 1:
config['bedroom_count'] = 1
fixed = True
if fixed:
with open(config_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False)
print(f" βœ“ Configuration validated and fixed")
return True
except Exception as e:
print(f" βœ— Config validation failed: {e}")
return False
def generate_scene_with_app(prompt: str, output_dir: Path, max_retries: int = 3) -> bool:
"""
Generate scene using backend components directly
Args:
prompt: User prompt
output_dir: Where to save the scene
max_retries: Maximum number of retries if generation fails
Returns:
True if generation succeeded
"""
from core.config_generator import ConfigGenerator
from backend.infinigen_backend import InfinigenBackend
from config import settings
for attempt in range(max_retries):
try:
if attempt > 0:
print(f" Retry {attempt}/{max_retries-1}...")
else:
print(f" Generating scene...")
# Step 1: Generate configuration (or use default on failure)
try:
config_gen = ConfigGenerator()
yaml_content = config_gen.generate(prompt)
# Save configuration
with open(settings.CONFIG_OUTPUT, 'w') as f:
f.write(yaml_content)
# Validate and fix configuration
if not validate_and_fix_config(settings.CONFIG_OUTPUT):
raise ValueError("Invalid configuration generated")
except Exception as e:
print(f" ⚠️ Config generation failed: {e}, using default config")
# Use default config as fallback
default_config = Path("config/default_params.yaml")
if default_config.exists():
shutil.copy(default_config, settings.CONFIG_OUTPUT)
validate_and_fix_config(settings.CONFIG_OUTPUT)
else:
raise ValueError("No default config found")
# Step 2: Generate layout with different seed on retry
backend = InfinigenBackend()
seed = settings.DEFAULT_SEED + attempt # Use different seed on retry
blend_file = backend.generate(
config_path=settings.CONFIG_OUTPUT,
output_dir=settings.OUTPUT_DIR,
seed=seed
)
# Step 3: Copy to evaluation directory
source_dir = blend_file.parent
if output_dir.exists():
shutil.rmtree(output_dir)
shutil.copytree(source_dir, output_dir)
print(f" βœ“ Generated and saved to {output_dir}")
return True
except Exception as e:
print(f" βœ— Attempt {attempt+1} failed: {e}")
if attempt == max_retries - 1:
# Last attempt failed
import traceback
traceback.print_exc()
return False
# Continue to next retry
continue
return False
def evaluate_scene(test_case, scene_path: Path) -> dict:
"""
Evaluate a generated scene
Args:
test_case: Test case with ground truth
scene_path: Path to generated scene
Returns:
Evaluation result dictionary
"""
try:
# Extract graph
extractor = MeshGraphExtractor(scene_path)
G_gen = extractor.extract_graph()
# Evaluate with Balanced Evaluator
evaluator = BalancedEvaluator()
result = evaluator.evaluate(G_gen, test_case.ground_truth)
# Add metadata
result['test_id'] = test_case.id
result['prompt'] = test_case.prompt
result['description'] = test_case.description
result['generated_rooms'] = len(G_gen['nodes'])
result['generated_edges'] = len(G_gen['edges'])
return result
except Exception as e:
print(f" βœ— Evaluation failed: {e}")
return {
'test_id': test_case.id,
'overall_score': 0.0,
'error': str(e)
}
def run_full_evaluation(generate: bool = True, test_ids: list = None):
"""
Run complete evaluation pipeline
Args:
generate: If True, generate new scenes. If False, use existing.
test_ids: List of specific test IDs to run. If None, run all.
"""
print("="*80)
print("EVALUATION PIPELINE - Production Ready")
print("="*80)
# Get test cases
all_tests = get_test_cases()
if test_ids:
test_cases = [tc for tc in all_tests if tc.id in test_ids]
else:
test_cases = all_tests
print(f"\nTest Cases: {len(test_cases)}")
print(f"Mode: {'GENERATE + EVALUATE' if generate else 'EVALUATE ONLY'}\n")
results = []
output_base = Path("outputs/evaluation")
output_base.mkdir(parents=True, exist_ok=True)
for i, test_case in enumerate(test_cases, 1):
print(f"\n{'='*80}")
print(f"[{i}/{len(test_cases)}] {test_case.id}")
print(f"{'='*80}")
print(f"Prompt: {test_case.prompt}")
print(f"Description: {test_case.description}")
scene_dir = output_base / test_case.id
# Generate if requested
if generate:
start_time = time.time()
success = generate_scene_with_app(test_case.prompt, scene_dir)
gen_time = time.time() - start_time
if not success:
print(f" ⚠️ Skipping evaluation (generation failed)")
continue
print(f" Generation time: {gen_time:.1f}s")
else:
# Check if scene exists
if not scene_dir.exists() or not (scene_dir / "solve_state.json").exists():
print(f" ⚠️ Scene not found, skipping...")
continue
print(f" Using existing scene")
# Evaluate
print(f" Evaluating...")
result = evaluate_scene(test_case, scene_dir)
results.append(result)
# Display result
score = result['overall_score']
emoji = "πŸ†" if score >= 0.9 else "βœ“" if score >= 0.8 else "⚠️" if score >= 0.7 else "βœ—"
print(f"\n {emoji} Overall Score: {score:.1%}")
print(f" Interpretation: {result.get('interpretation', 'N/A')}")
print(f" Generated: {result.get('generated_rooms', 0)} rooms, "
f"{result.get('generated_edges', 0)} edges")
# Display component scores
print(f" Component Scores:")
if 'room_counts' in result:
print(f" - Room Counts: {result['room_counts']['score']:.1%} (weight 30%)")
if 'room_presence' in result:
print(f" - Room Presence: {result['room_presence']['score']:.1%} (weight 20%)")
if 'adjacency' in result:
print(f" - Adjacency: {result['adjacency']['score']:.1%} (weight 30%)")
if 'constraints' in result:
print(f" - Constraints: {result['constraints']['score']:.1%} (weight 20%)")
# Final summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
if not results:
print("\n⚠️ No results to summarize")
return
scores = [r['overall_score'] for r in results]
avg_score = sum(scores) / len(scores)
print(f"\nTotal Tests: {len(results)}")
print(f"Average Score: {avg_score:.1%}")
print(f"Score Range: {min(scores):.1%} - {max(scores):.1%}")
# Score distribution
excellent = sum(1 for s in scores if s >= 0.9)
good = sum(1 for s in scores if 0.8 <= s < 0.9)
acceptable = sum(1 for s in scores if 0.7 <= s < 0.8)
poor = sum(1 for s in scores if s < 0.7)
print(f"\nScore Distribution:")
print(f" πŸ† Excellent (β‰₯90%): {excellent}")
print(f" βœ“ Good (80-89%): {good}")
print(f" ⚠️ Acceptable (70-79%): {acceptable}")
print(f" βœ— Poor (<70%): {poor}")
# Individual results
print(f"\nIndividual Results:")
print("-"*80)
for result in sorted(results, key=lambda x: x['overall_score'], reverse=True):
score = result['overall_score']
emoji = "πŸ†" if score >= 0.9 else "βœ“" if score >= 0.8 else "⚠️" if score >= 0.7 else "βœ—"
print(f"{emoji} {result['test_id']:<30} {score:>6.1%}")
# Save results to files
save_results_to_files(results, test_cases)
def save_results_to_files(results, test_cases):
"""Save evaluation results to JSON and CSV files (append mode)"""
import json
import csv
from pathlib import Path
if not results:
return
# Prepare new data
new_data = []
for result in results:
# Find corresponding test case for description and prompt
test_case = next((tc for tc in test_cases if tc.id == result['test_id']), None)
data = {
'test_id': result['test_id'],
'description': test_case.description if test_case else '',
'prompt': test_case.prompt if test_case else '',
'score': result['overall_score'] * 100,
'room_counts': result.get('room_counts', {}).get('score', 0) * 100,
'room_presence': result.get('room_presence', {}).get('score', 0) * 100,
'adjacency': result.get('adjacency', {}).get('score', 0) * 100,
'constraints': result.get('constraints', {}).get('score', 0) * 100,
}
new_data.append(data)
# Load existing JSON data
json_file = Path('evaluation_results.json')
existing_data = []
if json_file.exists():
try:
with open(json_file, 'r') as f:
existing_data = json.load(f)
except:
existing_data = []
# Merge: update existing entries or add new ones
existing_ids = {item['test_id']: i for i, item in enumerate(existing_data)}
for data in new_data:
if data['test_id'] in existing_ids:
# Update existing entry
existing_data[existing_ids[data['test_id']]] = data
else:
# Add new entry
existing_data.append(data)
# Save merged JSON
with open('evaluation_results.json', 'w') as f:
json.dump(existing_data, f, indent=2)
print(f"\nβœ“ Saved results to evaluation_results.json ({len(new_data)} new/updated)")
# Save merged CSV
with open('evaluation_results.csv', 'w', newline='') as f:
fieldnames = ['test_id', 'score', 'room_counts', 'room_presence', 'adjacency', 'constraints']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for data in existing_data:
writer.writerow({k: data[k] for k in fieldnames})
print(f"βœ“ Saved results to evaluation_results.csv (total: {len(existing_data)} entries)")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Final Evaluation Pipeline")
parser.add_argument("--generate", action="store_true",
help="Generate new scenes (default: use existing)")
parser.add_argument("--test-ids", nargs="+",
help="Specific test IDs to run (default: all)")
parser.add_argument("--quick", action="store_true",
help="Quick test with only 3 cases")
args = parser.parse_args()
# Quick mode: test 3 representative cases
if args.quick:
test_ids = ["basic_studio_01", "two_bedroom_apt_01", "family_home_01"]
print("πŸš€ QUICK MODE: Testing 3 representative cases\n")
else:
test_ids = args.test_ids
run_full_evaluation(
generate=args.generate,
test_ids=test_ids
)