""" Final Evaluation Pipeline - Production Ready - Uses real app.py generation pipeline - 20 test cases covering diverse layouts - Balanced evaluator optimized for Infinigen's capabilities """ import sys from pathlib import Path import shutil import time from evaluation.test_cases import get_test_cases from evaluation.mesh_extractor import MeshGraphExtractor from evaluation.balanced_evaluator import BalancedEvaluator def validate_and_fix_config(config_path: Path) -> bool: """Validate and fix configuration to prevent Infinigen bugs""" import yaml try: with open(config_path, 'r') as f: original_content = f.read() f.seek(0) config = yaml.safe_load(f) # Debug: Show original AI output print(f" šŸ“„ AI generated config:") for line in original_content.split('\n')[:15]: # First 15 lines if line.strip(): print(f" {line}") # Ensure all area values are >= 5 (prevent division by zero) min_area = 5 area_keys = ['kitchen_area', 'bedroom_area', 'living_room_area', 'dining_room_area', 'bathroom_area'] fixed = False for key in area_keys: if key in config: if config[key] is None or config[key] < min_area: print(f" āš ļø Fixing {key}: {config[key]} → {min_area}") config[key] = min_area fixed = True else: # Add missing area with default value default_areas = { 'kitchen_area': 15, 'bedroom_area': 20, 'living_room_area': 30, 'dining_room_area': 15, 'bathroom_area': 8 } print(f" āš ļø Adding missing {key}: {default_areas[key]}") config[key] = default_areas[key] fixed = True # Ensure counts are >= 1 if 'bedroom_count' in config and config['bedroom_count'] < 1: config['bedroom_count'] = 1 fixed = True if fixed: with open(config_path, 'w') as f: yaml.dump(config, f, default_flow_style=False) print(f" āœ“ Configuration validated and fixed") return True except Exception as e: print(f" āœ— Config validation failed: {e}") return False def generate_scene_with_app(prompt: str, output_dir: Path, max_retries: int = 3) -> bool: """ Generate scene using backend components directly Args: prompt: User prompt output_dir: Where to save the scene max_retries: Maximum number of retries if generation fails Returns: True if generation succeeded """ from core.config_generator import ConfigGenerator from backend.infinigen_backend import InfinigenBackend from config import settings for attempt in range(max_retries): try: if attempt > 0: print(f" Retry {attempt}/{max_retries-1}...") else: print(f" Generating scene...") # Step 1: Generate configuration (or use default on failure) try: config_gen = ConfigGenerator() yaml_content = config_gen.generate(prompt) # Save configuration with open(settings.CONFIG_OUTPUT, 'w') as f: f.write(yaml_content) # Validate and fix configuration if not validate_and_fix_config(settings.CONFIG_OUTPUT): raise ValueError("Invalid configuration generated") except Exception as e: print(f" āš ļø Config generation failed: {e}, using default config") # Use default config as fallback default_config = Path("config/default_params.yaml") if default_config.exists(): shutil.copy(default_config, settings.CONFIG_OUTPUT) validate_and_fix_config(settings.CONFIG_OUTPUT) else: raise ValueError("No default config found") # Step 2: Generate layout with different seed on retry backend = InfinigenBackend() seed = settings.DEFAULT_SEED + attempt # Use different seed on retry blend_file = backend.generate( config_path=settings.CONFIG_OUTPUT, output_dir=settings.OUTPUT_DIR, seed=seed ) # Step 3: Copy to evaluation directory source_dir = blend_file.parent if output_dir.exists(): shutil.rmtree(output_dir) shutil.copytree(source_dir, output_dir) print(f" āœ“ Generated and saved to {output_dir}") return True except Exception as e: print(f" āœ— Attempt {attempt+1} failed: {e}") if attempt == max_retries - 1: # Last attempt failed import traceback traceback.print_exc() return False # Continue to next retry continue return False def evaluate_scene(test_case, scene_path: Path) -> dict: """ Evaluate a generated scene Args: test_case: Test case with ground truth scene_path: Path to generated scene Returns: Evaluation result dictionary """ try: # Extract graph extractor = MeshGraphExtractor(scene_path) G_gen = extractor.extract_graph() # Evaluate with Balanced Evaluator evaluator = BalancedEvaluator() result = evaluator.evaluate(G_gen, test_case.ground_truth) # Add metadata result['test_id'] = test_case.id result['prompt'] = test_case.prompt result['description'] = test_case.description result['generated_rooms'] = len(G_gen['nodes']) result['generated_edges'] = len(G_gen['edges']) return result except Exception as e: print(f" āœ— Evaluation failed: {e}") return { 'test_id': test_case.id, 'overall_score': 0.0, 'error': str(e) } def run_full_evaluation(generate: bool = True, test_ids: list = None): """ Run complete evaluation pipeline Args: generate: If True, generate new scenes. If False, use existing. test_ids: List of specific test IDs to run. If None, run all. """ print("="*80) print("EVALUATION PIPELINE - Production Ready") print("="*80) # Get test cases all_tests = get_test_cases() if test_ids: test_cases = [tc for tc in all_tests if tc.id in test_ids] else: test_cases = all_tests print(f"\nTest Cases: {len(test_cases)}") print(f"Mode: {'GENERATE + EVALUATE' if generate else 'EVALUATE ONLY'}\n") results = [] output_base = Path("outputs/evaluation") output_base.mkdir(parents=True, exist_ok=True) for i, test_case in enumerate(test_cases, 1): print(f"\n{'='*80}") print(f"[{i}/{len(test_cases)}] {test_case.id}") print(f"{'='*80}") print(f"Prompt: {test_case.prompt}") print(f"Description: {test_case.description}") scene_dir = output_base / test_case.id # Generate if requested if generate: start_time = time.time() success = generate_scene_with_app(test_case.prompt, scene_dir) gen_time = time.time() - start_time if not success: print(f" āš ļø Skipping evaluation (generation failed)") continue print(f" Generation time: {gen_time:.1f}s") else: # Check if scene exists if not scene_dir.exists() or not (scene_dir / "solve_state.json").exists(): print(f" āš ļø Scene not found, skipping...") continue print(f" Using existing scene") # Evaluate print(f" Evaluating...") result = evaluate_scene(test_case, scene_dir) results.append(result) # Display result score = result['overall_score'] emoji = "šŸ†" if score >= 0.9 else "āœ“" if score >= 0.8 else "āš ļø" if score >= 0.7 else "āœ—" print(f"\n {emoji} Overall Score: {score:.1%}") print(f" Interpretation: {result.get('interpretation', 'N/A')}") print(f" Generated: {result.get('generated_rooms', 0)} rooms, " f"{result.get('generated_edges', 0)} edges") # Display component scores print(f" Component Scores:") if 'room_counts' in result: print(f" - Room Counts: {result['room_counts']['score']:.1%} (weight 30%)") if 'room_presence' in result: print(f" - Room Presence: {result['room_presence']['score']:.1%} (weight 20%)") if 'adjacency' in result: print(f" - Adjacency: {result['adjacency']['score']:.1%} (weight 30%)") if 'constraints' in result: print(f" - Constraints: {result['constraints']['score']:.1%} (weight 20%)") # Final summary print("\n" + "="*80) print("SUMMARY") print("="*80) if not results: print("\nāš ļø No results to summarize") return scores = [r['overall_score'] for r in results] avg_score = sum(scores) / len(scores) print(f"\nTotal Tests: {len(results)}") print(f"Average Score: {avg_score:.1%}") print(f"Score Range: {min(scores):.1%} - {max(scores):.1%}") # Score distribution excellent = sum(1 for s in scores if s >= 0.9) good = sum(1 for s in scores if 0.8 <= s < 0.9) acceptable = sum(1 for s in scores if 0.7 <= s < 0.8) poor = sum(1 for s in scores if s < 0.7) print(f"\nScore Distribution:") print(f" šŸ† Excellent (≄90%): {excellent}") print(f" āœ“ Good (80-89%): {good}") print(f" āš ļø Acceptable (70-79%): {acceptable}") print(f" āœ— Poor (<70%): {poor}") # Individual results print(f"\nIndividual Results:") print("-"*80) for result in sorted(results, key=lambda x: x['overall_score'], reverse=True): score = result['overall_score'] emoji = "šŸ†" if score >= 0.9 else "āœ“" if score >= 0.8 else "āš ļø" if score >= 0.7 else "āœ—" print(f"{emoji} {result['test_id']:<30} {score:>6.1%}") # Save results to files save_results_to_files(results, test_cases) def save_results_to_files(results, test_cases): """Save evaluation results to JSON and CSV files (append mode)""" import json import csv from pathlib import Path if not results: return # Prepare new data new_data = [] for result in results: # Find corresponding test case for description and prompt test_case = next((tc for tc in test_cases if tc.id == result['test_id']), None) data = { 'test_id': result['test_id'], 'description': test_case.description if test_case else '', 'prompt': test_case.prompt if test_case else '', 'score': result['overall_score'] * 100, 'room_counts': result.get('room_counts', {}).get('score', 0) * 100, 'room_presence': result.get('room_presence', {}).get('score', 0) * 100, 'adjacency': result.get('adjacency', {}).get('score', 0) * 100, 'constraints': result.get('constraints', {}).get('score', 0) * 100, } new_data.append(data) # Load existing JSON data json_file = Path('evaluation_results.json') existing_data = [] if json_file.exists(): try: with open(json_file, 'r') as f: existing_data = json.load(f) except: existing_data = [] # Merge: update existing entries or add new ones existing_ids = {item['test_id']: i for i, item in enumerate(existing_data)} for data in new_data: if data['test_id'] in existing_ids: # Update existing entry existing_data[existing_ids[data['test_id']]] = data else: # Add new entry existing_data.append(data) # Save merged JSON with open('evaluation_results.json', 'w') as f: json.dump(existing_data, f, indent=2) print(f"\nāœ“ Saved results to evaluation_results.json ({len(new_data)} new/updated)") # Save merged CSV with open('evaluation_results.csv', 'w', newline='') as f: fieldnames = ['test_id', 'score', 'room_counts', 'room_presence', 'adjacency', 'constraints'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for data in existing_data: writer.writerow({k: data[k] for k in fieldnames}) print(f"āœ“ Saved results to evaluation_results.csv (total: {len(existing_data)} entries)") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Final Evaluation Pipeline") parser.add_argument("--generate", action="store_true", help="Generate new scenes (default: use existing)") parser.add_argument("--test-ids", nargs="+", help="Specific test IDs to run (default: all)") parser.add_argument("--quick", action="store_true", help="Quick test with only 3 cases") args = parser.parse_args() # Quick mode: test 3 representative cases if args.quick: test_ids = ["basic_studio_01", "two_bedroom_apt_01", "family_home_01"] print("šŸš€ QUICK MODE: Testing 3 representative cases\n") else: test_ids = args.test_ids run_full_evaluation( generate=args.generate, test_ids=test_ids )