Spaces:
Runtime error
Runtime error
| """ | |
| Final Evaluation Pipeline - Production Ready | |
| - Uses real app.py generation pipeline | |
| - 20 test cases covering diverse layouts | |
| - Balanced evaluator optimized for Infinigen's capabilities | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import shutil | |
| import time | |
| from evaluation.test_cases import get_test_cases | |
| from evaluation.mesh_extractor import MeshGraphExtractor | |
| from evaluation.balanced_evaluator import BalancedEvaluator | |
| def validate_and_fix_config(config_path: Path) -> bool: | |
| """Validate and fix configuration to prevent Infinigen bugs""" | |
| import yaml | |
| try: | |
| with open(config_path, 'r') as f: | |
| original_content = f.read() | |
| f.seek(0) | |
| config = yaml.safe_load(f) | |
| # Debug: Show original AI output | |
| print(f" π AI generated config:") | |
| for line in original_content.split('\n')[:15]: # First 15 lines | |
| if line.strip(): | |
| print(f" {line}") | |
| # Ensure all area values are >= 5 (prevent division by zero) | |
| min_area = 5 | |
| area_keys = ['kitchen_area', 'bedroom_area', 'living_room_area', | |
| 'dining_room_area', 'bathroom_area'] | |
| fixed = False | |
| for key in area_keys: | |
| if key in config: | |
| if config[key] is None or config[key] < min_area: | |
| print(f" β οΈ Fixing {key}: {config[key]} β {min_area}") | |
| config[key] = min_area | |
| fixed = True | |
| else: | |
| # Add missing area with default value | |
| default_areas = { | |
| 'kitchen_area': 15, | |
| 'bedroom_area': 20, | |
| 'living_room_area': 30, | |
| 'dining_room_area': 15, | |
| 'bathroom_area': 8 | |
| } | |
| print(f" β οΈ Adding missing {key}: {default_areas[key]}") | |
| config[key] = default_areas[key] | |
| fixed = True | |
| # Ensure counts are >= 1 | |
| if 'bedroom_count' in config and config['bedroom_count'] < 1: | |
| config['bedroom_count'] = 1 | |
| fixed = True | |
| if fixed: | |
| with open(config_path, 'w') as f: | |
| yaml.dump(config, f, default_flow_style=False) | |
| print(f" β Configuration validated and fixed") | |
| return True | |
| except Exception as e: | |
| print(f" β Config validation failed: {e}") | |
| return False | |
| def generate_scene_with_app(prompt: str, output_dir: Path, max_retries: int = 3) -> bool: | |
| """ | |
| Generate scene using backend components directly | |
| Args: | |
| prompt: User prompt | |
| output_dir: Where to save the scene | |
| max_retries: Maximum number of retries if generation fails | |
| Returns: | |
| True if generation succeeded | |
| """ | |
| from core.config_generator import ConfigGenerator | |
| from backend.infinigen_backend import InfinigenBackend | |
| from config import settings | |
| for attempt in range(max_retries): | |
| try: | |
| if attempt > 0: | |
| print(f" Retry {attempt}/{max_retries-1}...") | |
| else: | |
| print(f" Generating scene...") | |
| # Step 1: Generate configuration (or use default on failure) | |
| try: | |
| config_gen = ConfigGenerator() | |
| yaml_content = config_gen.generate(prompt) | |
| # Save configuration | |
| with open(settings.CONFIG_OUTPUT, 'w') as f: | |
| f.write(yaml_content) | |
| # Validate and fix configuration | |
| if not validate_and_fix_config(settings.CONFIG_OUTPUT): | |
| raise ValueError("Invalid configuration generated") | |
| except Exception as e: | |
| print(f" β οΈ Config generation failed: {e}, using default config") | |
| # Use default config as fallback | |
| default_config = Path("config/default_params.yaml") | |
| if default_config.exists(): | |
| shutil.copy(default_config, settings.CONFIG_OUTPUT) | |
| validate_and_fix_config(settings.CONFIG_OUTPUT) | |
| else: | |
| raise ValueError("No default config found") | |
| # Step 2: Generate layout with different seed on retry | |
| backend = InfinigenBackend() | |
| seed = settings.DEFAULT_SEED + attempt # Use different seed on retry | |
| blend_file = backend.generate( | |
| config_path=settings.CONFIG_OUTPUT, | |
| output_dir=settings.OUTPUT_DIR, | |
| seed=seed | |
| ) | |
| # Step 3: Copy to evaluation directory | |
| source_dir = blend_file.parent | |
| if output_dir.exists(): | |
| shutil.rmtree(output_dir) | |
| shutil.copytree(source_dir, output_dir) | |
| print(f" β Generated and saved to {output_dir}") | |
| return True | |
| except Exception as e: | |
| print(f" β Attempt {attempt+1} failed: {e}") | |
| if attempt == max_retries - 1: | |
| # Last attempt failed | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| # Continue to next retry | |
| continue | |
| return False | |
| def evaluate_scene(test_case, scene_path: Path) -> dict: | |
| """ | |
| Evaluate a generated scene | |
| Args: | |
| test_case: Test case with ground truth | |
| scene_path: Path to generated scene | |
| Returns: | |
| Evaluation result dictionary | |
| """ | |
| try: | |
| # Extract graph | |
| extractor = MeshGraphExtractor(scene_path) | |
| G_gen = extractor.extract_graph() | |
| # Evaluate with Balanced Evaluator | |
| evaluator = BalancedEvaluator() | |
| result = evaluator.evaluate(G_gen, test_case.ground_truth) | |
| # Add metadata | |
| result['test_id'] = test_case.id | |
| result['prompt'] = test_case.prompt | |
| result['description'] = test_case.description | |
| result['generated_rooms'] = len(G_gen['nodes']) | |
| result['generated_edges'] = len(G_gen['edges']) | |
| return result | |
| except Exception as e: | |
| print(f" β Evaluation failed: {e}") | |
| return { | |
| 'test_id': test_case.id, | |
| 'overall_score': 0.0, | |
| 'error': str(e) | |
| } | |
| def run_full_evaluation(generate: bool = True, test_ids: list = None): | |
| """ | |
| Run complete evaluation pipeline | |
| Args: | |
| generate: If True, generate new scenes. If False, use existing. | |
| test_ids: List of specific test IDs to run. If None, run all. | |
| """ | |
| print("="*80) | |
| print("EVALUATION PIPELINE - Production Ready") | |
| print("="*80) | |
| # Get test cases | |
| all_tests = get_test_cases() | |
| if test_ids: | |
| test_cases = [tc for tc in all_tests if tc.id in test_ids] | |
| else: | |
| test_cases = all_tests | |
| print(f"\nTest Cases: {len(test_cases)}") | |
| print(f"Mode: {'GENERATE + EVALUATE' if generate else 'EVALUATE ONLY'}\n") | |
| results = [] | |
| output_base = Path("outputs/evaluation") | |
| output_base.mkdir(parents=True, exist_ok=True) | |
| for i, test_case in enumerate(test_cases, 1): | |
| print(f"\n{'='*80}") | |
| print(f"[{i}/{len(test_cases)}] {test_case.id}") | |
| print(f"{'='*80}") | |
| print(f"Prompt: {test_case.prompt}") | |
| print(f"Description: {test_case.description}") | |
| scene_dir = output_base / test_case.id | |
| # Generate if requested | |
| if generate: | |
| start_time = time.time() | |
| success = generate_scene_with_app(test_case.prompt, scene_dir) | |
| gen_time = time.time() - start_time | |
| if not success: | |
| print(f" β οΈ Skipping evaluation (generation failed)") | |
| continue | |
| print(f" Generation time: {gen_time:.1f}s") | |
| else: | |
| # Check if scene exists | |
| if not scene_dir.exists() or not (scene_dir / "solve_state.json").exists(): | |
| print(f" β οΈ Scene not found, skipping...") | |
| continue | |
| print(f" Using existing scene") | |
| # Evaluate | |
| print(f" Evaluating...") | |
| result = evaluate_scene(test_case, scene_dir) | |
| results.append(result) | |
| # Display result | |
| score = result['overall_score'] | |
| emoji = "π" if score >= 0.9 else "β" if score >= 0.8 else "β οΈ" if score >= 0.7 else "β" | |
| print(f"\n {emoji} Overall Score: {score:.1%}") | |
| print(f" Interpretation: {result.get('interpretation', 'N/A')}") | |
| print(f" Generated: {result.get('generated_rooms', 0)} rooms, " | |
| f"{result.get('generated_edges', 0)} edges") | |
| # Display component scores | |
| print(f" Component Scores:") | |
| if 'room_counts' in result: | |
| print(f" - Room Counts: {result['room_counts']['score']:.1%} (weight 30%)") | |
| if 'room_presence' in result: | |
| print(f" - Room Presence: {result['room_presence']['score']:.1%} (weight 20%)") | |
| if 'adjacency' in result: | |
| print(f" - Adjacency: {result['adjacency']['score']:.1%} (weight 30%)") | |
| if 'constraints' in result: | |
| print(f" - Constraints: {result['constraints']['score']:.1%} (weight 20%)") | |
| # Final summary | |
| print("\n" + "="*80) | |
| print("SUMMARY") | |
| print("="*80) | |
| if not results: | |
| print("\nβ οΈ No results to summarize") | |
| return | |
| scores = [r['overall_score'] for r in results] | |
| avg_score = sum(scores) / len(scores) | |
| print(f"\nTotal Tests: {len(results)}") | |
| print(f"Average Score: {avg_score:.1%}") | |
| print(f"Score Range: {min(scores):.1%} - {max(scores):.1%}") | |
| # Score distribution | |
| excellent = sum(1 for s in scores if s >= 0.9) | |
| good = sum(1 for s in scores if 0.8 <= s < 0.9) | |
| acceptable = sum(1 for s in scores if 0.7 <= s < 0.8) | |
| poor = sum(1 for s in scores if s < 0.7) | |
| print(f"\nScore Distribution:") | |
| print(f" π Excellent (β₯90%): {excellent}") | |
| print(f" β Good (80-89%): {good}") | |
| print(f" β οΈ Acceptable (70-79%): {acceptable}") | |
| print(f" β Poor (<70%): {poor}") | |
| # Individual results | |
| print(f"\nIndividual Results:") | |
| print("-"*80) | |
| for result in sorted(results, key=lambda x: x['overall_score'], reverse=True): | |
| score = result['overall_score'] | |
| emoji = "π" if score >= 0.9 else "β" if score >= 0.8 else "β οΈ" if score >= 0.7 else "β" | |
| print(f"{emoji} {result['test_id']:<30} {score:>6.1%}") | |
| # Save results to files | |
| save_results_to_files(results, test_cases) | |
| def save_results_to_files(results, test_cases): | |
| """Save evaluation results to JSON and CSV files (append mode)""" | |
| import json | |
| import csv | |
| from pathlib import Path | |
| if not results: | |
| return | |
| # Prepare new data | |
| new_data = [] | |
| for result in results: | |
| # Find corresponding test case for description and prompt | |
| test_case = next((tc for tc in test_cases if tc.id == result['test_id']), None) | |
| data = { | |
| 'test_id': result['test_id'], | |
| 'description': test_case.description if test_case else '', | |
| 'prompt': test_case.prompt if test_case else '', | |
| 'score': result['overall_score'] * 100, | |
| 'room_counts': result.get('room_counts', {}).get('score', 0) * 100, | |
| 'room_presence': result.get('room_presence', {}).get('score', 0) * 100, | |
| 'adjacency': result.get('adjacency', {}).get('score', 0) * 100, | |
| 'constraints': result.get('constraints', {}).get('score', 0) * 100, | |
| } | |
| new_data.append(data) | |
| # Load existing JSON data | |
| json_file = Path('evaluation_results.json') | |
| existing_data = [] | |
| if json_file.exists(): | |
| try: | |
| with open(json_file, 'r') as f: | |
| existing_data = json.load(f) | |
| except: | |
| existing_data = [] | |
| # Merge: update existing entries or add new ones | |
| existing_ids = {item['test_id']: i for i, item in enumerate(existing_data)} | |
| for data in new_data: | |
| if data['test_id'] in existing_ids: | |
| # Update existing entry | |
| existing_data[existing_ids[data['test_id']]] = data | |
| else: | |
| # Add new entry | |
| existing_data.append(data) | |
| # Save merged JSON | |
| with open('evaluation_results.json', 'w') as f: | |
| json.dump(existing_data, f, indent=2) | |
| print(f"\nβ Saved results to evaluation_results.json ({len(new_data)} new/updated)") | |
| # Save merged CSV | |
| with open('evaluation_results.csv', 'w', newline='') as f: | |
| fieldnames = ['test_id', 'score', 'room_counts', 'room_presence', 'adjacency', 'constraints'] | |
| writer = csv.DictWriter(f, fieldnames=fieldnames) | |
| writer.writeheader() | |
| for data in existing_data: | |
| writer.writerow({k: data[k] for k in fieldnames}) | |
| print(f"β Saved results to evaluation_results.csv (total: {len(existing_data)} entries)") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Final Evaluation Pipeline") | |
| parser.add_argument("--generate", action="store_true", | |
| help="Generate new scenes (default: use existing)") | |
| parser.add_argument("--test-ids", nargs="+", | |
| help="Specific test IDs to run (default: all)") | |
| parser.add_argument("--quick", action="store_true", | |
| help="Quick test with only 3 cases") | |
| args = parser.parse_args() | |
| # Quick mode: test 3 representative cases | |
| if args.quick: | |
| test_ids = ["basic_studio_01", "two_bedroom_apt_01", "family_home_01"] | |
| print("π QUICK MODE: Testing 3 representative cases\n") | |
| else: | |
| test_ids = args.test_ids | |
| run_full_evaluation( | |
| generate=args.generate, | |
| test_ids=test_ids | |
| ) | |