Spaces:

Chunteng
/

Scene_Foundry_Demo

Runtime error

File size: 14,274 Bytes

a03fc9e

"""
Final Evaluation Pipeline - Production Ready
- Uses real app.py generation pipeline
- 20 test cases covering diverse layouts
- Balanced evaluator optimized for Infinigen's capabilities
"""

import sys
from pathlib import Path
import shutil
import time
from evaluation.test_cases import get_test_cases
from evaluation.mesh_extractor import MeshGraphExtractor
from evaluation.balanced_evaluator import BalancedEvaluator


def validate_and_fix_config(config_path: Path) -> bool:
    """Validate and fix configuration to prevent Infinigen bugs"""
    import yaml
    
    try:
        with open(config_path, 'r') as f:
            original_content = f.read()
            f.seek(0)
            config = yaml.safe_load(f)
        
        # Debug: Show original AI output
        print(f"   📄 AI generated config:")
        for line in original_content.split('\n')[:15]:  # First 15 lines
            if line.strip():
                print(f"      {line}")
        
        # Ensure all area values are >= 5 (prevent division by zero)
        min_area = 5
        area_keys = ['kitchen_area', 'bedroom_area', 'living_room_area', 
                     'dining_room_area', 'bathroom_area']
        
        fixed = False
        for key in area_keys:
            if key in config:
                if config[key] is None or config[key] < min_area:
                    print(f"   ⚠️  Fixing {key}: {config[key]} → {min_area}")
                    config[key] = min_area
                    fixed = True
            else:
                # Add missing area with default value
                default_areas = {
                    'kitchen_area': 15,
                    'bedroom_area': 20,
                    'living_room_area': 30,
                    'dining_room_area': 15,
                    'bathroom_area': 8
                }
                print(f"   ⚠️  Adding missing {key}: {default_areas[key]}")
                config[key] = default_areas[key]
                fixed = True
        
        # Ensure counts are >= 1
        if 'bedroom_count' in config and config['bedroom_count'] < 1:
            config['bedroom_count'] = 1
            fixed = True
        
        if fixed:
            with open(config_path, 'w') as f:
                yaml.dump(config, f, default_flow_style=False)
            print(f"   ✓ Configuration validated and fixed")
        
        return True
        
    except Exception as e:
        print(f"   ✗ Config validation failed: {e}")
        return False


def generate_scene_with_app(prompt: str, output_dir: Path, max_retries: int = 3) -> bool:
    """
    Generate scene using backend components directly
    
    Args:
        prompt: User prompt
        output_dir: Where to save the scene
        max_retries: Maximum number of retries if generation fails
        
    Returns:
        True if generation succeeded
    """
    from core.config_generator import ConfigGenerator
    from backend.infinigen_backend import InfinigenBackend
    from config import settings
    
    for attempt in range(max_retries):
        try:
            if attempt > 0:
                print(f"   Retry {attempt}/{max_retries-1}...")
            else:
                print(f"   Generating scene...")
            
            # Step 1: Generate configuration (or use default on failure)
            try:
                config_gen = ConfigGenerator()
                yaml_content = config_gen.generate(prompt)
                
                # Save configuration
                with open(settings.CONFIG_OUTPUT, 'w') as f:
                    f.write(yaml_content)
                
                # Validate and fix configuration
                if not validate_and_fix_config(settings.CONFIG_OUTPUT):
                    raise ValueError("Invalid configuration generated")
            except Exception as e:
                print(f"   ⚠️  Config generation failed: {e}, using default config")
                # Use default config as fallback
                default_config = Path("config/default_params.yaml")
                if default_config.exists():
                    shutil.copy(default_config, settings.CONFIG_OUTPUT)
                    validate_and_fix_config(settings.CONFIG_OUTPUT)
                else:
                    raise ValueError("No default config found")
            
            # Step 2: Generate layout with different seed on retry
            backend = InfinigenBackend()
            seed = settings.DEFAULT_SEED + attempt  # Use different seed on retry
            blend_file = backend.generate(
                config_path=settings.CONFIG_OUTPUT,
                output_dir=settings.OUTPUT_DIR,
                seed=seed
            )
            
            # Step 3: Copy to evaluation directory
            source_dir = blend_file.parent
            
            if output_dir.exists():
                shutil.rmtree(output_dir)
            shutil.copytree(source_dir, output_dir)
            
            print(f"   ✓ Generated and saved to {output_dir}")
            return True
            
        except Exception as e:
            print(f"   ✗ Attempt {attempt+1} failed: {e}")
            if attempt == max_retries - 1:
                # Last attempt failed
                import traceback
                traceback.print_exc()
                return False
            # Continue to next retry
            continue
    
    return False


def evaluate_scene(test_case, scene_path: Path) -> dict:
    """
    Evaluate a generated scene
    
    Args:
        test_case: Test case with ground truth
        scene_path: Path to generated scene
        
    Returns:
        Evaluation result dictionary
    """
    try:
        # Extract graph
        extractor = MeshGraphExtractor(scene_path)
        G_gen = extractor.extract_graph()
        
        # Evaluate with Balanced Evaluator
        evaluator = BalancedEvaluator()
        result = evaluator.evaluate(G_gen, test_case.ground_truth)
        
        # Add metadata
        result['test_id'] = test_case.id
        result['prompt'] = test_case.prompt
        result['description'] = test_case.description
        result['generated_rooms'] = len(G_gen['nodes'])
        result['generated_edges'] = len(G_gen['edges'])
        
        return result
        
    except Exception as e:
        print(f"   ✗ Evaluation failed: {e}")
        return {
            'test_id': test_case.id,
            'overall_score': 0.0,
            'error': str(e)
        }


def run_full_evaluation(generate: bool = True, test_ids: list = None):
    """
    Run complete evaluation pipeline
    
    Args:
        generate: If True, generate new scenes. If False, use existing.
        test_ids: List of specific test IDs to run. If None, run all.
    """
    print("="*80)
    print("EVALUATION PIPELINE - Production Ready")
    print("="*80)
    
    # Get test cases
    all_tests = get_test_cases()
    
    if test_ids:
        test_cases = [tc for tc in all_tests if tc.id in test_ids]
    else:
        test_cases = all_tests
    
    print(f"\nTest Cases: {len(test_cases)}")
    print(f"Mode: {'GENERATE + EVALUATE' if generate else 'EVALUATE ONLY'}\n")
    
    results = []
    output_base = Path("outputs/evaluation")
    output_base.mkdir(parents=True, exist_ok=True)
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"\n{'='*80}")
        print(f"[{i}/{len(test_cases)}] {test_case.id}")
        print(f"{'='*80}")
        print(f"Prompt: {test_case.prompt}")
        print(f"Description: {test_case.description}")
        
        scene_dir = output_base / test_case.id
        
        # Generate if requested
        if generate:
            start_time = time.time()
            success = generate_scene_with_app(test_case.prompt, scene_dir)
            gen_time = time.time() - start_time
            
            if not success:
                print(f"   ⚠️  Skipping evaluation (generation failed)")
                continue
            
            print(f"   Generation time: {gen_time:.1f}s")
        else:
            # Check if scene exists
            if not scene_dir.exists() or not (scene_dir / "solve_state.json").exists():
                print(f"   ⚠️  Scene not found, skipping...")
                continue
            print(f"   Using existing scene")
        
        # Evaluate
        print(f"   Evaluating...")
        result = evaluate_scene(test_case, scene_dir)
        results.append(result)
        
        # Display result
        score = result['overall_score']
        emoji = "🏆" if score >= 0.9 else "✓" if score >= 0.8 else "⚠️" if score >= 0.7 else "✗"
        
        print(f"\n   {emoji} Overall Score: {score:.1%}")
        print(f"   Interpretation: {result.get('interpretation', 'N/A')}")
        print(f"   Generated: {result.get('generated_rooms', 0)} rooms, "
              f"{result.get('generated_edges', 0)} edges")
        
        # Display component scores
        print(f"   Component Scores:")
        if 'room_counts' in result:
            print(f"     - Room Counts:    {result['room_counts']['score']:.1%} (weight 30%)")
        if 'room_presence' in result:
            print(f"     - Room Presence:  {result['room_presence']['score']:.1%} (weight 20%)")
        if 'adjacency' in result:
            print(f"     - Adjacency:      {result['adjacency']['score']:.1%} (weight 30%)")
        if 'constraints' in result:
            print(f"     - Constraints:    {result['constraints']['score']:.1%} (weight 20%)")
    
    # Final summary
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    
    if not results:
        print("\n⚠️  No results to summarize")
        return
    
    scores = [r['overall_score'] for r in results]
    avg_score = sum(scores) / len(scores)
    
    print(f"\nTotal Tests: {len(results)}")
    print(f"Average Score: {avg_score:.1%}")
    print(f"Score Range: {min(scores):.1%} - {max(scores):.1%}")
    
    # Score distribution
    excellent = sum(1 for s in scores if s >= 0.9)
    good = sum(1 for s in scores if 0.8 <= s < 0.9)
    acceptable = sum(1 for s in scores if 0.7 <= s < 0.8)
    poor = sum(1 for s in scores if s < 0.7)
    
    print(f"\nScore Distribution:")
    print(f"  🏆 Excellent (≥90%): {excellent}")
    print(f"  ✓  Good (80-89%):    {good}")
    print(f"  ⚠️  Acceptable (70-79%): {acceptable}")
    print(f"  ✗  Poor (<70%):      {poor}")
    
    # Individual results
    print(f"\nIndividual Results:")
    print("-"*80)
    
    for result in sorted(results, key=lambda x: x['overall_score'], reverse=True):
        score = result['overall_score']
        emoji = "🏆" if score >= 0.9 else "✓" if score >= 0.8 else "⚠️" if score >= 0.7 else "✗"
        print(f"{emoji} {result['test_id']:<30} {score:>6.1%}")
    
    # Save results to files
    save_results_to_files(results, test_cases)


def save_results_to_files(results, test_cases):
    """Save evaluation results to JSON and CSV files (append mode)"""
    import json
    import csv
    from pathlib import Path
    
    if not results:
        return
    
    # Prepare new data
    new_data = []
    for result in results:
        # Find corresponding test case for description and prompt
        test_case = next((tc for tc in test_cases if tc.id == result['test_id']), None)
        
        data = {
            'test_id': result['test_id'],
            'description': test_case.description if test_case else '',
            'prompt': test_case.prompt if test_case else '',
            'score': result['overall_score'] * 100,
            'room_counts': result.get('room_counts', {}).get('score', 0) * 100,
            'room_presence': result.get('room_presence', {}).get('score', 0) * 100,
            'adjacency': result.get('adjacency', {}).get('score', 0) * 100,
            'constraints': result.get('constraints', {}).get('score', 0) * 100,
        }
        new_data.append(data)
    
    # Load existing JSON data
    json_file = Path('evaluation_results.json')
    existing_data = []
    if json_file.exists():
        try:
            with open(json_file, 'r') as f:
                existing_data = json.load(f)
        except:
            existing_data = []
    
    # Merge: update existing entries or add new ones
    existing_ids = {item['test_id']: i for i, item in enumerate(existing_data)}
    for data in new_data:
        if data['test_id'] in existing_ids:
            # Update existing entry
            existing_data[existing_ids[data['test_id']]] = data
        else:
            # Add new entry
            existing_data.append(data)
    
    # Save merged JSON
    with open('evaluation_results.json', 'w') as f:
        json.dump(existing_data, f, indent=2)
    print(f"\n✓ Saved results to evaluation_results.json ({len(new_data)} new/updated)")
    
    # Save merged CSV
    with open('evaluation_results.csv', 'w', newline='') as f:
        fieldnames = ['test_id', 'score', 'room_counts', 'room_presence', 'adjacency', 'constraints']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        
        for data in existing_data:
            writer.writerow({k: data[k] for k in fieldnames})
    print(f"✓ Saved results to evaluation_results.csv (total: {len(existing_data)} entries)")


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Final Evaluation Pipeline")
    parser.add_argument("--generate", action="store_true", 
                       help="Generate new scenes (default: use existing)")
    parser.add_argument("--test-ids", nargs="+", 
                       help="Specific test IDs to run (default: all)")
    parser.add_argument("--quick", action="store_true",
                       help="Quick test with only 3 cases")
    
    args = parser.parse_args()
    
    # Quick mode: test 3 representative cases
    if args.quick:
        test_ids = ["basic_studio_01", "two_bedroom_apt_01", "family_home_01"]
        print("🚀 QUICK MODE: Testing 3 representative cases\n")
    else:
        test_ids = args.test_ids
    
    run_full_evaluation(
        generate=args.generate,
        test_ids=test_ids
    )