Spaces:

NLarchive
/

Qagents-workflows

Sleeping

File size: 11,528 Bytes

1bb4678

# Path: QAgents-workflos/tests/comprehensive_test.py
# Relations: Uses orchestrators/, tests/test_problems.py, config.py
# Description: Comprehensive test across all difficulties with detailed diagnostics
#              Run with: python tests/comprehensive_test.py

"""
Comprehensive Circuit Generation Test

Tests all 9 problems (easy, medium, hard) with all 3 modes (naked, guided, blackboard).
Provides detailed diagnostics on where each mode succeeds/fails.
"""

import sys
import time
import os
from datetime import datetime
from pathlib import Path

# Setup paths
sys.path.insert(0, str(Path(__file__).parent.parent))

from tests.test_problems import ALL_PROBLEMS, ProblemDifficulty
from orchestrators import create_orchestrator
from config import reset_cost_tracking, get_cost_summary, set_api_key


def extract_qasm(result):
    """Extract QASM from orchestrator result."""
    if not result or not result.final_output:
        return None
    
    qasm = result.final_output
    if isinstance(qasm, list):
        qasm = qasm[0] if qasm else None
    
    return str(qasm) if qasm else None


def validate_qasm(qasm):
    """Validate QASM structure and count gates."""
    if not qasm:
        return {"valid": False, "has_qreg": False, "gate_count": 0, "depth": 0}
    
    valid = "OPENQASM" in qasm
    has_qreg = "qreg" in qasm
    
    # Count gates
    gate_count = 0
    for gate in ['h ', 'h(', 'x ', 'x(', 'z ', 'z(', 'cx ', 'cx(', 'cz ', 
                 'swap ', 't ', 's ', 'ry(', 'rz(', 'rx(', 'u1(', 'u2(', 'u3(']:
        gate_count += qasm.lower().count(gate)
    
    # Estimate depth (simplified)
    lines = [l for l in qasm.split('\n') if l.strip() and not l.strip().startswith('//')]
    depth = len([l for l in lines if any(g in l.lower() for g in ['h ', 'x ', 'cx ', 'cz ', 'swap'])])
    
    return {"valid": valid, "has_qreg": has_qreg, "gate_count": gate_count, "depth": depth}


def run_comprehensive_test():
    """Run comprehensive test across all problems and modes."""
    
    # Set API key
    api_key = os.getenv('GOOGLE_API_KEY') or os.getenv('GENAI_API_KEY')
    if api_key:
        set_api_key(api_key)
    else:
        print("ERROR: No API key found. Set GOOGLE_API_KEY environment variable.")
        return
    
    print("=" * 100)
    print("COMPREHENSIVE CIRCUIT GENERATION TEST - ALL DIFFICULTIES")
    print("=" * 100)
    print(f"Date: {datetime.now().isoformat()}")
    print(f"Problems: {len(ALL_PROBLEMS)} total (3 easy, 3 medium, 3 hard)")
    print(f"Modes: naked, guided, blackboard")
    print("=" * 100)
    
    # Store all results
    all_results = []
    
    # Test each problem with each mode
    for problem in ALL_PROBLEMS:
        print(f"\n\n{'=' * 100}")
        print(f"PROBLEM: {problem.id} - {problem.name}")
        print(f"Difficulty: {problem.difficulty.value.upper()}")
        print(f"Category: {problem.category.value}")
        print(f"Expected qubits: {problem.expected.min_qubits}-{problem.expected.max_qubits}")
        print(f"Required gates: {problem.expected.required_gates}")
        print(f"Expected states: {problem.expected.expected_states}")
        print("=" * 100)
        
        for mode in ['naked', 'guided', 'blackboard']:
            print(f"\n--- {mode.upper()} MODE ---")
            reset_cost_tracking()
            
            start = time.perf_counter()
            result = None
            qasm = None
            
            try:
                orchestrator = create_orchestrator(mode)
                result = orchestrator.run(problem.goal)
                
                elapsed = (time.perf_counter() - start) * 1000
                cost = get_cost_summary()
                
                # Extract and validate QASM
                qasm = extract_qasm(result)
                validation = validate_qasm(qasm)
                
                success = result.success if result else False
                errors = result.errors if result else []
                
                # Print detailed results
                status = '✅' if success and validation['valid'] else '❌'
                print(f"{status} Success: {success}")
                print(f"   Time: {elapsed:.0f}ms")
                print(f"   LLM Calls: {cost.get('total_requests', 0)}")
                print(f"   Tokens: {cost.get('total_tokens', 0)}")
                print(f"   QASM Valid: {validation['valid']}")
                print(f"   Has qreg: {validation['has_qreg']}")
                print(f"   Gate Count: {validation['gate_count']}")
                print(f"   Est. Depth: {validation['depth']}")
                
                if errors:
                    print(f"   ⚠️  Errors: {errors[:2]}")
                
                if qasm:
                    # Show first few lines of QASM
                    lines = qasm.split('\n')[:8]
                    print("   QASM:")
                    for line in lines:
                        print(f"      {line}")
                    if len(qasm.split('\n')) > 8:
                        print("      ...")
                else:
                    print("   QASM: None generated")
                
                all_results.append({
                    'problem_id': problem.id,
                    'problem_name': problem.name,
                    'difficulty': problem.difficulty.value,
                    'category': problem.category.value,
                    'mode': mode,
                    'success': success and validation['valid'],
                    'qasm_valid': validation['valid'],
                    'time_ms': elapsed,
                    'llm_calls': cost.get('total_requests', 0),
                    'tokens': cost.get('total_tokens', 0),
                    'gate_count': validation['gate_count'],
                    'depth': validation['depth'],
                    'qasm': qasm[:500] if qasm else None,
                    'error': str(errors[0])[:100] if errors else None
                })
                
            except Exception as e:
                elapsed = (time.perf_counter() - start) * 1000
                error_msg = f"{type(e).__name__}: {str(e)[:200]}"
                print(f"❌ EXCEPTION: {error_msg}")
                
                import traceback
                traceback.print_exc()
                
                all_results.append({
                    'problem_id': problem.id,
                    'problem_name': problem.name,
                    'difficulty': problem.difficulty.value,
                    'category': problem.category.value,
                    'mode': mode,
                    'success': False,
                    'qasm_valid': False,
                    'time_ms': elapsed,
                    'llm_calls': 0,
                    'tokens': 0,
                    'gate_count': 0,
                    'depth': 0,
                    'qasm': None,
                    'error': error_msg[:100]
                })
    
    # Print final summary
    print_summary(all_results)
    
    # Save results to JSON
    output_path = Path(__file__).parent.parent / f"research/comprehensive_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    output_path.parent.mkdir(exist_ok=True)
    
    import json
    with open(output_path, 'w') as f:
        json.dump(all_results, f, indent=2)
    print(f"\n\nResults saved to: {output_path}")
    
    return all_results


def print_summary(all_results):
    """Print summary by difficulty and mode."""
    
    print("\n\n" + "=" * 100)
    print("FINAL SUMMARY BY DIFFICULTY AND MODE")
    print("=" * 100)
    
    for diff in ['easy', 'medium', 'hard']:
        print(f"\n{diff.upper()} PROBLEMS:")
        print("-" * 80)
        
        for mode in ['naked', 'guided', 'blackboard']:
            mode_results = [r for r in all_results if r['difficulty'] == diff and r['mode'] == mode]
            if mode_results:
                successes = sum(1 for r in mode_results if r['success'])
                total = len(mode_results)
                avg_time = sum(r['time_ms'] for r in mode_results) / total
                total_llm = sum(r['llm_calls'] for r in mode_results)
                avg_gates = sum(r['gate_count'] for r in mode_results) / total
                
                status = '✅' if successes == total else '⚠️ ' if successes > 0 else '❌'
                print(f"{status} {mode:12} | Success: {successes}/{total} | Time: {avg_time:>6.0f}ms | LLM: {total_llm:>2} | Avg Gates: {avg_gates:.1f}")
                
                # Show failures
                failures = [r for r in mode_results if not r['success']]
                for f in failures:
                    error_msg = f['error'][:60] if f['error'] else 'No QASM generated'
                    print(f"      ❌ {f['problem_id']}: {error_msg}")
    
    # Calculate winners
    print("\n\n" + "=" * 100)
    print("🏆 WINNER BY DIFFICULTY (Score = Success*100 - Time/1000 - LLM*0.5)")
    print("=" * 100)
    
    for diff in ['easy', 'medium', 'hard']:
        print(f"\n{diff.upper()}:")
        best_mode = None
        best_score = -999
        
        for mode in ['naked', 'guided', 'blackboard']:
            mode_results = [r for r in all_results if r['difficulty'] == diff and r['mode'] == mode]
            if mode_results:
                successes = sum(1 for r in mode_results if r['success'])
                total = len(mode_results)
                avg_time = sum(r['time_ms'] for r in mode_results) / total
                total_llm = sum(r['llm_calls'] for r in mode_results)
                
                success_rate = successes / total
                time_penalty = avg_time / 1000
                llm_penalty = total_llm * 0.5
                score = success_rate * 100 - time_penalty - llm_penalty
                
                print(f"  {mode:12}: Score={score:>6.1f} (Success={success_rate*100:.0f}%, Time={avg_time:.0f}ms, LLM={total_llm})")
                
                if score > best_score:
                    best_score = score
                    best_mode = mode
        
        print(f"  🏆 WINNER: {best_mode.upper() if best_mode else 'NONE'}")
    
    # Overall recommendation
    print("\n\n" + "=" * 100)
    print("OVERALL RECOMMENDATIONS")
    print("=" * 100)
    
    # Calculate overall stats per mode
    for mode in ['naked', 'guided', 'blackboard']:
        mode_results = [r for r in all_results if r['mode'] == mode]
        if mode_results:
            successes = sum(1 for r in mode_results if r['success'])
            total = len(mode_results)
            avg_time = sum(r['time_ms'] for r in mode_results) / total
            total_llm = sum(r['llm_calls'] for r in mode_results)
            avg_gates = sum(r['gate_count'] for r in mode_results) / total
            
            print(f"\n{mode.upper()}:")
            print(f"  Overall Success: {successes}/{total} ({100*successes/total:.0f}%)")
            print(f"  Average Time: {avg_time:.0f}ms")
            print(f"  Total LLM Calls: {total_llm}")
            print(f"  Average Gates: {avg_gates:.1f}")
            
            # List failures
            failures = [r for r in mode_results if not r['success']]
            if failures:
                print(f"  Failures ({len(failures)}):")
                for f in failures:
                    print(f"    - {f['problem_id']} ({f['difficulty']}): {f['error'][:50] if f['error'] else 'Unknown'}")


if __name__ == "__main__":
    run_comprehensive_test()