Spaces:

NLarchive
/

Qagents-workflows

Sleeping

File size: 10,902 Bytes

1bb4678

# Path: QAgents-workflos/tests/comprehensive_test_v2.py
# Relations: Uses orchestrators, test_problems, client/mcp_client
# Description: Full diagnostic test comparing all 5 modes including QUASAR and HYBRID
"""
Comprehensive Test V2: Compare all orchestration modes

Modes tested:
1. NAKED - Direct LLM (baseline)
2. GUIDED - Multi-agent pipeline  
3. BLACKBOARD - Event-driven agents
4. QUASAR - Tool-augmented LLM with hierarchical validation
5. HYBRID - NAKED first, QUASAR fallback

Problems:
- 3 EASY
- 3 MEDIUM  
- 3 HARD
- 4 VERY_HARD (new - to find NAKED limits)
"""

import sys
import os
import json
import time
from datetime import datetime
from pathlib import Path

# Setup paths
sys.path.insert(0, str(Path(__file__).parent.parent.absolute()))

# Set API key BEFORE any imports
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
    api_key = "$env:GOOGLE_API_KEY"
    os.environ['GOOGLE_API_KEY'] = api_key

from tests.test_problems import (
    ALL_PROBLEMS, EASY_PROBLEMS, MEDIUM_PROBLEMS, 
    HARD_PROBLEMS, VERY_HARD_PROBLEMS,
    ProblemDifficulty
)
from orchestrators import create_orchestrator
from orchestrators.quasar_orchestrator import QuasarOrchestrator, HybridOrchestrator
from config import reset_cost_tracking, get_cost_summary, set_api_key
from client.mcp_client import get_client

# Set API key in config
set_api_key(api_key)


def extract_qasm_metrics(qasm: str) -> dict:
    """Extract metrics from QASM code."""
    if not qasm:
        return {"gate_count": 0, "depth": 0, "qubits": 0}
    
    import re
    
    # Count qubits
    qreg_match = re.search(r'qreg\s+\w+\[(\d+)\]', qasm)
    qubits = int(qreg_match.group(1)) if qreg_match else 0
    
    # Count gates (excluding declarations and measurements)
    gate_pattern = r'\b(h|x|y|z|s|t|sdg|tdg|cx|cz|cy|swap|ccx|rz|rx|ry|u1|u2|u3|p|cp)\b'
    gates = re.findall(gate_pattern, qasm, re.IGNORECASE)
    
    # Estimate depth (simplified)
    lines = [l.strip() for l in qasm.split('\n') if l.strip() and not l.strip().startswith(('OPENQASM', 'include', 'qreg', 'creg', '//'))]
    depth = len([l for l in lines if any(g in l.lower() for g in ['h ', 'x ', 'y ', 'z ', 'cx', 'cz', 'swap', 'rx', 'ry', 'rz', 'ccx'])])
    
    return {"gate_count": len(gates), "depth": depth, "qubits": qubits}


def run_test(problem, mode: str) -> dict:
    """Run a single test and return results."""
    result = {
        "problem_id": problem.id,
        "problem_name": problem.name,
        "difficulty": problem.difficulty.value,
        "category": problem.category.value,
        "mode": mode,
        "success": False,
        "qasm_valid": False,
        "time_ms": 0,
        "llm_calls": 0,
        "tokens": 0,
        "gate_count": 0,
        "depth": 0,
        "qasm": None,
        "error": None,
        "tiers_passed": [],
        "iterations": 0
    }
    
    start = time.perf_counter()
    reset_cost_tracking()
    
    try:
        if mode in ["quasar", "hybrid"]:
            # Use new orchestrators with expected values
            if mode == "quasar":
                orchestrator = QuasarOrchestrator(max_iterations=3)
            else:
                orchestrator = HybridOrchestrator()
            
            quasar_result = orchestrator.run(
                goal=problem.prompt,
                expected_qubits=problem.expected.min_qubits,
                expected_states=problem.expected.expected_states if problem.expected.expected_states else None,
                max_depth=problem.expected.max_depth
            )
            
            result["success"] = quasar_result.success
            result["qasm"] = quasar_result.final_qasm
            result["llm_calls"] = quasar_result.llm_calls
            result["tokens"] = quasar_result.tokens_used
            result["tiers_passed"] = quasar_result.tiers_passed
            result["iterations"] = quasar_result.iterations
            
            if quasar_result.final_qasm:
                result["qasm_valid"] = True
                metrics = extract_qasm_metrics(quasar_result.final_qasm)
                result["gate_count"] = metrics["gate_count"]
                result["depth"] = metrics["depth"]
            
            if quasar_result.errors:
                result["error"] = "; ".join(quasar_result.errors)
                
        else:
            # Use standard orchestrators
            orchestrator = create_orchestrator(mode)
            orch_result = orchestrator.run(problem.prompt)
            
            result["success"] = orch_result.success
            result["qasm"] = orch_result.final_output
            
            # Get LLM stats
            cost = get_cost_summary()
            result["llm_calls"] = cost.get("llm_requests", 0)
            result["tokens"] = cost.get("total_tokens", 0)
            
            if orch_result.final_output:
                result["qasm_valid"] = True
                metrics = extract_qasm_metrics(orch_result.final_output)
                result["gate_count"] = metrics["gate_count"]
                result["depth"] = metrics["depth"]
            
            if orch_result.errors:
                result["error"] = "; ".join(orch_result.errors)
                
    except Exception as e:
        result["error"] = str(e)
        
    result["time_ms"] = (time.perf_counter() - start) * 1000
    return result


def main():
    print("=" * 100)
    print("COMPREHENSIVE TEST V2 - ALL MODES INCLUDING QUASAR & HYBRID")
    print("=" * 100)
    print(f"Date: {datetime.now().isoformat()}")
    print(f"Problems: {len(ALL_PROBLEMS)} total")
    print(f"  - Easy: {len(EASY_PROBLEMS)}")
    print(f"  - Medium: {len(MEDIUM_PROBLEMS)}")
    print(f"  - Hard: {len(HARD_PROBLEMS)}")
    print(f"  - Very Hard: {len(VERY_HARD_PROBLEMS)}")
    print(f"Modes: naked, guided, blackboard, quasar, hybrid")
    print("=" * 100)
    
    # Check MCP server
    try:
        client = get_client()
        if client.health_check():
            print("✅ MCP Server connected")
        else:
            print("⚠️ MCP Server not responding - some validations may use fallback")
    except:
        print("⚠️ MCP Server not available")
    
    all_results = []
    modes = ["naked", "quasar", "hybrid", "guided", "blackboard"]  # Order: fastest to slowest
    
    # Group problems by difficulty
    problem_groups = [
        ("EASY", EASY_PROBLEMS),
        ("MEDIUM", MEDIUM_PROBLEMS),
        ("HARD", HARD_PROBLEMS),
        ("VERY_HARD", VERY_HARD_PROBLEMS)
    ]
    
    for diff_name, problems in problem_groups:
        print(f"\n{'='*100}")
        print(f"DIFFICULTY: {diff_name}")
        print("=" * 100)
        
        for problem in problems:
            print(f"\n--- Problem: {problem.id} - {problem.name} ---")
            
            for mode in modes:
                print(f"  Testing {mode}...", end=" ", flush=True)
                
                result = run_test(problem, mode)
                all_results.append(result)
                
                status = "✅" if result["success"] else "❌"
                time_str = f"{result['time_ms']:.0f}ms"
                llm_str = f"LLM:{result['llm_calls']}"
                gates_str = f"Gates:{result['gate_count']}"
                
                extra = ""
                if mode in ["quasar", "hybrid"]:
                    tiers = result.get("tiers_passed", [])
                    extra = f" Tiers:{tiers}"
                
                print(f"{status} {time_str} {llm_str} {gates_str}{extra}")
                
                if result["error"] and not result["success"]:
                    print(f"    Error: {result['error'][:80]}...")
                
                # Rate limiting
                time.sleep(5)
    
    # Summary
    print("\n\n" + "=" * 100)
    print("FINAL SUMMARY BY MODE")
    print("=" * 100)
    
    for mode in modes:
        mode_results = [r for r in all_results if r["mode"] == mode]
        successes = sum(1 for r in mode_results if r["success"])
        total = len(mode_results)
        total_time = sum(r["time_ms"] for r in mode_results)
        total_llm = sum(r["llm_calls"] for r in mode_results)
        avg_gates = sum(r["gate_count"] for r in mode_results if r["success"]) / max(successes, 1)
        
        print(f"\n{mode.upper()}:")
        print(f"  Success: {successes}/{total} ({100*successes/total:.1f}%)")
        print(f"  Total Time: {total_time:.0f}ms ({total_time/total:.0f}ms avg)")
        print(f"  LLM Calls: {total_llm} ({total_llm/total:.1f} avg)")
        print(f"  Avg Gates (success): {avg_gates:.1f}")
        
        # Per difficulty
        for diff in ["easy", "medium", "hard", "very_hard"]:
            diff_results = [r for r in mode_results if r["difficulty"] == diff]
            if diff_results:
                diff_success = sum(1 for r in diff_results if r["success"])
                print(f"    {diff}: {diff_success}/{len(diff_results)}")
    
    # Efficiency comparison
    print("\n" + "=" * 100)
    print("EFFICIENCY COMPARISON (Success per LLM call)")
    print("=" * 100)
    
    for mode in modes:
        mode_results = [r for r in all_results if r["mode"] == mode]
        successes = sum(1 for r in mode_results if r["success"])
        total_llm = sum(r["llm_calls"] for r in mode_results)
        efficiency = successes / max(total_llm, 1)
        print(f"  {mode}: {efficiency:.3f} successes per LLM call")
    
    # Winner determination
    print("\n" + "=" * 100)
    print("WINNER BY DIFFICULTY")
    print("=" * 100)
    
    for diff in ["easy", "medium", "hard", "very_hard"]:
        print(f"\n{diff.upper()}:")
        best_mode = None
        best_success = -1
        best_efficiency = -1
        
        for mode in modes:
            mode_results = [r for r in all_results if r["mode"] == mode and r["difficulty"] == diff]
            if mode_results:
                successes = sum(1 for r in mode_results if r["success"])
                total_llm = sum(r["llm_calls"] for r in mode_results)
                efficiency = successes / max(total_llm, 1)
                
                if successes > best_success or (successes == best_success and efficiency > best_efficiency):
                    best_success = successes
                    best_efficiency = efficiency
                    best_mode = mode
        
        if best_mode:
            print(f"  🏆 Winner: {best_mode.upper()} ({best_success} successes)")
    
    # Save results
    output_path = Path(__file__).parent.parent / "research" / f"comprehensive_test_v2_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    print(f"\n\nResults saved to: {output_path}")
    print("=" * 100)


if __name__ == "__main__":
    main()