Qagents-workflows / tests /comprehensive_test.py
Deminiko
Initial commit: QAgents-workflos multi-agent quantum circuit optimization system
1bb4678
# Path: QAgents-workflos/tests/comprehensive_test.py
# Relations: Uses orchestrators/, tests/test_problems.py, config.py
# Description: Comprehensive test across all difficulties with detailed diagnostics
# Run with: python tests/comprehensive_test.py
"""
Comprehensive Circuit Generation Test
Tests all 9 problems (easy, medium, hard) with all 3 modes (naked, guided, blackboard).
Provides detailed diagnostics on where each mode succeeds/fails.
"""
import sys
import time
import os
from datetime import datetime
from pathlib import Path
# Setup paths
sys.path.insert(0, str(Path(__file__).parent.parent))
from tests.test_problems import ALL_PROBLEMS, ProblemDifficulty
from orchestrators import create_orchestrator
from config import reset_cost_tracking, get_cost_summary, set_api_key
def extract_qasm(result):
"""Extract QASM from orchestrator result."""
if not result or not result.final_output:
return None
qasm = result.final_output
if isinstance(qasm, list):
qasm = qasm[0] if qasm else None
return str(qasm) if qasm else None
def validate_qasm(qasm):
"""Validate QASM structure and count gates."""
if not qasm:
return {"valid": False, "has_qreg": False, "gate_count": 0, "depth": 0}
valid = "OPENQASM" in qasm
has_qreg = "qreg" in qasm
# Count gates
gate_count = 0
for gate in ['h ', 'h(', 'x ', 'x(', 'z ', 'z(', 'cx ', 'cx(', 'cz ',
'swap ', 't ', 's ', 'ry(', 'rz(', 'rx(', 'u1(', 'u2(', 'u3(']:
gate_count += qasm.lower().count(gate)
# Estimate depth (simplified)
lines = [l for l in qasm.split('\n') if l.strip() and not l.strip().startswith('//')]
depth = len([l for l in lines if any(g in l.lower() for g in ['h ', 'x ', 'cx ', 'cz ', 'swap'])])
return {"valid": valid, "has_qreg": has_qreg, "gate_count": gate_count, "depth": depth}
def run_comprehensive_test():
"""Run comprehensive test across all problems and modes."""
# Set API key
api_key = os.getenv('GOOGLE_API_KEY') or os.getenv('GENAI_API_KEY')
if api_key:
set_api_key(api_key)
else:
print("ERROR: No API key found. Set GOOGLE_API_KEY environment variable.")
return
print("=" * 100)
print("COMPREHENSIVE CIRCUIT GENERATION TEST - ALL DIFFICULTIES")
print("=" * 100)
print(f"Date: {datetime.now().isoformat()}")
print(f"Problems: {len(ALL_PROBLEMS)} total (3 easy, 3 medium, 3 hard)")
print(f"Modes: naked, guided, blackboard")
print("=" * 100)
# Store all results
all_results = []
# Test each problem with each mode
for problem in ALL_PROBLEMS:
print(f"\n\n{'=' * 100}")
print(f"PROBLEM: {problem.id} - {problem.name}")
print(f"Difficulty: {problem.difficulty.value.upper()}")
print(f"Category: {problem.category.value}")
print(f"Expected qubits: {problem.expected.min_qubits}-{problem.expected.max_qubits}")
print(f"Required gates: {problem.expected.required_gates}")
print(f"Expected states: {problem.expected.expected_states}")
print("=" * 100)
for mode in ['naked', 'guided', 'blackboard']:
print(f"\n--- {mode.upper()} MODE ---")
reset_cost_tracking()
start = time.perf_counter()
result = None
qasm = None
try:
orchestrator = create_orchestrator(mode)
result = orchestrator.run(problem.goal)
elapsed = (time.perf_counter() - start) * 1000
cost = get_cost_summary()
# Extract and validate QASM
qasm = extract_qasm(result)
validation = validate_qasm(qasm)
success = result.success if result else False
errors = result.errors if result else []
# Print detailed results
status = '✅' if success and validation['valid'] else '❌'
print(f"{status} Success: {success}")
print(f" Time: {elapsed:.0f}ms")
print(f" LLM Calls: {cost.get('total_requests', 0)}")
print(f" Tokens: {cost.get('total_tokens', 0)}")
print(f" QASM Valid: {validation['valid']}")
print(f" Has qreg: {validation['has_qreg']}")
print(f" Gate Count: {validation['gate_count']}")
print(f" Est. Depth: {validation['depth']}")
if errors:
print(f" ⚠️ Errors: {errors[:2]}")
if qasm:
# Show first few lines of QASM
lines = qasm.split('\n')[:8]
print(" QASM:")
for line in lines:
print(f" {line}")
if len(qasm.split('\n')) > 8:
print(" ...")
else:
print(" QASM: None generated")
all_results.append({
'problem_id': problem.id,
'problem_name': problem.name,
'difficulty': problem.difficulty.value,
'category': problem.category.value,
'mode': mode,
'success': success and validation['valid'],
'qasm_valid': validation['valid'],
'time_ms': elapsed,
'llm_calls': cost.get('total_requests', 0),
'tokens': cost.get('total_tokens', 0),
'gate_count': validation['gate_count'],
'depth': validation['depth'],
'qasm': qasm[:500] if qasm else None,
'error': str(errors[0])[:100] if errors else None
})
except Exception as e:
elapsed = (time.perf_counter() - start) * 1000
error_msg = f"{type(e).__name__}: {str(e)[:200]}"
print(f"❌ EXCEPTION: {error_msg}")
import traceback
traceback.print_exc()
all_results.append({
'problem_id': problem.id,
'problem_name': problem.name,
'difficulty': problem.difficulty.value,
'category': problem.category.value,
'mode': mode,
'success': False,
'qasm_valid': False,
'time_ms': elapsed,
'llm_calls': 0,
'tokens': 0,
'gate_count': 0,
'depth': 0,
'qasm': None,
'error': error_msg[:100]
})
# Print final summary
print_summary(all_results)
# Save results to JSON
output_path = Path(__file__).parent.parent / f"research/comprehensive_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
output_path.parent.mkdir(exist_ok=True)
import json
with open(output_path, 'w') as f:
json.dump(all_results, f, indent=2)
print(f"\n\nResults saved to: {output_path}")
return all_results
def print_summary(all_results):
"""Print summary by difficulty and mode."""
print("\n\n" + "=" * 100)
print("FINAL SUMMARY BY DIFFICULTY AND MODE")
print("=" * 100)
for diff in ['easy', 'medium', 'hard']:
print(f"\n{diff.upper()} PROBLEMS:")
print("-" * 80)
for mode in ['naked', 'guided', 'blackboard']:
mode_results = [r for r in all_results if r['difficulty'] == diff and r['mode'] == mode]
if mode_results:
successes = sum(1 for r in mode_results if r['success'])
total = len(mode_results)
avg_time = sum(r['time_ms'] for r in mode_results) / total
total_llm = sum(r['llm_calls'] for r in mode_results)
avg_gates = sum(r['gate_count'] for r in mode_results) / total
status = '✅' if successes == total else '⚠️ ' if successes > 0 else '❌'
print(f"{status} {mode:12} | Success: {successes}/{total} | Time: {avg_time:>6.0f}ms | LLM: {total_llm:>2} | Avg Gates: {avg_gates:.1f}")
# Show failures
failures = [r for r in mode_results if not r['success']]
for f in failures:
error_msg = f['error'][:60] if f['error'] else 'No QASM generated'
print(f" ❌ {f['problem_id']}: {error_msg}")
# Calculate winners
print("\n\n" + "=" * 100)
print("🏆 WINNER BY DIFFICULTY (Score = Success*100 - Time/1000 - LLM*0.5)")
print("=" * 100)
for diff in ['easy', 'medium', 'hard']:
print(f"\n{diff.upper()}:")
best_mode = None
best_score = -999
for mode in ['naked', 'guided', 'blackboard']:
mode_results = [r for r in all_results if r['difficulty'] == diff and r['mode'] == mode]
if mode_results:
successes = sum(1 for r in mode_results if r['success'])
total = len(mode_results)
avg_time = sum(r['time_ms'] for r in mode_results) / total
total_llm = sum(r['llm_calls'] for r in mode_results)
success_rate = successes / total
time_penalty = avg_time / 1000
llm_penalty = total_llm * 0.5
score = success_rate * 100 - time_penalty - llm_penalty
print(f" {mode:12}: Score={score:>6.1f} (Success={success_rate*100:.0f}%, Time={avg_time:.0f}ms, LLM={total_llm})")
if score > best_score:
best_score = score
best_mode = mode
print(f" 🏆 WINNER: {best_mode.upper() if best_mode else 'NONE'}")
# Overall recommendation
print("\n\n" + "=" * 100)
print("OVERALL RECOMMENDATIONS")
print("=" * 100)
# Calculate overall stats per mode
for mode in ['naked', 'guided', 'blackboard']:
mode_results = [r for r in all_results if r['mode'] == mode]
if mode_results:
successes = sum(1 for r in mode_results if r['success'])
total = len(mode_results)
avg_time = sum(r['time_ms'] for r in mode_results) / total
total_llm = sum(r['llm_calls'] for r in mode_results)
avg_gates = sum(r['gate_count'] for r in mode_results) / total
print(f"\n{mode.upper()}:")
print(f" Overall Success: {successes}/{total} ({100*successes/total:.0f}%)")
print(f" Average Time: {avg_time:.0f}ms")
print(f" Total LLM Calls: {total_llm}")
print(f" Average Gates: {avg_gates:.1f}")
# List failures
failures = [r for r in mode_results if not r['success']]
if failures:
print(f" Failures ({len(failures)}):")
for f in failures:
print(f" - {f['problem_id']} ({f['difficulty']}): {f['error'][:50] if f['error'] else 'Unknown'}")
if __name__ == "__main__":
run_comprehensive_test()