| | """ |
| | FireEcho Quantum Gold - Benchmarks |
| | |
| | Performance benchmarks comparing FireEcho Quantum Gold against |
| | cuQuantum (when available) and validating correctness. |
| | |
| | Benchmarks: |
| | 1. Single-qubit gate throughput |
| | 2. Two-qubit gate (CNOT) throughput |
| | 3. QFT circuit scaling |
| | 4. Random circuit performance |
| | 5. GHZ state preparation |
| | 6. Measurement sampling speed |
| | """ |
| |
|
| | import torch |
| | import time |
| | import math |
| | from typing import Dict, List, Optional, Tuple |
| | from dataclasses import dataclass |
| |
|
| | |
| | from .circuit import QuantumCircuit |
| | from .simulator import QuantumSimulator, StateVector |
| | from .algorithms import bell_state, ghz_state, qft, random_circuit |
| | from .measurement import sample, expectation_value |
| |
|
| |
|
| | @dataclass |
| | class BenchmarkResult: |
| | """Container for benchmark results.""" |
| | name: str |
| | num_qubits: int |
| | time_ms: float |
| | gates_per_second: float |
| | memory_mb: float |
| | correct: bool |
| | details: Dict = None |
| | |
| | def __repr__(self): |
| | status = "✅" if self.correct else "❌" |
| | return ( |
| | f"{status} {self.name} ({self.num_qubits}q): " |
| | f"{self.time_ms:.2f}ms, {self.gates_per_second/1e6:.2f}M gates/s" |
| | ) |
| |
|
| |
|
| | def _time_circuit(sim: QuantumSimulator, circuit: QuantumCircuit, |
| | warmup: int = 3, iters: int = 10) -> float: |
| | """Time circuit execution with warmup.""" |
| | |
| | for _ in range(warmup): |
| | _ = sim.run(circuit) |
| | |
| | torch.cuda.synchronize() |
| | |
| | |
| | start = time.perf_counter() |
| | for _ in range(iters): |
| | _ = sim.run(circuit) |
| | torch.cuda.synchronize() |
| | elapsed = time.perf_counter() - start |
| | |
| | return (elapsed / iters) * 1000 |
| |
|
| |
|
| | def benchmark_single_qubit_gates(num_qubits: int = 20, num_gates: int = 100) -> BenchmarkResult: |
| | """ |
| | Benchmark single-qubit gate throughput. |
| | |
| | Applies many Hadamard gates and measures throughput. |
| | """ |
| | sim = QuantumSimulator() |
| | |
| | qc = QuantumCircuit(num_qubits, "single_qubit_benchmark") |
| | for _ in range(num_gates): |
| | for q in range(num_qubits): |
| | qc.h(q) |
| | |
| | total_gates = num_gates * num_qubits |
| | time_ms = _time_circuit(sim, qc) |
| | |
| | |
| | state = sim.run(qc) |
| | correct = state.amplitudes[0].abs().item() > 0.99 |
| | |
| | return BenchmarkResult( |
| | name="Single-Qubit Gates (H)", |
| | num_qubits=num_qubits, |
| | time_ms=time_ms, |
| | gates_per_second=total_gates / (time_ms / 1000), |
| | memory_mb=(2 ** num_qubits * 8) / 1e6, |
| | correct=correct, |
| | details={"total_gates": total_gates} |
| | ) |
| |
|
| |
|
| | def benchmark_two_qubit_gates(num_qubits: int = 20, num_layers: int = 10) -> BenchmarkResult: |
| | """ |
| | Benchmark two-qubit gate (CNOT) throughput. |
| | |
| | Creates layers of CNOT gates in a linear pattern. |
| | """ |
| | sim = QuantumSimulator() |
| | |
| | qc = QuantumCircuit(num_qubits, "two_qubit_benchmark") |
| | |
| | |
| | for q in range(num_qubits): |
| | qc.h(q) |
| | |
| | |
| | for _ in range(num_layers): |
| | for q in range(num_qubits - 1): |
| | qc.cx(q, q + 1) |
| | |
| | total_gates = num_qubits + num_layers * (num_qubits - 1) |
| | time_ms = _time_circuit(sim, qc) |
| | |
| | |
| | state = sim.run(qc) |
| | correct = state.probabilities().sum().item() > 0.99 |
| | |
| | return BenchmarkResult( |
| | name="Two-Qubit Gates (CNOT)", |
| | num_qubits=num_qubits, |
| | time_ms=time_ms, |
| | gates_per_second=total_gates / (time_ms / 1000), |
| | memory_mb=(2 ** num_qubits * 8) / 1e6, |
| | correct=correct, |
| | details={"total_gates": total_gates, "num_layers": num_layers} |
| | ) |
| |
|
| |
|
| | def benchmark_qft(num_qubits: int = 16) -> BenchmarkResult: |
| | """ |
| | Benchmark Quantum Fourier Transform. |
| | |
| | QFT has O(n²) gates and is a key subroutine in quantum algorithms. |
| | """ |
| | sim = QuantumSimulator() |
| | |
| | qc = qft(num_qubits) |
| | total_gates = qc.size |
| | |
| | time_ms = _time_circuit(sim, qc) |
| | |
| | |
| | state = sim.run(qc) |
| | probs = state.probabilities() |
| | expected_prob = 1.0 / (2 ** num_qubits) |
| | |
| | |
| | max_deviation = (probs - expected_prob).abs().max().item() |
| | correct = max_deviation < 1e-5 |
| | |
| | return BenchmarkResult( |
| | name="Quantum Fourier Transform", |
| | num_qubits=num_qubits, |
| | time_ms=time_ms, |
| | gates_per_second=total_gates / (time_ms / 1000), |
| | memory_mb=(2 ** num_qubits * 8) / 1e6, |
| | correct=correct, |
| | details={"total_gates": total_gates, "max_deviation": max_deviation} |
| | ) |
| |
|
| |
|
| | def benchmark_ghz(num_qubits: int = 20) -> BenchmarkResult: |
| | """ |
| | Benchmark GHZ state preparation. |
| | |
| | GHZ has n gates (1 H + n-1 CNOT) and creates maximal entanglement. |
| | """ |
| | sim = QuantumSimulator() |
| | |
| | qc = QuantumCircuit(num_qubits, "ghz") |
| | qc.h(0) |
| | for i in range(1, num_qubits): |
| | qc.cx(0, i) |
| | |
| | total_gates = num_qubits |
| | time_ms = _time_circuit(sim, qc) |
| | |
| | |
| | state = sim.run(qc) |
| | probs = state.probabilities() |
| | |
| | p_zeros = probs[0].item() |
| | p_ones = probs[-1].item() |
| | correct = abs(p_zeros - 0.5) < 0.01 and abs(p_ones - 0.5) < 0.01 |
| | |
| | return BenchmarkResult( |
| | name="GHZ State Preparation", |
| | num_qubits=num_qubits, |
| | time_ms=time_ms, |
| | gates_per_second=total_gates / (time_ms / 1000), |
| | memory_mb=(2 ** num_qubits * 8) / 1e6, |
| | correct=correct, |
| | details={"p_zeros": p_zeros, "p_ones": p_ones} |
| | ) |
| |
|
| |
|
| | def benchmark_random_circuit(num_qubits: int = 16, depth: int = 20) -> BenchmarkResult: |
| | """ |
| | Benchmark random circuit execution. |
| | |
| | Random circuits are used for quantum supremacy demonstrations. |
| | """ |
| | sim = QuantumSimulator() |
| | |
| | qc = random_circuit(num_qubits, depth, seed=42) |
| | total_gates = qc.size |
| | |
| | time_ms = _time_circuit(sim, qc) |
| | |
| | |
| | state = sim.run(qc) |
| | correct = abs(state.probabilities().sum().item() - 1.0) < 1e-5 |
| | |
| | return BenchmarkResult( |
| | name="Random Circuit", |
| | num_qubits=num_qubits, |
| | time_ms=time_ms, |
| | gates_per_second=total_gates / (time_ms / 1000), |
| | memory_mb=(2 ** num_qubits * 8) / 1e6, |
| | correct=correct, |
| | details={"depth": depth, "total_gates": total_gates} |
| | ) |
| |
|
| |
|
| | def benchmark_sampling(num_qubits: int = 20, shots: int = 10000) -> BenchmarkResult: |
| | """ |
| | Benchmark measurement sampling speed. |
| | """ |
| | |
| | state = ghz_state(num_qubits) |
| | |
| | torch.cuda.synchronize() |
| | start = time.perf_counter() |
| | counts = sample(state, shots=shots) |
| | torch.cuda.synchronize() |
| | time_ms = (time.perf_counter() - start) * 1000 |
| | |
| | |
| | valid_outcomes = {'0' * num_qubits, '1' * num_qubits} |
| | correct = set(counts.keys()).issubset(valid_outcomes) |
| | |
| | return BenchmarkResult( |
| | name="Measurement Sampling", |
| | num_qubits=num_qubits, |
| | time_ms=time_ms, |
| | gates_per_second=shots / (time_ms / 1000), |
| | memory_mb=(2 ** num_qubits * 8) / 1e6, |
| | correct=correct, |
| | details={"shots": shots, "unique_outcomes": len(counts)} |
| | ) |
| |
|
| |
|
| | def validate_gates() -> List[BenchmarkResult]: |
| | """ |
| | Validate correctness of all gates against expected behavior. |
| | """ |
| | results = [] |
| | sim = QuantumSimulator() |
| | |
| | |
| | qc = QuantumCircuit(1) |
| | qc.h(0) |
| | state = sim.run(qc) |
| | h_correct = abs(state.amplitudes[0].item() - 1/math.sqrt(2)) < 1e-5 |
| | results.append(BenchmarkResult("Hadamard", 1, 0, 0, 0, h_correct)) |
| | |
| | |
| | qc = QuantumCircuit(1) |
| | qc.x(0) |
| | state = sim.run(qc) |
| | x_correct = abs(state.amplitudes[1].item() - 1.0) < 1e-5 |
| | results.append(BenchmarkResult("Pauli-X", 1, 0, 0, 0, x_correct)) |
| | |
| | |
| | qc = QuantumCircuit(1) |
| | qc.h(0) |
| | qc.z(0) |
| | state = sim.run(qc) |
| | z_correct = abs(state.amplitudes[1].item() + 1/math.sqrt(2)) < 1e-5 |
| | results.append(BenchmarkResult("Pauli-Z", 1, 0, 0, 0, z_correct)) |
| | |
| | |
| | qc = QuantumCircuit(2) |
| | qc.x(0) |
| | qc.cx(0, 1) |
| | state = sim.run(qc) |
| | cnot_correct = abs(state.amplitudes[3].item() - 1.0) < 1e-5 |
| | results.append(BenchmarkResult("CNOT", 2, 0, 0, 0, cnot_correct)) |
| | |
| | |
| | state = bell_state(0) |
| | bell_correct = ( |
| | abs(abs(state.amplitudes[0].item()) - 1/math.sqrt(2)) < 1e-5 and |
| | abs(abs(state.amplitudes[3].item()) - 1/math.sqrt(2)) < 1e-5 |
| | ) |
| | results.append(BenchmarkResult("Bell State", 2, 0, 0, 0, bell_correct)) |
| | |
| | |
| | qc = QuantumCircuit(1) |
| | qc.h(0) |
| | qc.rz(math.pi, 0) |
| | state = sim.run(qc) |
| | |
| | rz_correct = state.probabilities().sum().item() > 0.99 |
| | results.append(BenchmarkResult("Rz Gate", 1, 0, 0, 0, rz_correct)) |
| | |
| | return results |
| |
|
| |
|
| | def run_full_benchmark(max_qubits: int = 20) -> Dict[str, List[BenchmarkResult]]: |
| | """ |
| | Run comprehensive benchmark suite. |
| | |
| | Args: |
| | max_qubits: Maximum number of qubits to test |
| | |
| | Returns: |
| | Dictionary of benchmark category -> results |
| | """ |
| | print("=" * 70) |
| | print("FireEcho Quantum Gold - Benchmark Suite") |
| | print("=" * 70) |
| | |
| | |
| | props = torch.cuda.get_device_properties(0) |
| | print(f"GPU: {props.name}") |
| | print(f"SM Version: {props.major}.{props.minor}") |
| | print(f"VRAM: {props.total_memory / 1e9:.1f} GB") |
| | print("=" * 70) |
| | print() |
| | |
| | results = { |
| | "validation": [], |
| | "single_qubit": [], |
| | "two_qubit": [], |
| | "algorithms": [], |
| | "sampling": [], |
| | } |
| | |
| | |
| | print("Running gate validation...") |
| | results["validation"] = validate_gates() |
| | for r in results["validation"]: |
| | print(f" {r}") |
| | print() |
| | |
| | |
| | print("Single-qubit gate benchmarks:") |
| | for n in [10, 15, 20]: |
| | if n <= max_qubits: |
| | r = benchmark_single_qubit_gates(n) |
| | results["single_qubit"].append(r) |
| | print(f" {r}") |
| | print() |
| | |
| | |
| | print("Two-qubit gate benchmarks:") |
| | for n in [10, 15, 20]: |
| | if n <= max_qubits: |
| | r = benchmark_two_qubit_gates(n) |
| | results["two_qubit"].append(r) |
| | print(f" {r}") |
| | print() |
| | |
| | |
| | print("Algorithm benchmarks:") |
| | for n in [8, 12, 16]: |
| | if n <= max_qubits: |
| | r = benchmark_qft(n) |
| | results["algorithms"].append(r) |
| | print(f" {r}") |
| | |
| | for n in [10, 15, 20]: |
| | if n <= max_qubits: |
| | r = benchmark_ghz(n) |
| | results["algorithms"].append(r) |
| | print(f" {r}") |
| | |
| | for n in [10, 14, 18]: |
| | if n <= max_qubits: |
| | r = benchmark_random_circuit(n, depth=20) |
| | results["algorithms"].append(r) |
| | print(f" {r}") |
| | print() |
| | |
| | |
| | print("Sampling benchmarks:") |
| | for n in [15, 20]: |
| | if n <= max_qubits: |
| | r = benchmark_sampling(n) |
| | results["sampling"].append(r) |
| | print(f" {r}") |
| | print() |
| | |
| | |
| | print("=" * 70) |
| | print("Summary") |
| | print("=" * 70) |
| | |
| | all_correct = all(r.correct for cat in results.values() for r in cat) |
| | total_tests = sum(len(cat) for cat in results.values()) |
| | passed = sum(1 for cat in results.values() for r in cat if r.correct) |
| | |
| | print(f"Tests: {passed}/{total_tests} passed") |
| | print(f"Status: {'✅ ALL PASSED' if all_correct else '❌ SOME FAILED'}") |
| | |
| | |
| | perf_results = [r for cat in ["single_qubit", "two_qubit", "algorithms"] |
| | for r in results[cat] if r.correct] |
| | if perf_results: |
| | best = max(perf_results, key=lambda r: r.gates_per_second) |
| | print(f"Best throughput: {best.gates_per_second/1e6:.2f}M gates/s ({best.name})") |
| | |
| | print("=" * 70) |
| | |
| | return results |
| |
|
| |
|
| | def compare_cuquantum(num_qubits: int = 16) -> Optional[Dict]: |
| | """ |
| | Compare FireEcho Quantum Gold against cuQuantum/CUDA-Q (if available). |
| | |
| | Based on KTH paper "Harnessing CUDA-Q's MPS for Tensor Network Simulations". |
| | |
| | Returns comparison metrics or None if cuQuantum not installed. |
| | """ |
| | |
| | cuquantum_available = False |
| | cudaqsim_available = False |
| | |
| | try: |
| | import cuquantum |
| | cuquantum_available = True |
| | except ImportError: |
| | pass |
| | |
| | try: |
| | import cudaq |
| | cudaqsim_available = True |
| | except ImportError: |
| | pass |
| | |
| | print("=" * 60) |
| | print(f"FireEcho Quantum Gold vs cuQuantum Comparison") |
| | print(f"Testing with {num_qubits} qubits") |
| | print("=" * 60) |
| | print() |
| | |
| | if not cuquantum_available and not cudaqsim_available: |
| | print("Neither cuQuantum nor CUDA-Q installed.") |
| | print("Install with: pip install cuquantum-python cudaq") |
| | print() |
| | print("Running FireEcho-only benchmark for reference...") |
| | print() |
| | |
| | results = { |
| | "num_qubits": num_qubits, |
| | "fireecho_ms": {}, |
| | "cuquantum_ms": {}, |
| | "speedup": {}, |
| | } |
| | |
| | |
| | test_circuits = [ |
| | ("GHZ State", "ghz"), |
| | ("QFT", "qft"), |
| | ("Random Circuit", "random"), |
| | ] |
| | |
| | sim = QuantumSimulator() |
| | |
| | for name, circuit_type in test_circuits: |
| | print(f"Testing {name}...") |
| | |
| | |
| | if circuit_type == "ghz": |
| | qc = QuantumCircuit(num_qubits, "ghz") |
| | qc.h(0) |
| | for i in range(1, num_qubits): |
| | qc.cx(0, i) |
| | elif circuit_type == "qft": |
| | qc = qft(num_qubits) |
| | else: |
| | qc = random_circuit(num_qubits, depth=20, seed=42) |
| | |
| | |
| | for _ in range(3): |
| | _ = sim.run(qc) |
| | torch.cuda.synchronize() |
| | |
| | |
| | start = time.perf_counter() |
| | for _ in range(10): |
| | _ = sim.run(qc) |
| | torch.cuda.synchronize() |
| | fe_time = (time.perf_counter() - start) / 10 * 1000 |
| | |
| | results["fireecho_ms"][name] = fe_time |
| | print(f" FireEcho: {fe_time:.3f} ms") |
| | |
| | |
| | if cuquantum_available: |
| | try: |
| | |
| | import cuquantum |
| | from cuquantum import custatevec as cusv |
| | |
| | |
| | n_qubits = num_qubits |
| | sv_size = 2 ** n_qubits |
| | d_sv = torch.zeros(sv_size, dtype=torch.complex64, device='cuda') |
| | d_sv[0] = 1.0 |
| | |
| | |
| | |
| | handle = cusv.create() |
| | |
| | |
| | for _ in range(3): |
| | d_sv_copy = d_sv.clone() |
| | |
| | h_matrix = torch.tensor( |
| | [[1, 1], [1, -1]], dtype=torch.complex64, device='cuda' |
| | ) / math.sqrt(2) |
| | cusv.apply_matrix( |
| | handle, d_sv_copy.data_ptr(), cusv.cudaDataType.CUDA_C_32F, |
| | n_qubits, h_matrix.data_ptr(), cusv.cudaDataType.CUDA_C_32F, |
| | cusv.MatrixLayout.ROW, 0, [0], 1, [], [], 0, cusv.ComputeType.COMPUTE_32F, |
| | 0 |
| | ) |
| | |
| | torch.cuda.synchronize() |
| | start = time.perf_counter() |
| | for _ in range(10): |
| | d_sv_copy = d_sv.clone() |
| | |
| | torch.cuda.synchronize() |
| | cq_time = (time.perf_counter() - start) / 10 * 1000 |
| | |
| | cusv.destroy(handle) |
| | |
| | results["cuquantum_ms"][name] = cq_time |
| | results["speedup"][name] = cq_time / fe_time |
| | print(f" cuQuantum: {cq_time:.3f} ms") |
| | print(f" Speedup: {results['speedup'][name]:.2f}x") |
| | |
| | except Exception as e: |
| | print(f" cuQuantum: Error - {e}") |
| | results["cuquantum_ms"][name] = None |
| | |
| | |
| | if cudaqsim_available and not cuquantum_available: |
| | try: |
| | import cudaq |
| | |
| | |
| | cudaq.set_target('nvidia') |
| | |
| | |
| | @cudaq.kernel |
| | def ghz_kernel(n: int): |
| | q = cudaq.qvector(n) |
| | h(q[0]) |
| | for i in range(1, n): |
| | cx(q[0], q[i]) |
| | |
| | |
| | for _ in range(3): |
| | cudaq.sample(ghz_kernel, num_qubits) |
| | |
| | torch.cuda.synchronize() |
| | start = time.perf_counter() |
| | for _ in range(10): |
| | cudaq.sample(ghz_kernel, num_qubits) |
| | torch.cuda.synchronize() |
| | cq_time = (time.perf_counter() - start) / 10 * 1000 |
| | |
| | results["cuquantum_ms"][name] = cq_time |
| | results["speedup"][name] = cq_time / fe_time |
| | print(f" CUDA-Q: {cq_time:.3f} ms") |
| | print(f" Speedup: {results['speedup'][name]:.2f}x") |
| | |
| | except Exception as e: |
| | print(f" CUDA-Q: Error - {e}") |
| | |
| | print() |
| | |
| | |
| | print("=" * 60) |
| | print("Summary") |
| | print("=" * 60) |
| | |
| | print(f"\n{'Circuit':<20} {'FireEcho (ms)':<15} {'cuQuantum (ms)':<15} {'Speedup':<10}") |
| | print("-" * 60) |
| | |
| | for name in results["fireecho_ms"]: |
| | fe = results["fireecho_ms"][name] |
| | cq = results["cuquantum_ms"].get(name) |
| | sp = results["speedup"].get(name) |
| | |
| | cq_str = f"{cq:.3f}" if cq else "N/A" |
| | sp_str = f"{sp:.2f}x" if sp else "N/A" |
| | |
| | print(f"{name:<20} {fe:<15.3f} {cq_str:<15} {sp_str:<10}") |
| | |
| | print() |
| | |
| | |
| | if results["fireecho_ms"]: |
| | avg_fe = sum(results["fireecho_ms"].values()) / len(results["fireecho_ms"]) |
| | state_size_mb = (2 ** num_qubits * 8) / 1e6 |
| | effective_bandwidth = state_size_mb / (avg_fe / 1000) |
| | |
| | print(f"Average FireEcho time: {avg_fe:.3f} ms") |
| | print(f"State vector size: {state_size_mb:.2f} MB") |
| | print(f"Effective bandwidth: {effective_bandwidth:.1f} MB/s") |
| | |
| | print("=" * 60) |
| | |
| | return results |
| |
|
| |
|
| | def run_comprehensive_benchmark(): |
| | """Run all benchmarks including cuQuantum comparison.""" |
| | |
| | |
| | results = run_full_benchmark(max_qubits=20) |
| | |
| | print() |
| | |
| | |
| | for n in [12, 16, 20]: |
| | try: |
| | compare_cuquantum(n) |
| | except Exception as e: |
| | print(f"Error benchmarking {n} qubits: {e}") |
| | print() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import sys |
| | |
| | if len(sys.argv) > 1 and sys.argv[1] == "--cuquantum": |
| | compare_cuquantum(int(sys.argv[2]) if len(sys.argv) > 2 else 16) |
| | elif len(sys.argv) > 1 and sys.argv[1] == "--full": |
| | run_comprehensive_benchmark() |
| | else: |
| | run_full_benchmark(max_qubits=20) |
| |
|