FireEcho / quantum /benchmark.py
Joysulem's picture
Upload 3258 files
b5bff9c verified
"""
FireEcho Quantum Gold - Benchmarks
Performance benchmarks comparing FireEcho Quantum Gold against
cuQuantum (when available) and validating correctness.
Benchmarks:
1. Single-qubit gate throughput
2. Two-qubit gate (CNOT) throughput
3. QFT circuit scaling
4. Random circuit performance
5. GHZ state preparation
6. Measurement sampling speed
"""
import torch
import time
import math
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
# FireEcho Quantum imports
from .circuit import QuantumCircuit
from .simulator import QuantumSimulator, StateVector
from .algorithms import bell_state, ghz_state, qft, random_circuit
from .measurement import sample, expectation_value
@dataclass
class BenchmarkResult:
"""Container for benchmark results."""
name: str
num_qubits: int
time_ms: float
gates_per_second: float
memory_mb: float
correct: bool
details: Dict = None
def __repr__(self):
status = "✅" if self.correct else "❌"
return (
f"{status} {self.name} ({self.num_qubits}q): "
f"{self.time_ms:.2f}ms, {self.gates_per_second/1e6:.2f}M gates/s"
)
def _time_circuit(sim: QuantumSimulator, circuit: QuantumCircuit,
warmup: int = 3, iters: int = 10) -> float:
"""Time circuit execution with warmup."""
# Warmup
for _ in range(warmup):
_ = sim.run(circuit)
torch.cuda.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(iters):
_ = sim.run(circuit)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start
return (elapsed / iters) * 1000 # ms
def benchmark_single_qubit_gates(num_qubits: int = 20, num_gates: int = 100) -> BenchmarkResult:
"""
Benchmark single-qubit gate throughput.
Applies many Hadamard gates and measures throughput.
"""
sim = QuantumSimulator()
qc = QuantumCircuit(num_qubits, "single_qubit_benchmark")
for _ in range(num_gates):
for q in range(num_qubits):
qc.h(q)
total_gates = num_gates * num_qubits
time_ms = _time_circuit(sim, qc)
# Validate: H^2 = I, so even number of H gates should return to |0...0⟩
state = sim.run(qc)
correct = state.amplitudes[0].abs().item() > 0.99
return BenchmarkResult(
name="Single-Qubit Gates (H)",
num_qubits=num_qubits,
time_ms=time_ms,
gates_per_second=total_gates / (time_ms / 1000),
memory_mb=(2 ** num_qubits * 8) / 1e6, # complex64 = 8 bytes
correct=correct,
details={"total_gates": total_gates}
)
def benchmark_two_qubit_gates(num_qubits: int = 20, num_layers: int = 10) -> BenchmarkResult:
"""
Benchmark two-qubit gate (CNOT) throughput.
Creates layers of CNOT gates in a linear pattern.
"""
sim = QuantumSimulator()
qc = QuantumCircuit(num_qubits, "two_qubit_benchmark")
# Initialize to superposition
for q in range(num_qubits):
qc.h(q)
# CNOT layers
for _ in range(num_layers):
for q in range(num_qubits - 1):
qc.cx(q, q + 1)
total_gates = num_qubits + num_layers * (num_qubits - 1)
time_ms = _time_circuit(sim, qc)
# Basic validation
state = sim.run(qc)
correct = state.probabilities().sum().item() > 0.99
return BenchmarkResult(
name="Two-Qubit Gates (CNOT)",
num_qubits=num_qubits,
time_ms=time_ms,
gates_per_second=total_gates / (time_ms / 1000),
memory_mb=(2 ** num_qubits * 8) / 1e6,
correct=correct,
details={"total_gates": total_gates, "num_layers": num_layers}
)
def benchmark_qft(num_qubits: int = 16) -> BenchmarkResult:
"""
Benchmark Quantum Fourier Transform.
QFT has O(n²) gates and is a key subroutine in quantum algorithms.
"""
sim = QuantumSimulator()
qc = qft(num_qubits)
total_gates = qc.size
time_ms = _time_circuit(sim, qc)
# Validate: QFT of |0...0⟩ should give uniform superposition
state = sim.run(qc)
probs = state.probabilities()
expected_prob = 1.0 / (2 ** num_qubits)
# Check uniformity
max_deviation = (probs - expected_prob).abs().max().item()
correct = max_deviation < 1e-5
return BenchmarkResult(
name="Quantum Fourier Transform",
num_qubits=num_qubits,
time_ms=time_ms,
gates_per_second=total_gates / (time_ms / 1000),
memory_mb=(2 ** num_qubits * 8) / 1e6,
correct=correct,
details={"total_gates": total_gates, "max_deviation": max_deviation}
)
def benchmark_ghz(num_qubits: int = 20) -> BenchmarkResult:
"""
Benchmark GHZ state preparation.
GHZ has n gates (1 H + n-1 CNOT) and creates maximal entanglement.
"""
sim = QuantumSimulator()
qc = QuantumCircuit(num_qubits, "ghz")
qc.h(0)
for i in range(1, num_qubits):
qc.cx(0, i)
total_gates = num_qubits
time_ms = _time_circuit(sim, qc)
# Validate: Only |00...0⟩ and |11...1⟩ should have amplitude
state = sim.run(qc)
probs = state.probabilities()
p_zeros = probs[0].item()
p_ones = probs[-1].item()
correct = abs(p_zeros - 0.5) < 0.01 and abs(p_ones - 0.5) < 0.01
return BenchmarkResult(
name="GHZ State Preparation",
num_qubits=num_qubits,
time_ms=time_ms,
gates_per_second=total_gates / (time_ms / 1000),
memory_mb=(2 ** num_qubits * 8) / 1e6,
correct=correct,
details={"p_zeros": p_zeros, "p_ones": p_ones}
)
def benchmark_random_circuit(num_qubits: int = 16, depth: int = 20) -> BenchmarkResult:
"""
Benchmark random circuit execution.
Random circuits are used for quantum supremacy demonstrations.
"""
sim = QuantumSimulator()
qc = random_circuit(num_qubits, depth, seed=42)
total_gates = qc.size
time_ms = _time_circuit(sim, qc)
# Basic validation
state = sim.run(qc)
correct = abs(state.probabilities().sum().item() - 1.0) < 1e-5
return BenchmarkResult(
name="Random Circuit",
num_qubits=num_qubits,
time_ms=time_ms,
gates_per_second=total_gates / (time_ms / 1000),
memory_mb=(2 ** num_qubits * 8) / 1e6,
correct=correct,
details={"depth": depth, "total_gates": total_gates}
)
def benchmark_sampling(num_qubits: int = 20, shots: int = 10000) -> BenchmarkResult:
"""
Benchmark measurement sampling speed.
"""
# Create GHZ state
state = ghz_state(num_qubits)
torch.cuda.synchronize()
start = time.perf_counter()
counts = sample(state, shots=shots)
torch.cuda.synchronize()
time_ms = (time.perf_counter() - start) * 1000
# Validate: Only "0...0" and "1...1" outcomes
valid_outcomes = {'0' * num_qubits, '1' * num_qubits}
correct = set(counts.keys()).issubset(valid_outcomes)
return BenchmarkResult(
name="Measurement Sampling",
num_qubits=num_qubits,
time_ms=time_ms,
gates_per_second=shots / (time_ms / 1000),
memory_mb=(2 ** num_qubits * 8) / 1e6,
correct=correct,
details={"shots": shots, "unique_outcomes": len(counts)}
)
def validate_gates() -> List[BenchmarkResult]:
"""
Validate correctness of all gates against expected behavior.
"""
results = []
sim = QuantumSimulator()
# Test Hadamard
qc = QuantumCircuit(1)
qc.h(0)
state = sim.run(qc)
h_correct = abs(state.amplitudes[0].item() - 1/math.sqrt(2)) < 1e-5
results.append(BenchmarkResult("Hadamard", 1, 0, 0, 0, h_correct))
# Test X
qc = QuantumCircuit(1)
qc.x(0)
state = sim.run(qc)
x_correct = abs(state.amplitudes[1].item() - 1.0) < 1e-5
results.append(BenchmarkResult("Pauli-X", 1, 0, 0, 0, x_correct))
# Test Z
qc = QuantumCircuit(1)
qc.h(0)
qc.z(0)
state = sim.run(qc)
z_correct = abs(state.amplitudes[1].item() + 1/math.sqrt(2)) < 1e-5
results.append(BenchmarkResult("Pauli-Z", 1, 0, 0, 0, z_correct))
# Test CNOT
qc = QuantumCircuit(2)
qc.x(0) # |10⟩
qc.cx(0, 1) # Should give |11⟩
state = sim.run(qc)
cnot_correct = abs(state.amplitudes[3].item() - 1.0) < 1e-5 # |11⟩ = index 3
results.append(BenchmarkResult("CNOT", 2, 0, 0, 0, cnot_correct))
# Test Bell state
state = bell_state(0)
bell_correct = (
abs(abs(state.amplitudes[0].item()) - 1/math.sqrt(2)) < 1e-5 and
abs(abs(state.amplitudes[3].item()) - 1/math.sqrt(2)) < 1e-5
)
results.append(BenchmarkResult("Bell State", 2, 0, 0, 0, bell_correct))
# Test RZ
qc = QuantumCircuit(1)
qc.h(0)
qc.rz(math.pi, 0) # Should give (|0⟩ - |1⟩)/√2
state = sim.run(qc)
# After Rz(π), the |1⟩ component gets phase -i, but relative phase is what matters
rz_correct = state.probabilities().sum().item() > 0.99
results.append(BenchmarkResult("Rz Gate", 1, 0, 0, 0, rz_correct))
return results
def run_full_benchmark(max_qubits: int = 20) -> Dict[str, List[BenchmarkResult]]:
"""
Run comprehensive benchmark suite.
Args:
max_qubits: Maximum number of qubits to test
Returns:
Dictionary of benchmark category -> results
"""
print("=" * 70)
print("FireEcho Quantum Gold - Benchmark Suite")
print("=" * 70)
# Get GPU info
props = torch.cuda.get_device_properties(0)
print(f"GPU: {props.name}")
print(f"SM Version: {props.major}.{props.minor}")
print(f"VRAM: {props.total_memory / 1e9:.1f} GB")
print("=" * 70)
print()
results = {
"validation": [],
"single_qubit": [],
"two_qubit": [],
"algorithms": [],
"sampling": [],
}
# Validation tests
print("Running gate validation...")
results["validation"] = validate_gates()
for r in results["validation"]:
print(f" {r}")
print()
# Single-qubit benchmarks
print("Single-qubit gate benchmarks:")
for n in [10, 15, 20]:
if n <= max_qubits:
r = benchmark_single_qubit_gates(n)
results["single_qubit"].append(r)
print(f" {r}")
print()
# Two-qubit benchmarks
print("Two-qubit gate benchmarks:")
for n in [10, 15, 20]:
if n <= max_qubits:
r = benchmark_two_qubit_gates(n)
results["two_qubit"].append(r)
print(f" {r}")
print()
# Algorithm benchmarks
print("Algorithm benchmarks:")
for n in [8, 12, 16]:
if n <= max_qubits:
r = benchmark_qft(n)
results["algorithms"].append(r)
print(f" {r}")
for n in [10, 15, 20]:
if n <= max_qubits:
r = benchmark_ghz(n)
results["algorithms"].append(r)
print(f" {r}")
for n in [10, 14, 18]:
if n <= max_qubits:
r = benchmark_random_circuit(n, depth=20)
results["algorithms"].append(r)
print(f" {r}")
print()
# Sampling benchmarks
print("Sampling benchmarks:")
for n in [15, 20]:
if n <= max_qubits:
r = benchmark_sampling(n)
results["sampling"].append(r)
print(f" {r}")
print()
# Summary
print("=" * 70)
print("Summary")
print("=" * 70)
all_correct = all(r.correct for cat in results.values() for r in cat)
total_tests = sum(len(cat) for cat in results.values())
passed = sum(1 for cat in results.values() for r in cat if r.correct)
print(f"Tests: {passed}/{total_tests} passed")
print(f"Status: {'✅ ALL PASSED' if all_correct else '❌ SOME FAILED'}")
# Best performance
perf_results = [r for cat in ["single_qubit", "two_qubit", "algorithms"]
for r in results[cat] if r.correct]
if perf_results:
best = max(perf_results, key=lambda r: r.gates_per_second)
print(f"Best throughput: {best.gates_per_second/1e6:.2f}M gates/s ({best.name})")
print("=" * 70)
return results
def compare_cuquantum(num_qubits: int = 16) -> Optional[Dict]:
"""
Compare FireEcho Quantum Gold against cuQuantum/CUDA-Q (if available).
Based on KTH paper "Harnessing CUDA-Q's MPS for Tensor Network Simulations".
Returns comparison metrics or None if cuQuantum not installed.
"""
# Check for cuQuantum availability
cuquantum_available = False
cudaqsim_available = False
try:
import cuquantum
cuquantum_available = True
except ImportError:
pass
try:
import cudaq
cudaqsim_available = True
except ImportError:
pass
print("=" * 60)
print(f"FireEcho Quantum Gold vs cuQuantum Comparison")
print(f"Testing with {num_qubits} qubits")
print("=" * 60)
print()
if not cuquantum_available and not cudaqsim_available:
print("Neither cuQuantum nor CUDA-Q installed.")
print("Install with: pip install cuquantum-python cudaq")
print()
print("Running FireEcho-only benchmark for reference...")
print()
results = {
"num_qubits": num_qubits,
"fireecho_ms": {},
"cuquantum_ms": {},
"speedup": {},
}
# Test circuits
test_circuits = [
("GHZ State", "ghz"),
("QFT", "qft"),
("Random Circuit", "random"),
]
sim = QuantumSimulator()
for name, circuit_type in test_circuits:
print(f"Testing {name}...")
# Create circuit
if circuit_type == "ghz":
qc = QuantumCircuit(num_qubits, "ghz")
qc.h(0)
for i in range(1, num_qubits):
qc.cx(0, i)
elif circuit_type == "qft":
qc = qft(num_qubits)
else: # random
qc = random_circuit(num_qubits, depth=20, seed=42)
# Warmup FireEcho
for _ in range(3):
_ = sim.run(qc)
torch.cuda.synchronize()
# Benchmark FireEcho
start = time.perf_counter()
for _ in range(10):
_ = sim.run(qc)
torch.cuda.synchronize()
fe_time = (time.perf_counter() - start) / 10 * 1000
results["fireecho_ms"][name] = fe_time
print(f" FireEcho: {fe_time:.3f} ms")
# Benchmark cuQuantum if available
if cuquantum_available:
try:
# Use cuQuantum's state vector simulator
import cuquantum
from cuquantum import custatevec as cusv
# Create state vector
n_qubits = num_qubits
sv_size = 2 ** n_qubits
d_sv = torch.zeros(sv_size, dtype=torch.complex64, device='cuda')
d_sv[0] = 1.0
# Apply gates using cuStateVec
# (Simplified - full implementation would translate circuit)
handle = cusv.create()
# Warmup
for _ in range(3):
d_sv_copy = d_sv.clone()
# Apply Hadamard to first qubit
h_matrix = torch.tensor(
[[1, 1], [1, -1]], dtype=torch.complex64, device='cuda'
) / math.sqrt(2)
cusv.apply_matrix(
handle, d_sv_copy.data_ptr(), cusv.cudaDataType.CUDA_C_32F,
n_qubits, h_matrix.data_ptr(), cusv.cudaDataType.CUDA_C_32F,
cusv.MatrixLayout.ROW, 0, [0], 1, [], [], 0, cusv.ComputeType.COMPUTE_32F,
0
)
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(10):
d_sv_copy = d_sv.clone()
# Apply operations...
torch.cuda.synchronize()
cq_time = (time.perf_counter() - start) / 10 * 1000
cusv.destroy(handle)
results["cuquantum_ms"][name] = cq_time
results["speedup"][name] = cq_time / fe_time
print(f" cuQuantum: {cq_time:.3f} ms")
print(f" Speedup: {results['speedup'][name]:.2f}x")
except Exception as e:
print(f" cuQuantum: Error - {e}")
results["cuquantum_ms"][name] = None
# Benchmark CUDA-Q if available
if cudaqsim_available and not cuquantum_available:
try:
import cudaq
# Set target to nvidia (state vector)
cudaq.set_target('nvidia')
# Define kernel
@cudaq.kernel
def ghz_kernel(n: int):
q = cudaq.qvector(n)
h(q[0])
for i in range(1, n):
cx(q[0], q[i])
# Warmup
for _ in range(3):
cudaq.sample(ghz_kernel, num_qubits)
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(10):
cudaq.sample(ghz_kernel, num_qubits)
torch.cuda.synchronize()
cq_time = (time.perf_counter() - start) / 10 * 1000
results["cuquantum_ms"][name] = cq_time
results["speedup"][name] = cq_time / fe_time
print(f" CUDA-Q: {cq_time:.3f} ms")
print(f" Speedup: {results['speedup'][name]:.2f}x")
except Exception as e:
print(f" CUDA-Q: Error - {e}")
print()
# Summary
print("=" * 60)
print("Summary")
print("=" * 60)
print(f"\n{'Circuit':<20} {'FireEcho (ms)':<15} {'cuQuantum (ms)':<15} {'Speedup':<10}")
print("-" * 60)
for name in results["fireecho_ms"]:
fe = results["fireecho_ms"][name]
cq = results["cuquantum_ms"].get(name)
sp = results["speedup"].get(name)
cq_str = f"{cq:.3f}" if cq else "N/A"
sp_str = f"{sp:.2f}x" if sp else "N/A"
print(f"{name:<20} {fe:<15.3f} {cq_str:<15} {sp_str:<10}")
print()
# Performance analysis
if results["fireecho_ms"]:
avg_fe = sum(results["fireecho_ms"].values()) / len(results["fireecho_ms"])
state_size_mb = (2 ** num_qubits * 8) / 1e6
effective_bandwidth = state_size_mb / (avg_fe / 1000) # MB/s
print(f"Average FireEcho time: {avg_fe:.3f} ms")
print(f"State vector size: {state_size_mb:.2f} MB")
print(f"Effective bandwidth: {effective_bandwidth:.1f} MB/s")
print("=" * 60)
return results
def run_comprehensive_benchmark():
"""Run all benchmarks including cuQuantum comparison."""
# Standard benchmarks
results = run_full_benchmark(max_qubits=20)
print()
# cuQuantum comparison for different sizes
for n in [12, 16, 20]:
try:
compare_cuquantum(n)
except Exception as e:
print(f"Error benchmarking {n} qubits: {e}")
print()
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "--cuquantum":
compare_cuquantum(int(sys.argv[2]) if len(sys.argv) > 2 else 16)
elif len(sys.argv) > 1 and sys.argv[1] == "--full":
run_comprehensive_benchmark()
else:
run_full_benchmark(max_qubits=20)