"""
XERV CRAYON V5.1.0 - Production Omni-Backend Tokenizer
=======================================================
Copy this ENTIRE script into a Google Colab cell and run it.

IMPORTANT: Enable GPU runtime first:
Runtime -> Change runtime type -> GPU (T4/V100/A100)

WHAT'S NEW in v4.3.0:
- Fixed ROCm/HIP compilation: Now properly uses hipcc instead of g++
- Full support for AMD GPUs (MI250/MI300, Radeon RX 7000+)
- Production-grade error handling across all backends
- Python 3.10-3.13 fully supported
"""

import subprocess
import sys
import os
import time

print("=" * 70)
print("XERV CRAYON V4.3.0 INSTALLATION AND BENCHMARKS")
print("=" * 70)

# 1. Environment Check
print("[1/7] Checking environment...")
try:
    import torch
    print(f"      PyTorch: {torch.__version__}")
    if torch.cuda.is_available():
        print(f"      CUDA: {torch.version.cuda} ({torch.cuda.get_device_name(0)})")
        print("      * Smart Build: Will compile ONLY for this GPU architecture")
    else:
        print("      CUDA: Not available (CPU only)")
except ImportError:
    print("      PyTorch not found (will be installed)")

# Check for NVCC (NVIDIA) or hipcc (AMD)
nvcc_check = subprocess.run(["which", "nvcc"], capture_output=True, text=True)
if nvcc_check.returncode == 0:
    print(f"      NVCC: {nvcc_check.stdout.strip()}")
else:
    print("      NVCC: Not found")

hipcc_check = subprocess.run(["which", "hipcc"], capture_output=True, text=True)
if hipcc_check.returncode == 0:
    print(f"      HIPCC (ROCm): {hipcc_check.stdout.strip()}")
else:
    print("      HIPCC (ROCm): Not found")


# 2. Build Dependencies
print("\n[2/7] Installing build dependencies...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "ninja", "packaging", "wheel", "setuptools>=68.0"])
print("      Done (ninja, packaging, wheel)")


# 3. Clean Old State
print("\n[3/7] Cleaning previous installations...")
os.system("pip uninstall -y xerv-crayon crayon 2>/dev/null")
os.system("rm -rf /tmp/crayon* build dist src/*.egg-info 2>/dev/null")


# 4. Clone Source
print("\n[4/7] Cloning source code...")
timestamp = int(time.time())
clone_dir = f"/tmp/crayon_{timestamp}"
cmd = f"git clone --depth 1 https://github.com/Electroiscoding/CRAYON.git {clone_dir}"
if os.system(cmd) != 0:
    print("      FATAL: Git clone failed!")
    sys.exit(1)

# Verify source
v_check = subprocess.run(["grep", "-m1", "__version__", f"{clone_dir}/src/crayon/__init__.py"], 
                        capture_output=True, text=True)
print(f"      {v_check.stdout.strip()}")


# 5. Build & Install (Streaming Output)
print("\n[5/7] Compiling and Installing (Streaming Logs)...")
print("-" * 70)

build_env = os.environ.copy()
build_env["MAX_JOBS"] = "1"      # Force serial build to prevent OOM
build_env["CUDA_HOME"] = "/usr/local/cuda"
# ROCm is auto-detected via /opt/rocm

# Stream output line-by-line
cmd = [sys.executable, "-m", "pip", "install", "-v", "--no-build-isolation", clone_dir]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=build_env, text=True)

# Print output while running
while True:
    line = process.stdout.readline()
    if not line and process.poll() is not None:
        break
    if line:
        print(line.rstrip())

rc = process.poll()
print("-" * 70)

if rc != 0:
    print("\n" + "!" * 70)
    print("FATAL ERROR: Installation failed!")
    print(f"Exit Code: {rc}")
    print("!" * 70)
    sys.exit(1)


# 6. Verification
print("\n[6/7] Verifying installation...")
# Reset module cache
for key in list(sys.modules.keys()):
    if "crayon" in key:
        del sys.modules[key]

try:
    import crayon
    print(f"      Success! Installed version: {crayon.get_version()}")
    backends = crayon.check_backends()
    print(f"      Backends: {backends}")
except ImportError as e:
    print(f"      FATAL: Could not import crayon: {e}")
    sys.exit(1)


# 7. Benchmarks
print("\n" + "=" * 70)
print("BENCHMARKS & TESTING")
print("=" * 70)

from crayon import CrayonVocab

vocab = CrayonVocab(device="auto")
vocab.load_profile("lite")
print(f"\nActive Device: {vocab.device.upper()}")

info = vocab.get_info()
print(f"Backend: {info['backend']}")

if vocab.device == "cpu" and backends.get("cuda"):
    print("NOTE: Running on CPU but CUDA is available. Use device='cuda' to force.")
if vocab.device == "cpu" and backends.get("rocm"):
    print("NOTE: Running on CPU but ROCm is available. Use device='rocm' to force.")

# Throughput test
text = "The quick brown fox jumps over the lazy dog."
batch_sizes = [1000, 10000, 50000]
print("\nBatch Throughput:")
for bs in batch_sizes:
    batch = [text] * bs
    # Warmup
    vocab.tokenize(batch[:10]) 
    
    start = time.time()
    res = vocab.tokenize(batch)
    dur = time.time() - start
    
    toks = sum(len(x) for x in res)
    print(f"  {bs:>8,} docs: {bs/dur:>12,.0f} docs/sec | {toks/dur:>14,.0f} tokens/sec")

print("\n" + "=" * 70)
print("INSTALLATION COMPLETE!")
print("=" * 70)
print("""
Quick Start:
    from crayon import CrayonVocab
    
    vocab = CrayonVocab(device='auto')
    vocab.load_profile('lite')
    
    tokens = vocab.tokenize("Hello, world!")
    print(tokens)

Available Profiles: 'lite', 'code', 'science', 'multilingual', 'arts_commerce'
Available Devices: 'auto', 'cpu', 'cuda', 'rocm'
""")