"""Google Colab deployment utilities. Phase 8: Helpers for running VoxDoc in Colab environments. - GPU runtime auto-detection - Ngrok tunneling for external access - Colab-specific optimizations (memory, paths) Set ``COLAB_MODE=true`` in config or .env to enable. """ import logging import os import sys from typing import Dict, Any, Optional from app.config import settings logger = logging.getLogger(__name__) def is_colab_environment() -> bool: """Detect if running inside Google Colab.""" try: import google.colab # noqa: F401 return True except ImportError: return False def detect_gpu_runtime() -> Dict[str, Any]: """Detect GPU type and available VRAM in Colab.""" import torch info: Dict[str, Any] = { "cuda_available": torch.cuda.is_available(), "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0, "gpu_name": None, "vram_total_gb": 0.0, "vram_free_gb": 0.0, "recommended_quantization": None, } if torch.cuda.is_available(): props = torch.cuda.get_device_properties(0) total_gb = props.total_mem / (1024 ** 3) free_gb = torch.cuda.mem_get_info(0)[0] / (1024 ** 3) info["gpu_name"] = props.name info["vram_total_gb"] = round(total_gb, 2) info["vram_free_gb"] = round(free_gb, 2) # Recommend quantization based on VRAM if total_gb < 8: info["recommended_quantization"] = 4 elif total_gb < 16: info["recommended_quantization"] = 8 else: info["recommended_quantization"] = None # Full precision OK return info def setup_colab_environment() -> Dict[str, str]: """Apply Colab-specific environment optimizations.""" changes = {} if not settings.colab_mode: return changes # Use /content for model cache (Colab persistent storage) if os.path.isdir("/content"): model_dir = "/content/models" os.makedirs(model_dir, exist_ok=True) os.environ.setdefault("HF_HOME", "/content/hf_cache") changes["model_cache"] = model_dir changes["hf_home"] = "/content/hf_cache" # Reduce memory pressure os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:128") changes["cuda_alloc_conf"] = "max_split_size_mb:128" # Enable TF32 for A100/T4 (faster matmul) import torch if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True changes["tf32_enabled"] = "true" logger.info("Colab environment configured: %s", changes) return changes async def start_ngrok_tunnel(port: int = 8000) -> Optional[str]: """Start ngrok tunnel for external access to Colab. Requires ``COLAB_NGROK_TOKEN`` to be set. Returns the public URL or None if unavailable. """ token = settings.colab_ngrok_token if not token: logger.warning("No ngrok token configured; external access unavailable") return None try: from pyngrok import ngrok, conf conf.get_default().auth_token = token tunnel = ngrok.connect(port, "http") public_url = tunnel.public_url logger.info("Ngrok tunnel active: %s -> localhost:%d", public_url, port) return public_url except ImportError: logger.warning("pyngrok not installed; run: pip install pyngrok") return None except Exception as e: logger.error("Failed to start ngrok tunnel: %s", e) return None def get_colab_launch_info() -> Dict[str, Any]: """Get summary info for Colab launch display.""" gpu_info = detect_gpu_runtime() return { "is_colab": is_colab_environment(), "colab_mode_enabled": settings.colab_mode, "gpu": gpu_info, "recommended_settings": { "enable_gpu": gpu_info["cuda_available"], "model_quantization_enabled": gpu_info["recommended_quantization"] is not None, "model_quantization_bits": gpu_info["recommended_quantization"] or 4, "streaming_interval_seconds": 0.5 if gpu_info["cuda_available"] else 4.0, }, }