import os import sys import json import time import socket import platform import subprocess import traceback import datetime from typing import Any, Dict from fastapi import APIRouter router = APIRouter() # ========================================================= # SAFE EXECUTION WRAPPER (NEVER FAIL) # ========================================================= def safe_run(name, func): start = time.time() try: return { "status": "ok", "duration_sec": round(time.time() - start, 3), "data": func(), } except Exception as e: return { "status": "error", "duration_sec": round(time.time() - start, 3), "error": str(e), "traceback": traceback.format_exc(limit=2), } # ========================================================= # COMMAND RUNNER # ========================================================= def run_cmd(cmd): try: r = subprocess.run( cmd, capture_output=True, text=True, timeout=25, ) return { "cmd": " ".join(cmd), "returncode": r.returncode, "stdout": r.stdout.strip(), "stderr": r.stderr.strip(), } except Exception as e: return {"cmd": " ".join(cmd), "error": str(e)} # ========================================================= # SYSTEM INFO # ========================================================= def system_info(): return { "time_utc": datetime.datetime.utcnow().isoformat(), "hostname": socket.gethostname(), "platform": platform.platform(), "python": sys.version, "executable": sys.executable, "cwd": os.getcwd(), "pid": os.getpid(), "cpu_count": os.cpu_count(), } # ========================================================= # ENVIRONMENT VARIABLES (MASK SECRETS) # ========================================================= def env_info(): masked = {} for k, v in os.environ.items(): if any(x in k.lower() for x in ["token", "secret", "password", "key"]): masked[k] = "***hidden***" else: masked[k] = v return masked # ========================================================= # GPU / TORCH INFO # ========================================================= def gpu_info(): try: import torch data = { "cuda_available": torch.cuda.is_available(), "device_count": torch.cuda.device_count(), "torch_version": torch.__version__, } if torch.cuda.is_available(): data.update({ "device_name": torch.cuda.get_device_name(0), "memory_allocated": torch.cuda.memory_allocated(0), "memory_reserved": torch.cuda.memory_reserved(0), "memory_total": torch.cuda.get_device_properties(0).total_memory, }) return data except Exception as e: return {"torch_error": str(e)} # ========================================================= # HUGGING FACE CACHE # ========================================================= def hf_cache_info(): cache_dir = os.getenv("HF_HOME", "/tmp/huggingface") return { "cache_dir": cache_dir, "exists": os.path.exists(cache_dir), "files_sample": os.listdir(cache_dir)[:30] if os.path.exists(cache_dir) else [], } # ========================================================= # INSTALLED PACKAGES # ========================================================= def packages_info(): try: import pkg_resources return sorted( [f"{p.project_name}=={p.version}" for p in pkg_resources.working_set] ) except Exception as e: return {"error": str(e)} # ========================================================= # DISK + MEMORY # ========================================================= def disk_info(): return { "disk_usage": run_cmd(["df", "-h"]), "memory": run_cmd(["free", "-h"]), } # ========================================================= # NETWORK INFO # ========================================================= def network_info(): return { "hostname": socket.gethostname(), "ip": socket.gethostbyname(socket.gethostname()), } # ========================================================= # HF CLI CHECKS # ========================================================= def hf_cli_info(): return { "whoami": run_cmd(["huggingface-cli", "whoami"]), "scan_cache": run_cmd(["huggingface-cli", "scan-cache"]), } # ========================================================= # PYTHON RUNTIME STATE # ========================================================= def runtime_info(): import threading return { "active_threads": [t.name for t in threading.enumerate()], "loaded_modules_count": len(sys.modules), } # ========================================================= # SPACE DETECTION (HF SPECIFIC) # ========================================================= def hf_space_info(): return { "is_space": "SPACE_ID" in os.environ, "space_id": os.getenv("SPACE_ID"), "hardware": os.getenv("SPACE_HARDWARE"), "sdk": os.getenv("SPACE_SDK"), } # ========================================================= # MAIN UNIVERSAL DEBUG ENDPOINT # ========================================================= @router.get("/debug/full", include_in_schema=False) def full_debug() -> Dict[str, Any]: report = { "system": safe_run("system", system_info), "environment": safe_run("env", env_info), "gpu": safe_run("gpu", gpu_info), "disk": safe_run("disk", disk_info), "network": safe_run("network", network_info), "hf_space": safe_run("hf_space", hf_space_info), "hf_cache": safe_run("hf_cache", hf_cache_info), "hf_cli": safe_run("hf_cli", hf_cli_info), "runtime": safe_run("runtime", runtime_info), "packages": safe_run("packages", packages_info), } return { "status": "ok", "generated_at": datetime.datetime.utcnow().isoformat(), "report": report, }