"""Unified server for HF Spaces: environment + inference + dashboard on port 7860.""" import json import os import sys import time import threading sys.path.insert(0, "/app") from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import FileResponse, JSONResponse import uvicorn from server.app import app as env_app env_app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # Model state (loaded in background) MODEL_STATE = {"model": None, "tokenizer": None, "ready": False, "error": None} UNTRAINED_SYSTEM = ( "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n" "You receive an incident ticket with hardware/model/backend context, log excerpts, and specialist opinions.\n" "Some specialists may be wrong. Output a JSON array of actions:\n" ' {"type":"inspect","target":"logs|config|snippet|metrics"}\n' ' {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n' ' {"type":"apply_fix","fix":""}\n' ' {"type":"submit","root_cause":"","fix":"","justification":""}' ) TRAINED_SYSTEM = ( "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n" "You are methodical: first inspect logs and config, then query specialists to cross-verify (some lie), then apply a fix and submit.\n\n" "Available actions (output as a JSON array):\n" ' {"type":"inspect","target":"logs"} or "config" or "snippet" or "metrics"\n' ' {"type":"ask_specialist","specialist":"runtime"} or "dispatch" or "kernel" or "loader"\n' ' {"type":"apply_fix","fix":""} -- available fixes: add_whitelist_entry, fix_comm_config, fix_quantization, fix_runtime_path, fix_weight_mapping, relax_arch_check, switch_backend, tune_memory_config, update_driver_config, update_model_config\n' ' {"type":"submit","root_cause":"","fix":"","justification":""}\n\n' "Available root causes: arch_guard, backend_selector, backend_whitelist, distributed_comm, driver_compat, memory_oom, model_config, quantization_error, runtime_loader, weight_layout\n\n" "IMPORTANT: Pick ONE target per inspect, ONE specialist per query. Investigate before submitting. Give a detailed justification.\n\n" "Example output:\n" '[{"type":"inspect","target":"logs"},{"type":"inspect","target":"config"},{"type":"ask_specialist","specialist":"kernel"},' '{"type":"apply_fix","fix":"relax_arch_check"},' '{"type":"submit","root_cause":"arch_guard","fix":"relax_arch_check","justification":"Logs show architecture check failure for SM90. Config confirms guard enabled. Kernel specialist confirmed not a kernel issue."}]' ) def load_model_background(): """Load Qwen 1.5B in a background thread so the server starts fast.""" try: print("[Model] Loading Qwen2.5-1.5B-Instruct (CPU)...") t0 = time.time() from transformers import AutoModelForCausalLM, AutoTokenizer import torch model_name = "Qwen/Qwen2.5-1.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, device_map="cpu", ) MODEL_STATE["model"] = model MODEL_STATE["tokenizer"] = tokenizer MODEL_STATE["ready"] = True print(f"[Model] Loaded in {time.time()-t0:.1f}s") except Exception as ex: MODEL_STATE["error"] = str(ex) print(f"[Model] Failed to load: {ex}") threading.Thread(target=load_model_background, daemon=True).start() @env_app.post("/generate") async def generate_endpoint(request: Request): body = await request.json() prompt_text = body.get("prompt", "") max_tokens = body.get("max_tokens", 512) mode = body.get("mode", "untrained") if not MODEL_STATE["ready"]: if MODEL_STATE["error"]: return JSONResponse({"error": MODEL_STATE["error"]}, status_code=500) return JSONResponse({"error": "Model still loading, please wait..."}, status_code=503) model = MODEL_STATE["model"] tokenizer = MODEL_STATE["tokenizer"] system = TRAINED_SYSTEM if mode == "trained" else UNTRAINED_SYSTEM messages = [ {"role": "system", "content": system}, {"role": "user", "content": prompt_text}, ] import torch text_input = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(text_input, return_tensors="pt") t0 = time.time() with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id, ) new_tokens = outputs[0][inputs["input_ids"].shape[1]:] text = tokenizer.decode(new_tokens, skip_special_tokens=True) gen_time = time.time() - t0 print(f"[Model] Generated {len(text)} chars in {gen_time:.1f}s (mode={mode})") return JSONResponse({"text": text, "gen_time": gen_time}) @env_app.get("/model_status") async def model_status(): return JSONResponse({ "ready": MODEL_STATE["ready"], "error": MODEL_STATE["error"], }) @env_app.get("/", include_in_schema=False) async def root(): return FileResponse("/app/static/index.html") if __name__ == "__main__": uvicorn.run(env_app, host="0.0.0.0", port=7860)