Spaces:

tventurella
/

mr_chatterbox

Running

App Files Files Community

tventurella commited on Mar 26

Commit

559415e

verified ·

1 Parent(s): 654c0be

Update scripts/chat_web.py

Browse files

Files changed (1) hide show

scripts/chat_web.py +10 -79

scripts/chat_web.py CHANGED Viewed

@@ -136,6 +136,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 device_type = autodetect_device_type() if args.device_type == "" else args.device_type
 @dataclass
 class Worker:
@@ -267,57 +268,18 @@ def validate_chat_request(request: ChatRequest):
                 detail=f"max_tokens must be between {MIN_MAX_TOKENS} and {MAX_MAX_TOKENS}"
             )
-async def _load_model_background(app: FastAPI):
-    """Download and load model in background so the server can respond to health checks immediately."""
-    loop = asyncio.get_event_loop()
-    def _download_and_load():
-        import os
-        model_dir = os.environ.get("NANOCHAT_BASE_DIR", "/app/nanochat_cache")
-        checkpoint_dir = os.path.join(model_dir, "chatsft_checkpoints", "d18")
-        model_file = os.path.join(checkpoint_dir, "model_000070.pt")
-        model_repo = "tventurella/mr_chatterbox_model"
-        # Download if not present
-        if not os.path.exists(model_file):
-            print("Downloading model checkpoint...", flush=True)
-            from huggingface_hub import hf_hub_download
-            os.makedirs(checkpoint_dir, exist_ok=True)
-            hf_hub_download(model_repo, "model_000070.pt", local_dir=checkpoint_dir)
-            hf_hub_download(model_repo, "meta_000070.json", local_dir=checkpoint_dir)
-            print("Model downloaded.", flush=True)
-        else:
-            print("Model checkpoint already present.", flush=True)
-        # Initialize compute
-        print("Initializing compute...", flush=True)
-        ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-        # Load model
-        print("Loading model into memory...", flush=True)
-        pool = WorkerPool(num_gpus=args.num_gpus)
-        import asyncio as _asyncio
-        _asyncio.run(pool.initialize(args.source, model_tag=args.model_tag, step=args.step))
-        return pool
-    pool = await loop.run_in_executor(None, _download_and_load)
-    app.state.worker_pool = pool
-    app.state.model_ready = True
-    print(f"Model loaded! Server ready at http://localhost:{args.port}", flush=True)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Start server immediately, load model in background."""
-    app.state.model_ready = False
-    app.state.worker_pool = None
-    # Start model loading in background
-    load_task = asyncio.create_task(_load_model_background(app))
     # Start periodic log push (every hour)
     log_task = asyncio.create_task(periodic_log_push(3600))
     yield
     # Push any remaining logs on shutdown
     log_task.cancel()
-    load_task.cancel()
     push_logs()
 app = FastAPI(lifespan=lifespan)
@@ -330,35 +292,9 @@ app.add_middleware(
     allow_headers=["*"],
 )
-LOADING_HTML = """<!DOCTYPE html>
-<html><head>
-<meta charset="UTF-8">
-<meta http-equiv="refresh" content="10">
-<title>Mr. Chatterbox — Loading</title>
-<style>
-@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@700&family=Lora:ital@0;1&display=swap');
-body { font-family: 'Lora', Georgia, serif; background: #f5f0e8; color: #2c1810;
-       display: flex; justify-content: center; align-items: center; min-height: 100vh; margin: 0; }
-.box { text-align: center; max-width: 500px; padding: 2rem; }
-h1 { font-family: 'Playfair Display', Georgia, serif; color: #722f37; font-size: 2rem; }
-p { line-height: 1.7; color: #5c4033; }
-.spinner { display: inline-block; width: 40px; height: 40px; border: 3px solid #c4b59a;
-           border-top-color: #722f37; border-radius: 50%; animation: spin 1s linear infinite; margin: 1rem 0; }
-@keyframes spin { to { transform: rotate(360deg); } }
-</style></head><body>
-<div class="box">
-<h1>Mr. Chatterbox</h1>
-<div class="spinner"></div>
-<p>The gentleman is preparing himself for conversation.<br>
-<em>This may take a few minutes on first visit.</em></p>
-<p style="font-size:0.85rem; color:#8b7355;">This page will refresh automatically.</p>
-</div></body></html>"""
 @app.get("/")
 async def root():
-    """Serve the chat UI, or a loading page if model isn't ready yet."""
-    if not getattr(app.state, 'model_ready', False):
-        return HTMLResponse(content=LOADING_HTML)
     ui_html_path = os.path.join("nanochat", "ui.html")
     with open(ui_html_path, "r", encoding="utf-8") as f:
         html_content = f.read()
@@ -432,10 +368,6 @@ async def generate_stream(
 async def chat_completions(request: ChatRequest):
     """Chat completion endpoint (streaming only) - uses worker pool for multi-GPU."""
-    # Block requests while model is still loading
-    if not getattr(app.state, 'model_ready', False):
-        raise HTTPException(status_code=503, detail="Model is still loading. Please wait.")
     # Basic validation to prevent abuse
     validate_chat_request(request)
@@ -520,12 +452,11 @@ async def chat_completions(request: ChatRequest):
 @app.get("/health")
 async def health():
-    """Health check endpoint — always returns 200 so HF doesn't time out."""
-    model_ready = getattr(app.state, 'model_ready', False)
     worker_pool = getattr(app.state, 'worker_pool', None)
     return {
-        "status": "ok" if model_ready else "loading",
-        "ready": model_ready and worker_pool is not None and len(worker_pool.workers) > 0,
         "num_gpus": worker_pool.num_gpus if worker_pool else 0,
         "available_workers": worker_pool.available_workers.qsize() if worker_pool else 0
     }

 logger = logging.getLogger(__name__)
 device_type = autodetect_device_type() if args.device_type == "" else args.device_type
+ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 @dataclass
 class Worker:
                 detail=f"max_tokens must be between {MIN_MAX_TOKENS} and {MAX_MAX_TOKENS}"
             )
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Load models on all GPUs on startup."""
+    print("Loading nanochat models across GPUs...")
+    app.state.worker_pool = WorkerPool(num_gpus=args.num_gpus)
+    await app.state.worker_pool.initialize(args.source, model_tag=args.model_tag, step=args.step)
+    print(f"Server ready at http://localhost:{args.port}")
     # Start periodic log push (every hour)
     log_task = asyncio.create_task(periodic_log_push(3600))
     yield
     # Push any remaining logs on shutdown
     log_task.cancel()
     push_logs()
 app = FastAPI(lifespan=lifespan)
     allow_headers=["*"],
 )
 @app.get("/")
 async def root():
+    """Serve the chat UI."""
     ui_html_path = os.path.join("nanochat", "ui.html")
     with open(ui_html_path, "r", encoding="utf-8") as f:
         html_content = f.read()
 async def chat_completions(request: ChatRequest):
     """Chat completion endpoint (streaming only) - uses worker pool for multi-GPU."""
     # Basic validation to prevent abuse
     validate_chat_request(request)
 @app.get("/health")
 async def health():
+    """Health check endpoint."""
     worker_pool = getattr(app.state, 'worker_pool', None)
     return {
+        "status": "ok",
+        "ready": worker_pool is not None and len(worker_pool.workers) > 0,
         "num_gpus": worker_pool.num_gpus if worker_pool else 0,
         "available_workers": worker_pool.available_workers.qsize() if worker_pool else 0
     }