import os import subprocess import time import sys import httpx from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse import uvicorn import gradio as gr from openai import OpenAI, APIConnectionError import base64 from io import BytesIO # --- CONFIGURATION --- MODEL_ID = "numind/NuMarkdown-8B-Thinking" GPU_UTILIZATION = 0.90 MAX_MODEL_LEN = 32768 VLLM_PORT = 8000 EXPOSED_PORT = 7860 # --- STEP 1: LAUNCH vLLM (Background) --- def start_vllm(): if "VLLM_PID" in os.environ: return print(f"Starting vLLM server on port {VLLM_PORT}...") # JSON formatted limit string to fix parsing error limit_mm_config = '{"image": 1}' command = [ "vllm", "serve", MODEL_ID, "--host", "0.0.0.0", "--port", str(VLLM_PORT), "--trust-remote-code", "--gpu-memory-utilization", str(GPU_UTILIZATION), "--max-model-len", str(MAX_MODEL_LEN), "--dtype", "bfloat16", "--limit-mm-per-prompt", limit_mm_config ] # Redirect stdout/stderr to see download progress proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr) os.environ["VLLM_PID"] = str(proc.pid) # We do NOT block here anymore. We let vLLM load in the background # while the UI starts. This allows you to see the UI immediately. print("vLLM started in background. Please wait for model download...") start_vllm() # --- STEP 2: FASTAPI PROXY --- app = FastAPI() @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"]) async def proxy_to_vllm(path: str, request: Request): target_url = f"http://localhost:{VLLM_PORT}/v1/{path}" async with httpx.AsyncClient() as client: try: proxy_req = client.build_request( request.method, target_url, headers=request.headers.raw, content=await request.body(), timeout=300.0 ) r = await client.send(proxy_req, stream=True) return StreamingResponse( r.aiter_raw(), status_code=r.status_code, headers=r.headers, background=None ) except httpx.ConnectError: return JSONResponse(status_code=503, content={"error": "Model is still loading. Please wait."}) # --- STEP 3: GRADIO UI --- def run_ui_test(image, prompt): if image is None: return "⚠️ Please upload an image first." client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY") # Encode Image try: buffered = BytesIO() image.save(buffered, format="JPEG") b64 = base64.b64encode(buffered.getvalue()).decode('utf-8') except Exception as e: return f"Error processing image: {e}" if not prompt: prompt = "Convert to markdown." try: completion = client.chat.completions.create( model=MODEL_ID, messages=[{"role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}} ]}], max_tokens=4096 ) return completion.choices[0].message.content except APIConnectionError: return "⏳ Model is still downloading/loading... Check the 'Logs' tab. This takes 2-3 minutes on a fresh GPU." except Exception as e: return f"Error: {str(e)}" with gr.Blocks() as demo: gr.Markdown("# NuMarkdown L40S vLLM Server") gr.Markdown("Status: If you just started this Space, wait 3 minutes for weights to download.") with gr.Row(): with gr.Column(): img_input = gr.Image(type="pil", label="Document") # FIXED: Added the missing prompt input txt_input = gr.Textbox(value="Convert to markdown.", label="Prompt") btn = gr.Button("Test Inference") with gr.Column(): out = gr.Textbox(label="Output") # FIXED: Passed both inputs [img_input, txt_input] btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out]) app = gr.mount_gradio_app(app, demo, path="/") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)