Spaces:
Sleeping
Sleeping
File size: 4,314 Bytes
0b8a564 6a5b9e1 592242d 6a5b9e1 592242d 6a5b9e1 d3cc88b 592242d 6a5b9e1 d3cc88b 6a5b9e1 592242d 6a5b9e1 592242d d3cc88b 592242d 6a5b9e1 d3cc88b 6a5b9e1 592242d 6a5b9e1 592242d 6a5b9e1 592242d 6a5b9e1 592242d d3cc88b 6a5b9e1 d3cc88b 592242d 6a5b9e1 d3cc88b 6a5b9e1 592242d 6a5b9e1 592242d 6a5b9e1 592242d 6a5b9e1 dba0d3f 592242d dba0d3f 6a5b9e1 dba0d3f 592242d d3cc88b 592242d dba0d3f 592242d d3cc88b 6a5b9e1 d3cc88b 6a5b9e1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | import os
import subprocess
import time
import sys
import httpx
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import uvicorn
import gradio as gr
from openai import OpenAI, APIConnectionError
import base64
from io import BytesIO
# --- CONFIGURATION ---
MODEL_ID = "numind/NuMarkdown-8B-Thinking"
GPU_UTILIZATION = 0.90
MAX_MODEL_LEN = 32768
VLLM_PORT = 8000
EXPOSED_PORT = 7860
# --- STEP 1: LAUNCH vLLM (Background) ---
def start_vllm():
if "VLLM_PID" in os.environ:
return
print(f"Starting vLLM server on port {VLLM_PORT}...")
# JSON formatted limit string to fix parsing error
limit_mm_config = '{"image": 1}'
command = [
"vllm", "serve", MODEL_ID,
"--host", "0.0.0.0",
"--port", str(VLLM_PORT),
"--trust-remote-code",
"--gpu-memory-utilization", str(GPU_UTILIZATION),
"--max-model-len", str(MAX_MODEL_LEN),
"--dtype", "bfloat16",
"--limit-mm-per-prompt", limit_mm_config
]
# Redirect stdout/stderr to see download progress
proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
os.environ["VLLM_PID"] = str(proc.pid)
# We do NOT block here anymore. We let vLLM load in the background
# while the UI starts. This allows you to see the UI immediately.
print("vLLM started in background. Please wait for model download...")
start_vllm()
# --- STEP 2: FASTAPI PROXY ---
app = FastAPI()
@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
async def proxy_to_vllm(path: str, request: Request):
target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
async with httpx.AsyncClient() as client:
try:
proxy_req = client.build_request(
request.method,
target_url,
headers=request.headers.raw,
content=await request.body(),
timeout=300.0
)
r = await client.send(proxy_req, stream=True)
return StreamingResponse(
r.aiter_raw(),
status_code=r.status_code,
headers=r.headers,
background=None
)
except httpx.ConnectError:
return JSONResponse(status_code=503, content={"error": "Model is still loading. Please wait."})
# --- STEP 3: GRADIO UI ---
def run_ui_test(image, prompt):
if image is None:
return "⚠️ Please upload an image first."
client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
# Encode Image
try:
buffered = BytesIO()
image.save(buffered, format="JPEG")
b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
except Exception as e:
return f"Error processing image: {e}"
if not prompt: prompt = "Convert to markdown."
try:
completion = client.chat.completions.create(
model=MODEL_ID,
messages=[{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
]}],
max_tokens=4096
)
return completion.choices[0].message.content
except APIConnectionError:
return "⏳ Model is still downloading/loading... Check the 'Logs' tab. This takes 2-3 minutes on a fresh GPU."
except Exception as e:
return f"Error: {str(e)}"
with gr.Blocks() as demo:
gr.Markdown("# NuMarkdown L40S vLLM Server")
gr.Markdown("Status: If you just started this Space, wait 3 minutes for weights to download.")
with gr.Row():
with gr.Column():
img_input = gr.Image(type="pil", label="Document")
# FIXED: Added the missing prompt input
txt_input = gr.Textbox(value="Convert to markdown.", label="Prompt")
btn = gr.Button("Test Inference")
with gr.Column():
out = gr.Textbox(label="Output")
# FIXED: Passed both inputs [img_input, txt_input]
btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])
app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)
|