File size: 4,314 Bytes
0b8a564
 
6a5b9e1
592242d
6a5b9e1
 
592242d
6a5b9e1
d3cc88b
592242d
6a5b9e1
 
d3cc88b
6a5b9e1
 
592242d
6a5b9e1
592242d
 
d3cc88b
592242d
6a5b9e1
 
 
d3cc88b
6a5b9e1
592242d
 
 
 
6a5b9e1
 
 
 
 
 
 
 
592242d
6a5b9e1
592242d
 
 
6a5b9e1
 
592242d
 
 
d3cc88b
6a5b9e1
d3cc88b
592242d
6a5b9e1
d3cc88b
6a5b9e1
 
 
 
592242d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a5b9e1
592242d
6a5b9e1
592242d
 
 
6a5b9e1
dba0d3f
592242d
 
 
 
 
 
 
dba0d3f
6a5b9e1
dba0d3f
592242d
 
 
 
 
 
 
 
 
 
 
 
 
 
d3cc88b
 
592242d
 
 
dba0d3f
592242d
 
 
 
 
 
 
 
 
 
d3cc88b
6a5b9e1
d3cc88b
 
6a5b9e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import subprocess
import time
import sys
import httpx
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import uvicorn
import gradio as gr
from openai import OpenAI, APIConnectionError
import base64
from io import BytesIO

# --- CONFIGURATION ---
MODEL_ID = "numind/NuMarkdown-8B-Thinking"
GPU_UTILIZATION = 0.90
MAX_MODEL_LEN = 32768
VLLM_PORT = 8000
EXPOSED_PORT = 7860

# --- STEP 1: LAUNCH vLLM (Background) ---
def start_vllm():
    if "VLLM_PID" in os.environ:
        return

    print(f"Starting vLLM server on port {VLLM_PORT}...")
    
    # JSON formatted limit string to fix parsing error
    limit_mm_config = '{"image": 1}'
    
    command = [
        "vllm", "serve", MODEL_ID,
        "--host", "0.0.0.0",
        "--port", str(VLLM_PORT),
        "--trust-remote-code",
        "--gpu-memory-utilization", str(GPU_UTILIZATION),
        "--max-model-len", str(MAX_MODEL_LEN),
        "--dtype", "bfloat16",
        "--limit-mm-per-prompt", limit_mm_config
    ]
    
    # Redirect stdout/stderr to see download progress
    proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
    os.environ["VLLM_PID"] = str(proc.pid)
    
    # We do NOT block here anymore. We let vLLM load in the background
    # while the UI starts. This allows you to see the UI immediately.
    print("vLLM started in background. Please wait for model download...")

start_vllm()

# --- STEP 2: FASTAPI PROXY ---
app = FastAPI()

@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
async def proxy_to_vllm(path: str, request: Request):
    target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
    async with httpx.AsyncClient() as client:
        try:
            proxy_req = client.build_request(
                request.method,
                target_url,
                headers=request.headers.raw,
                content=await request.body(),
                timeout=300.0
            )
            r = await client.send(proxy_req, stream=True)
            return StreamingResponse(
                r.aiter_raw(),
                status_code=r.status_code,
                headers=r.headers,
                background=None
            )
        except httpx.ConnectError:
            return JSONResponse(status_code=503, content={"error": "Model is still loading. Please wait."})

# --- STEP 3: GRADIO UI ---
def run_ui_test(image, prompt):
    if image is None:
        return "⚠️ Please upload an image first."
        
    client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
    
    # Encode Image
    try:
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
    except Exception as e:
        return f"Error processing image: {e}"
    
    if not prompt: prompt = "Convert to markdown."
    
    try:
        completion = client.chat.completions.create(
            model=MODEL_ID,
            messages=[{"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
            ]}],
            max_tokens=4096
        )
        return completion.choices[0].message.content
    except APIConnectionError:
        return "⏳ Model is still downloading/loading... Check the 'Logs' tab. This takes 2-3 minutes on a fresh GPU."
    except Exception as e:
        return f"Error: {str(e)}"

with gr.Blocks() as demo:
    gr.Markdown("# NuMarkdown L40S vLLM Server")
    gr.Markdown("Status: If you just started this Space, wait 3 minutes for weights to download.")
    
    with gr.Row():
        with gr.Column():
            img_input = gr.Image(type="pil", label="Document")
            # FIXED: Added the missing prompt input
            txt_input = gr.Textbox(value="Convert to markdown.", label="Prompt") 
            btn = gr.Button("Test Inference")
        with gr.Column():
            out = gr.Textbox(label="Output")
    
    # FIXED: Passed both inputs [img_input, txt_input]
    btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])

app = gr.mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)