Spaces:

encryptd
/

ocr_vlm_thinking

Sleeping

App Files Files Community

encryptd commited on Feb 14

Commit

592242d

1 Parent(s): 6a5b9e1

fix port issue

Browse files

Files changed (1) hide show

app.py +74 -64

app.py CHANGED Viewed

@@ -1,29 +1,33 @@
 import os
 import subprocess
 import time
 import httpx
 from fastapi import FastAPI, Request
-from fastapi.responses import StreamingResponse, JSONResponse
 import uvicorn
 import gradio as gr
-from openai import OpenAI
 import base64
 from io import BytesIO
 # --- CONFIGURATION ---
 MODEL_ID = "numind/NuMarkdown-8B-Thinking"
-GPU_UTILIZATION = 0.9
 MAX_MODEL_LEN = 32768
-VLLM_PORT = 8000        # Internal port for vLLM
-EXPOSED_PORT = 7860     # External port (Hugging Face default)
-# --- STEP 1: LAUNCH vLLM IN BACKGROUND ---
 def start_vllm():
     if "VLLM_PID" in os.environ:
-        print("vLLM already running.")
         return
     print(f"Starting vLLM server on port {VLLM_PORT}...")
     command = [
         "vllm", "serve", MODEL_ID,
         "--host", "0.0.0.0",
@@ -32,87 +36,93 @@ def start_vllm():
         "--gpu-memory-utilization", str(GPU_UTILIZATION),
         "--max-model-len", str(MAX_MODEL_LEN),
         "--dtype", "bfloat16",
-        "--limit-mm-per-prompt", "image=1",
     ]
-    proc = subprocess.Popen(command)
     os.environ["VLLM_PID"] = str(proc.pid)
-    # Wait for vLLM to be ready
-    print("Waiting for vLLM to load...")
-    for i in range(30):
-        try:
-            # Quick health check
-            httpx.get(f"http://localhost:{VLLM_PORT}/health")
-            print("vLLM is READY!")
-            return
-        except:
-            time.sleep(10)
-            print(f"Loading... {i*10}s")
-# Start vLLM immediately
 start_vllm()
-# --- STEP 2: SETUP FASTAPI PROXY ---
 app = FastAPI()
-# This is the magic function that forwards Docling's requests to vLLM
 @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def proxy_to_vllm(path: str, request: Request):
     target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
     async with httpx.AsyncClient() as client:
-        # Forward the request to vLLM
-        proxy_req = client.build_request(
-            request.method,
-            target_url,
-            headers=request.headers.raw,
-            content=await request.body(),
-            timeout=120.0 # Long timeout for OCR
-        )
-        # Send to vLLM and stream the response back to the internet
-        r = await client.send(proxy_req, stream=True)
-        return StreamingResponse(
-            r.aiter_raw(),
-            status_code=r.status_code,
-            headers=r.headers,
-            background=None
-        )
-# --- STEP 3: GRADIO UI (Optional, but good for testing) ---
 def run_ui_test(image, prompt):
-    # This talks to the INTERNAL vLLM
     client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
-    # Encode image simple logic
-    buffered = BytesIO()
-    image.save(buffered, format="JPEG")
-    b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
     if not prompt: prompt = "Convert to markdown."
-    completion = client.chat.completions.create(
-        model=MODEL_ID,
-        messages=[{"role": "user", "content": [
-            {"type": "text", "text": prompt},
-            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
-        ]}],
-        max_tokens=2048
-    )
-    return completion.choices[0].message.content
 with gr.Blocks() as demo:
-    gr.Markdown("# NuMarkdown vLLM API Server")
     with gr.Row():
-        img = gr.Image(type="pil")
-        btn = gr.Button("Test Internal Inference")
-    out = gr.Textbox()
-    btn.click(run_ui_test, inputs=[img], outputs=[out])
-# Mount Gradio to the root URL
 app = gr.mount_gradio_app(app, demo, path="/")
-# --- STEP 4: RUN EVERYTHING ON PORT 7860 ---
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)

 import os
 import subprocess
 import time
+import sys
 import httpx
 from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
 import uvicorn
 import gradio as gr
+from openai import OpenAI, APIConnectionError
 import base64
 from io import BytesIO
 # --- CONFIGURATION ---
 MODEL_ID = "numind/NuMarkdown-8B-Thinking"
+GPU_UTILIZATION = 0.90
 MAX_MODEL_LEN = 32768
+VLLM_PORT = 8000
+EXPOSED_PORT = 7860
+# --- STEP 1: LAUNCH vLLM (Background) ---
 def start_vllm():
     if "VLLM_PID" in os.environ:
         return
     print(f"Starting vLLM server on port {VLLM_PORT}...")
+    # JSON formatted limit string to fix parsing error
+    limit_mm_config = '{"image": 1}'
     command = [
         "vllm", "serve", MODEL_ID,
         "--host", "0.0.0.0",
         "--gpu-memory-utilization", str(GPU_UTILIZATION),
         "--max-model-len", str(MAX_MODEL_LEN),
         "--dtype", "bfloat16",
+        "--limit-mm-per-prompt", limit_mm_config
     ]
+    # Redirect stdout/stderr to see download progress
+    proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
     os.environ["VLLM_PID"] = str(proc.pid)
+    # We do NOT block here anymore. We let vLLM load in the background
+    # while the UI starts. This allows you to see the UI immediately.
+    print("vLLM started in background. Please wait for model download...")
 start_vllm()
+# --- STEP 2: FASTAPI PROXY ---
 app = FastAPI()
 @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def proxy_to_vllm(path: str, request: Request):
     target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
     async with httpx.AsyncClient() as client:
+        try:
+            proxy_req = client.build_request(
+                request.method,
+                target_url,
+                headers=request.headers.raw,
+                content=await request.body(),
+                timeout=300.0
+            )
+            r = await client.send(proxy_req, stream=True)
+            return StreamingResponse(
+                r.aiter_raw(),
+                status_code=r.status_code,
+                headers=r.headers,
+                background=None
+            )
+        except httpx.ConnectError:
+            return JSONResponse(status_code=503, content={"error": "Model is still loading. Please wait."})
+# --- STEP 3: GRADIO UI ---
 def run_ui_test(image, prompt):
+    if image is None:
+        return "⚠️ Please upload an image first."
     client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
+    # Encode Image
+    try:
+        buffered = BytesIO()
+        image.save(buffered, format="JPEG")
+        b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    except Exception as e:
+        return f"Error processing image: {e}"
     if not prompt: prompt = "Convert to markdown."
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_ID,
+            messages=[{"role": "user", "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
+            ]}],
+            max_tokens=4096
+        )
+        return completion.choices[0].message.content
+    except APIConnectionError:
+        return "⏳ Model is still downloading/loading... Check the 'Logs' tab. This takes 2-3 minutes on a fresh GPU."
+    except Exception as e:
+        return f"Error: {str(e)}"
 with gr.Blocks() as demo:
+    gr.Markdown("# NuMarkdown L40S vLLM Server")
+    gr.Markdown("Status: If you just started this Space, wait 3 minutes for weights to download.")
     with gr.Row():
+        with gr.Column():
+            img_input = gr.Image(type="pil", label="Document")
+            # FIXED: Added the missing prompt input
+            txt_input = gr.Textbox(value="Convert to markdown.", label="Prompt")
+            btn = gr.Button("Test Inference")
+        with gr.Column():
+            out = gr.Textbox(label="Output")
+    # FIXED: Passed both inputs [img_input, txt_input]
+    btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)