Spaces:

encryptd
/

ocr_vlm_thinking

Paused

App Files Files Community

encryptd commited on Feb 14

Commit

6a5b9e1

1 Parent(s): 0b8a564

Enabling vLLM

Browse files

Files changed (3) hide show

.gitignore +2 -0
app.py +98 -120
requirements.txt +5 -24

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ app.py.backup
2	+ requirements.txt.backup

app.py CHANGED Viewed

@@ -1,140 +1,118 @@
 import os
-import sys
 import subprocess
-# --- START DEPENDENCY FIX ---
-# This block ensures the correct libraries are loaded even if the build failed.
-def install_dependencies():
-    print("Checking dependencies...")
-    try:
-        # Try to import the specific function that is causing the crash
-        from huggingface_hub import is_offline_mode
-    except ImportError:
-        print("Dependency Mismatch detected. Forcing re-install...")
-        # 1. Force uninstall the broken libraries
-        subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "huggingface_hub", "transformers"], check=False)
-        # 2. Force install the working versions
-        subprocess.run([
-            sys.executable, "-m", "pip", "install",
-            "huggingface-hub>=0.24.0",
-            "git+https://github.com/huggingface/transformers.git",
-            "accelerate>=0.26.0"
-        ], check=True)
-        print("Dependencies fixed. Restarting app...")
-        # 3. Restart the entire script to load the new files
-        os.execv(sys.executable, [sys.executable] + sys.argv)
-install_dependencies()
-# --- END DEPENDENCY FIX ---
 import gradio as gr
-import spaces
-import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
-# 1. Load Model with 4-bit quantization (CRITICAL for ZeroGPU)
-# This reduces memory usage from ~16GB to ~6GB
-model_id = "numind/NuMarkdown-8B-Thinking"
-print("Loading model... this may take a minute.")
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    #load_in_4bit=False  # <--- PREVENTS CRASHES
-    #attn_implementation="flash_attention_2", # Optional: Remove if it causes errors on your specific GPU slice
-)
-# Load processor
-min_pixels = 256 * 28 * 28
-max_pixels = 1280 * 28 * 28
-processor = AutoProcessor.from_pretrained(
-    model_id,
-    min_pixels=min_pixels,
-    max_pixels=max_pixels
-)
-@spaces.GPU
-def run_ocr(image, prompt):
-    if image is None:
-        return "Please upload an image."
-    if not prompt:
-        prompt = "Convert this document to markdown."
-    # Qwen2.5-VL requires specific message formatting
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": prompt},
-            ],
-        }
-    ]
-    # Preprocess inputs
-    text_input = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text_input],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    # Move inputs to GPU
-    inputs = inputs.to("cuda")
-    # Generate
-    generated_ids = model.generate(
-        **inputs,
-        max_new_tokens=2048,  # Adjust based on document length
-        do_sample=False       # Deterministic is usually better for OCR
-    )
-    # Decode output
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
-    return output_text[0]
-# Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("# NuMarkdown-8B-Thinking (ZeroGPU)")
     with gr.Row():
-        with gr.Column():
-            img_input = gr.Image(type="pil", label="Upload Document")
-            prompt_input = gr.Textbox(value="Convert this document to markdown.", label="Instruction")
-            submit_btn = gr.Button("Run OCR")
-        with gr.Column():
-            output_md = gr.Markdown(label="Rendered Output")
-            output_raw = gr.Textbox(label="Raw Markdown Code")
-    submit_btn.click(
-        fn=run_ocr,
-        inputs=[img_input, prompt_input],
-        outputs=[output_raw] # We output to the raw box
-    ).success(
-        fn=lambda x: x, # Copy raw text to markdown render
-        inputs=[output_raw],
-        outputs=[output_md]
-    )
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import subprocess
+import time
+import httpx
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse, JSONResponse
+import uvicorn
 import gradio as gr
+from openai import OpenAI
+import base64
+from io import BytesIO
+# --- CONFIGURATION ---
+MODEL_ID = "numind/NuMarkdown-8B-Thinking"
+GPU_UTILIZATION = 0.9
+MAX_MODEL_LEN = 32768
+VLLM_PORT = 8000        # Internal port for vLLM
+EXPOSED_PORT = 7860     # External port (Hugging Face default)
+# --- STEP 1: LAUNCH vLLM IN BACKGROUND ---
+def start_vllm():
+    if "VLLM_PID" in os.environ:
+        print("vLLM already running.")
+        return
+    print(f"Starting vLLM server on port {VLLM_PORT}...")
+    command = [
+        "vllm", "serve", MODEL_ID,
+        "--host", "0.0.0.0",
+        "--port", str(VLLM_PORT),
+        "--trust-remote-code",
+        "--gpu-memory-utilization", str(GPU_UTILIZATION),
+        "--max-model-len", str(MAX_MODEL_LEN),
+        "--dtype", "bfloat16",
+        "--limit-mm-per-prompt", "image=1",
+    ]
+    proc = subprocess.Popen(command)
+    os.environ["VLLM_PID"] = str(proc.pid)
+    # Wait for vLLM to be ready
+    print("Waiting for vLLM to load...")
+    for i in range(30):
+        try:
+            # Quick health check
+            httpx.get(f"http://localhost:{VLLM_PORT}/health")
+            print("vLLM is READY!")
+            return
+        except:
+            time.sleep(10)
+            print(f"Loading... {i*10}s")
+# Start vLLM immediately
+start_vllm()
+# --- STEP 2: SETUP FASTAPI PROXY ---
+app = FastAPI()
+# This is the magic function that forwards Docling's requests to vLLM
+@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
+async def proxy_to_vllm(path: str, request: Request):
+    target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
+    async with httpx.AsyncClient() as client:
+        # Forward the request to vLLM
+        proxy_req = client.build_request(
+            request.method,
+            target_url,
+            headers=request.headers.raw,
+            content=await request.body(),
+            timeout=120.0 # Long timeout for OCR
+        )
+        # Send to vLLM and stream the response back to the internet
+        r = await client.send(proxy_req, stream=True)
+        return StreamingResponse(
+            r.aiter_raw(),
+            status_code=r.status_code,
+            headers=r.headers,
+            background=None
+        )
+# --- STEP 3: GRADIO UI (Optional, but good for testing) ---
+def run_ui_test(image, prompt):
+    # This talks to the INTERNAL vLLM
+    client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
+    # Encode image simple logic
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    if not prompt: prompt = "Convert to markdown."
+    completion = client.chat.completions.create(
+        model=MODEL_ID,
+        messages=[{"role": "user", "content": [
+            {"type": "text", "text": prompt},
+            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
+        ]}],
+        max_tokens=2048
     )
+    return completion.choices[0].message.content
 with gr.Blocks() as demo:
+    gr.Markdown("# NuMarkdown vLLM API Server")
     with gr.Row():
+        img = gr.Image(type="pil")
+        btn = gr.Button("Test Internal Inference")
+    out = gr.Textbox()
+    btn.click(run_ui_test, inputs=[img], outputs=[out])
+# Mount Gradio to the root URL
+app = gr.mount_gradio_app(app, demo, path="/")
+# --- STEP 4: RUN EVERYTHING ON PORT 7860 ---
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)

requirements.txt CHANGED Viewed

@@ -1,25 +1,6 @@
-# --- Critical Version Pins (Fixes Dependency Conflicts) ---
-fsspec<=2025.10.0
-huggingface-hub>=0.24.0
-# --- Model Support (Qwen2.5-VL / NuMarkdown-8B) ---
-# We need the bleeding edge Transformers for this model
-git+https://github.com/huggingface/transformers.git
-accelerate>=0.26.0
-bitsandbytes
-qwen-vl-utils
-# --- App & ZeroGPU ---
 gradio>=4.0.0
-spaces
-# --- Missing Dependencies Fix ---
-# Explicitly added because Gradio's CLI sometimes misses it
-typer
-rich
-click
-# --- Standard Utilities ---
-torch
-torchvision
-pillow

+vllm>=0.7.2
 gradio>=4.0.0
+openai
+fastapi
+uvicorn
+httpx