Spaces:
Sleeping
Sleeping
encryptd commited on
Commit ·
6a5b9e1
1
Parent(s): 0b8a564
Enabling vLLM
Browse files- .gitignore +2 -0
- app.py +98 -120
- requirements.txt +5 -24
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
app.py.backup
|
| 2 |
+
requirements.txt.backup
|
app.py
CHANGED
|
@@ -1,140 +1,118 @@
|
|
| 1 |
import os
|
| 2 |
-
import sys
|
| 3 |
import subprocess
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
try:
|
| 10 |
-
# Try to import the specific function that is causing the crash
|
| 11 |
-
from huggingface_hub import is_offline_mode
|
| 12 |
-
except ImportError:
|
| 13 |
-
print("Dependency Mismatch detected. Forcing re-install...")
|
| 14 |
-
|
| 15 |
-
# 1. Force uninstall the broken libraries
|
| 16 |
-
subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "huggingface_hub", "transformers"], check=False)
|
| 17 |
-
|
| 18 |
-
# 2. Force install the working versions
|
| 19 |
-
subprocess.run([
|
| 20 |
-
sys.executable, "-m", "pip", "install",
|
| 21 |
-
"huggingface-hub>=0.24.0",
|
| 22 |
-
"git+https://github.com/huggingface/transformers.git",
|
| 23 |
-
"accelerate>=0.26.0"
|
| 24 |
-
], check=True)
|
| 25 |
-
|
| 26 |
-
print("Dependencies fixed. Restarting app...")
|
| 27 |
-
# 3. Restart the entire script to load the new files
|
| 28 |
-
os.execv(sys.executable, [sys.executable] + sys.argv)
|
| 29 |
-
|
| 30 |
-
install_dependencies()
|
| 31 |
-
# --- END DEPENDENCY FIX ---
|
| 32 |
-
|
| 33 |
import gradio as gr
|
| 34 |
-
import
|
| 35 |
-
import
|
| 36 |
-
from
|
| 37 |
-
from qwen_vl_utils import process_vision_info
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
#load_in_4bit=False # <--- PREVENTS CRASHES
|
| 49 |
-
#attn_implementation="flash_attention_2", # Optional: Remove if it causes errors on your specific GPU slice
|
| 50 |
-
)
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
if image is None:
|
| 64 |
-
return "Please upload an image."
|
| 65 |
-
|
| 66 |
-
if not prompt:
|
| 67 |
-
prompt = "Convert this document to markdown."
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
{
|
| 72 |
-
"role": "user",
|
| 73 |
-
"content": [
|
| 74 |
-
{"type": "image", "image": image},
|
| 75 |
-
{"type": "text", "text": prompt},
|
| 76 |
-
],
|
| 77 |
-
}
|
| 78 |
-
]
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
padding=True,
|
| 92 |
-
return_tensors="pt",
|
| 93 |
-
)
|
| 94 |
|
| 95 |
-
|
| 96 |
-
inputs = inputs.to("cuda")
|
| 97 |
-
|
| 98 |
-
# Generate
|
| 99 |
-
generated_ids = model.generate(
|
| 100 |
-
**inputs,
|
| 101 |
-
max_new_tokens=2048, # Adjust based on document length
|
| 102 |
-
do_sample=False # Deterministic is usually better for OCR
|
| 103 |
-
)
|
| 104 |
-
|
| 105 |
-
# Decode output
|
| 106 |
-
generated_ids_trimmed = [
|
| 107 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 108 |
-
]
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
)
|
| 113 |
-
|
| 114 |
-
return output_text[0]
|
| 115 |
|
| 116 |
-
# Gradio Interface
|
| 117 |
with gr.Blocks() as demo:
|
| 118 |
-
gr.Markdown("# NuMarkdown
|
| 119 |
-
|
| 120 |
with gr.Row():
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
with gr.Column():
|
| 126 |
-
output_md = gr.Markdown(label="Rendered Output")
|
| 127 |
-
output_raw = gr.Textbox(label="Raw Markdown Code")
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
inputs=[img_input, prompt_input],
|
| 132 |
-
outputs=[output_raw] # We output to the raw box
|
| 133 |
-
).success(
|
| 134 |
-
fn=lambda x: x, # Copy raw text to markdown render
|
| 135 |
-
inputs=[output_raw],
|
| 136 |
-
outputs=[output_md]
|
| 137 |
-
)
|
| 138 |
|
|
|
|
| 139 |
if __name__ == "__main__":
|
| 140 |
-
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import subprocess
|
| 3 |
+
import time
|
| 4 |
+
import httpx
|
| 5 |
+
from fastapi import FastAPI, Request
|
| 6 |
+
from fastapi.responses import StreamingResponse, JSONResponse
|
| 7 |
+
import uvicorn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
import base64
|
| 11 |
+
from io import BytesIO
|
|
|
|
| 12 |
|
| 13 |
+
# --- CONFIGURATION ---
|
| 14 |
+
MODEL_ID = "numind/NuMarkdown-8B-Thinking"
|
| 15 |
+
GPU_UTILIZATION = 0.9
|
| 16 |
+
MAX_MODEL_LEN = 32768
|
| 17 |
+
VLLM_PORT = 8000 # Internal port for vLLM
|
| 18 |
+
EXPOSED_PORT = 7860 # External port (Hugging Face default)
|
| 19 |
|
| 20 |
+
# --- STEP 1: LAUNCH vLLM IN BACKGROUND ---
|
| 21 |
+
def start_vllm():
|
| 22 |
+
if "VLLM_PID" in os.environ:
|
| 23 |
+
print("vLLM already running.")
|
| 24 |
+
return
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
print(f"Starting vLLM server on port {VLLM_PORT}...")
|
| 27 |
+
command = [
|
| 28 |
+
"vllm", "serve", MODEL_ID,
|
| 29 |
+
"--host", "0.0.0.0",
|
| 30 |
+
"--port", str(VLLM_PORT),
|
| 31 |
+
"--trust-remote-code",
|
| 32 |
+
"--gpu-memory-utilization", str(GPU_UTILIZATION),
|
| 33 |
+
"--max-model-len", str(MAX_MODEL_LEN),
|
| 34 |
+
"--dtype", "bfloat16",
|
| 35 |
+
"--limit-mm-per-prompt", "image=1",
|
| 36 |
+
]
|
| 37 |
+
proc = subprocess.Popen(command)
|
| 38 |
+
os.environ["VLLM_PID"] = str(proc.pid)
|
| 39 |
+
|
| 40 |
+
# Wait for vLLM to be ready
|
| 41 |
+
print("Waiting for vLLM to load...")
|
| 42 |
+
for i in range(30):
|
| 43 |
+
try:
|
| 44 |
+
# Quick health check
|
| 45 |
+
httpx.get(f"http://localhost:{VLLM_PORT}/health")
|
| 46 |
+
print("vLLM is READY!")
|
| 47 |
+
return
|
| 48 |
+
except:
|
| 49 |
+
time.sleep(10)
|
| 50 |
+
print(f"Loading... {i*10}s")
|
| 51 |
|
| 52 |
+
# Start vLLM immediately
|
| 53 |
+
start_vllm()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
# --- STEP 2: SETUP FASTAPI PROXY ---
|
| 56 |
+
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
# This is the magic function that forwards Docling's requests to vLLM
|
| 59 |
+
@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
|
| 60 |
+
async def proxy_to_vllm(path: str, request: Request):
|
| 61 |
+
target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
|
| 62 |
|
| 63 |
+
async with httpx.AsyncClient() as client:
|
| 64 |
+
# Forward the request to vLLM
|
| 65 |
+
proxy_req = client.build_request(
|
| 66 |
+
request.method,
|
| 67 |
+
target_url,
|
| 68 |
+
headers=request.headers.raw,
|
| 69 |
+
content=await request.body(),
|
| 70 |
+
timeout=120.0 # Long timeout for OCR
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Send to vLLM and stream the response back to the internet
|
| 74 |
+
r = await client.send(proxy_req, stream=True)
|
| 75 |
+
|
| 76 |
+
return StreamingResponse(
|
| 77 |
+
r.aiter_raw(),
|
| 78 |
+
status_code=r.status_code,
|
| 79 |
+
headers=r.headers,
|
| 80 |
+
background=None
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# --- STEP 3: GRADIO UI (Optional, but good for testing) ---
|
| 84 |
+
def run_ui_test(image, prompt):
|
| 85 |
+
# This talks to the INTERNAL vLLM
|
| 86 |
+
client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
|
| 87 |
|
| 88 |
+
# Encode image simple logic
|
| 89 |
+
buffered = BytesIO()
|
| 90 |
+
image.save(buffered, format="JPEG")
|
| 91 |
+
b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
if not prompt: prompt = "Convert to markdown."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
completion = client.chat.completions.create(
|
| 96 |
+
model=MODEL_ID,
|
| 97 |
+
messages=[{"role": "user", "content": [
|
| 98 |
+
{"type": "text", "text": prompt},
|
| 99 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
|
| 100 |
+
]}],
|
| 101 |
+
max_tokens=2048
|
| 102 |
)
|
| 103 |
+
return completion.choices[0].message.content
|
|
|
|
| 104 |
|
|
|
|
| 105 |
with gr.Blocks() as demo:
|
| 106 |
+
gr.Markdown("# NuMarkdown vLLM API Server")
|
|
|
|
| 107 |
with gr.Row():
|
| 108 |
+
img = gr.Image(type="pil")
|
| 109 |
+
btn = gr.Button("Test Internal Inference")
|
| 110 |
+
out = gr.Textbox()
|
| 111 |
+
btn.click(run_ui_test, inputs=[img], outputs=[out])
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
# Mount Gradio to the root URL
|
| 114 |
+
app = gr.mount_gradio_app(app, demo, path="/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
+
# --- STEP 4: RUN EVERYTHING ON PORT 7860 ---
|
| 117 |
if __name__ == "__main__":
|
| 118 |
+
uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)
|
requirements.txt
CHANGED
|
@@ -1,25 +1,6 @@
|
|
| 1 |
-
|
| 2 |
-
fsspec<=2025.10.0
|
| 3 |
-
huggingface-hub>=0.24.0
|
| 4 |
-
|
| 5 |
-
# --- Model Support (Qwen2.5-VL / NuMarkdown-8B) ---
|
| 6 |
-
# We need the bleeding edge Transformers for this model
|
| 7 |
-
git+https://github.com/huggingface/transformers.git
|
| 8 |
-
accelerate>=0.26.0
|
| 9 |
-
bitsandbytes
|
| 10 |
-
qwen-vl-utils
|
| 11 |
-
|
| 12 |
-
# --- App & ZeroGPU ---
|
| 13 |
gradio>=4.0.0
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
typer
|
| 19 |
-
rich
|
| 20 |
-
click
|
| 21 |
-
|
| 22 |
-
# --- Standard Utilities ---
|
| 23 |
-
torch
|
| 24 |
-
torchvision
|
| 25 |
-
pillow
|
|
|
|
| 1 |
+
vllm>=0.7.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
gradio>=4.0.0
|
| 3 |
+
openai
|
| 4 |
+
fastapi
|
| 5 |
+
uvicorn
|
| 6 |
+
httpx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|