encryptd commited on
Commit
6a5b9e1
·
1 Parent(s): 0b8a564

Enabling vLLM

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +98 -120
  3. requirements.txt +5 -24
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ app.py.backup
2
+ requirements.txt.backup
app.py CHANGED
@@ -1,140 +1,118 @@
1
  import os
2
- import sys
3
  import subprocess
4
-
5
- # --- START DEPENDENCY FIX ---
6
- # This block ensures the correct libraries are loaded even if the build failed.
7
- def install_dependencies():
8
- print("Checking dependencies...")
9
- try:
10
- # Try to import the specific function that is causing the crash
11
- from huggingface_hub import is_offline_mode
12
- except ImportError:
13
- print("Dependency Mismatch detected. Forcing re-install...")
14
-
15
- # 1. Force uninstall the broken libraries
16
- subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "huggingface_hub", "transformers"], check=False)
17
-
18
- # 2. Force install the working versions
19
- subprocess.run([
20
- sys.executable, "-m", "pip", "install",
21
- "huggingface-hub>=0.24.0",
22
- "git+https://github.com/huggingface/transformers.git",
23
- "accelerate>=0.26.0"
24
- ], check=True)
25
-
26
- print("Dependencies fixed. Restarting app...")
27
- # 3. Restart the entire script to load the new files
28
- os.execv(sys.executable, [sys.executable] + sys.argv)
29
-
30
- install_dependencies()
31
- # --- END DEPENDENCY FIX ---
32
-
33
  import gradio as gr
34
- import spaces
35
- import torch
36
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
37
- from qwen_vl_utils import process_vision_info
38
 
39
- # 1. Load Model with 4-bit quantization (CRITICAL for ZeroGPU)
40
- # This reduces memory usage from ~16GB to ~6GB
41
- model_id = "numind/NuMarkdown-8B-Thinking"
 
 
 
42
 
43
- print("Loading model... this may take a minute.")
44
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
45
- model_id,
46
- torch_dtype=torch.float16,
47
- device_map="auto",
48
- #load_in_4bit=False # <--- PREVENTS CRASHES
49
- #attn_implementation="flash_attention_2", # Optional: Remove if it causes errors on your specific GPU slice
50
- )
51
 
52
- # Load processor
53
- min_pixels = 256 * 28 * 28
54
- max_pixels = 1280 * 28 * 28
55
- processor = AutoProcessor.from_pretrained(
56
- model_id,
57
- min_pixels=min_pixels,
58
- max_pixels=max_pixels
59
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- @spaces.GPU
62
- def run_ocr(image, prompt):
63
- if image is None:
64
- return "Please upload an image."
65
-
66
- if not prompt:
67
- prompt = "Convert this document to markdown."
68
 
69
- # Qwen2.5-VL requires specific message formatting
70
- messages = [
71
- {
72
- "role": "user",
73
- "content": [
74
- {"type": "image", "image": image},
75
- {"type": "text", "text": prompt},
76
- ],
77
- }
78
- ]
79
 
80
- # Preprocess inputs
81
- text_input = processor.apply_chat_template(
82
- messages, tokenize=False, add_generation_prompt=True
83
- )
84
 
85
- image_inputs, video_inputs = process_vision_info(messages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- inputs = processor(
88
- text=[text_input],
89
- images=image_inputs,
90
- videos=video_inputs,
91
- padding=True,
92
- return_tensors="pt",
93
- )
94
 
95
- # Move inputs to GPU
96
- inputs = inputs.to("cuda")
97
-
98
- # Generate
99
- generated_ids = model.generate(
100
- **inputs,
101
- max_new_tokens=2048, # Adjust based on document length
102
- do_sample=False # Deterministic is usually better for OCR
103
- )
104
-
105
- # Decode output
106
- generated_ids_trimmed = [
107
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
108
- ]
109
 
110
- output_text = processor.batch_decode(
111
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
 
 
 
 
112
  )
113
-
114
- return output_text[0]
115
 
116
- # Gradio Interface
117
  with gr.Blocks() as demo:
118
- gr.Markdown("# NuMarkdown-8B-Thinking (ZeroGPU)")
119
-
120
  with gr.Row():
121
- with gr.Column():
122
- img_input = gr.Image(type="pil", label="Upload Document")
123
- prompt_input = gr.Textbox(value="Convert this document to markdown.", label="Instruction")
124
- submit_btn = gr.Button("Run OCR")
125
- with gr.Column():
126
- output_md = gr.Markdown(label="Rendered Output")
127
- output_raw = gr.Textbox(label="Raw Markdown Code")
128
 
129
- submit_btn.click(
130
- fn=run_ocr,
131
- inputs=[img_input, prompt_input],
132
- outputs=[output_raw] # We output to the raw box
133
- ).success(
134
- fn=lambda x: x, # Copy raw text to markdown render
135
- inputs=[output_raw],
136
- outputs=[output_md]
137
- )
138
 
 
139
  if __name__ == "__main__":
140
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import os
 
2
  import subprocess
3
+ import time
4
+ import httpx
5
+ from fastapi import FastAPI, Request
6
+ from fastapi.responses import StreamingResponse, JSONResponse
7
+ import uvicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import gradio as gr
9
+ from openai import OpenAI
10
+ import base64
11
+ from io import BytesIO
 
12
 
13
+ # --- CONFIGURATION ---
14
+ MODEL_ID = "numind/NuMarkdown-8B-Thinking"
15
+ GPU_UTILIZATION = 0.9
16
+ MAX_MODEL_LEN = 32768
17
+ VLLM_PORT = 8000 # Internal port for vLLM
18
+ EXPOSED_PORT = 7860 # External port (Hugging Face default)
19
 
20
+ # --- STEP 1: LAUNCH vLLM IN BACKGROUND ---
21
+ def start_vllm():
22
+ if "VLLM_PID" in os.environ:
23
+ print("vLLM already running.")
24
+ return
 
 
 
25
 
26
+ print(f"Starting vLLM server on port {VLLM_PORT}...")
27
+ command = [
28
+ "vllm", "serve", MODEL_ID,
29
+ "--host", "0.0.0.0",
30
+ "--port", str(VLLM_PORT),
31
+ "--trust-remote-code",
32
+ "--gpu-memory-utilization", str(GPU_UTILIZATION),
33
+ "--max-model-len", str(MAX_MODEL_LEN),
34
+ "--dtype", "bfloat16",
35
+ "--limit-mm-per-prompt", "image=1",
36
+ ]
37
+ proc = subprocess.Popen(command)
38
+ os.environ["VLLM_PID"] = str(proc.pid)
39
+
40
+ # Wait for vLLM to be ready
41
+ print("Waiting for vLLM to load...")
42
+ for i in range(30):
43
+ try:
44
+ # Quick health check
45
+ httpx.get(f"http://localhost:{VLLM_PORT}/health")
46
+ print("vLLM is READY!")
47
+ return
48
+ except:
49
+ time.sleep(10)
50
+ print(f"Loading... {i*10}s")
51
 
52
+ # Start vLLM immediately
53
+ start_vllm()
 
 
 
 
 
54
 
55
+ # --- STEP 2: SETUP FASTAPI PROXY ---
56
+ app = FastAPI()
 
 
 
 
 
 
 
 
57
 
58
+ # This is the magic function that forwards Docling's requests to vLLM
59
+ @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
60
+ async def proxy_to_vllm(path: str, request: Request):
61
+ target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
62
 
63
+ async with httpx.AsyncClient() as client:
64
+ # Forward the request to vLLM
65
+ proxy_req = client.build_request(
66
+ request.method,
67
+ target_url,
68
+ headers=request.headers.raw,
69
+ content=await request.body(),
70
+ timeout=120.0 # Long timeout for OCR
71
+ )
72
+
73
+ # Send to vLLM and stream the response back to the internet
74
+ r = await client.send(proxy_req, stream=True)
75
+
76
+ return StreamingResponse(
77
+ r.aiter_raw(),
78
+ status_code=r.status_code,
79
+ headers=r.headers,
80
+ background=None
81
+ )
82
+
83
+ # --- STEP 3: GRADIO UI (Optional, but good for testing) ---
84
+ def run_ui_test(image, prompt):
85
+ # This talks to the INTERNAL vLLM
86
+ client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
87
 
88
+ # Encode image simple logic
89
+ buffered = BytesIO()
90
+ image.save(buffered, format="JPEG")
91
+ b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
 
 
 
92
 
93
+ if not prompt: prompt = "Convert to markdown."
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ completion = client.chat.completions.create(
96
+ model=MODEL_ID,
97
+ messages=[{"role": "user", "content": [
98
+ {"type": "text", "text": prompt},
99
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
100
+ ]}],
101
+ max_tokens=2048
102
  )
103
+ return completion.choices[0].message.content
 
104
 
 
105
  with gr.Blocks() as demo:
106
+ gr.Markdown("# NuMarkdown vLLM API Server")
 
107
  with gr.Row():
108
+ img = gr.Image(type="pil")
109
+ btn = gr.Button("Test Internal Inference")
110
+ out = gr.Textbox()
111
+ btn.click(run_ui_test, inputs=[img], outputs=[out])
 
 
 
112
 
113
+ # Mount Gradio to the root URL
114
+ app = gr.mount_gradio_app(app, demo, path="/")
 
 
 
 
 
 
 
115
 
116
+ # --- STEP 4: RUN EVERYTHING ON PORT 7860 ---
117
  if __name__ == "__main__":
118
+ uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)
requirements.txt CHANGED
@@ -1,25 +1,6 @@
1
- # --- Critical Version Pins (Fixes Dependency Conflicts) ---
2
- fsspec<=2025.10.0
3
- huggingface-hub>=0.24.0
4
-
5
- # --- Model Support (Qwen2.5-VL / NuMarkdown-8B) ---
6
- # We need the bleeding edge Transformers for this model
7
- git+https://github.com/huggingface/transformers.git
8
- accelerate>=0.26.0
9
- bitsandbytes
10
- qwen-vl-utils
11
-
12
- # --- App & ZeroGPU ---
13
  gradio>=4.0.0
14
- spaces
15
-
16
- # --- Missing Dependencies Fix ---
17
- # Explicitly added because Gradio's CLI sometimes misses it
18
- typer
19
- rich
20
- click
21
-
22
- # --- Standard Utilities ---
23
- torch
24
- torchvision
25
- pillow
 
1
+ vllm>=0.7.2
 
 
 
 
 
 
 
 
 
 
 
2
  gradio>=4.0.0
3
+ openai
4
+ fastapi
5
+ uvicorn
6
+ httpx