encryptd commited on
Commit
592242d
·
1 Parent(s): 6a5b9e1

fix port issue

Browse files
Files changed (1) hide show
  1. app.py +74 -64
app.py CHANGED
@@ -1,29 +1,33 @@
1
  import os
2
  import subprocess
3
  import time
 
4
  import httpx
5
  from fastapi import FastAPI, Request
6
- from fastapi.responses import StreamingResponse, JSONResponse
7
  import uvicorn
8
  import gradio as gr
9
- from openai import OpenAI
10
  import base64
11
  from io import BytesIO
12
 
13
  # --- CONFIGURATION ---
14
  MODEL_ID = "numind/NuMarkdown-8B-Thinking"
15
- GPU_UTILIZATION = 0.9
16
  MAX_MODEL_LEN = 32768
17
- VLLM_PORT = 8000 # Internal port for vLLM
18
- EXPOSED_PORT = 7860 # External port (Hugging Face default)
19
 
20
- # --- STEP 1: LAUNCH vLLM IN BACKGROUND ---
21
  def start_vllm():
22
  if "VLLM_PID" in os.environ:
23
- print("vLLM already running.")
24
  return
25
 
26
  print(f"Starting vLLM server on port {VLLM_PORT}...")
 
 
 
 
27
  command = [
28
  "vllm", "serve", MODEL_ID,
29
  "--host", "0.0.0.0",
@@ -32,87 +36,93 @@ def start_vllm():
32
  "--gpu-memory-utilization", str(GPU_UTILIZATION),
33
  "--max-model-len", str(MAX_MODEL_LEN),
34
  "--dtype", "bfloat16",
35
- "--limit-mm-per-prompt", "image=1",
36
  ]
37
- proc = subprocess.Popen(command)
 
 
38
  os.environ["VLLM_PID"] = str(proc.pid)
39
 
40
- # Wait for vLLM to be ready
41
- print("Waiting for vLLM to load...")
42
- for i in range(30):
43
- try:
44
- # Quick health check
45
- httpx.get(f"http://localhost:{VLLM_PORT}/health")
46
- print("vLLM is READY!")
47
- return
48
- except:
49
- time.sleep(10)
50
- print(f"Loading... {i*10}s")
51
 
52
- # Start vLLM immediately
53
  start_vllm()
54
 
55
- # --- STEP 2: SETUP FASTAPI PROXY ---
56
  app = FastAPI()
57
 
58
- # This is the magic function that forwards Docling's requests to vLLM
59
  @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
60
  async def proxy_to_vllm(path: str, request: Request):
61
  target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
62
-
63
  async with httpx.AsyncClient() as client:
64
- # Forward the request to vLLM
65
- proxy_req = client.build_request(
66
- request.method,
67
- target_url,
68
- headers=request.headers.raw,
69
- content=await request.body(),
70
- timeout=120.0 # Long timeout for OCR
71
- )
72
-
73
- # Send to vLLM and stream the response back to the internet
74
- r = await client.send(proxy_req, stream=True)
75
-
76
- return StreamingResponse(
77
- r.aiter_raw(),
78
- status_code=r.status_code,
79
- headers=r.headers,
80
- background=None
81
- )
82
 
83
- # --- STEP 3: GRADIO UI (Optional, but good for testing) ---
84
  def run_ui_test(image, prompt):
85
- # This talks to the INTERNAL vLLM
 
 
86
  client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
87
 
88
- # Encode image simple logic
89
- buffered = BytesIO()
90
- image.save(buffered, format="JPEG")
91
- b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
 
 
 
92
 
93
  if not prompt: prompt = "Convert to markdown."
94
 
95
- completion = client.chat.completions.create(
96
- model=MODEL_ID,
97
- messages=[{"role": "user", "content": [
98
- {"type": "text", "text": prompt},
99
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
100
- ]}],
101
- max_tokens=2048
102
- )
103
- return completion.choices[0].message.content
 
 
 
 
 
104
 
105
  with gr.Blocks() as demo:
106
- gr.Markdown("# NuMarkdown vLLM API Server")
 
 
107
  with gr.Row():
108
- img = gr.Image(type="pil")
109
- btn = gr.Button("Test Internal Inference")
110
- out = gr.Textbox()
111
- btn.click(run_ui_test, inputs=[img], outputs=[out])
 
 
 
 
 
 
112
 
113
- # Mount Gradio to the root URL
114
  app = gr.mount_gradio_app(app, demo, path="/")
115
 
116
- # --- STEP 4: RUN EVERYTHING ON PORT 7860 ---
117
  if __name__ == "__main__":
118
  uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)
 
1
  import os
2
  import subprocess
3
  import time
4
+ import sys
5
  import httpx
6
  from fastapi import FastAPI, Request
7
+ from fastapi.responses import StreamingResponse
8
  import uvicorn
9
  import gradio as gr
10
+ from openai import OpenAI, APIConnectionError
11
  import base64
12
  from io import BytesIO
13
 
14
  # --- CONFIGURATION ---
15
  MODEL_ID = "numind/NuMarkdown-8B-Thinking"
16
+ GPU_UTILIZATION = 0.90
17
  MAX_MODEL_LEN = 32768
18
+ VLLM_PORT = 8000
19
+ EXPOSED_PORT = 7860
20
 
21
+ # --- STEP 1: LAUNCH vLLM (Background) ---
22
  def start_vllm():
23
  if "VLLM_PID" in os.environ:
 
24
  return
25
 
26
  print(f"Starting vLLM server on port {VLLM_PORT}...")
27
+
28
+ # JSON formatted limit string to fix parsing error
29
+ limit_mm_config = '{"image": 1}'
30
+
31
  command = [
32
  "vllm", "serve", MODEL_ID,
33
  "--host", "0.0.0.0",
 
36
  "--gpu-memory-utilization", str(GPU_UTILIZATION),
37
  "--max-model-len", str(MAX_MODEL_LEN),
38
  "--dtype", "bfloat16",
39
+ "--limit-mm-per-prompt", limit_mm_config
40
  ]
41
+
42
+ # Redirect stdout/stderr to see download progress
43
+ proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
44
  os.environ["VLLM_PID"] = str(proc.pid)
45
 
46
+ # We do NOT block here anymore. We let vLLM load in the background
47
+ # while the UI starts. This allows you to see the UI immediately.
48
+ print("vLLM started in background. Please wait for model download...")
 
 
 
 
 
 
 
 
49
 
 
50
  start_vllm()
51
 
52
+ # --- STEP 2: FASTAPI PROXY ---
53
  app = FastAPI()
54
 
 
55
  @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
56
  async def proxy_to_vllm(path: str, request: Request):
57
  target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
 
58
  async with httpx.AsyncClient() as client:
59
+ try:
60
+ proxy_req = client.build_request(
61
+ request.method,
62
+ target_url,
63
+ headers=request.headers.raw,
64
+ content=await request.body(),
65
+ timeout=300.0
66
+ )
67
+ r = await client.send(proxy_req, stream=True)
68
+ return StreamingResponse(
69
+ r.aiter_raw(),
70
+ status_code=r.status_code,
71
+ headers=r.headers,
72
+ background=None
73
+ )
74
+ except httpx.ConnectError:
75
+ return JSONResponse(status_code=503, content={"error": "Model is still loading. Please wait."})
 
76
 
77
+ # --- STEP 3: GRADIO UI ---
78
  def run_ui_test(image, prompt):
79
+ if image is None:
80
+ return "⚠️ Please upload an image first."
81
+
82
  client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
83
 
84
+ # Encode Image
85
+ try:
86
+ buffered = BytesIO()
87
+ image.save(buffered, format="JPEG")
88
+ b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
89
+ except Exception as e:
90
+ return f"Error processing image: {e}"
91
 
92
  if not prompt: prompt = "Convert to markdown."
93
 
94
+ try:
95
+ completion = client.chat.completions.create(
96
+ model=MODEL_ID,
97
+ messages=[{"role": "user", "content": [
98
+ {"type": "text", "text": prompt},
99
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
100
+ ]}],
101
+ max_tokens=4096
102
+ )
103
+ return completion.choices[0].message.content
104
+ except APIConnectionError:
105
+ return "⏳ Model is still downloading/loading... Check the 'Logs' tab. This takes 2-3 minutes on a fresh GPU."
106
+ except Exception as e:
107
+ return f"Error: {str(e)}"
108
 
109
  with gr.Blocks() as demo:
110
+ gr.Markdown("# NuMarkdown L40S vLLM Server")
111
+ gr.Markdown("Status: If you just started this Space, wait 3 minutes for weights to download.")
112
+
113
  with gr.Row():
114
+ with gr.Column():
115
+ img_input = gr.Image(type="pil", label="Document")
116
+ # FIXED: Added the missing prompt input
117
+ txt_input = gr.Textbox(value="Convert to markdown.", label="Prompt")
118
+ btn = gr.Button("Test Inference")
119
+ with gr.Column():
120
+ out = gr.Textbox(label="Output")
121
+
122
+ # FIXED: Passed both inputs [img_input, txt_input]
123
+ btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])
124
 
 
125
  app = gr.mount_gradio_app(app, demo, path="/")
126
 
 
127
  if __name__ == "__main__":
128
  uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)