Spaces:

encryptd
/

ocr_vlm_thinking

Sleeping

encryptd

fix port issue

592242d 28 days ago

4.31 kB

	import os
	import subprocess
	import time
	import sys
	import httpx
	from fastapi import FastAPI, Request
	from fastapi.responses import StreamingResponse
	import uvicorn
	import gradio as gr
	from openai import OpenAI, APIConnectionError
	import base64
	from io import BytesIO

	# --- CONFIGURATION ---
	MODEL_ID = "numind/NuMarkdown-8B-Thinking"
	GPU_UTILIZATION = 0.90
	MAX_MODEL_LEN = 32768
	VLLM_PORT = 8000
	EXPOSED_PORT = 7860

	# --- STEP 1: LAUNCH vLLM (Background) ---
	def start_vllm():
	if "VLLM_PID" in os.environ:
	return

	print(f"Starting vLLM server on port {VLLM_PORT}...")

	# JSON formatted limit string to fix parsing error
	limit_mm_config = '{"image": 1}'

	command = [
	"vllm", "serve", MODEL_ID,
	"--host", "0.0.0.0",
	"--port", str(VLLM_PORT),
	"--trust-remote-code",
	"--gpu-memory-utilization", str(GPU_UTILIZATION),
	"--max-model-len", str(MAX_MODEL_LEN),
	"--dtype", "bfloat16",
	"--limit-mm-per-prompt", limit_mm_config
	]

	# Redirect stdout/stderr to see download progress
	proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
	os.environ["VLLM_PID"] = str(proc.pid)

	# We do NOT block here anymore. We let vLLM load in the background
	# while the UI starts. This allows you to see the UI immediately.
	print("vLLM started in background. Please wait for model download...")

	start_vllm()

	# --- STEP 2: FASTAPI PROXY ---
	app = FastAPI()

	@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
	async def proxy_to_vllm(path: str, request: Request):
	target_url = f"http://localhost:{VLLM_PORT}/v1/{path}"
	async with httpx.AsyncClient() as client:
	try:
	proxy_req = client.build_request(
	request.method,
	target_url,
	headers=request.headers.raw,
	content=await request.body(),
	timeout=300.0
	)
	r = await client.send(proxy_req, stream=True)
	return StreamingResponse(
	r.aiter_raw(),
	status_code=r.status_code,
	headers=r.headers,
	background=None
	)
	except httpx.ConnectError:
	return JSONResponse(status_code=503, content={"error": "Model is still loading. Please wait."})

	# --- STEP 3: GRADIO UI ---
	def run_ui_test(image, prompt):
	if image is None:
	return "⚠️ Please upload an image first."

	client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")

	# Encode Image
	try:
	buffered = BytesIO()
	image.save(buffered, format="JPEG")
	b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
	except Exception as e:
	return f"Error processing image: {e}"

	if not prompt: prompt = "Convert to markdown."

	try:
	completion = client.chat.completions.create(
	model=MODEL_ID,
	messages=[{"role": "user", "content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
	]}],
	max_tokens=4096
	)
	return completion.choices[0].message.content
	except APIConnectionError:
	return "⏳ Model is still downloading/loading... Check the 'Logs' tab. This takes 2-3 minutes on a fresh GPU."
	except Exception as e:
	return f"Error: {str(e)}"

	with gr.Blocks() as demo:
	gr.Markdown("# NuMarkdown L40S vLLM Server")
	gr.Markdown("Status: If you just started this Space, wait 3 minutes for weights to download.")

	with gr.Row():
	with gr.Column():
	img_input = gr.Image(type="pil", label="Document")
	# FIXED: Added the missing prompt input
	txt_input = gr.Textbox(value="Convert to markdown.", label="Prompt")
	btn = gr.Button("Test Inference")
	with gr.Column():
	out = gr.Textbox(label="Output")

	# FIXED: Passed both inputs [img_input, txt_input]
	btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])

	app = gr.mount_gradio_app(app, demo, path="/")

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=EXPOSED_PORT)