File size: 3,982 Bytes
0d3d2c6
002f0f2
0d3d2c6
002f0f2
 
 
0d3d2c6
 
002f0f2
0d3d2c6
 
 
eaa5923
 
0d3d2c6
 
 
 
 
 
 
 
 
 
 
 
 
002f0f2
0d3d2c6
 
 
 
002f0f2
0d3d2c6
 
002f0f2
951c827
 
 
 
 
 
 
aaa0398
 
 
0d3d2c6
 
002f0f2
0d3d2c6
 
 
002f0f2
0d3d2c6
 
 
 
 
 
002f0f2
0d3d2c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
002f0f2
 
0d3d2c6
 
002f0f2
 
0d3d2c6
 
 
 
002f0f2
0d3d2c6
002f0f2
 
0d3d2c6
002f0f2
 
0d3d2c6
 
 
 
002f0f2
 
 
 
0d3d2c6
 
 
002f0f2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
HF Spaces (Docker SDK) app
- Launches vLLM (OpenAI-compatible) on localhost:API_PORT
- FastAPI proxies /v1/* → vLLM (so clients can use OpenAI SDK / LangChain)
- Gradio UI at "/"
- Defaults for A10G 24GB (Qwen 2.5 14B AWQ, 8k context)
"""

import os, time, threading, subprocess, requests
from fastapi import FastAPI, Request, Response
import gradio as gr

os.environ["VLLM_USE_OUTLINES"] = "0"   # turn off outlines so pyairports isn't imported

MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
API_PORT = int(os.environ.get("API_PORT", "8000"))  # vLLM internal port
SYSTEM_PROMPT = os.environ.get(
    "SYSTEM_PROMPT",
    "You are ExCom AI, a professional assistant that answers precisely and clearly."
)

VLLM_ARGS = [
    "python3", "-m", "vllm.entrypoints.openai.api_server",
    "--model", MODEL_ID,
    "--host", "0.0.0.0",
    "--port", str(API_PORT),
    "--served-model-name", "excom-ai",
    "--max-model-len", "8192",               # fits A10G 24GB
    "--gpu-memory-utilization", "0.90",
    "--trust-remote-code",
]
if "AWQ" in MODEL_ID.upper():
    VLLM_ARGS += ["--quantization", "awq_marlin"]  # faster AWQ kernel if available

def launch_vllm():
    print(f"[vLLM] Launch: {MODEL_ID}")
    # Debug: check if pyairports is available
    try:
        import pyairports
        print(f"[DEBUG] pyairports found: {pyairports.__file__}")
    except ImportError as e:
        print(f"[DEBUG] pyairports NOT found: {e}")

    env = os.environ.copy()
    env["VLLM_USE_OUTLINES"] = "0"  # disable outlines to avoid pyairports import
    subprocess.Popen(VLLM_ARGS, env=env)

def wait_vllm_ready(timeout=900, interval=3):
    url = f"http://127.0.0.1:{API_PORT}/v1/models"
    start = time.time()
    while time.time() - start < timeout:
        try:
            r = requests.get(url, timeout=3)
            if r.ok:
                print("[vLLM] Ready.")
                return True
        except Exception:
            pass
        time.sleep(interval)
    print("[vLLM] Not ready in time.")
    return False

threading.Thread(target=launch_vllm, daemon=True).start()
threading.Thread(target=wait_vllm_ready, daemon=True).start()

app = FastAPI()

@app.get("/health")
def health():
    try:
        r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=2)
        return {"upstream_ok": r.ok}
    except Exception as e:
        return {"upstream_ok": False, "error": str(e)}

@app.get("/v1/models")
def proxy_models():
    r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=30)
    return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)

@app.post("/v1/chat/completions")
async def proxy_chat(req: Request):
    body = await req.body()
    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
                      data=body,
                      headers={"Content-Type": "application/json"},
                      timeout=600)
    return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)

# -------- Gradio (messages mode) --------
_ready = {"ok": False}
def ensure_ready():
    if _ready["ok"]: return True
    if wait_vllm_ready(timeout=60): _ready["ok"] = True; return True
    return False

def chat_fn(user_message: str, history: list[dict]):
    if not ensure_ready():
        return "⏳ Model is loading… please retry shortly."
    messages = [{"role":"system","content":SYSTEM_PROMPT}] + history + [{"role":"user","content":user_message}]
    payload = {"model":"excom-ai","messages":messages,"temperature":0.4}
    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions", json=payload, timeout=600)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"]

ui = gr.ChatInterface(fn=chat_fn, title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)", type="messages")
ui.queue()
app = gr.mount_gradio_app(app, ui, path="/")