Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -2,18 +2,30 @@ import os
|
|
| 2 |
import subprocess
|
| 3 |
import time
|
| 4 |
import requests
|
| 5 |
-
from fastapi import FastAPI, HTTPException, Request
|
|
|
|
| 6 |
from pydantic import BaseModel
|
| 7 |
from typing import List, Optional, Any
|
| 8 |
import torch
|
| 9 |
|
| 10 |
-
app = FastAPI(title="
|
| 11 |
|
| 12 |
# Configuration
|
| 13 |
MODEL_ID = "Qwen/Qwen3-VL-8B-Thinking"
|
| 14 |
SGLANG_PORT = 30000
|
| 15 |
SGLANG_HOST = "127.0.0.1"
|
| 16 |
SGLANG_URL = f"http://{SGLANG_HOST}:{SGLANG_PORT}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Global process for SGLang server
|
| 19 |
sglang_process = None
|
|
@@ -23,24 +35,20 @@ def start_sglang():
|
|
| 23 |
if sglang_process is None:
|
| 24 |
print(f"Starting SGLang server for {MODEL_ID}...")
|
| 25 |
|
| 26 |
-
# Command to start SGLang server
|
| 27 |
-
# Using --chat-template qwen2-vl as Fara-7B is based on Qwen2.5-VL
|
| 28 |
cmd = [
|
| 29 |
"python3", "-m", "sglang.launch_server",
|
| 30 |
"--model-path", MODEL_ID,
|
| 31 |
"--host", SGLANG_HOST,
|
| 32 |
"--port", str(SGLANG_PORT),
|
| 33 |
-
"--chat-template", "qwen2-vl",
|
| 34 |
"--trust-remote-code"
|
| 35 |
]
|
| 36 |
|
| 37 |
-
# Check GPU availability for tensor parallel
|
| 38 |
if torch.cuda.device_count() > 1:
|
| 39 |
cmd.extend(["--tp", str(torch.cuda.device_count())])
|
| 40 |
|
| 41 |
sglang_process = subprocess.Popen(cmd)
|
| 42 |
|
| 43 |
-
# Wait for server to be ready
|
| 44 |
max_retries = 60
|
| 45 |
for i in range(max_retries):
|
| 46 |
try:
|
|
@@ -57,7 +65,6 @@ def start_sglang():
|
|
| 57 |
|
| 58 |
@app.on_event("startup")
|
| 59 |
async def startup_event():
|
| 60 |
-
# Start SGLang in the background
|
| 61 |
import threading
|
| 62 |
threading.Thread(target=start_sglang, daemon=True).start()
|
| 63 |
|
|
@@ -78,7 +85,7 @@ class MessageRequest(BaseModel):
|
|
| 78 |
|
| 79 |
@app.get("/")
|
| 80 |
async def root():
|
| 81 |
-
return {"message": "
|
| 82 |
|
| 83 |
@app.get("/health")
|
| 84 |
async def health():
|
|
@@ -91,9 +98,8 @@ async def health():
|
|
| 91 |
return {"status": "starting", "backend": "sglang"}
|
| 92 |
|
| 93 |
@app.post("/v1/responses")
|
| 94 |
-
async def generate_response(request: ResponseRequest):
|
| 95 |
try:
|
| 96 |
-
# Map /v1/responses to SGLang's completions or chat completions
|
| 97 |
payload = {
|
| 98 |
"model": MODEL_ID,
|
| 99 |
"prompt": request.prompt,
|
|
@@ -108,9 +114,8 @@ async def generate_response(request: ResponseRequest):
|
|
| 108 |
raise HTTPException(status_code=500, detail=str(e))
|
| 109 |
|
| 110 |
@app.post("/v1/messages")
|
| 111 |
-
async def generate_message(request: MessageRequest):
|
| 112 |
try:
|
| 113 |
-
# Map /v1/messages to SGLang's chat completions
|
| 114 |
payload = {
|
| 115 |
"model": MODEL_ID,
|
| 116 |
"messages": [m.dict() for m in request.messages],
|
|
@@ -124,12 +129,12 @@ async def generate_message(request: MessageRequest):
|
|
| 124 |
except Exception as e:
|
| 125 |
raise HTTPException(status_code=500, detail=str(e))
|
| 126 |
|
| 127 |
-
# Proxy other OpenAI compatible requests to SGLang
|
| 128 |
@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
|
| 129 |
-
async def proxy_openai(path: str, request: Request):
|
| 130 |
url = f"{SGLANG_URL}/v1/{path}"
|
| 131 |
method = request.method
|
| 132 |
-
headers = {k: v for k, v in request.headers.items() if k.lower()
|
| 133 |
body = await request.body()
|
| 134 |
|
| 135 |
try:
|
|
|
|
| 2 |
import subprocess
|
| 3 |
import time
|
| 4 |
import requests
|
| 5 |
+
from fastapi import FastAPI, HTTPException, Request, Depends
|
| 6 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 7 |
from pydantic import BaseModel
|
| 8 |
from typing import List, Optional, Any
|
| 9 |
import torch
|
| 10 |
|
| 11 |
+
app = FastAPI(title="Qwen3-VL SGLang API with Auth")
|
| 12 |
|
| 13 |
# Configuration
|
| 14 |
MODEL_ID = "Qwen/Qwen3-VL-8B-Thinking"
|
| 15 |
SGLANG_PORT = 30000
|
| 16 |
SGLANG_HOST = "127.0.0.1"
|
| 17 |
SGLANG_URL = f"http://{SGLANG_HOST}:{SGLANG_PORT}"
|
| 18 |
+
API_KEY = "sk-sheikh545466"
|
| 19 |
+
|
| 20 |
+
security = HTTPBearer()
|
| 21 |
+
|
| 22 |
+
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
| 23 |
+
if credentials.credentials != API_KEY:
|
| 24 |
+
raise HTTPException(
|
| 25 |
+
status_code=403,
|
| 26 |
+
detail="Invalid or missing API Key"
|
| 27 |
+
)
|
| 28 |
+
return credentials.credentials
|
| 29 |
|
| 30 |
# Global process for SGLang server
|
| 31 |
sglang_process = None
|
|
|
|
| 35 |
if sglang_process is None:
|
| 36 |
print(f"Starting SGLang server for {MODEL_ID}...")
|
| 37 |
|
|
|
|
|
|
|
| 38 |
cmd = [
|
| 39 |
"python3", "-m", "sglang.launch_server",
|
| 40 |
"--model-path", MODEL_ID,
|
| 41 |
"--host", SGLANG_HOST,
|
| 42 |
"--port", str(SGLANG_PORT),
|
| 43 |
+
"--chat-template", "qwen2-vl",
|
| 44 |
"--trust-remote-code"
|
| 45 |
]
|
| 46 |
|
|
|
|
| 47 |
if torch.cuda.device_count() > 1:
|
| 48 |
cmd.extend(["--tp", str(torch.cuda.device_count())])
|
| 49 |
|
| 50 |
sglang_process = subprocess.Popen(cmd)
|
| 51 |
|
|
|
|
| 52 |
max_retries = 60
|
| 53 |
for i in range(max_retries):
|
| 54 |
try:
|
|
|
|
| 65 |
|
| 66 |
@app.on_event("startup")
|
| 67 |
async def startup_event():
|
|
|
|
| 68 |
import threading
|
| 69 |
threading.Thread(target=start_sglang, daemon=True).start()
|
| 70 |
|
|
|
|
| 85 |
|
| 86 |
@app.get("/")
|
| 87 |
async def root():
|
| 88 |
+
return {"message": "Qwen3-VL SGLang API is running. Auth required for /v1/ endpoints."}
|
| 89 |
|
| 90 |
@app.get("/health")
|
| 91 |
async def health():
|
|
|
|
| 98 |
return {"status": "starting", "backend": "sglang"}
|
| 99 |
|
| 100 |
@app.post("/v1/responses")
|
| 101 |
+
async def generate_response(request: ResponseRequest, token: str = Depends(verify_token)):
|
| 102 |
try:
|
|
|
|
| 103 |
payload = {
|
| 104 |
"model": MODEL_ID,
|
| 105 |
"prompt": request.prompt,
|
|
|
|
| 114 |
raise HTTPException(status_code=500, detail=str(e))
|
| 115 |
|
| 116 |
@app.post("/v1/messages")
|
| 117 |
+
async def generate_message(request: MessageRequest, token: str = Depends(verify_token)):
|
| 118 |
try:
|
|
|
|
| 119 |
payload = {
|
| 120 |
"model": MODEL_ID,
|
| 121 |
"messages": [m.dict() for m in request.messages],
|
|
|
|
| 129 |
except Exception as e:
|
| 130 |
raise HTTPException(status_code=500, detail=str(e))
|
| 131 |
|
| 132 |
+
# Proxy other OpenAI compatible requests to SGLang
|
| 133 |
@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
|
| 134 |
+
async def proxy_openai(path: str, request: Request, token: str = Depends(verify_token)):
|
| 135 |
url = f"{SGLANG_URL}/v1/{path}"
|
| 136 |
method = request.method
|
| 137 |
+
headers = {k: v for k, v in request.headers.items() if k.lower() not in ["host", "authorization"]}
|
| 138 |
body = await request.body()
|
| 139 |
|
| 140 |
try:
|