QWEN-3_5-CHAT / app.py
mayafree's picture
Update app.py
0454a0a verified
import sys
print(f"[BOOT] Python {sys.version}", flush=True)
import base64
import os
import re
from typing import Generator, Optional
try:
import gradio as gr
print(f"[BOOT] gradio {gr.__version__}", flush=True)
except ImportError as e:
print(f"[BOOT] FATAL: {e}", flush=True)
sys.exit(1)
try:
from huggingface_hub import InferenceClient
import httpx
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
print("[BOOT] All imports OK", flush=True)
except ImportError as e:
print(f"[BOOT] FATAL: {e} β€” add to requirements.txt", flush=True)
sys.exit(1)
# ══════════════════════════════════════════════════════════════════════════════
# 1. MODEL CAPABILITY MATRIX
# ══════════════════════════════════════════════════════════════════════════════
MODEL_CAPS: dict[str, dict] = {
"Qwen/Qwen3.5-122B-A10B": {
"arch": "MoE", "active": "10B / 122B total",
"ctx": "262K β†’ 1M", "thinking": True, "vision": True,
"max_tokens": 8192, "temp_max": 2.0, "top_p": True,
"color": "#7c3aed",
"badge": "πŸ† Best Overall Β· BFCL 72.2 Β· GPQA 86.6 Β· SWE 72.0",
"desc": "Top reasoning & agents Β· Complex math Β· Long context",
},
"Qwen/Qwen3.5-27B": {
"arch": "Dense", "active": "27B (all active)",
"ctx": "262K β†’ 1M", "thinking": True, "vision": True,
"max_tokens": 8192, "temp_max": 2.0, "top_p": True,
"color": "#0d9488",
"badge": "🎯 Dense #1 · IFEval 95.0 · SWE 72.4 · PolyMATH 71.2",
"desc": "Instruction king Β· Creative writing Β· 201 languages",
},
"Qwen/Qwen3.5-35B-A3B": {
"arch": "MoE", "active": "3B / 35B total",
"ctx": "262K β†’ 1M", "thinking": True, "vision": True,
"max_tokens": 4096, "temp_max": 1.5, "top_p": True,
"color": "#d97706",
"badge": "⚑ Flash Speed · TAU2 81.2 · MMLU-Pro 85.3",
"desc": "Fastest Β· 3B active params Β· ~6Γ— faster than 27B",
}
}
MODEL_IDS = list(MODEL_CAPS.keys())
DEFAULT_MODEL = MODEL_IDS[0]
# ══════════════════════════════════════════════════════════════════════════════
# 2. SYSTEM PROMPT PRESETS
# ══════════════════════════════════════════════════════════════════════════════
PRESETS = {
"general": "You are a helpful, harmless, and honest AI assistant.",
"code": "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
"math": "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.",
"creative": "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.",
"translate": "You are a professional translator fluent in 201 languages. Provide accurate, natural-sounding translations with cultural context.",
"research": "You are a rigorous research analyst. Provide structured, well-reasoned analysis. Identify assumptions and acknowledge uncertainty.",
}
# ══════════════════════════════════════════════════════════════════════════════
# 3. THINKING MODE HELPERS
# ══════════════════════════════════════════════════════════════════════════════
def build_user_message(text: str, thinking: bool) -> str:
return ("/think\n" if thinking else "/no_think\n") + text
def parse_think_blocks(text: str) -> tuple[str, str]:
m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
return (m.group(1).strip(), text[m.end():].strip()) if m else ("", text)
def format_response(raw: str) -> str:
chain, answer = parse_think_blocks(raw)
if chain:
lines = chain.split("\n")
quoted = "\n".join(f"> {l}" for l in lines)
block = (
"<details>\n"
"<summary>🧠 Reasoning Chain β€” click to expand</summary>\n\n"
f"{quoted}\n\n"
"</details>\n\n"
)
return block + answer
return raw
# ══════════════════════════════════════════════════════════════════════════════
# 4. STREAMING BACKEND
# ══════════════════════════════════════════════════════════════════════════════
def generate_reply(
message: str,
history: list,
model_id: str,
thinking_mode: str,
image_input,
system_prompt: str,
max_new_tokens: int,
temperature: float,
top_p: float,
) -> Generator[str, None, None]:
token = os.getenv("HF_TOKEN")
client = InferenceClient(token=token, timeout=120)
cap = MODEL_CAPS[model_id]
use_think = "Thinking" in thinking_mode and cap["thinking"]
max_new_tokens = min(int(max_new_tokens), cap["max_tokens"])
temperature = min(float(temperature), cap["temp_max"])
messages: list[dict] = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
for turn in history:
if isinstance(turn, dict):
role = turn.get("role", "")
raw = turn.get("content") or ""
text = (" ".join(p.get("text","") for p in raw
if isinstance(p,dict) and p.get("type")=="text")
if isinstance(raw, list) else str(raw))
if role == "user":
messages.append({"role":"user","content":text})
elif role == "assistant":
_, clean = parse_think_blocks(text)
messages.append({"role":"assistant","content":clean})
else:
try:
u, a = (turn[0] or None), (turn[1] if len(turn)>1 else None)
except (IndexError, TypeError):
continue
def _txt(v):
if v is None: return None
if isinstance(v, list):
return " ".join(p.get("text","") for p in v
if isinstance(p,dict) and p.get("type")=="text")
return str(v)
if u := _txt(u): messages.append({"role":"user","content":u})
if a := _txt(a):
_, clean = parse_think_blocks(a)
messages.append({"role":"assistant","content":clean})
user_text = build_user_message(message, use_think)
if image_input and cap["vision"]:
import io
from PIL import Image as PILImage
# Handle 3 cases:
# 1. base64 data URL string (from JS fetch API) "data:image/...;base64,..."
# 2. PIL Image object (from Gradio UI)
# 3. numpy array (legacy Gradio)
if isinstance(image_input, str) and image_input.startswith("data:"):
# Strip the data URL prefix and decode directly
header, b64_data = image_input.split(",", 1)
b64 = b64_data
else:
buf = io.BytesIO()
if not isinstance(image_input, PILImage.Image):
image_input = PILImage.fromarray(image_input)
image_input.save(buf, format="JPEG")
b64 = base64.b64encode(buf.getvalue()).decode()
content = [
{"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{b64}"}},
{"type":"text","text":user_text},
]
else:
content = user_text
messages.append({"role":"user","content":content})
try:
stream = client.chat_completion(
model=model_id, messages=messages,
max_tokens=max_new_tokens, temperature=temperature,
top_p=float(top_p), stream=True,
)
raw = ""
for chunk in stream:
if not chunk.choices: continue
delta = chunk.choices[0].delta
if not delta or not delta.content: continue
raw += delta.content
yield format_response(raw)
except Exception as exc:
yield (f"**Error:** `{model_id}`\n\n```\n{exc}\n```\n\n"
"_Check HF\\_TOKEN or try another model._")
# ══════════════════════════════════════════════════════════════════════════════
# 5. GRADIO BLOCKS (hidden – only serves /gradio/gradio_api/call/chat API)
# ══════════════════════════════════════════════════════════════════════════════
with gr.Blocks(title="Qwen3.5 MultiChat API") as gradio_demo:
model_dd = gr.Dropdown(choices=MODEL_IDS, value=DEFAULT_MODEL, visible=False)
thinking_toggle = gr.Radio(
choices=["⚑ Fast Mode (direct answer)",
"🧠 Thinking Mode (chain-of-thought reasoning)"],
value="⚑ Fast Mode (direct answer)",
visible=False,
)
image_input = gr.Textbox(value="", visible=False) # receives base64 data URL from JS
system_prompt = gr.Textbox(value=PRESETS["general"], visible=False)
max_new_tokens = gr.Slider(minimum=64, maximum=8192, value=1024, visible=False)
temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, visible=False)
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, visible=False)
gr.ChatInterface(
fn=generate_reply,
api_name="chat",
additional_inputs=[
model_dd, thinking_toggle, image_input,
system_prompt, max_new_tokens, temperature, top_p,
],
)
# ══════════════════════════════════════════════════════════════════════════════
# 6. FASTAPI – index.html + HF OAuth + Gradio API
# ══════════════════════════════════════════════════════════════════════════════
import pathlib, secrets
fapp = FastAPI()
SESSIONS: dict[str, dict] = {} # session_id β†’ user info
HTML = pathlib.Path(__file__).parent / "index.html"
# ── HF OAuth config (auto-injected by HF Spaces when OAuth is enabled) ────────
CLIENT_ID = os.getenv("OAUTH_CLIENT_ID", "")
CLIENT_SECRET = os.getenv("OAUTH_CLIENT_SECRET", "")
SPACE_HOST = os.getenv("SPACE_HOST", "localhost:7860")
REDIRECT_URI = f"https://{SPACE_HOST}/login/callback"
# Startup OAuth status log
print(f"[OAuth] CLIENT_ID set: {bool(CLIENT_ID)}")
print(f"[OAuth] CLIENT_SECRET set: {bool(CLIENT_SECRET)}")
print(f"[OAuth] SPACE_HOST: {SPACE_HOST}")
print(f"[OAuth] REDIRECT_URI: {REDIRECT_URI}")
HF_AUTH_URL = "https://huggingface.co/oauth/authorize"
HF_TOKEN_URL = "https://huggingface.co/oauth/token"
HF_USER_URL = "https://huggingface.co/oauth/userinfo"
SCOPES = os.getenv("OAUTH_SCOPES", "openid profile")
from urllib.parse import urlencode
def _sid(req: Request) -> Optional[str]:
return req.cookies.get("mc_session")
def _user(req: Request) -> Optional[dict]:
sid = _sid(req)
return SESSIONS.get(sid) if sid else None
# ── Routes ────────────────────────────────────────────────────────────────────
@fapp.get("/")
async def root(request: Request):
html = HTML.read_text(encoding="utf-8") if HTML.exists() else "<h2>index.html missing</h2>"
return HTMLResponse(html)
@fapp.get("/oauth/user")
async def oauth_user(request: Request):
u = _user(request)
if u:
return JSONResponse(u)
return JSONResponse({"logged_in": False}, status_code=401)
@fapp.get("/oauth/login")
async def oauth_login(request: Request):
print(f"[OAuth] /oauth/login called. CLIENT_ID={bool(CLIENT_ID)}")
if not CLIENT_ID:
print("[OAuth] ERROR: OAUTH_CLIENT_ID not set β€” add hf_oauth: true to README.md")
return RedirectResponse("/?oauth_error=not_configured")
state = secrets.token_urlsafe(16)
params = {
"response_type": "code",
"client_id": CLIENT_ID,
"redirect_uri": REDIRECT_URI,
"scope": SCOPES,
"state": state,
}
url = f"{HF_AUTH_URL}?{urlencode(params)}"
print(f"[OAuth] Redirecting β†’ {url[:120]}")
return RedirectResponse(url, status_code=302)
@fapp.get("/login/callback")
async def oauth_callback(code: str = "", error: str = "", state: str = ""):
if error or not code:
print(f"[OAuth] Callback error: {error}")
return RedirectResponse("/?auth_error=1")
# Basic auth as recommended by HF docs
basic = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()
async with httpx.AsyncClient() as client:
# Exchange code for token β€” use Authorization: Basic header
tok = await client.post(HF_TOKEN_URL, data={
"grant_type": "authorization_code",
"code": code,
"redirect_uri": REDIRECT_URI,
}, headers={
"Accept": "application/json",
"Authorization": f"Basic {basic}",
})
if tok.status_code != 200:
print(f"[OAuth] Token exchange FAILED: {tok.status_code} {tok.text[:300]}")
return RedirectResponse("/?auth_error=1")
access_token = tok.json().get("access_token", "")
if not access_token:
print(f"[OAuth] No access_token: {tok.text[:300]}")
return RedirectResponse("/?auth_error=1")
# Get user info
uinfo = await client.get(HF_USER_URL, headers={"Authorization": f"Bearer {access_token}"})
if uinfo.status_code != 200:
print(f"[OAuth] Userinfo FAILED: {uinfo.status_code}")
return RedirectResponse("/?auth_error=1")
user = uinfo.json()
print(f"[OAuth] Login OK: {user.get('preferred_username', '?')}")
sid = secrets.token_urlsafe(32)
SESSIONS[sid] = {
"logged_in": True,
"username": user.get("preferred_username", user.get("name", "User")),
"name": user.get("name", ""),
"avatar": user.get("picture", ""),
"profile": f"https://huggingface.co/{user.get('preferred_username', '')}",
}
resp = RedirectResponse("/")
resp.set_cookie("mc_session", sid, httponly=True, samesite="lax", secure=True, max_age=60*60*24*7)
return resp
@fapp.get("/oauth/logout")
async def oauth_logout(request: Request):
sid = _sid(request)
if sid and sid in SESSIONS:
del SESSIONS[sid]
resp = RedirectResponse("/")
resp.delete_cookie("mc_session")
return resp
@fapp.get("/health")
async def health():
return {"status": "ok"}
# Mount Gradio at /gradio β†’ API at /gradio/gradio_api/call/chat
app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("[BOOT] All components initialized. Starting uvicorn on :7860", flush=True)
uvicorn.run(app, host="0.0.0.0", port=7860)