from fastapi import FastAPI from fastapi.staticfiles import StaticFiles from fastapi.responses import FileResponse, StreamingResponse from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch import threading app = FastAPI() MODEL_NAME = "Qwen/Qwen3.5-0.8B" with open("prompts/system.txt", "r") as f: SYSTEM_PROMPT = f.read().strip() tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) app.mount("/static", StaticFiles(directory="static"), name="static") class ChatRequest(BaseModel): message: str max_new_tokens: int = 512 temperature: float = 0.7 top_p: float = 0.9 repetition_penalty: float = 1.3 @app.get("/") def root(): return FileResponse("static/index.html") @app.get("/health") def health(): return {"status": "healthy"} @app.get("/info") def info(): return {"status": "running", "model": MODEL_NAME} @app.post("/chat") def chat(request: ChatRequest): messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": request.message}, ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False, ) inputs = tokenizer([text], return_tensors="pt").to(model.device) streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True ) generation_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=request.max_new_tokens, temperature=request.temperature, repetition_penalty=request.repetition_penalty, top_p=request.top_p, do_sample=True, ) thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() def token_stream(): for token in streamer: yield token thread.join() return StreamingResponse(token_stream(), media_type="text/plain")