""" FastAPI inference server for sentiment analysis. Supports dynamic switching between three fine-tuned LoRA models. Run: python server.py Listens on http://127.0.0.1:8765 """ import re import os import gc import asyncio from concurrent.futures import ThreadPoolExecutor os.environ.setdefault("HF_HOME", "/tmp/hf_cache") os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf_cache/hub") import torch from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel # ── Model registry ──────────────────────────────────────────────────────────── MODELS = { "qwen3-0.6B": { "label": "Qwen3-0.6B (Fine-tuned)", "base": "Qwen/Qwen3-0.6B", "lora": "alanwang2001/qwen3-0.6B-sentiment-lora", }, "qwen3-1.7B": { "label": "Qwen3-1.7B (Fine-tuned)", "base": "Qwen/Qwen3-1.7B", "lora": "alanwang2001/qwen3-1.7B-sentiment-lora", }, } DEFAULT_MODEL = "qwen3-0.6B" # ── System prompt ───────────────────────────────────────────────────────────── SYSTEM_PROMPT = ( "You are a sentiment analysis assistant. " "Classify the sentiment of the given movie review into one of three categories:\n" "- positive: the reviewer expresses a favorable opinion of the movie.\n" "- negative: the reviewer expresses an unfavorable opinion of the movie.\n" "- neutral: the reviewer expresses a mixed or balanced opinion with no clear positive or negative leaning.\n" "First explain your reasoning, then put your final answer in \\boxed{}, " "for example \\boxed{positive}." ) VALID_LABELS = {"positive", "negative", "neutral"} # ── Model state ─────────────────────────────────────────────────────────────── device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = None model = None current_model_name = None model_status = "loading" # loading | ready | switching | error _executor = ThreadPoolExecutor(max_workers=1) def _do_load(name: str): """Blocking model load — runs in thread executor.""" global tokenizer, model, current_model_name, model_status cfg = MODELS[name] print(f"\n[load] {name} base={cfg['base']} lora={cfg['lora']}") # Unload existing model if model is not None: model = None tokenizer = None gc.collect() if device == "cuda": torch.cuda.empty_cache() tokenizer = AutoTokenizer.from_pretrained( cfg["base"], trust_remote_code=True, ) base = AutoModelForCausalLM.from_pretrained( cfg["base"], dtype=torch.float16 if device == "cuda" else torch.float32, device_map={"": 0} if device == "cuda" else None, trust_remote_code=True, ) model = PeftModel.from_pretrained(base, cfg["lora"]) model.eval() if device == "cpu": model.to(device) current_model_name = name model_status = "ready" print(f"[load] done — {name} ready on {device}") # ── FastAPI ─────────────────────────────────────────────────────────────────── app = FastAPI(title="Sentiment API") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["GET", "POST", "OPTIONS"], allow_headers=["*"], ) from starlette.middleware.base import BaseHTTPMiddleware from starlette.requests import Request class PrivateNetworkMiddleware(BaseHTTPMiddleware): async def dispatch(self, request: Request, call_next): response = await call_next(request) response.headers["Access-Control-Allow-Private-Network"] = "true" return response app.add_middleware(PrivateNetworkMiddleware) # ── Startup ─────────────────────────────────────────────────────────────────── @app.on_event("startup") async def startup(): loop = asyncio.get_event_loop() await loop.run_in_executor(_executor, _do_load, DEFAULT_MODEL) # ── Endpoints ───────────────────────────────────────────────────────────────── @app.get("/health") def health(): return {"status": model_status, "device": device, "model": current_model_name} @app.get("/models") def list_models(): return [ { "id": mid, "label": cfg["label"], "current": mid == current_model_name, } for mid, cfg in MODELS.items() ] class SwitchRequest(BaseModel): model: str @app.post("/model") async def switch_model(req: SwitchRequest): global model_status if req.model not in MODELS: raise HTTPException(status_code=404, detail=f"Unknown model: {req.model}") if req.model == current_model_name and model_status == "ready": return {"model": current_model_name, "status": "ready"} model_status = "switching" loop = asyncio.get_event_loop() try: await loop.run_in_executor(_executor, _do_load, req.model) except Exception as e: model_status = "error" raise HTTPException(status_code=500, detail=str(e)) return {"model": current_model_name, "status": "ready"} # ── Inference ───────────────────────────────────────────────────────────────── def build_prompt(text: str) -> str: messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"Classify the sentiment of this movie review:\n\n{text[:512]}"}, ] return tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False, ) def parse_label(text: str) -> str: m = re.search(r"\\boxed\{(\w+)\}", text) if m and m.group(1).lower() in VALID_LABELS: return m.group(1).lower() for label in VALID_LABELS: if label in text.lower(): return label return "neutral" class ReviewRequest(BaseModel): text: str @app.post("/analyze") def analyze(req: ReviewRequest): if model_status != "ready": raise HTTPException(status_code=503, detail=f"Model is {model_status}") prompt = build_prompt(req.text) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=150, do_sample=False, pad_token_id=tokenizer.eos_token_id, ) new_tokens = output_ids[0][inputs.input_ids.shape[1]:] generated = tokenizer.decode(new_tokens, skip_special_tokens=True).strip() sentiment = parse_label(generated) reasoning = re.sub(r"\\boxed\{\w+\}", "", generated).strip() return {"sentiment": sentiment, "reasoning": reasoning} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)