| import jwt |
| import time |
| import os |
| from datetime import datetime, timedelta |
| from fastapi import FastAPI, Depends, HTTPException |
| from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials |
| from pydantic import BaseModel |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import torch |
| from dotenv import load_dotenv |
|
|
| |
| load_dotenv() |
| SECRET_KEY = os.getenv("JWT_SECRET_KEY", "default-fallback-secret") |
| ALGORITHM = "HS256" |
| MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" |
|
|
| security = HTTPBearer() |
|
|
| def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)): |
| token = credentials.credentials |
| try: |
| payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) |
| return payload |
| except Exception: |
| raise HTTPException(status_code=401, detail="Unauthorized") |
|
|
| |
| app = FastAPI() |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype="auto", device_map="auto") |
|
|
| class ChatMessage(BaseModel): |
| role: str |
| content: str |
|
|
| class ChatCompletionRequest(BaseModel): |
| messages: list[ChatMessage] |
| max_tokens: int = 100 |
|
|
| @app.get("/") |
| def read_root(): |
| return {"message": "Qwen OpenAI-style API is running with .env auth"} |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_generate(request: ChatCompletionRequest, user=Depends(verify_token)): |
| chat_msgs = [msg.dict() for msg in request.messages] |
| text = tokenizer.apply_chat_template(chat_msgs, tokenize=False, add_generation_prompt=True) |
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
| generated_ids = model.generate( |
| **model_inputs, |
| max_new_tokens=request.max_tokens |
| ) |
| generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] |
| response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
| return { |
| "id": f"chatcmpl-{int(time.time())}", |
| "object": "chat.completion", |
| "model": MODEL_NAME, |
| "choices": [{ |
| "message": {"role": "assistant", "content": response}, |
| "finish_reason": "stop" |
| }] |
| } |
|
|