from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = FastAPI() model_id = "google/flan-t5-small" # Replace with your model here #"unsloth/mistral-7b-v0.2-bnb-4bit" #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto", ) @app.post("/generate") async def generate(request: Request): data = await request.json() prompt = data.get("prompt", "").strip() inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( inputs.input_ids, max_new_tokens=100, use_cache=True, temperature=0.7, ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"output": generated_text}