from fastapi import FastAPI from pydantic import BaseModel import torch from transformers import AutoTokenizer, AutoModelForCausalLM app = FastAPI() MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = None model = None def load_model(): global tokenizer, model if tokenizer is None or model is None: print("🔥 Loading TinyLlama model on CPU...") tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO) # ❗ NO device_map # ❗ NO torch_dtype=float16 model = AutoModelForCausalLM.from_pretrained( MODEL_REPO, torch_dtype=torch.float32, # safe CPU low_cpu_mem_usage=True ) print("✅ TinyLlama loaded!") @app.get("/") async def home(): return { "message": "🚀 TinyLlama Chat API Running", "endpoints": ["/", "/status", "/generate"], "model": MODEL_REPO } @app.get("/status") async def status(): return { "status": "ok", "model": MODEL_REPO, "loaded": model is not None } class InputText(BaseModel): text: str @app.post("/generate") async def generate_text(data: InputText): load_model() prompt = f"<|system|>You are a helpful assistant.<|user|>{data.text}<|assistant|>" inputs = tokenizer(prompt, return_tensors="pt") # Move to CPU explicitly inputs = {k: v.to("cpu") for k, v in inputs.items()} model.to("cpu") output = model.generate( **inputs, max_new_tokens=150, temperature=0.7, top_p=0.9, do_sample=True ) result = tokenizer.decode(output[0], skip_special_tokens=True) return {"response": result}