hello-ram's picture
Update app.py
d66d396 verified
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
app = FastAPI()
MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = None
model = None
def load_model():
global tokenizer, model
if tokenizer is None or model is None:
print("πŸ”₯ Loading TinyLlama model on CPU...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
# ❗ NO device_map
# ❗ NO torch_dtype=float16
model = AutoModelForCausalLM.from_pretrained(
MODEL_REPO,
torch_dtype=torch.float32, # safe CPU
low_cpu_mem_usage=True
)
print("βœ… TinyLlama loaded!")
@app.get("/")
async def home():
return {
"message": "πŸš€ TinyLlama Chat API Running",
"endpoints": ["/", "/status", "/generate"],
"model": MODEL_REPO
}
@app.get("/status")
async def status():
return {
"status": "ok",
"model": MODEL_REPO,
"loaded": model is not None
}
class InputText(BaseModel):
text: str
@app.post("/generate")
async def generate_text(data: InputText):
load_model()
prompt = f"<|system|>You are a helpful assistant.<|user|>{data.text}<|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt")
# Move to CPU explicitly
inputs = {k: v.to("cpu") for k, v in inputs.items()}
model.to("cpu")
output = model.generate(
**inputs,
max_new_tokens=150,
temperature=0.7,
top_p=0.9,
do_sample=True
)
result = tokenizer.decode(output[0], skip_special_tokens=True)
return {"response": result}