Spaces:
Sleeping
Sleeping
File size: 1,689 Bytes
af5708f 4fcfac7 3ed3ed9 42e23ac 4fcfac7 650274d 8004c59 650274d ae5b614 42e23ac 1b99533 ae5b614 d66d396 3ed3ed9 d66d396 3ed3ed9 ae5b614 d66d396 ae5b614 d66d396 3ed3ed9 ae5b614 d66d396 3ed3ed9 f3c61d6 a0aaa19 f3c61d6 d66d396 a0aaa19 f3c61d6 ae5b614 f3c61d6 3ed3ed9 a0aaa19 3ed3ed9 f3c61d6 4fcfac7 34202a9 f3c61d6 4fcfac7 ae5b614 42e23ac d66d396 a0aaa19 d66d396 3ed3ed9 4fcfac7 3ed3ed9 a0aaa19 4fcfac7 42e23ac a0aaa19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
app = FastAPI()
MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = None
model = None
def load_model():
global tokenizer, model
if tokenizer is None or model is None:
print("🔥 Loading TinyLlama model on CPU...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
# ❗ NO device_map
# ❗ NO torch_dtype=float16
model = AutoModelForCausalLM.from_pretrained(
MODEL_REPO,
torch_dtype=torch.float32, # safe CPU
low_cpu_mem_usage=True
)
print("✅ TinyLlama loaded!")
@app.get("/")
async def home():
return {
"message": "🚀 TinyLlama Chat API Running",
"endpoints": ["/", "/status", "/generate"],
"model": MODEL_REPO
}
@app.get("/status")
async def status():
return {
"status": "ok",
"model": MODEL_REPO,
"loaded": model is not None
}
class InputText(BaseModel):
text: str
@app.post("/generate")
async def generate_text(data: InputText):
load_model()
prompt = f"<|system|>You are a helpful assistant.<|user|>{data.text}<|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt")
# Move to CPU explicitly
inputs = {k: v.to("cpu") for k, v in inputs.items()}
model.to("cpu")
output = model.generate(
**inputs,
max_new_tokens=150,
temperature=0.7,
top_p=0.9,
do_sample=True
)
result = tokenizer.decode(output[0], skip_special_tokens=True)
return {"response": result}
|