Spaces:
Sleeping
Sleeping
File size: 2,301 Bytes
667a0a8 3512ce0 4d942a0 3512ce0 667a0a8 45cf4c1 ec891af 4d942a0 3512ce0 ec891af 3512ce0 4d942a0 3512ce0 ec891af 3512ce0 4d942a0 ec891af 3512ce0 ec891af 3512ce0 ec891af 3512ce0 ec891af 3512ce0 667a0a8 3512ce0 45cf4c1 ec891af 3512ce0 45cf4c1 3512ce0 ec891af 3512ce0 ec891af 3512ce0 667a0a8 3512ce0 ec891af 3512ce0 45cf4c1 3512ce0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from fastapi import FastAPI
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
# ==========================================
# 1. SETUP & MODEL LOADING
# ==========================================
app = FastAPI()
# Model ID is correct
MODEL_ID = "natalieparker/LumaAI-160M-v3"
# Force CPU device for deployment
DEVICE = "cpu"
try:
print(f"🔄 Downloading and loading tokenizer from {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
print(f"🔄 Downloading and loading model from {MODEL_ID} (CPU Optimized)...")
# CRITICAL FIX: Load in Float16 to halve memory consumption (441MB -> 220MB)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
low_cpu_mem_usage=True # Use memory efficient loading
)
# Move model to CPU memory
model.to(DEVICE)
print("✅ Model loaded successfully on CPU!")
except Exception as e:
print(f"FATAL MODEL LOAD ERROR: {e}")
# The flag is set to False if loading fails
model = None
tokenizer = None
# ==========================================
# 2. ENDPOINTS
# ==========================================
@app.get("/")
def root():
# Returns true only if model loaded successfully
return {"status": "LumaAI API is live", "model_loaded": model is not None}
@app.post("/generate")
def generate(prompt: str):
if model is None:
return {"error": "Model failed to load during startup."}
formatted_prompt = f"User: {prompt}\nCharacter:"
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
# Run generation without torch.no_grad() setup, as it's not needed for inference
output = model.generate(
**inputs,
max_new_tokens=150,
temperature=0.75,
top_p=0.9,
repetition_penalty=1.2,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
text = tokenizer.decode(output[0], skip_special_tokens=True)
# Clean response (using final tested logic)
response = text.split("Character:")[-1].split("User:")[0].strip()
response = response.replace(" .", ".").replace(" ,", ",").replace(" ?", "?").replace(" !", "!")
return {"response": response} |