File size: 2,301 Bytes
667a0a8
 
 
3512ce0
 
 
4d942a0
3512ce0
667a0a8
45cf4c1
 
ec891af
4d942a0
3512ce0
ec891af
3512ce0
 
 
4d942a0
 
3512ce0
ec891af
 
 
3512ce0
4d942a0
ec891af
 
3512ce0
ec891af
3512ce0
ec891af
3512ce0
 
 
ec891af
3512ce0
 
667a0a8
 
3512ce0
 
 
45cf4c1
 
 
ec891af
3512ce0
45cf4c1
 
 
3512ce0
ec891af
3512ce0
 
 
 
 
ec891af
 
 
 
 
 
 
 
 
 
3512ce0
667a0a8
3512ce0
ec891af
 
3512ce0
45cf4c1
3512ce0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from fastapi import FastAPI
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os

# ==========================================
# 1. SETUP & MODEL LOADING 
# ==========================================

app = FastAPI()

# Model ID is correct
MODEL_ID = "natalieparker/LumaAI-160M-v3"

# Force CPU device for deployment
DEVICE = "cpu" 

try:
    print(f"🔄 Downloading and loading tokenizer from {MODEL_ID}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    
    print(f"🔄 Downloading and loading model from {MODEL_ID} (CPU Optimized)...")
    
    # CRITICAL FIX: Load in Float16 to halve memory consumption (441MB -> 220MB)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True # Use memory efficient loading
    )
    # Move model to CPU memory
    model.to(DEVICE)
    print("✅ Model loaded successfully on CPU!")

except Exception as e:
    print(f"FATAL MODEL LOAD ERROR: {e}")
    # The flag is set to False if loading fails
    model = None
    tokenizer = None


# ==========================================
# 2. ENDPOINTS
# ==========================================

@app.get("/")
def root():
    # Returns true only if model loaded successfully
    return {"status": "LumaAI API is live", "model_loaded": model is not None}

@app.post("/generate")
def generate(prompt: str):
    if model is None:
        return {"error": "Model failed to load during startup."}
        
    formatted_prompt = f"User: {prompt}\nCharacter:"
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
    
    # Run generation without torch.no_grad() setup, as it's not needed for inference
    output = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.75,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Clean response (using final tested logic)
    response = text.split("Character:")[-1].split("User:")[0].strip()
    response = response.replace(" .", ".").replace(" ,", ",").replace(" ?", "?").replace(" !", "!")

    return {"response": response}