natalieparker commited on
Commit
ec891af
·
verified ·
1 Parent(s): 4d942a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -25
app.py CHANGED
@@ -9,28 +9,31 @@ import os
9
 
10
  app = FastAPI()
11
 
12
- # CRITICAL FIX: We use your Model Repository ID.
13
- # This tells the app to download the brain from your other Hugging Face repo.
14
  MODEL_ID = "natalieparker/LumaAI-160M-v3"
15
 
16
- # Force CPU settings for the free tier
17
  DEVICE = "cpu"
18
 
19
  try:
20
  print(f"🔄 Downloading and loading tokenizer from {MODEL_ID}...")
21
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
22
 
23
- print(f"🔄 Downloading and loading model from {MODEL_ID}...")
 
 
24
  model = AutoModelForCausalLM.from_pretrained(
25
  MODEL_ID,
26
- torch_dtype=torch.float32,
27
- low_cpu_mem_usage=True
28
  )
 
29
  model.to(DEVICE)
30
- print("✅ Model loaded successfully!")
31
 
32
  except Exception as e:
33
  print(f"FATAL MODEL LOAD ERROR: {e}")
 
34
  model = None
35
  tokenizer = None
36
 
@@ -41,37 +44,33 @@ except Exception as e:
41
 
42
  @app.get("/")
43
  def root():
 
44
  return {"status": "LumaAI API is live", "model_loaded": model is not None}
45
 
46
  @app.post("/generate")
47
  def generate(prompt: str):
48
  if model is None:
49
- return {"error": "Model failed to load."}
50
 
51
  formatted_prompt = f"User: {prompt}\nCharacter:"
52
 
53
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
54
 
55
- with torch.no_grad():
56
- output = model.generate(
57
- **inputs,
58
- max_new_tokens=150,
59
- temperature=0.75,
60
- top_p=0.9,
61
- repetition_penalty=1.2,
62
- do_sample=True,
63
- pad_token_id=tokenizer.eos_token_id
64
- )
65
 
66
  text = tokenizer.decode(output[0], skip_special_tokens=True)
67
 
68
- # Clean response
69
- if "Character:" in text:
70
- response = text.split("Character:")[-1]
71
- else:
72
- response = text
73
-
74
- response = response.split("User:")[0].strip()
75
  response = response.replace(" .", ".").replace(" ,", ",").replace(" ?", "?").replace(" !", "!")
76
 
77
  return {"response": response}
 
9
 
10
  app = FastAPI()
11
 
12
+ # Model ID is correct
 
13
  MODEL_ID = "natalieparker/LumaAI-160M-v3"
14
 
15
+ # Force CPU device for deployment
16
  DEVICE = "cpu"
17
 
18
  try:
19
  print(f"🔄 Downloading and loading tokenizer from {MODEL_ID}...")
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
21
 
22
+ print(f"🔄 Downloading and loading model from {MODEL_ID} (CPU Optimized)...")
23
+
24
+ # CRITICAL FIX: Load in Float16 to halve memory consumption (441MB -> 220MB)
25
  model = AutoModelForCausalLM.from_pretrained(
26
  MODEL_ID,
27
+ torch_dtype=torch.float16,
28
+ low_cpu_mem_usage=True # Use memory efficient loading
29
  )
30
+ # Move model to CPU memory
31
  model.to(DEVICE)
32
+ print("✅ Model loaded successfully on CPU!")
33
 
34
  except Exception as e:
35
  print(f"FATAL MODEL LOAD ERROR: {e}")
36
+ # The flag is set to False if loading fails
37
  model = None
38
  tokenizer = None
39
 
 
44
 
45
  @app.get("/")
46
  def root():
47
+ # Returns true only if model loaded successfully
48
  return {"status": "LumaAI API is live", "model_loaded": model is not None}
49
 
50
  @app.post("/generate")
51
  def generate(prompt: str):
52
  if model is None:
53
+ return {"error": "Model failed to load during startup."}
54
 
55
  formatted_prompt = f"User: {prompt}\nCharacter:"
56
 
57
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
58
 
59
+ # Run generation without torch.no_grad() setup, as it's not needed for inference
60
+ output = model.generate(
61
+ **inputs,
62
+ max_new_tokens=150,
63
+ temperature=0.75,
64
+ top_p=0.9,
65
+ repetition_penalty=1.2,
66
+ do_sample=True,
67
+ pad_token_id=tokenizer.eos_token_id
68
+ )
69
 
70
  text = tokenizer.decode(output[0], skip_special_tokens=True)
71
 
72
+ # Clean response (using final tested logic)
73
+ response = text.split("Character:")[-1].split("User:")[0].strip()
 
 
 
 
 
74
  response = response.replace(" .", ".").replace(" ,", ",").replace(" ?", "?").replace(" !", "!")
75
 
76
  return {"response": response}