hello-ram commited on
Commit
d66d396
Β·
verified Β·
1 Parent(s): 51f372d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -5,9 +5,6 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
  app = FastAPI()
7
 
8
- # -------------------------------------
9
- # MODEL (FAST & SMALL)
10
- # -------------------------------------
11
  MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
12
 
13
  tokenizer = None
@@ -16,28 +13,27 @@ model = None
16
 
17
  def load_model():
18
  global tokenizer, model
 
19
  if tokenizer is None or model is None:
20
- print("πŸ”₯ Loading TinyLlama model...")
21
 
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
23
 
 
 
24
  model = AutoModelForCausalLM.from_pretrained(
25
  MODEL_REPO,
26
- torch_dtype=torch.float32, # CPU safe
27
- device_map="cpu",
28
  low_cpu_mem_usage=True
29
  )
30
 
31
- print("βœ… TinyLlama loaded successfully!")
32
 
33
 
34
- # -------------------------------------
35
- # ROUTES
36
- # -------------------------------------
37
  @app.get("/")
38
  async def home():
39
  return {
40
- "message": "πŸš€ TinyLlama Chat API (FastAPI + HF Spaces)",
41
  "endpoints": ["/", "/status", "/generate"],
42
  "model": MODEL_REPO
43
  }
@@ -60,9 +56,13 @@ class InputText(BaseModel):
60
  async def generate_text(data: InputText):
61
  load_model()
62
 
63
- prompt = f"<|system|>You are a friendly helpful AI assistant.<|user|>{data.text}<|assistant|>"
 
 
64
 
65
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 
66
 
67
  output = model.generate(
68
  **inputs,
 
5
 
6
  app = FastAPI()
7
 
 
 
 
8
  MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
9
 
10
  tokenizer = None
 
13
 
14
  def load_model():
15
  global tokenizer, model
16
+
17
  if tokenizer is None or model is None:
18
+ print("πŸ”₯ Loading TinyLlama model on CPU...")
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
21
 
22
+ # ❗ NO device_map
23
+ # ❗ NO torch_dtype=float16
24
  model = AutoModelForCausalLM.from_pretrained(
25
  MODEL_REPO,
26
+ torch_dtype=torch.float32, # safe CPU
 
27
  low_cpu_mem_usage=True
28
  )
29
 
30
+ print("βœ… TinyLlama loaded!")
31
 
32
 
 
 
 
33
  @app.get("/")
34
  async def home():
35
  return {
36
+ "message": "πŸš€ TinyLlama Chat API Running",
37
  "endpoints": ["/", "/status", "/generate"],
38
  "model": MODEL_REPO
39
  }
 
56
  async def generate_text(data: InputText):
57
  load_model()
58
 
59
+ prompt = f"<|system|>You are a helpful assistant.<|user|>{data.text}<|assistant|>"
60
+
61
+ inputs = tokenizer(prompt, return_tensors="pt")
62
 
63
+ # Move to CPU explicitly
64
+ inputs = {k: v.to("cpu") for k, v in inputs.items()}
65
+ model.to("cpu")
66
 
67
  output = model.generate(
68
  **inputs,