hello-ram commited on
Commit
a0aaa19
·
verified ·
1 Parent(s): 8004c59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -14
app.py CHANGED
@@ -5,6 +5,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
  app = FastAPI()
7
 
 
 
 
8
  MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
9
 
10
  tokenizer = None
@@ -14,25 +17,29 @@ model = None
14
  def load_model():
15
  global tokenizer, model
16
  if tokenizer is None or model is None:
17
- print("🔥 Loading model now (lazy load)... This will take time but only once.")
18
 
19
  tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
20
 
21
  model = AutoModelForCausalLM.from_pretrained(
22
  MODEL_REPO,
23
- dtype=torch.float16,
24
- device_map="cpu", # force CPU for Spaces
25
  low_cpu_mem_usage=True
26
  )
27
 
28
- print("✅ Model loaded successfully!")
29
 
30
 
 
 
 
31
  @app.get("/")
32
- async def root():
33
  return {
34
- "message": "🚀 FastAPI MPT Model Running on Hugging Face Spaces",
35
- "endpoints": ["/", "/status", "/generate"]
 
36
  }
37
 
38
 
@@ -41,7 +48,7 @@ async def status():
41
  return {
42
  "status": "ok",
43
  "model": MODEL_REPO,
44
- "model_loaded": model is not None
45
  }
46
 
47
 
@@ -51,17 +58,19 @@ class InputText(BaseModel):
51
 
52
  @app.post("/generate")
53
  async def generate_text(data: InputText):
54
-
55
- # Load model ONLY when first request happens
56
  load_model()
57
 
58
- inputs = tokenizer(data.text, return_tensors="pt").to(model.device)
 
 
59
 
60
  output = model.generate(
61
  **inputs,
62
  max_new_tokens=150,
63
- temperature=0.7
 
 
64
  )
65
 
66
- text = tokenizer.decode(output[0], skip_special_tokens=True)
67
- return {"response": text}
 
5
 
6
  app = FastAPI()
7
 
8
+ # -------------------------------------
9
+ # MODEL (FAST & SMALL)
10
+ # -------------------------------------
11
  MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
12
 
13
  tokenizer = None
 
17
  def load_model():
18
  global tokenizer, model
19
  if tokenizer is None or model is None:
20
+ print("🔥 Loading TinyLlama model...")
21
 
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
23
 
24
  model = AutoModelForCausalLM.from_pretrained(
25
  MODEL_REPO,
26
+ torch_dtype=torch.float32, # CPU safe
27
+ device_map="cpu",
28
  low_cpu_mem_usage=True
29
  )
30
 
31
+ print("✅ TinyLlama loaded successfully!")
32
 
33
 
34
+ # -------------------------------------
35
+ # ROUTES
36
+ # -------------------------------------
37
  @app.get("/")
38
+ async def home():
39
  return {
40
+ "message": "🚀 TinyLlama Chat API (FastAPI + HF Spaces)",
41
+ "endpoints": ["/", "/status", "/generate"],
42
+ "model": MODEL_REPO
43
  }
44
 
45
 
 
48
  return {
49
  "status": "ok",
50
  "model": MODEL_REPO,
51
+ "loaded": model is not None
52
  }
53
 
54
 
 
58
 
59
  @app.post("/generate")
60
  async def generate_text(data: InputText):
 
 
61
  load_model()
62
 
63
+ prompt = f"<|system|>You are a friendly helpful AI assistant.<|user|>{data.text}<|assistant|>"
64
+
65
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
66
 
67
  output = model.generate(
68
  **inputs,
69
  max_new_tokens=150,
70
+ temperature=0.7,
71
+ top_p=0.9,
72
+ do_sample=True
73
  )
74
 
75
+ result = tokenizer.decode(output[0], skip_special_tokens=True)
76
+ return {"response": result}