Spaces:

hello-ram
/

unsolth-fast-api

Sleeping

App Files Files Community

hello-ram commited on Nov 14, 2025

Commit

a0aaa19

verified ·

1 Parent(s): 8004c59

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -14

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 app = FastAPI()
 MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 tokenizer = None
@@ -14,25 +17,29 @@ model = None
 def load_model():
     global tokenizer, model
     if tokenizer is None or model is None:
-        print("🔥 Loading model now (lazy load)... This will take time but only once.")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_REPO,
-            dtype=torch.float16,
-            device_map="cpu",   # force CPU for Spaces
             low_cpu_mem_usage=True
         )
-        print("✅ Model loaded successfully!")
 @app.get("/")
-async def root():
     return {
-        "message": "🚀 FastAPI MPT Model Running on Hugging Face Spaces",
-        "endpoints": ["/", "/status", "/generate"]
     }
@@ -41,7 +48,7 @@ async def status():
     return {
         "status": "ok",
         "model": MODEL_REPO,
-        "model_loaded": model is not None
     }
@@ -51,17 +58,19 @@ class InputText(BaseModel):
 @app.post("/generate")
 async def generate_text(data: InputText):
-    # Load model ONLY when first request happens
     load_model()
-    inputs = tokenizer(data.text, return_tensors="pt").to(model.device)
     output = model.generate(
         **inputs,
         max_new_tokens=150,
-        temperature=0.7
     )
-    text = tokenizer.decode(output[0], skip_special_tokens=True)
-    return {"response": text}

 app = FastAPI()
+# -------------------------------------
+# MODEL (FAST & SMALL)
+# -------------------------------------
 MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 tokenizer = None
 def load_model():
     global tokenizer, model
     if tokenizer is None or model is None:
+        print("🔥 Loading TinyLlama model...")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_REPO,
+            torch_dtype=torch.float32,    # CPU safe
+            device_map="cpu",
             low_cpu_mem_usage=True
         )
+        print("✅ TinyLlama loaded successfully!")
+# -------------------------------------
+# ROUTES
+# -------------------------------------
 @app.get("/")
+async def home():
     return {
+        "message": "🚀 TinyLlama Chat API (FastAPI + HF Spaces)",
+        "endpoints": ["/", "/status", "/generate"],
+        "model": MODEL_REPO
     }
     return {
         "status": "ok",
         "model": MODEL_REPO,
+        "loaded": model is not None
     }
 @app.post("/generate")
 async def generate_text(data: InputText):
     load_model()
+    prompt = f"<|system|>You are a friendly helpful AI assistant.<|user|>{data.text}<|assistant|>"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     output = model.generate(
         **inputs,
         max_new_tokens=150,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True
     )
+    result = tokenizer.decode(output[0], skip_special_tokens=True)
+    return {"response": result}