Spaces:

hello-ram
/

unsolth-fast-api

Sleeping

hello-ram commited on Nov 14, 2025

Commit

ae5b614

verified ·

1 Parent(s): d398613

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,36 +5,38 @@ import torch
 app = FastAPI()
-# ---- Load your HF model repo ----
 MODEL_REPO = "hello-ram/mpt-model"
-print("Loading tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
-print("Loading model...")
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_REPO,
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-# ---------- ROUTES -------------
 @app.get("/")
 async def root():
     return {
-        "message": "🚀 FastAPI MPT Model Running on Hugging Face Spaces",
-        "endpoints": ["/", "/status", "/generate"]
     }
 @app.get("/status")
 async def status():
-    return {
-        "status": "ok",
-        "model": MODEL_REPO,
-        "device": str(model.device),
-        "torch_dtype": str(model.dtype)
-    }
 class InputText(BaseModel):
@@ -43,8 +45,9 @@ class InputText(BaseModel):
 @app.post("/generate")
 async def generate_text(data: InputText):
-    inputs = tokenizer(data.text, return_tensors="pt").to(model.device)
     output = model.generate(
         **inputs,
         max_new_tokens=200,

 app = FastAPI()
 MODEL_REPO = "hello-ram/mpt-model"
+tokenizer = None
+model = None
+def load_model():
+    global tokenizer, model
+    if tokenizer is None:
+        print("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
+    if model is None:
+        print("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_REPO,
+            dtype=torch.float16,
+            device_map="auto"
+        )
 @app.get("/")
 async def root():
     return {
+        "message": "🚀 FastAPI MPT Model Running",
+        "routes": ["/", "/status", "/generate"]
     }
 @app.get("/status")
 async def status():
+    return {"status": "ok"}
 class InputText(BaseModel):
 @app.post("/generate")
 async def generate_text(data: InputText):
+    load_model()
+    inputs = tokenizer(data.text, return_tensors="pt").to(model.device)
     output = model.generate(
         **inputs,
         max_new_tokens=200,