Spaces:

Trigger82
/

Work

Sleeping

Trigger82 commited on May 31, 2025

Commit

7d7624b

verified ·

1 Parent(s): 1a2f674

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,23 +1,24 @@
 # app.py
-import os
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from fastapi import FastAPI
-# Ensure cache env vars point to writable directory (same as Dockerfile)
-home = os.environ.get("HOME", "/home/user")
-cache_dir = os.path.join(home, ".cache", "huggingface")
-os.makedirs(cache_dir, exist_ok=True)
-os.environ["HF_HOME"] = cache_dir
-os.environ["TRANSFORMERS_CACHE"] = cache_dir
-model_id = "rasyosef/Phi-1_5-Instruct-v0.1"
-model = AutoModelForCausalLM.from_pretrained(model_id)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
 app = FastAPI()
 @app.get("/chat")
 def chat(query: str):
-    # Compose chat-format prompt (system + user) for Phi-1.5
     prompt = (
         "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
         "<|im_start|>user\n" + query + "<|im_end|>"
@@ -25,8 +26,9 @@ def chat(query: str):
     )
     inputs = tokenizer(prompt, return_tensors="pt")
     outputs = model.generate(**inputs, max_new_tokens=200)
-    # Decode only the newly generated tokens (skip input tokens)
     response = tokenizer.decode(
-        outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True
     )
     return {"answer": response.strip()}

 # app.py
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from fastapi import FastAPI
+# Model ID on Hugging Face
+MODEL_ID = "rasyosef/Phi-1_5-Instruct-v0.1"
+# Load tokenizer and model from local cache (pre-downloaded in Docker build)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
 app = FastAPI()
 @app.get("/chat")
 def chat(query: str):
+    """
+    GET /chat?query=Your+question
+    Returns JSON: {"answer": "...model’s reply..."}
+    """
+    # Build the instruction‐style prompt expected by Phi‐1.5 Instruct
     prompt = (
         "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
         "<|im_start|>user\n" + query + "<|im_end|>"
     )
     inputs = tokenizer(prompt, return_tensors="pt")
     outputs = model.generate(**inputs, max_new_tokens=200)
+    # Only decode newly generated tokens (skip the “prompt” tokens)
     response = tokenizer.decode(
+        outputs[0][inputs.input_ids.shape[-1]:],
+        skip_special_tokens=True
     )
     return {"answer": response.strip()}