Spaces:

sachiniyer
/

posttraining-practice

Sleeping

App Files Files Community

sachiniyer commited on Jan 15

Commit

b9a427c

verified ·

1 Parent(s): 33c92b9

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/backend.cpython-312.pyc +0 -0
app.py +45 -41
backend.py +84 -0
deploy.py +67 -0

__pycache__/backend.cpython-312.pyc ADDED Viewed

Binary file (4.03 kB). View file

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 MODEL_IDS = [
     "sachiniyer/SmolLM2-DPO-Schwinn-SmolLM2-Base",
@@ -11,48 +12,42 @@ MODEL_IDS = [
     "sachiniyer/DeepSeek-R1-QLoRA-Finetuned",
 ]
-# Load all models
-models = {}
-for model_id in MODEL_IDS:
-    print(f"Loading model: {model_id}")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None,
-    )
-    models[model_id] = {"model": model, "tokenizer": tokenizer}
-    print(f"Loaded: {model_id}")
 def make_respond_fn(model_id: str):
     def respond(message: str, history: list[tuple[str, str]]) -> str:
-        tokenizer = models[model_id]["tokenizer"]
-        model = models[model_id]["model"]
-        # Build conversation from history
-        conversation = ""
-        for user_msg, assistant_msg in history:
-            conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
-        conversation += f"User: {message}\nAssistant:"
-        inputs = tokenizer(conversation, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = inputs.to("cuda")
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=256,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            pad_token_id=tokenizer.eos_token_id,
-        )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract only the new assistant response
-        response = response.split("Assistant:")[-1].strip()
-        return response
     return respond
@@ -62,6 +57,10 @@ with gr.Blocks(title="posttraining-practice") as demo:
     gr.Markdown("# posttraining-practice")
     gr.Markdown("Chat with different fine-tuned models")
     with gr.Tabs():
         for model_id in MODEL_IDS:
             short_name = model_id.split("/")[-1]
@@ -71,5 +70,10 @@ with gr.Blocks(title="posttraining-practice") as demo:
                     description=f"Chatting with: {model_id}",
                 )
 if __name__ == "__main__":
-    demo.launch()

+import os
 import gradio as gr
+import requests
 MODEL_IDS = [
     "sachiniyer/SmolLM2-DPO-Schwinn-SmolLM2-Base",
     "sachiniyer/DeepSeek-R1-QLoRA-Finetuned",
 ]
+# Modal endpoint URL - set this after deploying backend.py
+MODAL_ENDPOINT = os.environ.get("MODAL_ENDPOINT", "")
+# API key for authenticating with Modal backend
+MODEL_SITE_API_KEY = os.environ.get("MODEL_SITE_API_KEY", "")
+# Password for Gradio login (any username accepted)
+SITE_PASSWORD = os.environ.get("SITE_PASSWORD", "")
 def make_respond_fn(model_id: str):
     def respond(message: str, history: list[tuple[str, str]]) -> str:
+        if not MODAL_ENDPOINT:
+            return "Error: MODAL_ENDPOINT environment variable not set"
+        try:
+            response = requests.post(
+                MODAL_ENDPOINT,
+                headers={"X-API-Key": MODEL_SITE_API_KEY},
+                json={
+                    "model_id": model_id,
+                    "message": message,
+                    "history": history,
+                },
+                timeout=120,  # Cold start can take a while
+            )
+            response.raise_for_status()
+            data = response.json()
+            if "error" in data:
+                return f"Error: {data['error']}"
+            return data.get("response", "No response received")
+        except requests.exceptions.Timeout:
+            return "Error: Request timed out. The model may be starting up, please try again."
+        except requests.exceptions.RequestException as e:
+            return f"Error: {e}"
     return respond
     gr.Markdown("# posttraining-practice")
     gr.Markdown("Chat with different fine-tuned models")
+    missing = [v for v in ["MODAL_ENDPOINT", "MODEL_SITE_API_KEY", "SITE_PASSWORD"] if not os.environ.get(v)]
+    if missing:
+        gr.Markdown(f"⚠️ **Warning:** Missing secrets: {', '.join(missing)}")
     with gr.Tabs():
         for model_id in MODEL_IDS:
             short_name = model_id.split("/")[-1]
                     description=f"Chatting with: {model_id}",
                 )
+def check_password(username: str, password: str) -> bool:
+    return password == SITE_PASSWORD
 if __name__ == "__main__":
+    auth = check_password if SITE_PASSWORD else None
+    demo.launch(auth=auth)

backend.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import modal
+from fastapi import Header
+MODEL_IDS = [
+    "sachiniyer/SmolLM2-DPO-Schwinn-SmolLM2-Base",
+    "sachiniyer/SmolLM2-DPO-Schwinn-gpt-5-mini-base",
+    "sachiniyer/Qwen2.5-0.5B-DPO-Schwinn",
+    "sachiniyer/SmolLM2-FT-SFT-Learning",
+    "sachiniyer/DeepSeek-R1-LoRA-Finetuned",
+    "sachiniyer/DeepSeek-R1-QLoRA-Finetuned",
+]
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install("torch", "transformers", "accelerate", "fastapi")
+)
+app = modal.App("posttraining-chat", image=image)
+@app.cls(
+    gpu="T4",
+    scaledown_window=60,
+    secrets=[modal.Secret.from_dotenv()],
+)
+class Inference:
+    @modal.enter()
+    def load_models(self):
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        self.models = {}
+        for model_id in MODEL_IDS:
+            print(f"Loading model: {model_id}")
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16,
+                device_map="auto",
+            )
+            self.models[model_id] = {"model": model, "tokenizer": tokenizer}
+            print(f"Loaded: {model_id}")
+    @modal.fastapi_endpoint(method="POST")
+    def generate(self, request: dict, x_api_key: str | None = Header(None)) -> dict:
+        import torch
+        expected_key = os.environ.get("MODEL_SITE_API_KEY")
+        if not expected_key or x_api_key != expected_key:
+            return {"error": "Unauthorized - invalid API key"}
+        model_id = request.get("model_id", MODEL_IDS[0])
+        message = request.get("message", "")
+        history = request.get("history", [])
+        if model_id not in self.models:
+            return {"error": f"Model {model_id} not found"}
+        tokenizer = self.models[model_id]["tokenizer"]
+        model = self.models[model_id]["model"]
+        conversation = ""
+        for user_msg, assistant_msg in history:
+            conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
+        conversation += f"User: {message}\nAssistant:"
+        inputs = tokenizer(conversation, return_tensors="pt").to("cuda")
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=256,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response.split("Assistant:")[-1].strip()
+        return {"response": response}

deploy.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+"""Deploy the chat site: Modal backend + HuggingFace Space + secrets."""
+import os
+import re
+import subprocess
+import sys
+from dotenv import load_dotenv
+from huggingface_hub import HfApi
+load_dotenv()
+def main():
+    # Check required env vars
+    api_key = os.environ.get("MODEL_SITE_API_KEY")
+    site_password = os.environ.get("SITE_PASSWORD")
+    if not api_key or not site_password:
+        sys.exit("ERROR: MODEL_SITE_API_KEY and SITE_PASSWORD must be set in .env")
+    # Deploy Modal backend
+    print("Deploying Modal backend...")
+    result = subprocess.run(
+        ["uv", "run", "modal", "deploy", "site/backend.py"],
+        capture_output=True,
+        text=True,
+    )
+    print(result.stdout + result.stderr)
+    match = re.search(r"https://[^\s]+\.modal\.run", result.stdout + result.stderr)
+    if not match:
+        sys.exit("ERROR: Could not find Modal endpoint URL")
+    modal_endpoint = match.group(0)
+    # Generate requirements and deploy to HuggingFace
+    print("Deploying to HuggingFace Spaces (select 'cpu-basic')...")
+    result = subprocess.run(
+        ["uv", "export", "--group", "site", "--no-hashes", "--no-dev"],
+        capture_output=True,
+        text=True,
+    )
+    with open("site/requirements.txt", "w") as f:
+        f.write(result.stdout)
+    subprocess.run(
+        ["uv", "run", "--group", "site", "gradio", "deploy",
+         "--title", "posttraining-practice", "--app-file", "app.py"],
+        cwd="site",
+    )
+    os.remove("site/requirements.txt")
+    # Set secrets
+    space_id = input("Space ID (e.g., sachiniyer/posttraining-practice): ").strip()
+    if not space_id:
+        sys.exit("ERROR: Space ID required")
+    api = HfApi()
+    api.add_space_secret(repo_id=space_id, key="MODAL_ENDPOINT", value=modal_endpoint)
+    api.add_space_secret(repo_id=space_id, key="MODEL_SITE_API_KEY", value=api_key)
+    api.add_space_secret(repo_id=space_id, key="SITE_PASSWORD", value=site_password)
+    print(f"Done! https://huggingface.co/spaces/{space_id}")
+if __name__ == "__main__":
+    main()