Spaces:

pierreramez
/

chatbot-api

Sleeping

App Files Files Community

pierreramez commited on Nov 27, 2025

Commit

d840997

verified ·

1 Parent(s): 65db64e

Update

Browse files

Files changed (1) hide show

app.py +73 -91

app.py CHANGED Viewed

@@ -1,16 +1,5 @@
 """
 Enhanced FastAPI Backend with Feedback Management
---------------------------------------------------
-New endpoints for production continuous learning workflow:
-  - GET /download-feedback: Download feedback for training
-  - POST /clear-feedback: Clear feedback after training
-  - GET /correction-count: Monitor training readiness
-  - POST /reload-adapter: Hot reload new model without restart
-Deploy to HuggingFace Spaces (FREE):
-  1. Create new Space: "YourUsername/chatbot-api"
-  2. Select: SDK = "Docker"
-  3. Upload: app.py, requirements.txt, Dockerfile, README.md
 """
 from fastapi import FastAPI, HTTPException
@@ -23,6 +12,7 @@ from pathlib import Path
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
 app = FastAPI(
     title="Personalized Chatbot API",
@@ -114,39 +104,58 @@ class ModelManager:
         if adapter_path:
             print(f"With adapter: {adapter_path}")
         self._device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self._device}")
-        self._tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            trust_remote_code=True
-        )
         if self._tokenizer.pad_token is None:
             self._tokenizer.pad_token = self._tokenizer.eos_token
-        if use_4bit and torch.cuda.is_available():
-            from transformers import BitsAndBytesConfig
-            bnb_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-            )
-            base_model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                quantization_config=bnb_config,
-                device_map="auto",
-                trust_remote_code=True,
-                torch_dtype=torch.float16,
-            )
         else:
             base_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
-                device_map="auto",
                 trust_remote_code=True,
             )
         if adapter_path and (isinstance(adapter_path, str) and adapter_path.strip()):
@@ -155,7 +164,7 @@ class ModelManager:
                 self._model = PeftModel.from_pretrained(
                     base_model,
                     adapter_path,
-                    torch_dtype=torch.float16
                 )
                 self._current_adapter = adapter_path
                 print(f"✅ Adapter loaded successfully")
@@ -218,6 +227,10 @@ class ModelManager:
             skip_special_tokens=True
         ).strip()
         return reply
@@ -225,6 +238,7 @@ class FeedbackManager:
     """Manages feedback storage and statistics."""
     def __init__(self, feedback_file: str = "data/feedback.jsonl"):
         self.feedback_file = Path(feedback_file)
         self.feedback_file.parent.mkdir(parents=True, exist_ok=True)
     def save_interaction(
@@ -295,9 +309,16 @@ async def startup_event():
     print("Starting up...")
     model_manager.initialize(
-        model_name="meta-llama/Llama-3.2-1B-Instruct",
-        adapter_path=None,  # Update this after training: "username/adapter-v1"
-        use_4bit=True
     )
     print("Ready to serve!")
@@ -310,6 +331,7 @@ async def root():
         "message": "Personalized Chatbot API v2.0",
         "version": "2.0.0",
         "current_adapter": model_manager._current_adapter,
         "endpoints": {
             "chat": "POST /chat",
             "feedback": "POST /feedback",
@@ -358,6 +380,7 @@ async def chat(request: ChatRequest):
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
@@ -365,10 +388,15 @@ async def chat(request: ChatRequest):
 async def submit_feedback(request: FeedbackRequest):
     """Submit correction for a model response."""
     try:
-        with open(feedback_manager.feedback_file, "r", encoding="utf-8") as f:
-            lines = f.readlines()
         found = False
         for i in range(len(lines) - 1, -1, -1):
             try:
                 record = json.loads(lines[i])
@@ -395,6 +423,7 @@ async def submit_feedback(request: FeedbackRequest):
                 message="Feedback recorded successfully"
             )
         else:
             feedback_manager.save_interaction(
                 user_input=request.user_input,
                 model_reply=request.model_reply,
@@ -420,11 +449,7 @@ async def get_stats():
 @app.get("/correction-count", response_model=CorrectionCountResponse)
 async def get_correction_count():
-    """
-    Get count of corrections for training readiness monitoring.
-    Use this to check if you have enough corrections to train.
-    """
     if not feedback_manager.feedback_file.exists():
         return CorrectionCountResponse(
             corrections=0,
@@ -454,21 +479,7 @@ async def get_correction_count():
 @app.get("/download-feedback", response_model=DownloadFeedbackResponse)
 async def download_feedback():
-    """
-    Download feedback file for training.
-    Use this endpoint to download feedback from production backend
-    to your training notebook.
-    Example:
-    ```python
-    response = requests.get(f"{API_URL}/download-feedback")
-    feedback_data = response.json()
-    with open(HITL_FILE, 'w') as f:
-        f.write(feedback_data["content"])
-    ```
-    """
     if not feedback_manager.feedback_file.exists():
         return DownloadFeedbackResponse(
             content="",
@@ -487,17 +498,7 @@ async def download_feedback():
 @app.post("/clear-feedback")
 async def clear_feedback():
-    """
-    Clear feedback file after training.
-    Call this after you've downloaded feedback and completed training
-    to start collecting fresh feedback for the next training cycle.
-    Example:
-    ```python
-    requests.post(f"{API_URL}/clear-feedback")
-    ```
-    """
     try:
         if feedback_manager.feedback_file.exists():
             feedback_manager.feedback_file.unlink()
@@ -516,20 +517,7 @@ async def clear_feedback():
 @app.post("/reload-adapter")
 async def reload_adapter(request: ReloadAdapterRequest):
-    """
-    Hot reload model with new adapter without restarting the Space.
-    This allows you to deploy new models without downtime.
-    Example:
-    ```python
-    # After training and pushing to HF Hub
-    requests.post(
-        f"{API_URL}/reload-adapter",
-        json={"adapter_path": "username/adapter-v2"}
-    )
-    ```
-    """
     try:
         model_manager.initialize(
             model_name="meta-llama/Llama-3.2-1B-Instruct",
@@ -550,10 +538,4 @@ async def reload_adapter(request: ReloadAdapterRequest):
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(
-        "app:app",
-        host="0.0.0.0",
-        port=7860,
-        reload=True
-    )

 """
 Enhanced FastAPI Backend with Feedback Management
 """
 from fastapi import FastAPI, HTTPException
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
+import os
 app = FastAPI(
     title="Personalized Chatbot API",
         if adapter_path:
             print(f"With adapter: {adapter_path}")
+        # Check for GPU
         self._device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self._device}")
+        try:
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                trust_remote_code=True
+            )
+        except Exception as e:
+            print(f"Error loading tokenizer: {e}")
+            print("Did you set HF_TOKEN in Settings > Secrets?")
+            raise e
         if self._tokenizer.pad_token is None:
             self._tokenizer.pad_token = self._tokenizer.eos_token
+        # CRITICAL FIX: Only try 4-bit if we actually have a GPU
+        if use_4bit and self._device == "cuda":
+            print("🚀 GPU detected: Loading in 4-bit mode")
+            try:
+                from transformers import BitsAndBytesConfig
+                bnb_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                )
+                base_model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    quantization_config=bnb_config,
+                    device_map="auto",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16,
+                )
+            except ImportError:
+                print("⚠️ bitsandbytes not installed. Falling back to standard loading.")
+                base_model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    device_map="auto",
+                    trust_remote_code=True,
+                )
         else:
+            print(f"⚠️ Using {self._device} (No GPU or use_4bit=False). Loading standard model.")
             base_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
+                device_map=self._device,
                 trust_remote_code=True,
+                # Use float32 for CPU stability
+                torch_dtype=torch.float32 if self._device == "cpu" else torch.float16
             )
         if adapter_path and (isinstance(adapter_path, str) and adapter_path.strip()):
                 self._model = PeftModel.from_pretrained(
                     base_model,
                     adapter_path,
+                    torch_dtype=torch.float16 if self._device == "cuda" else torch.float32
                 )
                 self._current_adapter = adapter_path
                 print(f"✅ Adapter loaded successfully")
             skip_special_tokens=True
         ).strip()
+        # Remove the system/user prompt if it leaked into response
+        if "assistant" in reply.lower() and len(reply.split("assistant")) > 1:
+             reply = reply.split("assistant")[-1].strip()
         return reply
     """Manages feedback storage and statistics."""
     def __init__(self, feedback_file: str = "data/feedback.jsonl"):
         self.feedback_file = Path(feedback_file)
+        # Ensure directory exists (Handled by Dockerfile too, but good safety)
         self.feedback_file.parent.mkdir(parents=True, exist_ok=True)
     def save_interaction(
     print("Starting up...")
     model_manager.initialize(
+        # 1. The Base Model (The heavy lifter)
+        # We use the official Llama 3.2 3B Instruct as the foundation
+        model_name="meta-llama/Llama-3.2-3B-Instruct",
+        # 2. Adapter (The personalization)
+        adapter_path="pierreramez/Llama-3.2-3B-Instruct-bnb-4bit_finetuned",
+        # 3. CPU Optimization
+        # Must be False for the free CPU tier
+        use_4bit=False
     )
     print("Ready to serve!")
         "message": "Personalized Chatbot API v2.0",
         "version": "2.0.0",
         "current_adapter": model_manager._current_adapter,
+        "device": model_manager._device,
         "endpoints": {
             "chat": "POST /chat",
             "feedback": "POST /feedback",
         )
     except Exception as e:
+        print(f"Error during chat: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 async def submit_feedback(request: FeedbackRequest):
     """Submit correction for a model response."""
     try:
+        # Optimistic feedback update: try to find existing entry
+        if feedback_manager.feedback_file.exists():
+            with open(feedback_manager.feedback_file, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+        else:
+            lines = []
         found = False
+        # Search backwards to find the most recent matching interaction
         for i in range(len(lines) - 1, -1, -1):
             try:
                 record = json.loads(lines[i])
                 message="Feedback recorded successfully"
             )
         else:
+            # If not found (e.g., app restarted), just append new record
             feedback_manager.save_interaction(
                 user_input=request.user_input,
                 model_reply=request.model_reply,
 @app.get("/correction-count", response_model=CorrectionCountResponse)
 async def get_correction_count():
+    """Get count of corrections for training readiness monitoring."""
     if not feedback_manager.feedback_file.exists():
         return CorrectionCountResponse(
             corrections=0,
 @app.get("/download-feedback", response_model=DownloadFeedbackResponse)
 async def download_feedback():
+    """Download feedback file for training."""
     if not feedback_manager.feedback_file.exists():
         return DownloadFeedbackResponse(
             content="",
 @app.post("/clear-feedback")
 async def clear_feedback():
+    """Clear feedback file after training."""
     try:
         if feedback_manager.feedback_file.exists():
             feedback_manager.feedback_file.unlink()
 @app.post("/reload-adapter")
 async def reload_adapter(request: ReloadAdapterRequest):
+    """Hot reload model with new adapter."""
     try:
         model_manager.initialize(
             model_name="meta-llama/Llama-3.2-1B-Instruct",
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)