Spaces:

ndwdgda
/

cpu

Sleeping

App Files Files Community

Nhughes09 commited on Dec 11, 2025

Commit

4f30320

1 Parent(s): bc2d859

Switch to llama-cpp-python with TinyLlama for HF Spaces cloud hosting

Browse files

Files changed (2) hide show

app.py +68 -256
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -1,292 +1,104 @@
-# app.py - ULTRA-ROBUST Ollama Chatbot with MAXIMUM Logging
 import gradio as gr
-import requests
 import logging
 import sys
 import traceback
-import json
 from datetime import datetime
-# ============================================================================
-#                    ULTRA-DETAILED LOGGING SYSTEM
-# ============================================================================
-class ColoredFormatter(logging.Formatter):
-    """Custom formatter with colors for terminal output."""
-    COLORS = {
-        'DEBUG': '\033[94m',     # Blue
-        'INFO': '\033[92m',      # Green
-        'WARNING': '\033[93m',   # Yellow
-        'ERROR': '\033[91m',     # Red
-        'CRITICAL': '\033[95m',  # Magenta
-        'RESET': '\033[0m'       # Reset
-    }
-    def format(self, record):
-        color = self.COLORS.get(record.levelname, self.COLORS['RESET'])
-        reset = self.COLORS['RESET']
-        record.levelname = f"{color}{record.levelname:8}{reset}"
-        return super().format(record)
-# Configure logging
-handler = logging.StreamHandler(sys.stdout)
-handler.setFormatter(ColoredFormatter(
-    "%(asctime)s | %(levelname)s | [%(funcName)s:%(lineno)d] %(message)s"
-))
 logger = logging.getLogger("CHATBOT")
-logger.setLevel(logging.DEBUG)
-logger.addHandler(handler)
-logger.propagate = False
-# Silence noisy libraries
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("httpcore").setLevel(logging.WARNING)
-logging.getLogger("gradio").setLevel(logging.WARNING)
-def banner(text):
-    logger.info("=" * 70)
-    logger.info(f"  {text}")
-    logger.info("=" * 70)
-def section(text):
-    logger.info("-" * 50)
-    logger.info(f"  >> {text}")
-    logger.info("-" * 50)
-# ============================================================================
-#                    STARTUP
-# ============================================================================
-banner("OLLAMA CHATBOT v3.0 - ULTRA LOGGING MODE")
-logger.info(f"Timestamp: {datetime.now().isoformat()}")
-logger.info(f"Python: {sys.version}")
-logger.info(f"Gradio: {gr.__version__}")
-# ============================================================================
-#                    OLLAMA CONFIGURATION
-# ============================================================================
-OLLAMA_URL = "http://localhost:11434"
-MODEL = "llama3.2:3b"
-section("OLLAMA CONNECTION TEST")
 try:
-    logger.info(f"Connecting to {OLLAMA_URL}...")
-    response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
-    logger.info(f"Status Code: {response.status_code}")
-    if response.status_code == 200:
-        models = [m["name"] for m in response.json().get("models", [])]
-        logger.info(f"SUCCESS! Found {len(models)} models:")
-        for m in models:
-            marker = " <<<< SELECTED" if m == MODEL else ""
-            logger.info(f"    - {m}{marker}")
-        if MODEL in models:
-            logger.info(f"Model {MODEL} is available!")
-        else:
-            logger.warning(f"Model {MODEL} NOT FOUND - may cause errors")
-    else:
-        logger.error(f"Ollama error: {response.status_code}")
 except Exception as e:
-    logger.error(f"Cannot connect to Ollama: {e}")
-    logger.error(">>> RUN: ollama serve <<<")
-# ============================================================================
-#                    CONVERSATION MEMORY
-# ============================================================================
-# Store conversation history globally
-conversation_history = []
-def log_history():
-    """Log the current conversation history."""
-    logger.debug(f"Current history has {len(conversation_history)} messages:")
-    for i, msg in enumerate(conversation_history):
-        role = msg.get('role', '?')
-        content = msg.get('content', '')[:50]
-        logger.debug(f"  [{i}] {role}: {content}...")
-# ============================================================================
-#                    MAIN CHAT FUNCTION
-# ============================================================================
 request_count = 0
-def chat_with_ollama(message, history):
-    """
-    Handle chat with ULTRA detailed logging.
-    This function:
-    1. Logs everything about the incoming request
-    2. Builds the prompt from conversation history
-    3. Calls Ollama API
-    4. Logs everything about the response
-    5. Returns the AI's response
-    """
-    global request_count, conversation_history
     request_count += 1
-    req_id = f"REQ-{request_count:04d}"
-    # ===== PHASE 1: LOG INCOMING REQUEST =====
-    section(f"{req_id} - NEW MESSAGE RECEIVED")
-    logger.info(f"[{req_id}] ┌─────────────────────────────────────────")
-    logger.info(f"[{req_id}] │ USER MESSAGE: {message}")
-    logger.info(f"[{req_id}] │ Message Length: {len(message)} chars")
-    logger.info(f"[{req_id}] │ Timestamp: {datetime.now().isoformat()}")
-    logger.info(f"[{req_id}] └─────────────────────────────────────────")
-    # ===== PHASE 2: LOG HISTORY FROM GRADIO =====
-    logger.info(f"[{req_id}] GRADIO HISTORY ANALYSIS:")
-    logger.info(f"[{req_id}]   - Type: {type(history)}")
-    logger.info(f"[{req_id}]   - Length: {len(history) if history else 0}")
-    if history:
-        for i, item in enumerate(history):
-            logger.debug(f"[{req_id}]   - Item[{i}]: type={type(item).__name__}")
-            if isinstance(item, dict):
-                logger.debug(f"[{req_id}]     role={item.get('role')}, content_len={len(str(item.get('content', '')))}")
-            elif isinstance(item, (list, tuple)):
-                logger.debug(f"[{req_id}]     tuple/list with {len(item)} elements")
-            else:
-                logger.debug(f"[{req_id}]     value={str(item)[:100]}")
     try:
-        # ===== PHASE 3: BUILD PROMPT =====
-        logger.info(f"[{req_id}] BUILDING PROMPT...")
-        prompt_parts = ["You are a helpful AI assistant. Be friendly and conversational.\n"]
-        # Process history (handle multiple formats)
         if history:
-            for i, item in enumerate(history):
-                try:
-                    if isinstance(item, dict):
-                        # New Gradio format
-                        role = item.get("role", "unknown")
-                        content = str(item.get("content", ""))
-                        if role == "user":
-                            prompt_parts.append(f"User: {content}")
-                        elif role == "assistant":
-                            prompt_parts.append(f"Assistant: {content}")
-                        logger.debug(f"[{req_id}] Added {role} message ({len(content)} chars)")
-                    elif isinstance(item, (list, tuple)) and len(item) >= 2:
-                        # Old Gradio format
-                        user_msg = str(item[0]) if item[0] else ""
-                        bot_msg = str(item[1]) if item[1] else ""
-                        if user_msg:
-                            prompt_parts.append(f"User: {user_msg}")
-                            logger.debug(f"[{req_id}] Added user message ({len(user_msg)} chars)")
-                        if bot_msg:
-                            prompt_parts.append(f"Assistant: {bot_msg}")
-                            logger.debug(f"[{req_id}] Added assistant message ({len(bot_msg)} chars)")
-                    else:
-                        logger.warning(f"[{req_id}] Skipping unknown history format: {type(item)}")
-                except Exception as e:
-                    logger.error(f"[{req_id}] Error processing history item {i}: {e}")
-        # Add current message
-        prompt_parts.append(f"User: {message}")
-        prompt_parts.append("Assistant:")
-        full_prompt = "\n".join(prompt_parts)
-        logger.info(f"[{req_id}] PROMPT BUILT:")
-        logger.info(f"[{req_id}]   - Total Parts: {len(prompt_parts)}")
-        logger.info(f"[{req_id}]   - Total Length: {len(full_prompt)} chars")
-        logger.debug(f"[{req_id}]   - Full Prompt:\n{full_prompt}")
-        # ===== PHASE 4: CALL OLLAMA =====
-        logger.info(f"[{req_id}] CALLING OLLAMA API...")
-        logger.info(f"[{req_id}]   - URL: {OLLAMA_URL}/api/generate")
-        logger.info(f"[{req_id}]   - Model: {MODEL}")
-        logger.info(f"[{req_id}]   - Stream: False")
-        start_time = datetime.now()
-        payload = {
-            "model": MODEL,
-            "prompt": full_prompt,
-            "stream": False,
-            "options": {
-                "temperature": 0.7,
-                "num_predict": 500
-            }
-        }
-        logger.debug(f"[{req_id}] Request payload: {json.dumps(payload, indent=2)[:500]}...")
-        response = requests.post(
-            f"{OLLAMA_URL}/api/generate",
-            json=payload,
-            timeout=120
-        )
-        elapsed = (datetime.now() - start_time).total_seconds()
-        # ===== PHASE 5: LOG RESPONSE =====
-        logger.info(f"[{req_id}] RESPONSE RECEIVED:")
-        logger.info(f"[{req_id}]   - Status Code: {response.status_code}")
-        logger.info(f"[{req_id}]   - Time Elapsed: {elapsed:.2f} seconds")
-        logger.info(f"[{req_id}]   - Response Size: {len(response.text)} bytes")
-        if response.status_code == 200:
-            result = response.json()
-            ai_response = result.get("response", "")
-            total_duration = result.get("total_duration", 0) / 1_000_000_000
-            eval_count = result.get("eval_count", 0)
-            prompt_eval_count = result.get("prompt_eval_count", 0)
-            logger.info(f"[{req_id}] OLLAMA STATS:")
-            logger.info(f"[{req_id}]   - Prompt Tokens: {prompt_eval_count}")
-            logger.info(f"[{req_id}]   - Response Tokens: {eval_count}")
-            logger.info(f"[{req_id}]   - Total Duration: {total_duration:.2f}s")
-            logger.info(f"[{req_id}] AI RESPONSE:")
-            logger.info(f"[{req_id}]   - Length: {len(ai_response)} chars")
-            logger.info(f"[{req_id}]   - Preview: {ai_response[:200]}...")
-            # Save to global history
-            conversation_history.append({"role": "user", "content": message})
-            conversation_history.append({"role": "assistant", "content": ai_response})
-            logger.info(f"[{req_id}] SUCCESS! Returning response to user.")
-            return ai_response.strip()
-        else:
-            logger.error(f"[{req_id}] OLLAMA ERROR!")
-            logger.error(f"[{req_id}]   - Status: {response.status_code}")
-            logger.error(f"[{req_id}]   - Body: {response.text[:500]}")
-            return f"Error: Ollama returned status {response.status_code}\n\nDetails: {response.text[:200]}"
-    except requests.exceptions.ConnectionError as e:
-        logger.error(f"[{req_id}] CONNECTION ERROR!")
-        logger.error(f"[{req_id}]   - Error: {e}")
-        logger.error(f"[{req_id}]   - Is Ollama running? Try: ollama serve")
-        return "Error: Cannot connect to Ollama. Please run: ollama serve"
-    except requests.exceptions.Timeout:
-        logger.error(f"[{req_id}] TIMEOUT!")
-        logger.error(f"[{req_id}]   - Request took longer than 120 seconds")
-        return "Error: Request timed out after 120 seconds"
     except Exception as e:
-        logger.error(f"[{req_id}] UNEXPECTED ERROR!")
-        logger.error(f"[{req_id}]   - Type: {type(e).__name__}")
-        logger.error(f"[{req_id}]   - Error: {e}")
-        logger.error(f"[{req_id}]   - Traceback:\n{traceback.format_exc()}")
-        return f"Error: {type(e).__name__}: {e}\n\nCheck terminal logs for full traceback."
-# ============================================================================
-#                    GRADIO UI
-# ============================================================================
-section("BUILDING GRADIO UI")
 demo = gr.ChatInterface(
-    fn=chat_with_ollama,
-    title="🤖 CPU Chatbot v3.0",
-    description=f"**Powered by Ollama** ({MODEL})\n\n✅ Conversation Memory | 📋 Ultra Logging | 🔧 Local AI",
-    examples=["Hello!", "What is Python?", "Tell me a joke", "Remember my name is Nick"],
 )
-banner("READY! Open http://127.0.0.1:7860")
-logger.info("Watch this terminal for detailed logs of every message!")
 if __name__ == "__main__":
     demo.launch()

+# app.py - HuggingFace Spaces Chatbot with Local LLM
 import gradio as gr
 import logging
 import sys
 import traceback
 from datetime import datetime
+from huggingface_hub import hf_hub_download
+# Logging setup
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-8s | %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
 logger = logging.getLogger("CHATBOT")
+logger.info("=" * 60)
+logger.info("  CPU CHATBOT - HUGGINGFACE SPACES EDITION")
+logger.info("=" * 60)
+# Model config
+MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+logger.info(f"Downloading model: {MODEL_FILE}")
 try:
+    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, cache_dir="/tmp/models")
+    logger.info(f"Model path: {model_path}")
 except Exception as e:
+    logger.error(f"Download failed: {e}")
+    model_path = None
+# Load model
+llm = None
+if model_path:
+    try:
+        from llama_cpp import Llama
+        logger.info("Loading model into memory (30-60 sec)...")
+        llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2, n_batch=128, verbose=False)
+        logger.info("MODEL LOADED!")
+    except Exception as e:
+        logger.error(f"Load failed: {e}")
+# Chat function
 request_count = 0
+def chat_with_ai(message, history):
+    global request_count
     request_count += 1
+    rid = f"REQ-{request_count:04d}"
+    logger.info(f"[{rid}] User: {message}")
+    if llm is None:
+        return "Error: Model not loaded. Check logs."
     try:
+        # Build prompt
+        prompt = "You are a helpful AI assistant.\n\n"
         if history:
+            for item in history:
+                if isinstance(item, dict):
+                    r = item.get("role", "")
+                    c = str(item.get("content", ""))
+                    if r == "user":
+                        prompt += f"User: {c}\n"
+                    elif r == "assistant":
+                        prompt += f"Assistant: {c}\n"
+                elif isinstance(item, (list, tuple)) and len(item) >= 2:
+                    prompt += f"User: {item[0]}\n"
+                    if item[1]:
+                        prompt += f"Assistant: {item[1]}\n"
+        prompt += f"User: {message}\nAssistant:"
+        logger.info(f"[{rid}] Generating response...")
+        start = datetime.now()
+        output = llm(prompt, max_tokens=256, stop=["User:", "\n\n"], echo=False)
+        elapsed = (datetime.now() - start).total_seconds()
+        response = output["choices"][0]["text"].strip()
+        logger.info(f"[{rid}] Response in {elapsed:.1f}s: {response[:100]}...")
+        return response
     except Exception as e:
+        logger.error(f"[{rid}] Error: {e}")
+        logger.error(traceback.format_exc())
+        return f"Error: {e}"
+# Gradio UI
+logger.info("Building Gradio UI...")
 demo = gr.ChatInterface(
+    fn=chat_with_ai,
+    title="CPU Chatbot",
+    description="**Powered by TinyLlama 1.1B** - Runs entirely on HuggingFace's servers!",
+    examples=["Hello!", "What is AI?", "Tell me a joke"],
 )
+logger.info("READY!")
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-gradio==4.19.2
-huggingface_hub==0.22.2
-requests

+gradio==4.44.0
+llama-cpp-python==0.2.90
+huggingface_hub>=0.20.0