Nhughes09 commited on
Commit
4f30320
·
1 Parent(s): bc2d859

Switch to llama-cpp-python with TinyLlama for HF Spaces cloud hosting

Browse files
Files changed (2) hide show
  1. app.py +68 -256
  2. requirements.txt +3 -3
app.py CHANGED
@@ -1,292 +1,104 @@
1
- # app.py - ULTRA-ROBUST Ollama Chatbot with MAXIMUM Logging
2
  import gradio as gr
3
- import requests
4
  import logging
5
  import sys
6
  import traceback
7
- import json
8
  from datetime import datetime
 
9
 
10
- # ============================================================================
11
- # ULTRA-DETAILED LOGGING SYSTEM
12
- # ============================================================================
13
- class ColoredFormatter(logging.Formatter):
14
- """Custom formatter with colors for terminal output."""
15
- COLORS = {
16
- 'DEBUG': '\033[94m', # Blue
17
- 'INFO': '\033[92m', # Green
18
- 'WARNING': '\033[93m', # Yellow
19
- 'ERROR': '\033[91m', # Red
20
- 'CRITICAL': '\033[95m', # Magenta
21
- 'RESET': '\033[0m' # Reset
22
- }
23
-
24
- def format(self, record):
25
- color = self.COLORS.get(record.levelname, self.COLORS['RESET'])
26
- reset = self.COLORS['RESET']
27
- record.levelname = f"{color}{record.levelname:8}{reset}"
28
- return super().format(record)
29
-
30
- # Configure logging
31
- handler = logging.StreamHandler(sys.stdout)
32
- handler.setFormatter(ColoredFormatter(
33
- "%(asctime)s | %(levelname)s | [%(funcName)s:%(lineno)d] %(message)s"
34
- ))
35
  logger = logging.getLogger("CHATBOT")
36
- logger.setLevel(logging.DEBUG)
37
- logger.addHandler(handler)
38
- logger.propagate = False
39
-
40
- # Silence noisy libraries
41
- logging.getLogger("httpx").setLevel(logging.WARNING)
42
- logging.getLogger("httpcore").setLevel(logging.WARNING)
43
- logging.getLogger("gradio").setLevel(logging.WARNING)
44
 
45
- def banner(text):
46
- logger.info("=" * 70)
47
- logger.info(f" {text}")
48
- logger.info("=" * 70)
49
 
50
- def section(text):
51
- logger.info("-" * 50)
52
- logger.info(f" >> {text}")
53
- logger.info("-" * 50)
54
 
55
- # ============================================================================
56
- # STARTUP
57
- # ============================================================================
58
- banner("OLLAMA CHATBOT v3.0 - ULTRA LOGGING MODE")
59
- logger.info(f"Timestamp: {datetime.now().isoformat()}")
60
- logger.info(f"Python: {sys.version}")
61
- logger.info(f"Gradio: {gr.__version__}")
62
-
63
- # ============================================================================
64
- # OLLAMA CONFIGURATION
65
- # ============================================================================
66
- OLLAMA_URL = "http://localhost:11434"
67
- MODEL = "llama3.2:3b"
68
-
69
- section("OLLAMA CONNECTION TEST")
70
  try:
71
- logger.info(f"Connecting to {OLLAMA_URL}...")
72
- response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
73
- logger.info(f"Status Code: {response.status_code}")
74
-
75
- if response.status_code == 200:
76
- models = [m["name"] for m in response.json().get("models", [])]
77
- logger.info(f"SUCCESS! Found {len(models)} models:")
78
- for m in models:
79
- marker = " <<<< SELECTED" if m == MODEL else ""
80
- logger.info(f" - {m}{marker}")
81
-
82
- if MODEL in models:
83
- logger.info(f"Model {MODEL} is available!")
84
- else:
85
- logger.warning(f"Model {MODEL} NOT FOUND - may cause errors")
86
- else:
87
- logger.error(f"Ollama error: {response.status_code}")
88
  except Exception as e:
89
- logger.error(f"Cannot connect to Ollama: {e}")
90
- logger.error(">>> RUN: ollama serve <<<")
91
-
92
- # ============================================================================
93
- # CONVERSATION MEMORY
94
- # ============================================================================
95
- # Store conversation history globally
96
- conversation_history = []
97
 
98
- def log_history():
99
- """Log the current conversation history."""
100
- logger.debug(f"Current history has {len(conversation_history)} messages:")
101
- for i, msg in enumerate(conversation_history):
102
- role = msg.get('role', '?')
103
- content = msg.get('content', '')[:50]
104
- logger.debug(f" [{i}] {role}: {content}...")
 
 
 
105
 
106
- # ============================================================================
107
- # MAIN CHAT FUNCTION
108
- # ============================================================================
109
  request_count = 0
110
 
111
- def chat_with_ollama(message, history):
112
- """
113
- Handle chat with ULTRA detailed logging.
114
-
115
- This function:
116
- 1. Logs everything about the incoming request
117
- 2. Builds the prompt from conversation history
118
- 3. Calls Ollama API
119
- 4. Logs everything about the response
120
- 5. Returns the AI's response
121
- """
122
- global request_count, conversation_history
123
  request_count += 1
124
- req_id = f"REQ-{request_count:04d}"
125
-
126
- # ===== PHASE 1: LOG INCOMING REQUEST =====
127
- section(f"{req_id} - NEW MESSAGE RECEIVED")
128
- logger.info(f"[{req_id}] ┌─────────────────────────────────────────")
129
- logger.info(f"[{req_id}] │ USER MESSAGE: {message}")
130
- logger.info(f"[{req_id}] │ Message Length: {len(message)} chars")
131
- logger.info(f"[{req_id}] │ Timestamp: {datetime.now().isoformat()}")
132
- logger.info(f"[{req_id}] └─────────────────────────────────────────")
133
 
134
- # ===== PHASE 2: LOG HISTORY FROM GRADIO =====
135
- logger.info(f"[{req_id}] GRADIO HISTORY ANALYSIS:")
136
- logger.info(f"[{req_id}] - Type: {type(history)}")
137
- logger.info(f"[{req_id}] - Length: {len(history) if history else 0}")
138
 
139
- if history:
140
- for i, item in enumerate(history):
141
- logger.debug(f"[{req_id}] - Item[{i}]: type={type(item).__name__}")
142
- if isinstance(item, dict):
143
- logger.debug(f"[{req_id}] role={item.get('role')}, content_len={len(str(item.get('content', '')))}")
144
- elif isinstance(item, (list, tuple)):
145
- logger.debug(f"[{req_id}] tuple/list with {len(item)} elements")
146
- else:
147
- logger.debug(f"[{req_id}] value={str(item)[:100]}")
148
 
149
  try:
150
- # ===== PHASE 3: BUILD PROMPT =====
151
- logger.info(f"[{req_id}] BUILDING PROMPT...")
152
-
153
- prompt_parts = ["You are a helpful AI assistant. Be friendly and conversational.\n"]
154
-
155
- # Process history (handle multiple formats)
156
  if history:
157
- for i, item in enumerate(history):
158
- try:
159
- if isinstance(item, dict):
160
- # New Gradio format
161
- role = item.get("role", "unknown")
162
- content = str(item.get("content", ""))
163
- if role == "user":
164
- prompt_parts.append(f"User: {content}")
165
- elif role == "assistant":
166
- prompt_parts.append(f"Assistant: {content}")
167
- logger.debug(f"[{req_id}] Added {role} message ({len(content)} chars)")
168
- elif isinstance(item, (list, tuple)) and len(item) >= 2:
169
- # Old Gradio format
170
- user_msg = str(item[0]) if item[0] else ""
171
- bot_msg = str(item[1]) if item[1] else ""
172
- if user_msg:
173
- prompt_parts.append(f"User: {user_msg}")
174
- logger.debug(f"[{req_id}] Added user message ({len(user_msg)} chars)")
175
- if bot_msg:
176
- prompt_parts.append(f"Assistant: {bot_msg}")
177
- logger.debug(f"[{req_id}] Added assistant message ({len(bot_msg)} chars)")
178
- else:
179
- logger.warning(f"[{req_id}] Skipping unknown history format: {type(item)}")
180
- except Exception as e:
181
- logger.error(f"[{req_id}] Error processing history item {i}: {e}")
182
-
183
- # Add current message
184
- prompt_parts.append(f"User: {message}")
185
- prompt_parts.append("Assistant:")
186
-
187
- full_prompt = "\n".join(prompt_parts)
188
-
189
- logger.info(f"[{req_id}] PROMPT BUILT:")
190
- logger.info(f"[{req_id}] - Total Parts: {len(prompt_parts)}")
191
- logger.info(f"[{req_id}] - Total Length: {len(full_prompt)} chars")
192
- logger.debug(f"[{req_id}] - Full Prompt:\n{full_prompt}")
193
-
194
- # ===== PHASE 4: CALL OLLAMA =====
195
- logger.info(f"[{req_id}] CALLING OLLAMA API...")
196
- logger.info(f"[{req_id}] - URL: {OLLAMA_URL}/api/generate")
197
- logger.info(f"[{req_id}] - Model: {MODEL}")
198
- logger.info(f"[{req_id}] - Stream: False")
199
 
200
- start_time = datetime.now()
201
 
202
- payload = {
203
- "model": MODEL,
204
- "prompt": full_prompt,
205
- "stream": False,
206
- "options": {
207
- "temperature": 0.7,
208
- "num_predict": 500
209
- }
210
- }
211
 
212
- logger.debug(f"[{req_id}] Request payload: {json.dumps(payload, indent=2)[:500]}...")
213
 
214
- response = requests.post(
215
- f"{OLLAMA_URL}/api/generate",
216
- json=payload,
217
- timeout=120
218
- )
219
 
220
- elapsed = (datetime.now() - start_time).total_seconds()
 
221
 
222
- # ===== PHASE 5: LOG RESPONSE =====
223
- logger.info(f"[{req_id}] RESPONSE RECEIVED:")
224
- logger.info(f"[{req_id}] - Status Code: {response.status_code}")
225
- logger.info(f"[{req_id}] - Time Elapsed: {elapsed:.2f} seconds")
226
- logger.info(f"[{req_id}] - Response Size: {len(response.text)} bytes")
227
-
228
- if response.status_code == 200:
229
- result = response.json()
230
-
231
- ai_response = result.get("response", "")
232
- total_duration = result.get("total_duration", 0) / 1_000_000_000
233
- eval_count = result.get("eval_count", 0)
234
- prompt_eval_count = result.get("prompt_eval_count", 0)
235
-
236
- logger.info(f"[{req_id}] OLLAMA STATS:")
237
- logger.info(f"[{req_id}] - Prompt Tokens: {prompt_eval_count}")
238
- logger.info(f"[{req_id}] - Response Tokens: {eval_count}")
239
- logger.info(f"[{req_id}] - Total Duration: {total_duration:.2f}s")
240
-
241
- logger.info(f"[{req_id}] AI RESPONSE:")
242
- logger.info(f"[{req_id}] - Length: {len(ai_response)} chars")
243
- logger.info(f"[{req_id}] - Preview: {ai_response[:200]}...")
244
-
245
- # Save to global history
246
- conversation_history.append({"role": "user", "content": message})
247
- conversation_history.append({"role": "assistant", "content": ai_response})
248
-
249
- logger.info(f"[{req_id}] SUCCESS! Returning response to user.")
250
- return ai_response.strip()
251
-
252
- else:
253
- logger.error(f"[{req_id}] OLLAMA ERROR!")
254
- logger.error(f"[{req_id}] - Status: {response.status_code}")
255
- logger.error(f"[{req_id}] - Body: {response.text[:500]}")
256
- return f"Error: Ollama returned status {response.status_code}\n\nDetails: {response.text[:200]}"
257
-
258
- except requests.exceptions.ConnectionError as e:
259
- logger.error(f"[{req_id}] CONNECTION ERROR!")
260
- logger.error(f"[{req_id}] - Error: {e}")
261
- logger.error(f"[{req_id}] - Is Ollama running? Try: ollama serve")
262
- return "Error: Cannot connect to Ollama. Please run: ollama serve"
263
-
264
- except requests.exceptions.Timeout:
265
- logger.error(f"[{req_id}] TIMEOUT!")
266
- logger.error(f"[{req_id}] - Request took longer than 120 seconds")
267
- return "Error: Request timed out after 120 seconds"
268
-
269
  except Exception as e:
270
- logger.error(f"[{req_id}] UNEXPECTED ERROR!")
271
- logger.error(f"[{req_id}] - Type: {type(e).__name__}")
272
- logger.error(f"[{req_id}] - Error: {e}")
273
- logger.error(f"[{req_id}] - Traceback:\n{traceback.format_exc()}")
274
- return f"Error: {type(e).__name__}: {e}\n\nCheck terminal logs for full traceback."
275
-
276
- # ============================================================================
277
- # GRADIO UI
278
- # ============================================================================
279
- section("BUILDING GRADIO UI")
280
 
 
 
281
  demo = gr.ChatInterface(
282
- fn=chat_with_ollama,
283
- title="🤖 CPU Chatbot v3.0",
284
- description=f"**Powered by Ollama** ({MODEL})\n\n✅ Conversation Memory | 📋 Ultra Logging | 🔧 Local AI",
285
- examples=["Hello!", "What is Python?", "Tell me a joke", "Remember my name is Nick"],
286
  )
287
 
288
- banner("READY! Open http://127.0.0.1:7860")
289
- logger.info("Watch this terminal for detailed logs of every message!")
290
 
291
  if __name__ == "__main__":
292
  demo.launch()
 
1
+ # app.py - HuggingFace Spaces Chatbot with Local LLM
2
  import gradio as gr
 
3
  import logging
4
  import sys
5
  import traceback
 
6
  from datetime import datetime
7
+ from huggingface_hub import hf_hub_download
8
 
9
+ # Logging setup
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s | %(levelname)-8s | %(message)s",
13
+ handlers=[logging.StreamHandler(sys.stdout)]
14
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  logger = logging.getLogger("CHATBOT")
 
 
 
 
 
 
 
 
16
 
17
+ logger.info("=" * 60)
18
+ logger.info(" CPU CHATBOT - HUGGINGFACE SPACES EDITION")
19
+ logger.info("=" * 60)
 
20
 
21
+ # Model config
22
+ MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
23
+ MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
 
24
 
25
+ logger.info(f"Downloading model: {MODEL_FILE}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  try:
27
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, cache_dir="/tmp/models")
28
+ logger.info(f"Model path: {model_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
+ logger.error(f"Download failed: {e}")
31
+ model_path = None
 
 
 
 
 
 
32
 
33
+ # Load model
34
+ llm = None
35
+ if model_path:
36
+ try:
37
+ from llama_cpp import Llama
38
+ logger.info("Loading model into memory (30-60 sec)...")
39
+ llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2, n_batch=128, verbose=False)
40
+ logger.info("MODEL LOADED!")
41
+ except Exception as e:
42
+ logger.error(f"Load failed: {e}")
43
 
44
+ # Chat function
 
 
45
  request_count = 0
46
 
47
+ def chat_with_ai(message, history):
48
+ global request_count
 
 
 
 
 
 
 
 
 
 
49
  request_count += 1
50
+ rid = f"REQ-{request_count:04d}"
 
 
 
 
 
 
 
 
51
 
52
+ logger.info(f"[{rid}] User: {message}")
 
 
 
53
 
54
+ if llm is None:
55
+ return "Error: Model not loaded. Check logs."
 
 
 
 
 
 
 
56
 
57
  try:
58
+ # Build prompt
59
+ prompt = "You are a helpful AI assistant.\n\n"
 
 
 
 
60
  if history:
61
+ for item in history:
62
+ if isinstance(item, dict):
63
+ r = item.get("role", "")
64
+ c = str(item.get("content", ""))
65
+ if r == "user":
66
+ prompt += f"User: {c}\n"
67
+ elif r == "assistant":
68
+ prompt += f"Assistant: {c}\n"
69
+ elif isinstance(item, (list, tuple)) and len(item) >= 2:
70
+ prompt += f"User: {item[0]}\n"
71
+ if item[1]:
72
+ prompt += f"Assistant: {item[1]}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ prompt += f"User: {message}\nAssistant:"
75
 
76
+ logger.info(f"[{rid}] Generating response...")
77
+ start = datetime.now()
 
 
 
 
 
 
 
78
 
79
+ output = llm(prompt, max_tokens=256, stop=["User:", "\n\n"], echo=False)
80
 
81
+ elapsed = (datetime.now() - start).total_seconds()
82
+ response = output["choices"][0]["text"].strip()
 
 
 
83
 
84
+ logger.info(f"[{rid}] Response in {elapsed:.1f}s: {response[:100]}...")
85
+ return response
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
+ logger.error(f"[{rid}] Error: {e}")
89
+ logger.error(traceback.format_exc())
90
+ return f"Error: {e}"
 
 
 
 
 
 
 
91
 
92
+ # Gradio UI
93
+ logger.info("Building Gradio UI...")
94
  demo = gr.ChatInterface(
95
+ fn=chat_with_ai,
96
+ title="CPU Chatbot",
97
+ description="**Powered by TinyLlama 1.1B** - Runs entirely on HuggingFace's servers!",
98
+ examples=["Hello!", "What is AI?", "Tell me a joke"],
99
  )
100
 
101
+ logger.info("READY!")
 
102
 
103
  if __name__ == "__main__":
104
  demo.launch()
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- gradio==4.19.2
2
- huggingface_hub==0.22.2
3
- requests
 
1
+ gradio==4.44.0
2
+ llama-cpp-python==0.2.90
3
+ huggingface_hub>=0.20.0