Stanley03 commited on
Commit
2892625
Β·
verified Β·
1 Parent(s): 2a0d2d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -66
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py - OPTIMIZED TEXT-ONLY VERSION FOR HUGGING FACE SPACES
2
  from flask import Flask, request, jsonify
3
  from flask_cors import CORS
4
  import torch
@@ -20,11 +20,12 @@ ON_SPACES = os.environ.get('SPACE_ID') is not None
20
  logger.info(f"πŸš€ Running on Hugging Face Spaces: {ON_SPACES}")
21
 
22
  # ============================================================================
23
- # ULTRA-FAST QWEN MODEL LOADING
24
  # ============================================================================
25
 
26
- # Use the smallest Qwen model available
27
  MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
 
28
 
29
  model = None
30
  tokenizer = None
@@ -32,7 +33,7 @@ model_loaded = False
32
  model_loading = False
33
 
34
  def load_model_fast():
35
- """Fast model loading optimized for Spaces"""
36
  global model, tokenizer, model_loaded, model_loading
37
 
38
  if model_loading or model_loaded:
@@ -43,13 +44,13 @@ def load_model_fast():
43
  try:
44
  logger.info(f"πŸ”„ Loading {MODEL_NAME}...")
45
 
46
- # Import only when needed
47
  from transformers import AutoTokenizer, AutoModelForCausalLM
48
 
49
- # Load tokenizer first
50
  tokenizer = AutoTokenizer.from_pretrained(
51
  MODEL_NAME,
52
- trust_remote_code=True,
53
  padding_side="left"
54
  )
55
 
@@ -57,16 +58,16 @@ def load_model_fast():
57
  if tokenizer.pad_token is None:
58
  tokenizer.pad_token = tokenizer.eos_token
59
 
60
- # Load model with minimal settings
61
  model = AutoModelForCausalLM.from_pretrained(
62
  MODEL_NAME,
63
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
64
  device_map="auto" if torch.cuda.is_available() else None,
65
- trust_remote_code=True,
66
  low_cpu_mem_usage=True,
67
  )
68
 
69
- # If no GPU, move to CPU explicitly
70
  if not torch.cuda.is_available():
71
  model = model.to("cpu")
72
  logger.info("πŸ“± Model moved to CPU")
@@ -75,11 +76,60 @@ def load_model_fast():
75
 
76
  model.eval()
77
  model_loaded = True
78
- logger.info("βœ… Model loaded successfully!")
 
 
 
 
79
 
80
  except Exception as e:
81
- logger.error(f"❌ Model loading failed: {str(e)[:200]}")
82
- model_loaded = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  finally:
84
  model_loading = False
85
 
@@ -88,29 +138,36 @@ def load_model_fast():
88
  # ============================================================================
89
 
90
  def generate_quick(user_message, max_tokens=256):
91
- """Ultra-fast generation with minimal overhead"""
92
  if not model_loaded:
93
- return "Model is still loading, please wait a few seconds and try again..."
94
 
95
  try:
96
  # Truncate long messages
97
  if len(user_message) > 1000:
98
  user_message = user_message[:1000]
99
 
100
- # Format the prompt for Qwen chat template
101
  messages = [
102
- {"role": "system", "content": "You are Stanley AI, a helpful and knowledgeable assistant. Keep responses concise and helpful."},
 
 
 
103
  {"role": "user", "content": user_message}
104
  ]
105
 
106
- # Apply chat template
107
- text = tokenizer.apply_chat_template(
108
- messages,
109
- tokenize=False,
110
- add_generation_prompt=True
111
- )
 
 
 
 
112
 
113
- # Tokenize with truncation
114
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
115
 
116
  # Move to device
@@ -125,25 +182,49 @@ def generate_quick(user_message, max_tokens=256):
125
  temperature=0.7,
126
  do_sample=True,
127
  top_p=0.9,
128
- top_k=50,
129
  repetition_penalty=1.1,
130
- pad_token_id=tokenizer.eos_token_id,
131
  eos_token_id=tokenizer.eos_token_id,
132
- use_cache=True, # Important for speed
133
- attention_mask=inputs.get("attention_mask", None),
134
  )
135
 
136
- # Decode only new tokens
137
- response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  return response.strip()
140
 
141
  except Exception as e:
142
  logger.error(f"Generation error: {e}")
143
- return f"I encountered an error. Please try again or rephrase your question."
 
 
 
 
 
 
 
 
 
144
 
145
  # ============================================================================
146
- # SIMPLE CACHE SYSTEM
147
  # ============================================================================
148
 
149
  response_cache = {}
@@ -158,28 +239,37 @@ def cache_response(query, response):
158
  """Cache response"""
159
  key = query.lower().strip()[:80]
160
  if len(response_cache) >= CACHE_SIZE:
161
- # Remove oldest
162
  response_cache.pop(next(iter(response_cache)))
163
  response_cache[key] = response
164
 
165
  # ============================================================================
166
- # FLASK ROUTES - TEXT ONLY
167
  # ============================================================================
168
 
169
  @app.route('/')
170
  def home():
171
  return jsonify({
172
  "name": "Stanley AI",
173
- "version": "4.1",
174
  "model": MODEL_NAME,
175
  "status": "ready" if model_loaded else "loading",
176
  "platform": "huggingface-spaces",
177
  "endpoints": {
178
  "chat": "POST /api/chat",
179
  "status": "GET /api/status",
180
- "test": "GET /api/test"
 
181
  },
182
- "note": "Fast text generation with Qwen 0.5B model"
 
 
 
 
 
 
 
 
 
183
  })
184
 
185
  @app.route('/api/chat', methods=['POST', 'GET'])
@@ -188,11 +278,11 @@ def chat():
188
  start_time = time.time()
189
 
190
  try:
191
- # Handle both POST and GET
192
  if request.method == 'POST':
193
  data = request.get_json()
194
  if not data:
195
- return jsonify({"error": "No JSON data provided"}), 400
196
  user_message = data.get('message', '')
197
  else:
198
  user_message = request.args.get('message', 'Hello')
@@ -200,20 +290,23 @@ def chat():
200
  if not user_message:
201
  return jsonify({"error": "No message provided"}), 400
202
 
203
- # If model is not loaded yet
 
 
 
 
 
 
 
 
204
  if not model_loaded:
205
- # Start loading if not already loading
206
- if not model_loading:
207
- thread = threading.Thread(target=load_model_fast, daemon=True)
208
- thread.start()
209
-
210
  return jsonify({
211
- "response": "Model is loading... Please wait a few seconds and try again.",
212
  "status": "loading",
213
  "response_time": round(time.time() - start_time, 3)
214
  })
215
 
216
- # Check cache first
217
  cached = get_cached_response(user_message)
218
  if cached:
219
  logger.info("πŸ“¦ Using cached response")
@@ -245,13 +338,13 @@ def chat():
245
  except Exception as e:
246
  logger.error(f"Chat error: {e}")
247
  return jsonify({
248
- "error": f"Error: {str(e)[:200]}",
249
  "status": "error"
250
  }), 500
251
 
252
  @app.route('/api/status')
253
  def status():
254
- """Health check"""
255
  return jsonify({
256
  "model_loaded": model_loaded,
257
  "model_loading": model_loading,
@@ -259,17 +352,16 @@ def status():
259
  "device": str(model.device) if model_loaded else "none",
260
  "cache_size": len(response_cache),
261
  "timestamp": time.time(),
262
- "on_spaces": ON_SPACES,
263
- "memory": f"{torch.cuda.memory_allocated() / 1024**2:.1f} MB" if torch.cuda.is_available() and model_loaded else "CPU mode"
264
  })
265
 
266
  @app.route('/api/test')
267
  def test():
268
- """Quick test endpoint"""
269
  if not model_loaded:
270
  return jsonify({
271
  "status": "model_not_loaded",
272
- "message": "Model is still loading. Try /api/chat endpoint in a few seconds."
273
  })
274
 
275
  test_query = "Hello, who are you?"
@@ -280,30 +372,30 @@ def test():
280
  return jsonify({
281
  "test": "success",
282
  "query": test_query,
283
- "response_preview": response[:200] + "..." if len(response) > 200 else response,
284
  "response_time": time_taken,
285
  "model": MODEL_NAME
286
  })
287
 
288
- @app.route('/api/health')
289
- def health():
290
- """Simple health check for Spaces"""
291
  return jsonify({
292
- "status": "healthy",
293
- "timestamp": time.time()
 
 
294
  })
295
 
296
  # ============================================================================
297
- # STARTUP
298
  # ============================================================================
299
 
300
- # Start model loading in background when app starts
301
  if ON_SPACES:
302
- logger.info("Starting model load in background thread...")
303
  thread = threading.Thread(target=load_model_fast, daemon=True)
304
  thread.start()
305
  else:
306
- # Load immediately for local testing
307
  load_model_fast()
308
 
309
  # ============================================================================
@@ -312,14 +404,13 @@ else:
312
 
313
  if __name__ == '__main__':
314
  print("=" * 50)
315
- print("πŸš€ STANLEY AI - Hugging Face Spaces Edition")
316
  print(f"πŸ“¦ Model: {MODEL_NAME}")
317
  print(f"🌍 Platform: {'Hugging Face Spaces' if ON_SPACES else 'Local'}")
318
  print(f"⚑ Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
319
- print(f"πŸ“Š Model Status: {'Loaded' if model_loaded else 'Loading...'}")
320
  print("=" * 50)
321
 
322
- # Run app
323
  port = int(os.environ.get('PORT', 7860))
324
  app.run(
325
  debug=False,
 
1
+ # app.py - WORKING QWEN MODEL FOR HUGGING FACE SPACES
2
  from flask import Flask, request, jsonify
3
  from flask_cors import CORS
4
  import torch
 
20
  logger.info(f"πŸš€ Running on Hugging Face Spaces: {ON_SPACES}")
21
 
22
  # ============================================================================
23
+ # USE QWEN 0.5B WITH PROPER CONFIGURATION
24
  # ============================================================================
25
 
26
+ # Qwen 0.5B Model - will work with trust_remote_code
27
  MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
28
+ # Alternative: "Qwen/Qwen2.5-Coder-0.5B-Instruct" if the main one fails
29
 
30
  model = None
31
  tokenizer = None
 
33
  model_loading = False
34
 
35
  def load_model_fast():
36
+ """Load Qwen model with proper configuration"""
37
  global model, tokenizer, model_loaded, model_loading
38
 
39
  if model_loading or model_loaded:
 
44
  try:
45
  logger.info(f"πŸ”„ Loading {MODEL_NAME}...")
46
 
47
+ # Import transformers
48
  from transformers import AutoTokenizer, AutoModelForCausalLM
49
 
50
+ # IMPORTANT: Qwen requires trust_remote_code=True
51
  tokenizer = AutoTokenizer.from_pretrained(
52
  MODEL_NAME,
53
+ trust_remote_code=True, # REQUIRED for Qwen
54
  padding_side="left"
55
  )
56
 
 
58
  if tokenizer.pad_token is None:
59
  tokenizer.pad_token = tokenizer.eos_token
60
 
61
+ # Load model with trust_remote_code
62
  model = AutoModelForCausalLM.from_pretrained(
63
  MODEL_NAME,
64
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
65
  device_map="auto" if torch.cuda.is_available() else None,
66
+ trust_remote_code=True, # REQUIRED for Qwen
67
  low_cpu_mem_usage=True,
68
  )
69
 
70
+ # Move to CPU if no GPU
71
  if not torch.cuda.is_available():
72
  model = model.to("cpu")
73
  logger.info("πŸ“± Model moved to CPU")
 
76
 
77
  model.eval()
78
  model_loaded = True
79
+ logger.info(f"βœ… Model {MODEL_NAME} loaded successfully!")
80
+
81
+ # Test the model with a simple prompt
82
+ test_response = generate_quick("Hello", max_tokens=50)
83
+ logger.info(f"πŸ§ͺ Test successful: {test_response[:50]}...")
84
 
85
  except Exception as e:
86
+ logger.error(f"❌ Qwen model loading failed: {str(e)[:200]}")
87
+
88
+ # Try alternative Qwen model
89
+ try:
90
+ logger.info("πŸ”„ Trying alternative Qwen model...")
91
+ ALTERNATIVE_MODEL = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
92
+
93
+ tokenizer = AutoTokenizer.from_pretrained(
94
+ ALTERNATIVE_MODEL,
95
+ trust_remote_code=True,
96
+ )
97
+
98
+ model = AutoModelForCausalLM.from_pretrained(
99
+ ALTERNATIVE_MODEL,
100
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
101
+ device_map="auto" if torch.cuda.is_available() else None,
102
+ trust_remote_code=True,
103
+ low_cpu_mem_usage=True,
104
+ )
105
+
106
+ if not torch.cuda.is_available():
107
+ model = model.to("cpu")
108
+
109
+ model.eval()
110
+ model_loaded = True
111
+ logger.info(f"βœ… Alternative model {ALTERNATIVE_MODEL} loaded!")
112
+
113
+ except Exception as e2:
114
+ logger.error(f"❌ All Qwen models failed: {e2}")
115
+ # Fallback to a simple model
116
+ try:
117
+ logger.info("πŸ”„ Falling back to GPT-2...")
118
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
119
+
120
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
121
+ model = GPT2LMHeadModel.from_pretrained("gpt2")
122
+
123
+ if not torch.cuda.is_available():
124
+ model = model.to("cpu")
125
+
126
+ model.eval()
127
+ model_loaded = True
128
+ logger.info("βœ… GPT-2 fallback loaded!")
129
+
130
+ except Exception as e3:
131
+ logger.error(f"❌ Even GPT-2 failed: {e3}")
132
+ model_loaded = False
133
  finally:
134
  model_loading = False
135
 
 
138
  # ============================================================================
139
 
140
  def generate_quick(user_message, max_tokens=256):
141
+ """Generate response using Qwen model"""
142
  if not model_loaded:
143
+ return "πŸ”„ Stanley AI is starting up... Please wait a moment and try again!"
144
 
145
  try:
146
  # Truncate long messages
147
  if len(user_message) > 1000:
148
  user_message = user_message[:1000]
149
 
150
+ # Format for Qwen chat template
151
  messages = [
152
+ {
153
+ "role": "system",
154
+ "content": "You are Stanley AI, an advanced AI assistant created by Stanley Samwel Owino. You are helpful, knowledgeable, and incorporate Kiswahili phrases when appropriate."
155
+ },
156
  {"role": "user", "content": user_message}
157
  ]
158
 
159
+ # Apply Qwen chat template
160
+ try:
161
+ text = tokenizer.apply_chat_template(
162
+ messages,
163
+ tokenize=False,
164
+ add_generation_prompt=True
165
+ )
166
+ except:
167
+ # Fallback simple format
168
+ text = f"Human: {user_message}\nAssistant:"
169
 
170
+ # Tokenize
171
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
172
 
173
  # Move to device
 
182
  temperature=0.7,
183
  do_sample=True,
184
  top_p=0.9,
 
185
  repetition_penalty=1.1,
186
+ pad_token_id=tokenizer.pad_token_id,
187
  eos_token_id=tokenizer.eos_token_id,
188
+ use_cache=True,
 
189
  )
190
 
191
+ # Decode response
192
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
193
+
194
+ # Extract just the assistant's response
195
+ if "Assistant:" in response:
196
+ response = response.split("Assistant:")[-1].strip()
197
+ elif "assistant:" in response:
198
+ response = response.split("assistant:")[-1].strip()
199
+
200
+ # Add Kiswahili touch if relevant
201
+ if should_add_kiswahili(user_message):
202
+ kiswahili_phrases = [
203
+ "\n\nAsante sana kwa swali lako!",
204
+ "\n\nKaribu sana!",
205
+ "\n\nHakuna matata!",
206
+ "\n\nPoa sana!"
207
+ ]
208
+ import random
209
+ response += random.choice(kiswahili_phrases)
210
 
211
  return response.strip()
212
 
213
  except Exception as e:
214
  logger.error(f"Generation error: {e}")
215
+ return f"Samahani (Sorry)! I encountered an error: {str(e)[:100]}. Please try again."
216
+
217
+ def should_add_kiswahili(message):
218
+ """Check if we should add Kiswahili to response"""
219
+ kiswahili_keywords = [
220
+ 'swahili', 'kiswahili', 'hakuna matata', 'asante', 'jambo',
221
+ 'habari', 'rafiki', 'simba', 'africa', 'kenya', 'tanzania',
222
+ 'lion king', 'mufasa', 'nala', 'east africa', 'cultural'
223
+ ]
224
+ return any(keyword in message.lower() for keyword in kiswahili_keywords)
225
 
226
  # ============================================================================
227
+ # CACHE SYSTEM
228
  # ============================================================================
229
 
230
  response_cache = {}
 
239
  """Cache response"""
240
  key = query.lower().strip()[:80]
241
  if len(response_cache) >= CACHE_SIZE:
 
242
  response_cache.pop(next(iter(response_cache)))
243
  response_cache[key] = response
244
 
245
  # ============================================================================
246
+ # FLASK ROUTES
247
  # ============================================================================
248
 
249
  @app.route('/')
250
  def home():
251
  return jsonify({
252
  "name": "Stanley AI",
253
+ "version": "5.0",
254
  "model": MODEL_NAME,
255
  "status": "ready" if model_loaded else "loading",
256
  "platform": "huggingface-spaces",
257
  "endpoints": {
258
  "chat": "POST /api/chat",
259
  "status": "GET /api/status",
260
+ "test": "GET /api/test",
261
+ "health": "GET /health"
262
  },
263
+ "note": "Qwen 0.5B model with Kiswahili support"
264
+ })
265
+
266
+ @app.route('/health')
267
+ def health():
268
+ """Health check for Spaces"""
269
+ return jsonify({
270
+ "status": "healthy",
271
+ "model_loaded": model_loaded,
272
+ "timestamp": time.time()
273
  })
274
 
275
  @app.route('/api/chat', methods=['POST', 'GET'])
 
278
  start_time = time.time()
279
 
280
  try:
281
+ # Get message
282
  if request.method == 'POST':
283
  data = request.get_json()
284
  if not data:
285
+ return jsonify({"error": "No JSON data"}), 400
286
  user_message = data.get('message', '')
287
  else:
288
  user_message = request.args.get('message', 'Hello')
 
290
  if not user_message:
291
  return jsonify({"error": "No message provided"}), 400
292
 
293
+ logger.info(f"πŸ“© Message: {user_message[:50]}...")
294
+
295
+ # Start model loading if not started
296
+ if not model_loaded and not model_loading:
297
+ thread = threading.Thread(target=load_model_fast, daemon=True)
298
+ thread.start()
299
+ logger.info("πŸ”„ Started model loading")
300
+
301
+ # If model still loading
302
  if not model_loaded:
 
 
 
 
 
303
  return jsonify({
304
+ "response": "πŸ”„ Stanley AI is warming up... Please wait a moment and try again!",
305
  "status": "loading",
306
  "response_time": round(time.time() - start_time, 3)
307
  })
308
 
309
+ # Check cache
310
  cached = get_cached_response(user_message)
311
  if cached:
312
  logger.info("πŸ“¦ Using cached response")
 
338
  except Exception as e:
339
  logger.error(f"Chat error: {e}")
340
  return jsonify({
341
+ "error": "Error processing request",
342
  "status": "error"
343
  }), 500
344
 
345
  @app.route('/api/status')
346
  def status():
347
+ """Status endpoint"""
348
  return jsonify({
349
  "model_loaded": model_loaded,
350
  "model_loading": model_loading,
 
352
  "device": str(model.device) if model_loaded else "none",
353
  "cache_size": len(response_cache),
354
  "timestamp": time.time(),
355
+ "on_spaces": ON_SPACES
 
356
  })
357
 
358
  @app.route('/api/test')
359
  def test():
360
+ """Test endpoint"""
361
  if not model_loaded:
362
  return jsonify({
363
  "status": "model_not_loaded",
364
+ "message": "Model is still loading. Try in a few seconds."
365
  })
366
 
367
  test_query = "Hello, who are you?"
 
372
  return jsonify({
373
  "test": "success",
374
  "query": test_query,
375
+ "response": response,
376
  "response_time": time_taken,
377
  "model": MODEL_NAME
378
  })
379
 
380
+ @app.route('/api/stats')
381
+ def stats():
382
+ """Statistics endpoint"""
383
  return jsonify({
384
+ "uptime": time.time(),
385
+ "cache_hits": "N/A",
386
+ "total_requests": "N/A",
387
+ "average_response_time": "N/A"
388
  })
389
 
390
  # ============================================================================
391
+ # START MODEL LOADING
392
  # ============================================================================
393
 
 
394
  if ON_SPACES:
395
+ logger.info("πŸš€ Starting Qwen model load in background...")
396
  thread = threading.Thread(target=load_model_fast, daemon=True)
397
  thread.start()
398
  else:
 
399
  load_model_fast()
400
 
401
  # ============================================================================
 
404
 
405
  if __name__ == '__main__':
406
  print("=" * 50)
407
+ print("πŸš€ STANLEY AI - Qwen 0.5B Edition")
408
  print(f"πŸ“¦ Model: {MODEL_NAME}")
409
  print(f"🌍 Platform: {'Hugging Face Spaces' if ON_SPACES else 'Local'}")
410
  print(f"⚑ Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
411
+ print(f"πŸ“Š Status: {'Ready' if model_loaded else 'Loading...'}")
412
  print("=" * 50)
413
 
 
414
  port = int(os.environ.get('PORT', 7860))
415
  app.run(
416
  debug=False,