Spaces:

Stanley03
/

suno

Sleeping

App Files Files Community

Stanley03 commited on Jan 10

Commit

2892625

verified ·

1 Parent(s): 2a0d2d3

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -66

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - OPTIMIZED TEXT-ONLY VERSION FOR HUGGING FACE SPACES
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 import torch
@@ -20,11 +20,12 @@ ON_SPACES = os.environ.get('SPACE_ID') is not None
 logger.info(f"🚀 Running on Hugging Face Spaces: {ON_SPACES}")
 # ============================================================================
-# ULTRA-FAST QWEN MODEL LOADING
 # ============================================================================
-# Use the smallest Qwen model available
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
 model = None
 tokenizer = None
@@ -32,7 +33,7 @@ model_loaded = False
 model_loading = False
 def load_model_fast():
-    """Fast model loading optimized for Spaces"""
     global model, tokenizer, model_loaded, model_loading
     if model_loading or model_loaded:
@@ -43,13 +44,13 @@ def load_model_fast():
     try:
         logger.info(f"🔄 Loading {MODEL_NAME}...")
-        # Import only when needed
         from transformers import AutoTokenizer, AutoModelForCausalLM
-        # Load tokenizer first
         tokenizer = AutoTokenizer.from_pretrained(
             MODEL_NAME,
-            trust_remote_code=True,
             padding_side="left"
         )
@@ -57,16 +58,16 @@ def load_model_fast():
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # Load model with minimal settings
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True,
             low_cpu_mem_usage=True,
         )
-        # If no GPU, move to CPU explicitly
         if not torch.cuda.is_available():
             model = model.to("cpu")
             logger.info("📱 Model moved to CPU")
@@ -75,11 +76,60 @@ def load_model_fast():
         model.eval()
         model_loaded = True
-        logger.info("✅ Model loaded successfully!")
     except Exception as e:
-        logger.error(f"❌ Model loading failed: {str(e)[:200]}")
-        model_loaded = False
     finally:
         model_loading = False
@@ -88,29 +138,36 @@ def load_model_fast():
 # ============================================================================
 def generate_quick(user_message, max_tokens=256):
-    """Ultra-fast generation with minimal overhead"""
     if not model_loaded:
-        return "Model is still loading, please wait a few seconds and try again..."
     try:
         # Truncate long messages
         if len(user_message) > 1000:
             user_message = user_message[:1000]
-        # Format the prompt for Qwen chat template
         messages = [
-            {"role": "system", "content": "You are Stanley AI, a helpful and knowledgeable assistant. Keep responses concise and helpful."},
             {"role": "user", "content": user_message}
         ]
-        # Apply chat template
-        text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Tokenize with truncation
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         # Move to device
@@ -125,25 +182,49 @@ def generate_quick(user_message, max_tokens=256):
                 temperature=0.7,
                 do_sample=True,
                 top_p=0.9,
-                top_k=50,
                 repetition_penalty=1.1,
-                pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
-                use_cache=True,  # Important for speed
-                attention_mask=inputs.get("attention_mask", None),
             )
-        # Decode only new tokens
-        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
         return response.strip()
     except Exception as e:
         logger.error(f"Generation error: {e}")
-        return f"I encountered an error. Please try again or rephrase your question."
 # ============================================================================
-# SIMPLE CACHE SYSTEM
 # ============================================================================
 response_cache = {}
@@ -158,28 +239,37 @@ def cache_response(query, response):
     """Cache response"""
     key = query.lower().strip()[:80]
     if len(response_cache) >= CACHE_SIZE:
-        # Remove oldest
         response_cache.pop(next(iter(response_cache)))
     response_cache[key] = response
 # ============================================================================
-# FLASK ROUTES - TEXT ONLY
 # ============================================================================
 @app.route('/')
 def home():
     return jsonify({
         "name": "Stanley AI",
-        "version": "4.1",
         "model": MODEL_NAME,
         "status": "ready" if model_loaded else "loading",
         "platform": "huggingface-spaces",
         "endpoints": {
             "chat": "POST /api/chat",
             "status": "GET /api/status",
-            "test": "GET /api/test"
         },
-        "note": "Fast text generation with Qwen 0.5B model"
     })
 @app.route('/api/chat', methods=['POST', 'GET'])
@@ -188,11 +278,11 @@ def chat():
     start_time = time.time()
     try:
-        # Handle both POST and GET
         if request.method == 'POST':
             data = request.get_json()
             if not data:
-                return jsonify({"error": "No JSON data provided"}), 400
             user_message = data.get('message', '')
         else:
             user_message = request.args.get('message', 'Hello')
@@ -200,20 +290,23 @@ def chat():
         if not user_message:
             return jsonify({"error": "No message provided"}), 400
-        # If model is not loaded yet
         if not model_loaded:
-            # Start loading if not already loading
-            if not model_loading:
-                thread = threading.Thread(target=load_model_fast, daemon=True)
-                thread.start()
             return jsonify({
-                "response": "Model is loading... Please wait a few seconds and try again.",
                 "status": "loading",
                 "response_time": round(time.time() - start_time, 3)
             })
-        # Check cache first
         cached = get_cached_response(user_message)
         if cached:
             logger.info("📦 Using cached response")
@@ -245,13 +338,13 @@ def chat():
     except Exception as e:
         logger.error(f"Chat error: {e}")
         return jsonify({
-            "error": f"Error: {str(e)[:200]}",
             "status": "error"
         }), 500
 @app.route('/api/status')
 def status():
-    """Health check"""
     return jsonify({
         "model_loaded": model_loaded,
         "model_loading": model_loading,
@@ -259,17 +352,16 @@ def status():
         "device": str(model.device) if model_loaded else "none",
         "cache_size": len(response_cache),
         "timestamp": time.time(),
-        "on_spaces": ON_SPACES,
-        "memory": f"{torch.cuda.memory_allocated() / 1024**2:.1f} MB" if torch.cuda.is_available() and model_loaded else "CPU mode"
     })
 @app.route('/api/test')
 def test():
-    """Quick test endpoint"""
     if not model_loaded:
         return jsonify({
             "status": "model_not_loaded",
-            "message": "Model is still loading. Try /api/chat endpoint in a few seconds."
         })
     test_query = "Hello, who are you?"
@@ -280,30 +372,30 @@ def test():
     return jsonify({
         "test": "success",
         "query": test_query,
-        "response_preview": response[:200] + "..." if len(response) > 200 else response,
         "response_time": time_taken,
         "model": MODEL_NAME
     })
-@app.route('/api/health')
-def health():
-    """Simple health check for Spaces"""
     return jsonify({
-        "status": "healthy",
-        "timestamp": time.time()
     })
 # ============================================================================
-# STARTUP
 # ============================================================================
-# Start model loading in background when app starts
 if ON_SPACES:
-    logger.info("Starting model load in background thread...")
     thread = threading.Thread(target=load_model_fast, daemon=True)
     thread.start()
 else:
-    # Load immediately for local testing
     load_model_fast()
 # ============================================================================
@@ -312,14 +404,13 @@ else:
 if __name__ == '__main__':
     print("=" * 50)
-    print("🚀 STANLEY AI - Hugging Face Spaces Edition")
     print(f"📦 Model: {MODEL_NAME}")
     print(f"🌍 Platform: {'Hugging Face Spaces' if ON_SPACES else 'Local'}")
     print(f"⚡ Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
-    print(f"📊 Model Status: {'Loaded' if model_loaded else 'Loading...'}")
     print("=" * 50)
-    # Run app
     port = int(os.environ.get('PORT', 7860))
     app.run(
         debug=False,

+# app.py - WORKING QWEN MODEL FOR HUGGING FACE SPACES
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 import torch
 logger.info(f"🚀 Running on Hugging Face Spaces: {ON_SPACES}")
 # ============================================================================
+# USE QWEN 0.5B WITH PROPER CONFIGURATION
 # ============================================================================
+# Qwen 0.5B Model - will work with trust_remote_code
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
+# Alternative: "Qwen/Qwen2.5-Coder-0.5B-Instruct" if the main one fails
 model = None
 tokenizer = None
 model_loading = False
 def load_model_fast():
+    """Load Qwen model with proper configuration"""
     global model, tokenizer, model_loaded, model_loading
     if model_loading or model_loaded:
     try:
         logger.info(f"🔄 Loading {MODEL_NAME}...")
+        # Import transformers
         from transformers import AutoTokenizer, AutoModelForCausalLM
+        # IMPORTANT: Qwen requires trust_remote_code=True
         tokenizer = AutoTokenizer.from_pretrained(
             MODEL_NAME,
+            trust_remote_code=True,  # REQUIRED for Qwen
             padding_side="left"
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        # Load model with trust_remote_code
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True,  # REQUIRED for Qwen
             low_cpu_mem_usage=True,
         )
+        # Move to CPU if no GPU
         if not torch.cuda.is_available():
             model = model.to("cpu")
             logger.info("📱 Model moved to CPU")
         model.eval()
         model_loaded = True
+        logger.info(f"✅ Model {MODEL_NAME} loaded successfully!")
+        # Test the model with a simple prompt
+        test_response = generate_quick("Hello", max_tokens=50)
+        logger.info(f"🧪 Test successful: {test_response[:50]}...")
     except Exception as e:
+        logger.error(f"❌ Qwen model loading failed: {str(e)[:200]}")
+        # Try alternative Qwen model
+        try:
+            logger.info("🔄 Trying alternative Qwen model...")
+            ALTERNATIVE_MODEL = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+            tokenizer = AutoTokenizer.from_pretrained(
+                ALTERNATIVE_MODEL,
+                trust_remote_code=True,
+            )
+            model = AutoModelForCausalLM.from_pretrained(
+                ALTERNATIVE_MODEL,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+            )
+            if not torch.cuda.is_available():
+                model = model.to("cpu")
+            model.eval()
+            model_loaded = True
+            logger.info(f"✅ Alternative model {ALTERNATIVE_MODEL} loaded!")
+        except Exception as e2:
+            logger.error(f"❌ All Qwen models failed: {e2}")
+            # Fallback to a simple model
+            try:
+                logger.info("🔄 Falling back to GPT-2...")
+                from transformers import GPT2Tokenizer, GPT2LMHeadModel
+                tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+                model = GPT2LMHeadModel.from_pretrained("gpt2")
+                if not torch.cuda.is_available():
+                    model = model.to("cpu")
+                model.eval()
+                model_loaded = True
+                logger.info("✅ GPT-2 fallback loaded!")
+            except Exception as e3:
+                logger.error(f"❌ Even GPT-2 failed: {e3}")
+                model_loaded = False
     finally:
         model_loading = False
 # ============================================================================
 def generate_quick(user_message, max_tokens=256):
+    """Generate response using Qwen model"""
     if not model_loaded:
+        return "🔄 Stanley AI is starting up... Please wait a moment and try again!"
     try:
         # Truncate long messages
         if len(user_message) > 1000:
             user_message = user_message[:1000]
+        # Format for Qwen chat template
         messages = [
+            {
+                "role": "system",
+                "content": "You are Stanley AI, an advanced AI assistant created by Stanley Samwel Owino. You are helpful, knowledgeable, and incorporate Kiswahili phrases when appropriate."
+            },
             {"role": "user", "content": user_message}
         ]
+        # Apply Qwen chat template
+        try:
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+        except:
+            # Fallback simple format
+            text = f"Human: {user_message}\nAssistant:"
+        # Tokenize
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         # Move to device
                 temperature=0.7,
                 do_sample=True,
                 top_p=0.9,
                 repetition_penalty=1.1,
+                pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
+                use_cache=True,
             )
+        # Decode response
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract just the assistant's response
+        if "Assistant:" in response:
+            response = response.split("Assistant:")[-1].strip()
+        elif "assistant:" in response:
+            response = response.split("assistant:")[-1].strip()
+        # Add Kiswahili touch if relevant
+        if should_add_kiswahili(user_message):
+            kiswahili_phrases = [
+                "\n\nAsante sana kwa swali lako!",
+                "\n\nKaribu sana!",
+                "\n\nHakuna matata!",
+                "\n\nPoa sana!"
+            ]
+            import random
+            response += random.choice(kiswahili_phrases)
         return response.strip()
     except Exception as e:
         logger.error(f"Generation error: {e}")
+        return f"Samahani (Sorry)! I encountered an error: {str(e)[:100]}. Please try again."
+def should_add_kiswahili(message):
+    """Check if we should add Kiswahili to response"""
+    kiswahili_keywords = [
+        'swahili', 'kiswahili', 'hakuna matata', 'asante', 'jambo',
+        'habari', 'rafiki', 'simba', 'africa', 'kenya', 'tanzania',
+        'lion king', 'mufasa', 'nala', 'east africa', 'cultural'
+    ]
+    return any(keyword in message.lower() for keyword in kiswahili_keywords)
 # ============================================================================
+# CACHE SYSTEM
 # ============================================================================
 response_cache = {}
     """Cache response"""
     key = query.lower().strip()[:80]
     if len(response_cache) >= CACHE_SIZE:
         response_cache.pop(next(iter(response_cache)))
     response_cache[key] = response
 # ============================================================================
+# FLASK ROUTES
 # ============================================================================
 @app.route('/')
 def home():
     return jsonify({
         "name": "Stanley AI",
+        "version": "5.0",
         "model": MODEL_NAME,
         "status": "ready" if model_loaded else "loading",
         "platform": "huggingface-spaces",
         "endpoints": {
             "chat": "POST /api/chat",
             "status": "GET /api/status",
+            "test": "GET /api/test",
+            "health": "GET /health"
         },
+        "note": "Qwen 0.5B model with Kiswahili support"
+    })
+@app.route('/health')
+def health():
+    """Health check for Spaces"""
+    return jsonify({
+        "status": "healthy",
+        "model_loaded": model_loaded,
+        "timestamp": time.time()
     })
 @app.route('/api/chat', methods=['POST', 'GET'])
     start_time = time.time()
     try:
+        # Get message
         if request.method == 'POST':
             data = request.get_json()
             if not data:
+                return jsonify({"error": "No JSON data"}), 400
             user_message = data.get('message', '')
         else:
             user_message = request.args.get('message', 'Hello')
         if not user_message:
             return jsonify({"error": "No message provided"}), 400
+        logger.info(f"📩 Message: {user_message[:50]}...")
+        # Start model loading if not started
+        if not model_loaded and not model_loading:
+            thread = threading.Thread(target=load_model_fast, daemon=True)
+            thread.start()
+            logger.info("🔄 Started model loading")
+        # If model still loading
         if not model_loaded:
             return jsonify({
+                "response": "🔄 Stanley AI is warming up... Please wait a moment and try again!",
                 "status": "loading",
                 "response_time": round(time.time() - start_time, 3)
             })
+        # Check cache
         cached = get_cached_response(user_message)
         if cached:
             logger.info("📦 Using cached response")
     except Exception as e:
         logger.error(f"Chat error: {e}")
         return jsonify({
+            "error": "Error processing request",
             "status": "error"
         }), 500
 @app.route('/api/status')
 def status():
+    """Status endpoint"""
     return jsonify({
         "model_loaded": model_loaded,
         "model_loading": model_loading,
         "device": str(model.device) if model_loaded else "none",
         "cache_size": len(response_cache),
         "timestamp": time.time(),
+        "on_spaces": ON_SPACES
     })
 @app.route('/api/test')
 def test():
+    """Test endpoint"""
     if not model_loaded:
         return jsonify({
             "status": "model_not_loaded",
+            "message": "Model is still loading. Try in a few seconds."
         })
     test_query = "Hello, who are you?"
     return jsonify({
         "test": "success",
         "query": test_query,
+        "response": response,
         "response_time": time_taken,
         "model": MODEL_NAME
     })
+@app.route('/api/stats')
+def stats():
+    """Statistics endpoint"""
     return jsonify({
+        "uptime": time.time(),
+        "cache_hits": "N/A",
+        "total_requests": "N/A",
+        "average_response_time": "N/A"
     })
 # ============================================================================
+# START MODEL LOADING
 # ============================================================================
 if ON_SPACES:
+    logger.info("🚀 Starting Qwen model load in background...")
     thread = threading.Thread(target=load_model_fast, daemon=True)
     thread.start()
 else:
     load_model_fast()
 # ============================================================================
 if __name__ == '__main__':
     print("=" * 50)
+    print("🚀 STANLEY AI - Qwen 0.5B Edition")
     print(f"📦 Model: {MODEL_NAME}")
     print(f"🌍 Platform: {'Hugging Face Spaces' if ON_SPACES else 'Local'}")
     print(f"⚡ Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
+    print(f"📊 Status: {'Ready' if model_loaded else 'Loading...'}")
     print("=" * 50)
     port = int(os.environ.get('PORT', 7860))
     app.run(
         debug=False,