Spaces:

Stanley03
/

suno

Sleeping

App Files Files Community

Stanley03 commited on Jan 10

Commit

eb9dd30

verified ·

1 Parent(s): 1585d87

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -131

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import time
 import logging
 import os
 import json
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -24,15 +25,20 @@ logger.info(f"🚀 Running on Hugging Face Spaces: {ON_SPACES}")
 # Use the smallest Qwen model available
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
-# Or even smaller alternative: "Qwen/Qwen2.5-Coder-0.5B-Instruct"
 model = None
 tokenizer = None
 model_loaded = False
 def load_model_fast():
     """Fast model loading optimized for Spaces"""
-    global model, tokenizer, model_loaded
     try:
         logger.info(f"🔄 Loading {MODEL_NAME}...")
@@ -71,13 +77,11 @@ def load_model_fast():
         model_loaded = True
         logger.info("✅ Model loaded successfully!")
-        # Test a quick generation
-        test_response = generate_quick("Hello")
-        logger.info(f"🧪 Test generation: {test_response[:50]}...")
     except Exception as e:
         logger.error(f"❌ Model loading failed: {str(e)[:200]}")
         model_loaded = False
 # ============================================================================
 # OPTIMIZED GENERATION FUNCTIONS
@@ -86,12 +90,16 @@ def load_model_fast():
 def generate_quick(user_message, max_tokens=256):
     """Ultra-fast generation with minimal overhead"""
     if not model_loaded:
-        return "Model is still loading, please wait..."
     try:
         # Format the prompt for Qwen chat template
         messages = [
-            {"role": "system", "content": "You are Stanley AI, a helpful assistant."},
             {"role": "user", "content": user_message}
         ]
@@ -102,7 +110,7 @@ def generate_quick(user_message, max_tokens=256):
             add_generation_prompt=True
         )
-        # Tokenize
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         # Move to device
@@ -132,74 +140,23 @@ def generate_quick(user_message, max_tokens=256):
     except Exception as e:
         logger.error(f"Generation error: {e}")
-        return f"I encountered an error: {str(e)[:100]}"
-def generate_streaming(user_message, max_tokens=256):
-    """Streaming response for better UX"""
-    if not model_loaded:
-        yield "data: Model is still loading, please wait...\n\n"
-        return
-    try:
-        # Format prompt
-        messages = [
-            {"role": "system", "content": "You are Stanley AI, a helpful assistant."},
-            {"role": "user", "content": user_message}
-        ]
-        text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
-        device = model.device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate token by token
-        with torch.no_grad():
-            generated = inputs['input_ids'].clone()
-            for _ in range(max_tokens):
-                outputs = model(
-                    input_ids=generated,
-                    attention_mask=torch.ones_like(generated) if 'attention_mask' not in inputs else None,
-                    use_cache=True
-                )
-                # Get next token
-                next_token_logits = outputs.logits[:, -1, :]
-                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
-                # Check for eos
-                if next_token.item() == tokenizer.eos_token_id:
-                    break
-                # Decode and yield
-                generated = torch.cat([generated, next_token], dim=-1)
-                token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
-                yield f"data: {json.dumps({'token': token_text})}\n\n"
-    except Exception as e:
-        logger.error(f"Streaming error: {e}")
-        yield f"data: {json.dumps({'error': str(e)[:100]})}\n\n"
 # ============================================================================
-# CACHE SYSTEM FOR REPEATED QUERIES
 # ============================================================================
 response_cache = {}
-CACHE_SIZE = 100
 def get_cached_response(query):
     """Get response from cache"""
-    key = query.lower().strip()[:100]
     return response_cache.get(key)
 def cache_response(query, response):
     """Cache response"""
-    key = query.lower().strip()[:100]
     if len(response_cache) >= CACHE_SIZE:
         # Remove oldest
         response_cache.pop(next(iter(response_cache)))
@@ -212,22 +169,22 @@ def cache_response(query, response):
 @app.route('/')
 def home():
     return jsonify({
-        "name": "Stanley AI - Text Only",
-        "version": "4.0",
         "model": MODEL_NAME,
         "status": "ready" if model_loaded else "loading",
-        "optimized_for": "huggingface-spaces",
         "endpoints": {
-            "chat": "/api/chat",
-            "stream": "/api/chat/stream",
-            "status": "/api/status"
         },
-        "note": "Ultra-fast text-only version using Qwen 0.5B"
     })
 @app.route('/api/chat', methods=['POST', 'GET'])
 def chat():
-    """Main chat endpoint - supports both POST and GET for testing"""
     start_time = time.time()
     try:
@@ -243,6 +200,19 @@ def chat():
         if not user_message:
             return jsonify({"error": "No message provided"}), 400
         # Check cache first
         cached = get_cached_response(user_message)
         if cached:
@@ -251,7 +221,8 @@ def chat():
                 "response": cached,
                 "cached": True,
                 "response_time": round(time.time() - start_time, 3),
-                "model": MODEL_NAME
             })
         # Generate response
@@ -278,76 +249,61 @@ def chat():
             "status": "error"
         }), 500
-@app.route('/api/chat/stream')
-def chat_stream():
-    """Streaming chat endpoint"""
-    user_message = request.args.get('message', 'Hello')
-    def generate():
-        for token in generate_streaming(user_message):
-            yield token
-        yield "data: [DONE]\n\n"
-    return app.response_class(
-        generate(),
-        mimetype='text/event-stream',
-        headers={
-            'Cache-Control': 'no-cache',
-            'X-Accel-Buffering': 'no'
-        }
-    )
 @app.route('/api/status')
 def status():
     """Health check"""
     return jsonify({
         "model_loaded": model_loaded,
         "model_name": MODEL_NAME,
         "device": str(model.device) if model_loaded else "none",
         "cache_size": len(response_cache),
         "timestamp": time.time(),
-        "memory_allocated": f"{torch.cuda.memory_allocated() / 1024**2:.1f} MB" if torch.cuda.is_available() else "CPU mode"
     })
 @app.route('/api/test')
 def test():
     """Quick test endpoint"""
-    test_queries = [
-        "Hello, how are you?",
-        "What is AI?",
-        "Tell me a joke",
-        "Explain quantum computing simply"
-    ]
-    results = []
-    for query in test_queries[:2]:  # Test only 2 to be fast
-        start = time.time()
-        response = generate_quick(query, max_tokens=100)
-        time_taken = round(time.time() - start, 3)
-        results.append({
-            "query": query,
-            "response": response[:100] + "..." if len(response) > 100 else response,
-            "time": time_taken
         })
     return jsonify({
-        "tests": results,
-        "average_time": round(sum(r['time'] for r in results) / len(results), 3) if results else 0
     })
 # ============================================================================
-# STARTUP OPTIMIZATION
 # ============================================================================
-@app.before_first_request
-def startup():
-    """Load model on first request to avoid startup timeout"""
-    if not model_loaded:
-        load_model_fast()
-# Preload model immediately if not on Spaces (for local testing)
-if not ON_SPACES:
-    logger.info("🌍 Local mode - loading model immediately")
     load_model_fast()
 # ============================================================================
@@ -356,19 +312,13 @@ if not ON_SPACES:
 if __name__ == '__main__':
     print("=" * 50)
-    print("🚀 STANLEY AI - Ultra Fast Text Edition")
     print(f"📦 Model: {MODEL_NAME}")
     print(f"🌍 Platform: {'Hugging Face Spaces' if ON_SPACES else 'Local'}")
-    print(f"⚡ Optimized for: {'CPU' if not torch.cuda.is_available() else 'GPU'}")
     print("=" * 50)
-    # Load model in background thread to avoid timeout
-    import threading
-    if ON_SPACES and not model_loaded:
-        print("🔄 Loading model in background thread...")
-        thread = threading.Thread(target=load_model_fast, daemon=True)
-        thread.start()
     # Run app
     port = int(os.environ.get('PORT', 7860))
     app.run(

 import logging
 import os
 import json
+import threading
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Use the smallest Qwen model available
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
 model = None
 tokenizer = None
 model_loaded = False
+model_loading = False
 def load_model_fast():
     """Fast model loading optimized for Spaces"""
+    global model, tokenizer, model_loaded, model_loading
+    if model_loading or model_loaded:
+        return
+    model_loading = True
     try:
         logger.info(f"🔄 Loading {MODEL_NAME}...")
         model_loaded = True
         logger.info("✅ Model loaded successfully!")
     except Exception as e:
         logger.error(f"❌ Model loading failed: {str(e)[:200]}")
         model_loaded = False
+    finally:
+        model_loading = False
 # ============================================================================
 # OPTIMIZED GENERATION FUNCTIONS
 def generate_quick(user_message, max_tokens=256):
     """Ultra-fast generation with minimal overhead"""
     if not model_loaded:
+        return "Model is still loading, please wait a few seconds and try again..."
     try:
+        # Truncate long messages
+        if len(user_message) > 1000:
+            user_message = user_message[:1000]
         # Format the prompt for Qwen chat template
         messages = [
+            {"role": "system", "content": "You are Stanley AI, a helpful and knowledgeable assistant. Keep responses concise and helpful."},
             {"role": "user", "content": user_message}
         ]
             add_generation_prompt=True
         )
+        # Tokenize with truncation
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         # Move to device
     except Exception as e:
         logger.error(f"Generation error: {e}")
+        return f"I encountered an error. Please try again or rephrase your question."
 # ============================================================================
+# SIMPLE CACHE SYSTEM
 # ============================================================================
 response_cache = {}
+CACHE_SIZE = 50
 def get_cached_response(query):
     """Get response from cache"""
+    key = query.lower().strip()[:80]
     return response_cache.get(key)
 def cache_response(query, response):
     """Cache response"""
+    key = query.lower().strip()[:80]
     if len(response_cache) >= CACHE_SIZE:
         # Remove oldest
         response_cache.pop(next(iter(response_cache)))
 @app.route('/')
 def home():
     return jsonify({
+        "name": "Stanley AI",
+        "version": "4.1",
         "model": MODEL_NAME,
         "status": "ready" if model_loaded else "loading",
+        "platform": "huggingface-spaces",
         "endpoints": {
+            "chat": "POST /api/chat",
+            "status": "GET /api/status",
+            "test": "GET /api/test"
         },
+        "note": "Fast text generation with Qwen 0.5B model"
     })
 @app.route('/api/chat', methods=['POST', 'GET'])
 def chat():
+    """Main chat endpoint"""
     start_time = time.time()
     try:
         if not user_message:
             return jsonify({"error": "No message provided"}), 400
+        # If model is not loaded yet
+        if not model_loaded:
+            # Start loading if not already loading
+            if not model_loading:
+                thread = threading.Thread(target=load_model_fast, daemon=True)
+                thread.start()
+            return jsonify({
+                "response": "Model is loading... Please wait a few seconds and try again.",
+                "status": "loading",
+                "response_time": round(time.time() - start_time, 3)
+            })
         # Check cache first
         cached = get_cached_response(user_message)
         if cached:
                 "response": cached,
                 "cached": True,
                 "response_time": round(time.time() - start_time, 3),
+                "model": MODEL_NAME,
+                "tokens": len(cached.split())
             })
         # Generate response
             "status": "error"
         }), 500
 @app.route('/api/status')
 def status():
     """Health check"""
     return jsonify({
         "model_loaded": model_loaded,
+        "model_loading": model_loading,
         "model_name": MODEL_NAME,
         "device": str(model.device) if model_loaded else "none",
         "cache_size": len(response_cache),
         "timestamp": time.time(),
+        "on_spaces": ON_SPACES,
+        "memory": f"{torch.cuda.memory_allocated() / 1024**2:.1f} MB" if torch.cuda.is_available() and model_loaded else "CPU mode"
     })
 @app.route('/api/test')
 def test():
     """Quick test endpoint"""
+    if not model_loaded:
+        return jsonify({
+            "status": "model_not_loaded",
+            "message": "Model is still loading. Try /api/chat endpoint in a few seconds."
         })
+    test_query = "Hello, who are you?"
+    start = time.time()
+    response = generate_quick(test_query, max_tokens=100)
+    time_taken = round(time.time() - start, 3)
+    return jsonify({
+        "test": "success",
+        "query": test_query,
+        "response_preview": response[:200] + "..." if len(response) > 200 else response,
+        "response_time": time_taken,
+        "model": MODEL_NAME
+    })
+@app.route('/api/health')
+def health():
+    """Simple health check for Spaces"""
     return jsonify({
+        "status": "healthy",
+        "timestamp": time.time()
     })
 # ============================================================================
+# STARTUP
 # ============================================================================
+# Start model loading in background when app starts
+if ON_SPACES:
+    logger.info("Starting model load in background thread...")
+    thread = threading.Thread(target=load_model_fast, daemon=True)
+    thread.start()
+else:
+    # Load immediately for local testing
     load_model_fast()
 # ============================================================================
 if __name__ == '__main__':
     print("=" * 50)
+    print("🚀 STANLEY AI - Hugging Face Spaces Edition")
     print(f"📦 Model: {MODEL_NAME}")
     print(f"🌍 Platform: {'Hugging Face Spaces' if ON_SPACES else 'Local'}")
+    print(f"⚡ Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
+    print(f"📊 Model Status: {'Loaded' if model_loaded else 'Loading...'}")
     print("=" * 50)
     # Run app
     port = int(os.environ.get('PORT', 7860))
     app.run(