Spaces:

Stanley03
/

suno

Sleeping

App Files Files Community

Stanley03 commited on Jan 10

Commit

7df1070

verified ·

1 Parent(s): 9f01d67

Update app.py

Browse files

Files changed (1) hide show

app.py +291 -208

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
-# app.py - OPTIMIZED FOR HUGGING FACE SPACES
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 import torch
 import time
 import logging
 import os
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -15,282 +16,364 @@ CORS(app)
 # Detect if running on Hugging Face Spaces
 ON_SPACES = os.environ.get('SPACE_ID') is not None
 # ============================================================================
-# TINY MODEL FOR SPACES - NO GPU NEEDED
 # ============================================================================
-try:
-    # Import only what we need
-    from transformers import pipeline, AutoTokenizer
-    # USE A TINY MODEL THAT WORKS ON CPU
-    model_name = "microsoft/phi-2"  # 2.7B but very fast
-    # OR use an even smaller model:
-    # model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
-    logger.info(f"🚀 Loading {model_name} for Spaces...")
-    # Use pipeline for simplicity and speed
-    text_generator = pipeline(
-        "text-generation",
-        model=model_name,
-        tokenizer=model_name,
-        device=-1,  # CPU
-        torch_dtype=torch.float32,
-        model_kwargs={"low_cpu_mem_usage": True}
-    )
-    model_loaded = True
-    logger.info("✅ Model loaded successfully for Spaces!")
-except Exception as e:
-    logger.error(f"❌ Model loading failed: {e}")
-    # Fallback to even simpler model
     try:
-        from transformers import GPT2Tokenizer, GPT2LMHeadModel
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
         model_loaded = True
-        text_generator = None  # We'll use custom generation
-        logger.info("✅ Loaded GPT-2 as fallback!")
-    except:
         model_loaded = False
-        text_generator = None
-        logger.warning("⚠️ No model loaded - running in simulation mode")
-# Cache for responses
-response_cache = {}
-CACHE_SIZE = 50
-# Simplified system prompt for Spaces
-STANLEY_AI_SYSTEM = """You are STANLEY AI - an advanced AI assistant created by Stanley AI.
-You provide helpful, accurate, and concise responses.
-When appropriate, use Kiswahili phrases naturally in your responses."""
-def generate_response_fast(user_message):
-    """Ultra-fast response generation for Spaces"""
-    # Check cache
-    cache_key = user_message.lower()[:50]
-    if cache_key in response_cache:
-        return response_cache[cache_key]
-    # Truncate if too long
-    if len(user_message) > 500:
-        user_message = user_message[:500]
     try:
-        if text_generator:
-            # Use pipeline for fast generation
-            response = text_generator(
-                f"{STANLEY_AI_SYSTEM}\n\nUser: {user_message}\nStanley AI:",
-                max_new_tokens=256,
                 temperature=0.7,
                 do_sample=True,
                 top_p=0.9,
                 repetition_penalty=1.1,
-                num_return_sequences=1
-            )[0]['generated_text']
-            # Extract just the response part
-            if "Stanley AI:" in response:
-                response = response.split("Stanley AI:")[-1].strip()
-        elif model_loaded and 'model' in locals() and 'tokenizer' in locals():
-            # Fallback GPT-2 generation
-            inputs = tokenizer(user_message, return_tensors="pt", truncation=True, max_length=128)
-            with torch.no_grad():
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=128,
-                    temperature=0.7,
-                    do_sample=True,
-                    pad_token_id=tokenizer.eos_token_id
-                )
-            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        else:
-            # Simulation mode for testing
-            response = f"I'm Stanley AI! You said: {user_message[:100]}...\n\nI'm running on Hugging Face Spaces with limited resources. For full capabilities, consider running locally with GPU."
-        # Add some Kiswahili if relevant
-        if any(word in user_message.lower() for word in ['swahili', 'kiswahili', 'hakuna matata', 'jambo']):
-            response += "\n\nAsante sana for your question! Hakuna matata."
-        # Cache it
-        if len(response_cache) < CACHE_SIZE:
-            response_cache[cache_key] = response
         return response.strip()
     except Exception as e:
         logger.error(f"Generation error: {e}")
-        return f"Pole! I encountered an error: {str(e)[:100]}"
-# ============================================================================
-# SIMPLIFIED IMAGE GENERATION - NO HEAVY MODELS
-# ============================================================================
-def generate_image_simple(prompt):
-    """Simple image generation using only PIL - no external models"""
     try:
-        from PIL import Image, ImageDraw, ImageFont
-        import random
-        import io
-        import base64
-        # Create a simple image
-        width, height = 256, 256
-        img = Image.new('RGB', (width, height), color=(
-            random.randint(50, 200),
-            random.randint(50, 200),
-            random.randint(50, 200)
-        ))
-        draw = ImageDraw.Draw(img)
-        # Add some simple shapes based on prompt
-        if 'sun' in prompt.lower():
-            draw.ellipse([50, 50, 200, 200], fill=(255, 255, 0))
-        elif 'tree' in prompt.lower():
-            # Brown trunk
-            draw.rectangle([width//2-10, height//2, width//2+10, height-50], fill=(139, 69, 19))
-            # Green leaves
-            draw.ellipse([width//2-40, height//2-60, width//2+40, height//2+20], fill=(34, 139, 34))
-        # Add text
-        try:
-            font = ImageFont.load_default()
-            text = prompt[:30] + "..." if len(prompt) > 30 else prompt
-            draw.text((10, 10), f"Stanley AI:", fill=(255, 255, 255), font=font)
-            draw.text((10, 30), text, fill=(255, 255, 255), font=font)
-        except:
-            pass
-        # Convert to base64
-        buffered = io.BytesIO()
-        img.save(buffered, format="PNG", optimize=True)
-        img_str = base64.b64encode(buffered.getvalue()).decode()
-        return f"data:image/png;base64,{img_str}"
     except Exception as e:
-        logger.error(f"Image error: {e}")
-        return None
 # ============================================================================
-# FLASK ROUTES
 # ============================================================================
 @app.route('/')
 def home():
     return jsonify({
-        "message": "🚀 STANLEY AI - Hugging Face Spaces Edition",
-        "version": "3.0",
-        "status": "active" if model_loaded else "simulation",
-        "platform": "Hugging Face Spaces",
-        "model": "phi-2" if model_loaded else "simulation",
-        "image_generation": "simple",
-        "optimized_for": "spaces-free-tier",
-        "instructions": "Chat API at /api/chat, Images at /api/generate-image"
     })
-@app.route('/api/chat', methods=['POST'])
 def chat():
     try:
-        start_time = time.time()
-        data = request.get_json()
-        user_message = data.get('message', '')
         if not user_message:
-            return jsonify({"error": "Please provide a message"}), 400
-        if not model_loaded and ON_SPACES:
-            # Provide a helpful message
             return jsonify({
-                "response": "⚠️ Model not fully loaded. This is expected on Hugging Face Spaces free tier. I'm running in simulation mode.\n\nTry: 'Tell me about Kiswahili' or 'Generate an image of a lion'",
-                "status": "simulation",
-                "response_time": 0.1
             })
-        response = generate_response_fast(user_message)
-        response_time = round(time.time() - start_time, 2)
         return jsonify({
             "response": response,
-            "status": "success",
             "response_time": response_time,
-            "word_count": len(response.split()),
-            "platform": "huggingface-spaces"
         })
     except Exception as e:
         logger.error(f"Chat error: {e}")
         return jsonify({
-            "error": f"System error: {str(e)[:100]}",
             "status": "error"
         }), 500
-@app.route('/api/generate-image', methods=['POST'])
-def generate_image():
-    """Simple image generation endpoint"""
-    try:
-        data = request.get_json()
-        prompt = data.get('prompt', 'A beautiful landscape')
-        image_data = generate_image_simple(prompt)
-        if image_data:
-            return jsonify({
-                "image": image_data,
-                "prompt": prompt,
-                "status": "success",
-                "quality": "simple",
-                "note": "Simple image generation for Spaces free tier"
-            })
-        else:
-            return jsonify({
-                "image": None,
-                "prompt": prompt,
-                "status": "success",
-                "message": "Image generation failed, but chat is working!"
-            })
-    except Exception as e:
-        return jsonify({
-            "error": f"Image error: {str(e)[:100]}",
-            "status": "error"
-        }), 500
 @app.route('/api/status')
 def status():
-    """Health check endpoint"""
     return jsonify({
-        "status": "healthy" if model_loaded else "degraded",
         "model_loaded": model_loaded,
-        "on_spaces": ON_SPACES,
         "cache_size": len(response_cache),
-        "timestamp": time.time()
     })
 # ============================================================================
-# REQUIREMENTS.TXT (update this file too!)
 # ============================================================================
-"""
-flask>=2.3.0
-flask-cors>=4.0.0
-torch>=2.0.0
-transformers>=4.35.0
-pillow>=10.0.0
-accelerate>=0.24.0
-"""
 if __name__ == '__main__':
-    print("🚀 STANLEY AI - Hugging Face Spaces Edition")
-    print("⚡ Optimized for CPU-only environments")
-    print("🌍 Running on:", "Hugging Face Spaces" if ON_SPACES else "Local")
-    print("📦 Model status:", "Loaded" if model_loaded else "Simulation mode")
-    # Run on port 7860 (default for Spaces)
     port = int(os.environ.get('PORT', 7860))
-    app.run(debug=False, host='0.0.0.0', port=port)

+# app.py - OPTIMIZED TEXT-ONLY VERSION FOR HUGGING FACE SPACES
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 import torch
 import time
 import logging
 import os
+import json
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Detect if running on Hugging Face Spaces
 ON_SPACES = os.environ.get('SPACE_ID') is not None
+logger.info(f"🚀 Running on Hugging Face Spaces: {ON_SPACES}")
 # ============================================================================
+# ULTRA-FAST QWEN MODEL LOADING
 # ============================================================================
+# Use the smallest Qwen model available
+MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
+# Or even smaller alternative: "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+model = None
+tokenizer = None
+model_loaded = False
+def load_model_fast():
+    """Fast model loading optimized for Spaces"""
+    global model, tokenizer, model_loaded
     try:
+        logger.info(f"🔄 Loading {MODEL_NAME}...")
+        # Import only when needed
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        # Load tokenizer first
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            padding_side="left"
+        )
+        # Set padding token if not set
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Load model with minimal settings
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+        )
+        # If no GPU, move to CPU explicitly
+        if not torch.cuda.is_available():
+            model = model.to("cpu")
+            logger.info("📱 Model moved to CPU")
+        else:
+            logger.info("🎮 GPU available!")
+        model.eval()
         model_loaded = True
+        logger.info("✅ Model loaded successfully!")
+        # Test a quick generation
+        test_response = generate_quick("Hello")
+        logger.info(f"🧪 Test generation: {test_response[:50]}...")
+    except Exception as e:
+        logger.error(f"❌ Model loading failed: {str(e)[:200]}")
         model_loaded = False
+# ============================================================================
+# OPTIMIZED GENERATION FUNCTIONS
+# ============================================================================
+def generate_quick(user_message, max_tokens=256):
+    """Ultra-fast generation with minimal overhead"""
+    if not model_loaded:
+        return "Model is still loading, please wait..."
     try:
+        # Format the prompt for Qwen chat template
+        messages = [
+            {"role": "system", "content": "You are Stanley AI, a helpful assistant."},
+            {"role": "user", "content": user_message}
+        ]
+        # Apply chat template
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Tokenize
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+        # Move to device
+        device = model.device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate with optimized settings
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
                 temperature=0.7,
                 do_sample=True,
                 top_p=0.9,
+                top_k=50,
                 repetition_penalty=1.1,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                use_cache=True,  # Important for speed
+                attention_mask=inputs.get("attention_mask", None),
+            )
+        # Decode only new tokens
+        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
         return response.strip()
     except Exception as e:
         logger.error(f"Generation error: {e}")
+        return f"I encountered an error: {str(e)[:100]}"
+def generate_streaming(user_message, max_tokens=256):
+    """Streaming response for better UX"""
+    if not model_loaded:
+        yield "data: Model is still loading, please wait...\n\n"
+        return
     try:
+        # Format prompt
+        messages = [
+            {"role": "system", "content": "You are Stanley AI, a helpful assistant."},
+            {"role": "user", "content": user_message}
+        ]
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+        device = model.device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate token by token
+        with torch.no_grad():
+            generated = inputs['input_ids'].clone()
+            for _ in range(max_tokens):
+                outputs = model(
+                    input_ids=generated,
+                    attention_mask=torch.ones_like(generated) if 'attention_mask' not in inputs else None,
+                    use_cache=True
+                )
+                # Get next token
+                next_token_logits = outputs.logits[:, -1, :]
+                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+                # Check for eos
+                if next_token.item() == tokenizer.eos_token_id:
+                    break
+                # Decode and yield
+                generated = torch.cat([generated, next_token], dim=-1)
+                token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
+                yield f"data: {json.dumps({'token': token_text})}\n\n"
     except Exception as e:
+        logger.error(f"Streaming error: {e}")
+        yield f"data: {json.dumps({'error': str(e)[:100]})}\n\n"
 # ============================================================================
+# CACHE SYSTEM FOR REPEATED QUERIES
+# ============================================================================
+response_cache = {}
+CACHE_SIZE = 100
+def get_cached_response(query):
+    """Get response from cache"""
+    key = query.lower().strip()[:100]
+    return response_cache.get(key)
+def cache_response(query, response):
+    """Cache response"""
+    key = query.lower().strip()[:100]
+    if len(response_cache) >= CACHE_SIZE:
+        # Remove oldest
+        response_cache.pop(next(iter(response_cache)))
+    response_cache[key] = response
+# ============================================================================
+# FLASK ROUTES - TEXT ONLY
 # ============================================================================
 @app.route('/')
 def home():
     return jsonify({
+        "name": "Stanley AI - Text Only",
+        "version": "4.0",
+        "model": MODEL_NAME,
+        "status": "ready" if model_loaded else "loading",
+        "optimized_for": "huggingface-spaces",
+        "endpoints": {
+            "chat": "/api/chat",
+            "stream": "/api/chat/stream",
+            "status": "/api/status"
+        },
+        "note": "Ultra-fast text-only version using Qwen 0.5B"
     })
+@app.route('/api/chat', methods=['POST', 'GET'])
 def chat():
+    """Main chat endpoint - supports both POST and GET for testing"""
+    start_time = time.time()
     try:
+        # Handle both POST and GET
+        if request.method == 'POST':
+            data = request.get_json()
+            if not data:
+                return jsonify({"error": "No JSON data provided"}), 400
+            user_message = data.get('message', '')
+        else:
+            user_message = request.args.get('message', 'Hello')
         if not user_message:
+            return jsonify({"error": "No message provided"}), 400
+        # Check cache first
+        cached = get_cached_response(user_message)
+        if cached:
+            logger.info("📦 Using cached response")
             return jsonify({
+                "response": cached,
+                "cached": True,
+                "response_time": round(time.time() - start_time, 3),
+                "model": MODEL_NAME
             })
+        # Generate response
+        response = generate_quick(user_message)
+        # Cache it
+        cache_response(user_message, response)
+        response_time = round(time.time() - start_time, 3)
         return jsonify({
             "response": response,
+            "cached": False,
             "response_time": response_time,
+            "tokens": len(response.split()),
+            "model": MODEL_NAME,
+            "status": "success"
         })
     except Exception as e:
         logger.error(f"Chat error: {e}")
         return jsonify({
+            "error": f"Error: {str(e)[:200]}",
             "status": "error"
         }), 500
+@app.route('/api/chat/stream')
+def chat_stream():
+    """Streaming chat endpoint"""
+    user_message = request.args.get('message', 'Hello')
+    def generate():
+        for token in generate_streaming(user_message):
+            yield token
+        yield "data: [DONE]\n\n"
+    return app.response_class(
+        generate(),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no'
+        }
+    )
 @app.route('/api/status')
 def status():
+    """Health check"""
     return jsonify({
         "model_loaded": model_loaded,
+        "model_name": MODEL_NAME,
+        "device": str(model.device) if model_loaded else "none",
         "cache_size": len(response_cache),
+        "timestamp": time.time(),
+        "memory_allocated": f"{torch.cuda.memory_allocated() / 1024**2:.1f} MB" if torch.cuda.is_available() else "CPU mode"
+    })
+@app.route('/api/test')
+def test():
+    """Quick test endpoint"""
+    test_queries = [
+        "Hello, how are you?",
+        "What is AI?",
+        "Tell me a joke",
+        "Explain quantum computing simply"
+    ]
+    results = []
+    for query in test_queries[:2]:  # Test only 2 to be fast
+        start = time.time()
+        response = generate_quick(query, max_tokens=100)
+        time_taken = round(time.time() - start, 3)
+        results.append({
+            "query": query,
+            "response": response[:100] + "..." if len(response) > 100 else response,
+            "time": time_taken
+        })
+    return jsonify({
+        "tests": results,
+        "average_time": round(sum(r['time'] for r in results) / len(results), 3) if results else 0
     })
 # ============================================================================
+# STARTUP OPTIMIZATION
+# ============================================================================
+@app.before_first_request
+def startup():
+    """Load model on first request to avoid startup timeout"""
+    if not model_loaded:
+        load_model_fast()
+# Preload model immediately if not on Spaces (for local testing)
+if not ON_SPACES:
+    logger.info("🌍 Local mode - loading model immediately")
+    load_model_fast()
+# ============================================================================
+# MAIN
 # ============================================================================
 if __name__ == '__main__':
+    print("=" * 50)
+    print("🚀 STANLEY AI - Ultra Fast Text Edition")
+    print(f"📦 Model: {MODEL_NAME}")
+    print(f"🌍 Platform: {'Hugging Face Spaces' if ON_SPACES else 'Local'}")
+    print(f"⚡ Optimized for: {'CPU' if not torch.cuda.is_available() else 'GPU'}")
+    print("=" * 50)
+    # Load model in background thread to avoid timeout
+    import threading
+    if ON_SPACES and not model_loaded:
+        print("🔄 Loading model in background thread...")
+        thread = threading.Thread(target=load_model_fast, daemon=True)
+        thread.start()
+    # Run app
     port = int(os.environ.get('PORT', 7860))
+    app.run(
+        debug=False,
+        host='0.0.0.0',
+        port=port,
+        threaded=True
+    )