Spaces:

Gaston895
/

chat

Sleeping

App Files Files Community

Gaston895 commited on Jan 12

Commit

4f57881

verified ·

1 Parent(s): 756e842

🔧 Fix model loading using proven app_gunicorn.py approach - no pipeline, direct generation

Browse files

Files changed (1) hide show

app.py +73 -110

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ CPU-optimized version for Modal deployment
 from flask import Flask, request, jsonify, render_template_string
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import os
 import logging
 import json
@@ -38,7 +38,6 @@ app = Flask(__name__)
 # Global variables
 model = None
 tokenizer = None
-chat_pipeline = None
 executor = ThreadPoolExecutor(max_workers=2)
 def cleanup_memory():
@@ -848,140 +847,104 @@ Focus on quantitative metrics and actionable insights."""
 langgraph_processor = LangGraphProcessor()
 def load_model():
-    """Load the model and tokenizer from Gaston895/Aegisecon1 repository using optimized pipeline approach"""
     global model, tokenizer, chat_pipeline
     try:
         logger.info("Loading model and tokenizer from Hugging Face...")
-        # Model configuration
-        MODEL_NAME = "Gaston895/Aegisecon1"
-        DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Loading tokenizer from {MODEL_NAME}...")
-        # Load tokenizer with optimizations
         tokenizer = AutoTokenizer.from_pretrained(
-            MODEL_NAME,
             trust_remote_code=True,
-            use_fast=True  # Use fast tokenizer for speed
         )
-        # Set pad token if not exists
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        logger.info(f"Loading model from {MODEL_NAME}...")
-        # Load model with memory-optimized settings
         model = AutoModelForCausalLM.from_pretrained(
-            MODEL_NAME,
-            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,  # Use float16 for memory efficiency
-            device_map="auto" if DEVICE == "cuda" else None,
             trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            use_cache=True,  # Enable KV cache for faster generation
-            attn_implementation="flash_attention_2" if DEVICE == "cuda" else None  # Use flash attention if available
         )
-        # Optimize model for inference
-        model.eval()
-        if hasattr(model, 'gradient_checkpointing_disable'):
-            model.gradient_checkpointing_disable()
-        # Create pipeline with optimized settings
-        chat_pipeline = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            device=0 if DEVICE == "cuda" else -1,
-            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
-            model_kwargs={
-                "use_cache": True,
-                "do_sample": True,
-                "pad_token_id": tokenizer.eos_token_id
-            }
-        )
         logger.info("Model loaded successfully from HF repository!")
-        logger.info(f"Device: {DEVICE}")
         logger.info(f"Model dtype: {next(model.parameters()).dtype}")
-        # Clear any initialization memory
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
         return True
     except Exception as e:
-        logger.error(f"Failed to load model: {str(e)}")
-        # Clear memory on failure
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        return False
 def generate_response(prompt, temperature=0.7):
-    """Generate response using the loaded model pipeline with optimizations"""
     try:
-        if not chat_pipeline:
             return "Model is still loading, please wait a moment and try again..."
-        # Truncate input prompt if too long to save memory
-        max_input_length = 800
-        if len(prompt) > max_input_length:
-            prompt = prompt[:max_input_length] + "..."
-        # Format the prompt efficiently
-        formatted_prompt = f"User: {prompt}\nAssistant:"
-        # Optimized generation parameters for speed and memory
-        response = chat_pipeline(
-            formatted_prompt,
-            max_new_tokens=128,  # Reduced for faster generation
-            temperature=temperature,
-            do_sample=True,
-            top_p=0.9,  # Nucleus sampling for better quality
-            top_k=50,   # Limit vocabulary for speed
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            truncation=True,
-            return_full_text=False,  # Only return new tokens
-            clean_up_tokenization_spaces=True
-        )
-        # Extract response efficiently
-        if response and len(response) > 0:
-            generated_text = response[0].get('generated_text', '')
-            # Clean up the response
-            if "Assistant:" in generated_text:
-                assistant_response = generated_text.split("Assistant:")[-1].strip()
-            else:
-                assistant_response = generated_text.replace(formatted_prompt, "").strip()
-            # Remove any remaining prompt artifacts
-            if assistant_response.startswith("User:"):
-                lines = assistant_response.split('\n')
-                assistant_response = '\n'.join([line for line in lines if not line.strip().startswith("User:")])
-            # Ensure response isn't empty
-            if not assistant_response.strip():
-                assistant_response = "I understand your question. Let me provide an economic analysis based on the available data."
-            return assistant_response.strip()
-        else:
-            return "I'm processing your request. Please try again in a moment."
-    except torch.cuda.OutOfMemoryError:
-        # Handle CUDA OOM gracefully
-        logger.error("CUDA out of memory during generation")
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        return "System is under high load. Please try a shorter question."
     except Exception as e:
         logger.error(f"Error generating response: {str(e)}")
-        # Clear any potential memory issues
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        return "I'm experiencing technical difficulties. Please try again shortly."
 # HTML template (same as before)
 HTML_TEMPLATE = """
@@ -1324,7 +1287,7 @@ def load_model_manual():
         return jsonify({
             'success': success,
-            'model_loaded': chat_pipeline is not None,
             'tokenizer_loaded': tokenizer is not None,
             'message': 'Model loaded successfully' if success else 'Model loading failed'
         })
@@ -1360,7 +1323,7 @@ def health():
     """Health check endpoint"""
     return jsonify({
         'status': 'healthy',
-        'model_loaded': chat_pipeline is not None,
         'tokenizer_loaded': tokenizer is not None,
         'langgraph_available': LANGGRAPH_AVAILABLE,
         'processing_mode': 'langgraph' if LANGGRAPH_AVAILABLE else 'simplified'
@@ -1395,11 +1358,11 @@ else:
     logger.info("Production mode: Loading model during module import...")
     logger.info(f"LangGraph available: {LANGGRAPH_AVAILABLE}")
-    # Load model immediately
-    logger.info("Loading model from Gaston895/Aegisecon1...")
     model_loaded = load_model()
     if model_loaded:
         logger.info("✅ Model loaded successfully for production!")
     else:
-        logger.error("❌ Model failed to load in production mode!")

 from flask import Flask, request, jsonify, render_template_string
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
 import logging
 import json
 # Global variables
 model = None
 tokenizer = None
 executor = ThreadPoolExecutor(max_workers=2)
 def cleanup_memory():
 langgraph_processor = LangGraphProcessor()
 def load_model():
+    """Load the model and tokenizer from Gaston895/Aegisecon1 repository using the working approach"""
     global model, tokenizer, chat_pipeline
     try:
         logger.info("Loading model and tokenizer from Hugging Face...")
+        # Load from the deployed model repository
+        model_repo = "Gaston895/Aegisecon1"
+        logger.info(f"Loading tokenizer from {model_repo}...")
         tokenizer = AutoTokenizer.from_pretrained(
+            model_repo,
             trust_remote_code=True,
+            use_auth_token=False
         )
+        logger.info(f"Loading model from {model_repo}...")
         model = AutoModelForCausalLM.from_pretrained(
+            model_repo,
+            torch_dtype=torch.float16,  # Use float16 for better compatibility
+            device_map="cpu",           # Force CPU for HF Spaces compatibility
             trust_remote_code=True,
+            use_auth_token=False,
+            low_cpu_mem_usage=True
         )
+        # Don't create pipeline - use direct model generation like the working version
+        chat_pipeline = None  # Set to None to indicate we're using direct generation
         logger.info("Model loaded successfully from HF repository!")
+        logger.info(f"Model device: {next(model.parameters()).device}")
         logger.info(f"Model dtype: {next(model.parameters()).dtype}")
         return True
     except Exception as e:
+        logger.error(f"Error loading model from HF: {str(e)}")
+        # Try alternative loading method
+        try:
+            logger.info("Trying alternative loading method...")
+            tokenizer = AutoTokenizer.from_pretrained(
+                "Qwen/Qwen2-1.5B",  # Fallback to base model
+                trust_remote_code=True
+            )
+            model = AutoModelForCausalLM.from_pretrained(
+                "Qwen/Qwen2-1.5B",
+                torch_dtype=torch.float16,
+                device_map="cpu",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            chat_pipeline = None
+            logger.info("Fallback model loaded successfully!")
+            return True
+        except Exception as e2:
+            logger.error(f"Fallback loading also failed: {str(e2)}")
+            return False
 def generate_response(prompt, temperature=0.7):
+    """Generate response using direct model generation (like the working app_gunicorn.py)"""
     try:
+        if model is None or tokenizer is None:
             return "Model is still loading, please wait a moment and try again..."
+        # Economics-focused system prompt (like the working version)
+        system_prompt = """You are AEGIS Economics AI, an expert economic analyst and policy advisor.
+        Provide clear, accurate, and insightful responses about economics, finance, markets, and policy.
+        Focus on practical analysis and actionable insights."""
+        full_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:"
+        # Tokenize input (like the working version)
+        inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=1024)
+        # Generate response (like the working version)
+        with torch.no_grad():
+            outputs = model.generate(
+                inputs.input_ids,
+                max_new_tokens=256,  # Same as working version
+                temperature=temperature,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.1,
+                no_repeat_ngram_size=3
+            )
+        # Decode response (like the working version)
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's response (like the working version)
+        if "Assistant:" in response:
+            response = response.split("Assistant:")[-1].strip()
+        return response
     except Exception as e:
         logger.error(f"Error generating response: {str(e)}")
+        return "I apologize, but I'm having trouble processing your request right now. Please try again in a moment."
 # HTML template (same as before)
 HTML_TEMPLATE = """
         return jsonify({
             'success': success,
+            'model_loaded': model is not None,
             'tokenizer_loaded': tokenizer is not None,
             'message': 'Model loaded successfully' if success else 'Model loading failed'
         })
     """Health check endpoint"""
     return jsonify({
         'status': 'healthy',
+        'model_loaded': model is not None,
         'tokenizer_loaded': tokenizer is not None,
         'langgraph_available': LANGGRAPH_AVAILABLE,
         'processing_mode': 'langgraph' if LANGGRAPH_AVAILABLE else 'simplified'
     logger.info("Production mode: Loading model during module import...")
     logger.info(f"LangGraph available: {LANGGRAPH_AVAILABLE}")
+    # Try to load model, but don't fail if it doesn't work (like the working version)
+    logger.info("Attempting to load model...")
     model_loaded = load_model()
     if model_loaded:
         logger.info("✅ Model loaded successfully for production!")
     else:
+        logger.warning("⚠️ Model failed to load, but server will start anyway. Model can be loaded via /load_model_manual endpoint.")