Spaces:

Gaston895
/

chat

Sleeping

App Files Files Community

Gaston895 commited on Jan 12

Commit

756e842

verified ·

1 Parent(s): 77bf462

🚀 Memory and speed optimizations: faster generation, better memory management

Browse files

Files changed (1) hide show

app.py +114 -25

app.py CHANGED Viewed

@@ -41,6 +41,18 @@ tokenizer = None
 chat_pipeline = None
 executor = ThreadPoolExecutor(max_workers=2)
 @dataclass
 class TechScores:
     """Technology threat scores structure"""
@@ -836,7 +848,7 @@ Focus on quantitative metrics and actionable insights."""
 langgraph_processor = LangGraphProcessor()
 def load_model():
-    """Load the model and tokenizer from Gaston895/Aegisecon1 repository using pipeline approach"""
     global model, tokenizer, chat_pipeline
     try:
@@ -847,68 +859,129 @@ def load_model():
         DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Loading tokenizer from {MODEL_NAME}...")
-        # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
         logger.info(f"Loading model from {MODEL_NAME}...")
-        # Load model with appropriate settings
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
-            torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
             device_map="auto" if DEVICE == "cuda" else None,
             trust_remote_code=True,
-            low_cpu_mem_usage=True
         )
-        # Create pipeline
         chat_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
             device=0 if DEVICE == "cuda" else -1,
-            torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32
         )
         logger.info("Model loaded successfully from HF repository!")
         return True
     except Exception as e:
         logger.error(f"Failed to load model: {str(e)}")
         return False
 def generate_response(prompt, temperature=0.7):
-    """Generate response using the loaded model pipeline"""
     try:
         if not chat_pipeline:
             return "Model is still loading, please wait a moment and try again..."
-        # Format the prompt
         formatted_prompt = f"User: {prompt}\nAssistant:"
-        # Generate response
         response = chat_pipeline(
             formatted_prompt,
-            max_new_tokens=256,  # Use only max_new_tokens to avoid conflict
             temperature=temperature,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
-            truncation=True
         )
-        # Extract the generated text
-        generated_text = response[0]['generated_text']
-        # Extract only the assistant's response
-        if "Assistant:" in generated_text:
-            assistant_response = generated_text.split("Assistant:")[-1].strip()
         else:
-            assistant_response = generated_text.replace(formatted_prompt, "").strip()
-        return assistant_response
     except Exception as e:
         logger.error(f"Error generating response: {str(e)}")
-        return f"Error: {str(e)}"
 # HTML template (same as before)
 HTML_TEMPLATE = """
@@ -1157,7 +1230,7 @@ def home():
 @app.route('/process_tech_scores', methods=['POST'])
 def process_tech_scores():
-    """Process technology scores through LangGraph pipeline"""
     try:
         data = request.get_json()
@@ -1174,8 +1247,16 @@ def process_tech_scores():
         logger.info(f"Processing tech scores: {tech_scores.to_dict()}")
-        # Process through LangGraph
-        langgraph_result = langgraph_processor.process_tech_scores(tech_scores)
         if not langgraph_result['success']:
             return jsonify({'success': False, 'error': 'LangGraph processing failed'})
@@ -1183,10 +1264,17 @@ def process_tech_scores():
         # Get the optimized prompt from LangGraph
         final_prompt = langgraph_result['final_prompt']
         # Generate final analysis using AEGIS Economics AI
         logger.info("Generating final analysis with AEGIS Economics AI...")
         final_analysis = generate_response(final_prompt)
         return jsonify({
             'success': True,
             'processing_steps': langgraph_result.get('processing_steps', []),
@@ -1197,6 +1285,7 @@ def process_tech_scores():
     except Exception as e:
         logger.error(f"Error in tech score processing: {str(e)}")
         return jsonify({'success': False, 'error': str(e)}), 500
 @app.route('/chat', methods=['POST'])

 chat_pipeline = None
 executor = ThreadPoolExecutor(max_workers=2)
+def cleanup_memory():
+    """Clean up GPU/CPU memory"""
+    try:
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        # Force garbage collection
+        import gc
+        gc.collect()
+    except Exception as e:
+        logger.warning(f"Memory cleanup warning: {e}")
 @dataclass
 class TechScores:
     """Technology threat scores structure"""
 langgraph_processor = LangGraphProcessor()
 def load_model():
+    """Load the model and tokenizer from Gaston895/Aegisecon1 repository using optimized pipeline approach"""
     global model, tokenizer, chat_pipeline
     try:
         DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Loading tokenizer from {MODEL_NAME}...")
+        # Load tokenizer with optimizations
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            use_fast=True  # Use fast tokenizer for speed
+        )
+        # Set pad token if not exists
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
         logger.info(f"Loading model from {MODEL_NAME}...")
+        # Load model with memory-optimized settings
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,  # Use float16 for memory efficiency
             device_map="auto" if DEVICE == "cuda" else None,
             trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            use_cache=True,  # Enable KV cache for faster generation
+            attn_implementation="flash_attention_2" if DEVICE == "cuda" else None  # Use flash attention if available
         )
+        # Optimize model for inference
+        model.eval()
+        if hasattr(model, 'gradient_checkpointing_disable'):
+            model.gradient_checkpointing_disable()
+        # Create pipeline with optimized settings
         chat_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
             device=0 if DEVICE == "cuda" else -1,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+            model_kwargs={
+                "use_cache": True,
+                "do_sample": True,
+                "pad_token_id": tokenizer.eos_token_id
+            }
         )
         logger.info("Model loaded successfully from HF repository!")
+        logger.info(f"Device: {DEVICE}")
+        logger.info(f"Model dtype: {next(model.parameters()).dtype}")
+        # Clear any initialization memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return True
     except Exception as e:
         logger.error(f"Failed to load model: {str(e)}")
+        # Clear memory on failure
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return False
 def generate_response(prompt, temperature=0.7):
+    """Generate response using the loaded model pipeline with optimizations"""
     try:
         if not chat_pipeline:
             return "Model is still loading, please wait a moment and try again..."
+        # Truncate input prompt if too long to save memory
+        max_input_length = 800
+        if len(prompt) > max_input_length:
+            prompt = prompt[:max_input_length] + "..."
+        # Format the prompt efficiently
         formatted_prompt = f"User: {prompt}\nAssistant:"
+        # Optimized generation parameters for speed and memory
         response = chat_pipeline(
             formatted_prompt,
+            max_new_tokens=128,  # Reduced for faster generation
             temperature=temperature,
             do_sample=True,
+            top_p=0.9,  # Nucleus sampling for better quality
+            top_k=50,   # Limit vocabulary for speed
             pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            truncation=True,
+            return_full_text=False,  # Only return new tokens
+            clean_up_tokenization_spaces=True
         )
+        # Extract response efficiently
+        if response and len(response) > 0:
+            generated_text = response[0].get('generated_text', '')
+            # Clean up the response
+            if "Assistant:" in generated_text:
+                assistant_response = generated_text.split("Assistant:")[-1].strip()
+            else:
+                assistant_response = generated_text.replace(formatted_prompt, "").strip()
+            # Remove any remaining prompt artifacts
+            if assistant_response.startswith("User:"):
+                lines = assistant_response.split('\n')
+                assistant_response = '\n'.join([line for line in lines if not line.strip().startswith("User:")])
+            # Ensure response isn't empty
+            if not assistant_response.strip():
+                assistant_response = "I understand your question. Let me provide an economic analysis based on the available data."
+            return assistant_response.strip()
         else:
+            return "I'm processing your request. Please try again in a moment."
+    except torch.cuda.OutOfMemoryError:
+        # Handle CUDA OOM gracefully
+        logger.error("CUDA out of memory during generation")
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        return "System is under high load. Please try a shorter question."
     except Exception as e:
         logger.error(f"Error generating response: {str(e)}")
+        # Clear any potential memory issues
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return "I'm experiencing technical difficulties. Please try again shortly."
 # HTML template (same as before)
 HTML_TEMPLATE = """
 @app.route('/process_tech_scores', methods=['POST'])
 def process_tech_scores():
+    """Process technology scores through LangGraph pipeline with memory optimization"""
     try:
         data = request.get_json()
         logger.info(f"Processing tech scores: {tech_scores.to_dict()}")
+        # Clean memory before processing
+        cleanup_memory()
+        # Process through LangGraph with timeout
+        try:
+            langgraph_result = langgraph_processor.process_tech_scores(tech_scores)
+        except Exception as e:
+            logger.error(f"LangGraph processing failed: {e}")
+            # Fallback to simplified processing
+            langgraph_result = langgraph_processor._simplified_processing(tech_scores)
         if not langgraph_result['success']:
             return jsonify({'success': False, 'error': 'LangGraph processing failed'})
         # Get the optimized prompt from LangGraph
         final_prompt = langgraph_result['final_prompt']
+        # Truncate prompt if too long to save memory
+        if len(final_prompt) > 1000:
+            final_prompt = final_prompt[:1000] + "... [truncated for efficiency]"
         # Generate final analysis using AEGIS Economics AI
         logger.info("Generating final analysis with AEGIS Economics AI...")
         final_analysis = generate_response(final_prompt)
+        # Clean memory after processing
+        cleanup_memory()
         return jsonify({
             'success': True,
             'processing_steps': langgraph_result.get('processing_steps', []),
     except Exception as e:
         logger.error(f"Error in tech score processing: {str(e)}")
+        cleanup_memory()  # Clean memory on error
         return jsonify({'success': False, 'error': str(e)}), 500
 @app.route('/chat', methods=['POST'])