Spaces:

Gaston895
/

simple

Sleeping

App Files Files Community

Gaston895 commited on Jan 11

Commit

a9b927e

verified ·

1 Parent(s): bd9c577

Upload app.py

Browse files

Files changed (1) hide show

app.py +68 -46

app.py CHANGED Viewed

@@ -2,36 +2,50 @@ import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import re
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
 def load_model():
-    """Load the model and tokenizer"""
-    global model, tokenizer
     try:
         print("Loading AEGIS Conduct Economic Analysis Model...")
-        # Load tokenizer and model directly from root (no subfolder needed)
         tokenizer = AutoTokenizer.from_pretrained(
             "Gaston895/aegisconduct",
             trust_remote_code=True
         )
         model = AutoModelForCausalLM.from_pretrained(
             "Gaston895/aegisconduct",
-            torch_dtype=torch.bfloat16,
             device_map="auto",
-            trust_remote_code=True
         )
         print("Model loaded successfully!")
         return True
     except Exception as e:
         print(f"Error loading model: {e}")
         return False
 def format_response(text):
@@ -45,41 +59,45 @@ def format_response(text):
     return text
-def generate_response(message, history, temperature=0.7, max_tokens=512):
-    """Generate response from the model"""
-    global model, tokenizer
-    if model is None or tokenizer is None:
-        return "Model not loaded. Please wait for initialization to complete."
     try:
-        # Build conversation context
         conversation = ""
-        for user_msg, assistant_msg in history:
             conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
         # Add current message
         conversation += f"User: {message}\nAssistant:"
-        # Tokenize input
-        inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=2048)
         # Move to device
         if torch.cuda.is_available():
             inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        # Generate response
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 do_sample=True,
-                top_p=0.95,
-                top_k=40,
-                repetition_penalty=1.05,
                 pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id
             )
         # Decode response
@@ -91,6 +109,10 @@ def generate_response(message, history, temperature=0.7, max_tokens=512):
         # Format and clean response
         response = format_response(response)
         return response
     except Exception as e:
@@ -109,14 +131,8 @@ def chat_interface(message, history, temperature, max_tokens):
     return history, ""
-# Load model on startup
-print("Initializing AEGIS Conduct Chat Interface...")
-model_loaded = load_model()
 # Create Gradio interface
-with gr.Blocks(
-    title="AEGIS Conduct - Economic Analysis Chat"
-) as demo:
     gr.Markdown("""
     # 🤖 AEGIS Conduct - Economic Analysis Chat
@@ -127,49 +143,43 @@ with gr.Blocks(
     - **128k Context**: Extended memory for detailed conversations
     Ask questions about economics, finance, market analysis, policy impacts, and more!
-    """)
-    if not model_loaded:
-        gr.Markdown("⚠️ **Model Loading Error**: Please refresh the page or contact support.")
     with gr.Row():
         with gr.Column(scale=4):
             chatbot = gr.Chatbot(
-                height=500,
-                show_label=False,
-                container=True
             )
             msg = gr.Textbox(
                 placeholder="Ask me about economics, finance, markets, or any analytical question...",
-                show_label=False,
-                container=False,
-                scale=7
             )
             with gr.Row():
-                submit_btn = gr.Button("Send", variant="primary", scale=1)
-                clear_btn = gr.Button("Clear Chat", scale=1)
         with gr.Column(scale=1):
             gr.Markdown("### Settings")
             temperature = gr.Slider(
                 minimum=0.1,
-                maximum=2.0,
                 value=0.7,
                 step=0.1,
-                label="Temperature",
-                info="Controls randomness (0.1=focused, 2.0=creative)"
             )
             max_tokens = gr.Slider(
                 minimum=50,
-                maximum=1024,
-                value=512,
                 step=50,
-                label="Max Response Length",
-                info="Maximum tokens in response"
             )
             gr.Markdown("""
@@ -179,6 +189,11 @@ with gr.Blocks(
             - What are the risks of high national debt?
             - How do interest rates affect the stock market?
             - Think deeply: What causes economic recessions?
             """)
     # Event handlers
@@ -186,6 +201,10 @@ with gr.Blocks(
         return chat_interface(message, history, temp, max_tok)
     def clear_chat():
         return [], ""
     # Bind events
@@ -206,11 +225,14 @@ with gr.Blocks(
         outputs=[chatbot, msg]
     )
 # Launch configuration
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,
-        theme=gr.themes.Soft()
     )

 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import re
+import gc
+import os
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
+model_loaded = False
 def load_model():
+    """Load the model and tokenizer with memory optimization"""
+    global model, tokenizer, model_loaded
     try:
         print("Loading AEGIS Conduct Economic Analysis Model...")
+        # Load tokenizer first
         tokenizer = AutoTokenizer.from_pretrained(
             "Gaston895/aegisconduct",
             trust_remote_code=True
         )
+        # Load model with aggressive memory optimization
         model = AutoModelForCausalLM.from_pretrained(
             "Gaston895/aegisconduct",
+            torch_dtype=torch.float16,
             device_map="auto",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            load_in_8bit=True,
+            max_memory={0: "6GB", "cpu": "8GB"}  # Limit GPU and CPU memory usage
         )
+        # Force garbage collection
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         print("Model loaded successfully!")
+        model_loaded = True
         return True
     except Exception as e:
         print(f"Error loading model: {e}")
+        model_loaded = False
         return False
 def format_response(text):
     return text
+def generate_response(message, history, temperature=0.7, max_tokens=256):
+    """Generate response from the model with memory optimization"""
+    global model, tokenizer, model_loaded
+    if not model_loaded or model is None or tokenizer is None:
+        return "Model is loading... Please wait a moment and try again."
     try:
+        # Build conversation context (keep it shorter for memory)
         conversation = ""
+        # Only use last 3 exchanges to save memory
+        recent_history = history[-3:] if len(history) > 3 else history
+        for user_msg, assistant_msg in recent_history:
             conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
         # Add current message
         conversation += f"User: {message}\nAssistant:"
+        # Tokenize input with length limit
+        inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024)
         # Move to device
         if torch.cuda.is_available():
             inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # Generate response with memory-efficient settings
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 do_sample=True,
+                top_p=0.9,
+                top_k=50,
+                repetition_penalty=1.1,
                 pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                use_cache=True
             )
         # Decode response
         # Format and clean response
         response = format_response(response)
+        # Clean up GPU memory after generation
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return response
     except Exception as e:
     return history, ""
 # Create Gradio interface
+with gr.Blocks(title="AEGIS Conduct - Economic Analysis Chat") as demo:
     gr.Markdown("""
     # 🤖 AEGIS Conduct - Economic Analysis Chat
     - **128k Context**: Extended memory for detailed conversations
     Ask questions about economics, finance, market analysis, policy impacts, and more!
+    **Note**: This is a memory-optimized version for better performance.
+    """)
     with gr.Row():
         with gr.Column(scale=4):
             chatbot = gr.Chatbot(
+                height=400,
+                show_label=False
             )
             msg = gr.Textbox(
                 placeholder="Ask me about economics, finance, markets, or any analytical question...",
+                show_label=False
             )
             with gr.Row():
+                submit_btn = gr.Button("Send", variant="primary")
+                clear_btn = gr.Button("Clear Chat")
         with gr.Column(scale=1):
             gr.Markdown("### Settings")
             temperature = gr.Slider(
                 minimum=0.1,
+                maximum=1.5,
                 value=0.7,
                 step=0.1,
+                label="Temperature"
             )
             max_tokens = gr.Slider(
                 minimum=50,
+                maximum=512,
+                value=256,
                 step=50,
+                label="Max Response Length"
             )
             gr.Markdown("""
             - What are the risks of high national debt?
             - How do interest rates affect the stock market?
             - Think deeply: What causes economic recessions?
+            ### Memory Optimization
+            - Responses are limited to 256 tokens by default
+            - Only recent conversation history is used
+            - Model uses 8-bit quantization for efficiency
             """)
     # Event handlers
         return chat_interface(message, history, temp, max_tok)
     def clear_chat():
+        # Force garbage collection when clearing
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return [], ""
     # Bind events
         outputs=[chatbot, msg]
     )
+# Load model on startup
+print("Initializing AEGIS Conduct Chat Interface...")
+load_model()
 # Launch configuration
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False
     )