import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM import re import gc import os # Global variables for model and tokenizer model = None tokenizer = None model_loaded = False def load_model(): """Load the model and tokenizer optimized for CPU""" global model, tokenizer, model_loaded try: print("Loading AEGIS Conduct Economic Analysis Model for CPU...") # Load tokenizer first tokenizer = AutoTokenizer.from_pretrained( "Gaston895/aegisconduct", trust_remote_code=True ) # Load model optimized for CPU model = AutoModelForCausalLM.from_pretrained( "Gaston895/aegisconduct", torch_dtype=torch.float16, # Use float16 for memory efficiency device_map="cpu", # Force CPU usage trust_remote_code=True, low_cpu_mem_usage=True ) # Force garbage collection gc.collect() print("Model loaded successfully on CPU!") model_loaded = True return True except Exception as e: print(f"Error loading model: {e}") # Fallback to basic loading try: print("Trying fallback loading method...") model = AutoModelForCausalLM.from_pretrained( "Gaston895/aegisconduct", trust_remote_code=True, low_cpu_mem_usage=True ) print("Model loaded with fallback method!") model_loaded = True return True except Exception as e2: print(f"Fallback also failed: {e2}") model_loaded = False return False def format_response(text): """Clean and format the model response""" # Remove thinking tags if present text = re.sub(r'.*?', '', text, flags=re.DOTALL) # Clean up extra whitespace text = re.sub(r'\n\s*\n', '\n\n', text) text = text.strip() return text def generate_response(message, history, temperature=0.7, max_tokens=128): """Generate response from the model optimized for CPU""" global model, tokenizer, model_loaded if not model_loaded or model is None or tokenizer is None: return "Model is loading... Please wait a moment and try again." try: # Build conversation context (keep it very short for CPU) conversation = "" # Only use last 2 exchanges to save memory and processing time recent_history = history[-2:] if len(history) > 2 else history for user_msg, assistant_msg in recent_history: conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n" # Add current message conversation += f"User: {message}\nAssistant:" # Tokenize input with strict length limit for CPU inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=512) # Generate response with CPU-optimized settings with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, top_p=0.9, top_k=50, repetition_penalty=1.1, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, use_cache=True, num_beams=1 # Use greedy decoding for speed ) # Decode response response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the new response response = response[len(conversation):].strip() # Format and clean response response = format_response(response) # Clean up memory after generation gc.collect() return response if response else "I apologize, but I couldn't generate a proper response. Please try rephrasing your question." except Exception as e: return f"Error generating response: {str(e)}. Please try a shorter question." def chat_interface(message, history, temperature, max_tokens): """Main chat interface function""" if not message.strip(): return history, "" # Generate response response = generate_response(message, history, temperature, max_tokens) # Add to history history.append((message, response)) return history, "" # Create Gradio interface with gr.Blocks(title="AEGIS Conduct - Economic Analysis Chat") as demo: gr.Markdown(""" # 🤖 AEGIS Conduct - Economic Analysis Chat Chat with an AI model specialized in economic and financial analysis. This model features: - **Thinking Mode**: Automatic activation for complex reasoning - **Economic Expertise**: Specialized knowledge in finance, markets, and policy - **CPU Optimized**: Running efficiently on CPU hardware Ask questions about economics, finance, market analysis, policy impacts, and more! **Note**: This is a CPU-optimized version. Responses may take a moment to generate. """) with gr.Row(): with gr.Column(scale=4): chatbot = gr.Chatbot( height=400, show_label=False ) msg = gr.Textbox( placeholder="Ask me about economics, finance, markets... (keep questions concise for faster responses)", show_label=False ) with gr.Row(): submit_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear Chat") with gr.Column(scale=1): gr.Markdown("### Settings") temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature" ) max_tokens = gr.Slider( minimum=32, maximum=256, value=128, step=32, label="Max Response Length" ) gr.Markdown(""" ### Example Questions - What causes inflation? - Explain interest rates - How do markets work? - What is GDP? - Define recession ### CPU Optimization - Responses limited to 128 tokens for speed - Only recent conversation used - Optimized for CPU processing - Keep questions concise """) # Event handlers def submit_message(message, history, temp, max_tok): return chat_interface(message, history, temp, max_tok) def clear_chat(): # Force garbage collection when clearing gc.collect() return [], "" # Bind events submit_btn.click( submit_message, inputs=[msg, chatbot, temperature, max_tokens], outputs=[chatbot, msg] ) msg.submit( submit_message, inputs=[msg, chatbot, temperature, max_tokens], outputs=[chatbot, msg] ) clear_btn.click( clear_chat, outputs=[chatbot, msg] ) # Load model on startup print("Initializing AEGIS Conduct Chat Interface...") load_model() # Launch configuration if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )