import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import gc
import os

# Global variables for model and tokenizer
model = None
tokenizer = None
model_loaded = False

def load_model():
    """Load the model and tokenizer optimized for CPU"""
    global model, tokenizer, model_loaded
    
    try:
        print("Loading AEGIS Conduct Economic Analysis Model for CPU...")
        
        # Load tokenizer first
        tokenizer = AutoTokenizer.from_pretrained(
            "Gaston895/aegisconduct",
            trust_remote_code=True
        )
        
        # Load model optimized for CPU
        model = AutoModelForCausalLM.from_pretrained(
            "Gaston895/aegisconduct",
            torch_dtype=torch.float16,  # Use float16 for memory efficiency
            device_map="cpu",  # Force CPU usage
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        
        # Force garbage collection
        gc.collect()
        
        print("Model loaded successfully on CPU!")
        model_loaded = True
        return True
        
    except Exception as e:
        print(f"Error loading model: {e}")
        # Fallback to basic loading
        try:
            print("Trying fallback loading method...")
            model = AutoModelForCausalLM.from_pretrained(
                "Gaston895/aegisconduct",
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
            print("Model loaded with fallback method!")
            model_loaded = True
            return True
        except Exception as e2:
            print(f"Fallback also failed: {e2}")
            model_loaded = False
            return False

def format_response(text):
    """Clean and format the model response"""
    # Remove thinking tags if present
    text = re.sub(r'<thinking>.*?</thinking>', '', text, flags=re.DOTALL)
    
    # Clean up extra whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = text.strip()
    
    return text

def generate_response(message, history, temperature=0.7, max_tokens=128):
    """Generate response from the model optimized for CPU"""
    global model, tokenizer, model_loaded
    
    if not model_loaded or model is None or tokenizer is None:
        return "Model is loading... Please wait a moment and try again."
    
    try:
        # Build conversation context (keep it very short for CPU)
        conversation = ""
        # Only use last 2 exchanges to save memory and processing time
        recent_history = history[-2:] if len(history) > 2 else history
        
        for user_msg, assistant_msg in recent_history:
            conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
        
        # Add current message
        conversation += f"User: {message}\nAssistant:"
        
        # Tokenize input with strict length limit for CPU
        inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=512)
        
        # Generate response with CPU-optimized settings
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
                top_k=50,
                repetition_penalty=1.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
                num_beams=1  # Use greedy decoding for speed
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the new response
        response = response[len(conversation):].strip()
        
        # Format and clean response
        response = format_response(response)
        
        # Clean up memory after generation
        gc.collect()
        
        return response if response else "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
        
    except Exception as e:
        return f"Error generating response: {str(e)}. Please try a shorter question."

def chat_interface(message, history, temperature, max_tokens):
    """Main chat interface function"""
    if not message.strip():
        return history, ""
    
    # Generate response
    response = generate_response(message, history, temperature, max_tokens)
    
    # Add to history
    history.append((message, response))
    
    return history, ""

# Create Gradio interface
with gr.Blocks(title="AEGIS Conduct - Economic Analysis Chat") as demo:
    
    gr.Markdown("""
    # 🤖 AEGIS Conduct - Economic Analysis Chat
    
    Chat with an AI model specialized in economic and financial analysis. This model features:
    - **Thinking Mode**: Automatic activation for complex reasoning
    - **Economic Expertise**: Specialized knowledge in finance, markets, and policy
    - **CPU Optimized**: Running efficiently on CPU hardware
    
    Ask questions about economics, finance, market analysis, policy impacts, and more!
    
    **Note**: This is a CPU-optimized version. Responses may take a moment to generate.
    """)
    
    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(
                height=400,
                show_label=False
            )
            
            msg = gr.Textbox(
                placeholder="Ask me about economics, finance, markets... (keep questions concise for faster responses)",
                show_label=False
            )
            
            with gr.Row():
                submit_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear Chat")
        
        with gr.Column(scale=1):
            gr.Markdown("### Settings")
            
            temperature = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.7,
                step=0.1,
                label="Temperature"
            )
            
            max_tokens = gr.Slider(
                minimum=32,
                maximum=256,
                value=128,
                step=32,
                label="Max Response Length"
            )
            
            gr.Markdown("""
            ### Example Questions
            - What causes inflation?
            - Explain interest rates
            - How do markets work?
            - What is GDP?
            - Define recession
            
            ### CPU Optimization
            - Responses limited to 128 tokens for speed
            - Only recent conversation used
            - Optimized for CPU processing
            - Keep questions concise
            """)
    
    # Event handlers
    def submit_message(message, history, temp, max_tok):
        return chat_interface(message, history, temp, max_tok)
    
    def clear_chat():
        # Force garbage collection when clearing
        gc.collect()
        return [], ""
    
    # Bind events
    submit_btn.click(
        submit_message,
        inputs=[msg, chatbot, temperature, max_tokens],
        outputs=[chatbot, msg]
    )
    
    msg.submit(
        submit_message,
        inputs=[msg, chatbot, temperature, max_tokens],
        outputs=[chatbot, msg]
    )
    
    clear_btn.click(
        clear_chat,
        outputs=[chatbot, msg]
    )

# Load model on startup
print("Initializing AEGIS Conduct Chat Interface...")
load_model()

# Launch configuration
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )