simple / app.py
Gaston895's picture
Upload app.py
94156c7 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import gc
import os
# Global variables for model and tokenizer
model = None
tokenizer = None
model_loaded = False
def load_model():
"""Load the model and tokenizer optimized for CPU"""
global model, tokenizer, model_loaded
try:
print("Loading AEGIS Conduct Economic Analysis Model for CPU...")
# Load tokenizer first
tokenizer = AutoTokenizer.from_pretrained(
"Gaston895/aegisconduct",
trust_remote_code=True
)
# Load model optimized for CPU
model = AutoModelForCausalLM.from_pretrained(
"Gaston895/aegisconduct",
torch_dtype=torch.float16, # Use float16 for memory efficiency
device_map="cpu", # Force CPU usage
trust_remote_code=True,
low_cpu_mem_usage=True
)
# Force garbage collection
gc.collect()
print("Model loaded successfully on CPU!")
model_loaded = True
return True
except Exception as e:
print(f"Error loading model: {e}")
# Fallback to basic loading
try:
print("Trying fallback loading method...")
model = AutoModelForCausalLM.from_pretrained(
"Gaston895/aegisconduct",
trust_remote_code=True,
low_cpu_mem_usage=True
)
print("Model loaded with fallback method!")
model_loaded = True
return True
except Exception as e2:
print(f"Fallback also failed: {e2}")
model_loaded = False
return False
def format_response(text):
"""Clean and format the model response"""
# Remove thinking tags if present
text = re.sub(r'<thinking>.*?</thinking>', '', text, flags=re.DOTALL)
# Clean up extra whitespace
text = re.sub(r'\n\s*\n', '\n\n', text)
text = text.strip()
return text
def generate_response(message, history, temperature=0.7, max_tokens=128):
"""Generate response from the model optimized for CPU"""
global model, tokenizer, model_loaded
if not model_loaded or model is None or tokenizer is None:
return "Model is loading... Please wait a moment and try again."
try:
# Build conversation context (keep it very short for CPU)
conversation = ""
# Only use last 2 exchanges to save memory and processing time
recent_history = history[-2:] if len(history) > 2 else history
for user_msg, assistant_msg in recent_history:
conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
# Add current message
conversation += f"User: {message}\nAssistant:"
# Tokenize input with strict length limit for CPU
inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=512)
# Generate response with CPU-optimized settings
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
top_p=0.9,
top_k=50,
repetition_penalty=1.1,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
use_cache=True,
num_beams=1 # Use greedy decoding for speed
)
# Decode response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the new response
response = response[len(conversation):].strip()
# Format and clean response
response = format_response(response)
# Clean up memory after generation
gc.collect()
return response if response else "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
except Exception as e:
return f"Error generating response: {str(e)}. Please try a shorter question."
def chat_interface(message, history, temperature, max_tokens):
"""Main chat interface function"""
if not message.strip():
return history, ""
# Generate response
response = generate_response(message, history, temperature, max_tokens)
# Add to history
history.append((message, response))
return history, ""
# Create Gradio interface
with gr.Blocks(title="AEGIS Conduct - Economic Analysis Chat") as demo:
gr.Markdown("""
# 🤖 AEGIS Conduct - Economic Analysis Chat
Chat with an AI model specialized in economic and financial analysis. This model features:
- **Thinking Mode**: Automatic activation for complex reasoning
- **Economic Expertise**: Specialized knowledge in finance, markets, and policy
- **CPU Optimized**: Running efficiently on CPU hardware
Ask questions about economics, finance, market analysis, policy impacts, and more!
**Note**: This is a CPU-optimized version. Responses may take a moment to generate.
""")
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(
height=400,
show_label=False
)
msg = gr.Textbox(
placeholder="Ask me about economics, finance, markets... (keep questions concise for faster responses)",
show_label=False
)
with gr.Row():
submit_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear Chat")
with gr.Column(scale=1):
gr.Markdown("### Settings")
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
label="Temperature"
)
max_tokens = gr.Slider(
minimum=32,
maximum=256,
value=128,
step=32,
label="Max Response Length"
)
gr.Markdown("""
### Example Questions
- What causes inflation?
- Explain interest rates
- How do markets work?
- What is GDP?
- Define recession
### CPU Optimization
- Responses limited to 128 tokens for speed
- Only recent conversation used
- Optimized for CPU processing
- Keep questions concise
""")
# Event handlers
def submit_message(message, history, temp, max_tok):
return chat_interface(message, history, temp, max_tok)
def clear_chat():
# Force garbage collection when clearing
gc.collect()
return [], ""
# Bind events
submit_btn.click(
submit_message,
inputs=[msg, chatbot, temperature, max_tokens],
outputs=[chatbot, msg]
)
msg.submit(
submit_message,
inputs=[msg, chatbot, temperature, max_tokens],
outputs=[chatbot, msg]
)
clear_btn.click(
clear_chat,
outputs=[chatbot, msg]
)
# Load model on startup
print("Initializing AEGIS Conduct Chat Interface...")
load_model()
# Launch configuration
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)