File size: 7,947 Bytes
e32498e a9b927e e32498e a9b927e e32498e 2367226 a9b927e e32498e 2367226 e32498e a9b927e e32498e f4cab5c e32498e 2367226 e32498e 94156c7 2367226 a9b927e 94156c7 e32498e a9b927e 2367226 a9b927e e32498e 2367226 e32498e 2367226 a9b927e e32498e a9b927e e32498e 2367226 e32498e 2367226 a9b927e e32498e 2367226 e32498e 2367226 e32498e a9b927e e32498e a9b927e 2367226 e32498e 2367226 a9b927e 2367226 e32498e 2367226 e32498e a9b927e e32498e 2367226 e32498e 2367226 a9b927e f4cab5c e32498e a9b927e e32498e 2367226 a9b927e e32498e a9b927e e32498e 2367226 e32498e a9b927e e32498e 2367226 a9b927e e32498e 2367226 a9b927e 2367226 e32498e a9b927e e32498e a9b927e f4cab5c e32498e f4cab5c a9b927e f4cab5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import gc
import os
# Global variables for model and tokenizer
model = None
tokenizer = None
model_loaded = False
def load_model():
"""Load the model and tokenizer optimized for CPU"""
global model, tokenizer, model_loaded
try:
print("Loading AEGIS Conduct Economic Analysis Model for CPU...")
# Load tokenizer first
tokenizer = AutoTokenizer.from_pretrained(
"Gaston895/aegisconduct",
trust_remote_code=True
)
# Load model optimized for CPU
model = AutoModelForCausalLM.from_pretrained(
"Gaston895/aegisconduct",
torch_dtype=torch.float16, # Use float16 for memory efficiency
device_map="cpu", # Force CPU usage
trust_remote_code=True,
low_cpu_mem_usage=True
)
# Force garbage collection
gc.collect()
print("Model loaded successfully on CPU!")
model_loaded = True
return True
except Exception as e:
print(f"Error loading model: {e}")
# Fallback to basic loading
try:
print("Trying fallback loading method...")
model = AutoModelForCausalLM.from_pretrained(
"Gaston895/aegisconduct",
trust_remote_code=True,
low_cpu_mem_usage=True
)
print("Model loaded with fallback method!")
model_loaded = True
return True
except Exception as e2:
print(f"Fallback also failed: {e2}")
model_loaded = False
return False
def format_response(text):
"""Clean and format the model response"""
# Remove thinking tags if present
text = re.sub(r'<thinking>.*?</thinking>', '', text, flags=re.DOTALL)
# Clean up extra whitespace
text = re.sub(r'\n\s*\n', '\n\n', text)
text = text.strip()
return text
def generate_response(message, history, temperature=0.7, max_tokens=128):
"""Generate response from the model optimized for CPU"""
global model, tokenizer, model_loaded
if not model_loaded or model is None or tokenizer is None:
return "Model is loading... Please wait a moment and try again."
try:
# Build conversation context (keep it very short for CPU)
conversation = ""
# Only use last 2 exchanges to save memory and processing time
recent_history = history[-2:] if len(history) > 2 else history
for user_msg, assistant_msg in recent_history:
conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
# Add current message
conversation += f"User: {message}\nAssistant:"
# Tokenize input with strict length limit for CPU
inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=512)
# Generate response with CPU-optimized settings
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
top_p=0.9,
top_k=50,
repetition_penalty=1.1,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
use_cache=True,
num_beams=1 # Use greedy decoding for speed
)
# Decode response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the new response
response = response[len(conversation):].strip()
# Format and clean response
response = format_response(response)
# Clean up memory after generation
gc.collect()
return response if response else "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
except Exception as e:
return f"Error generating response: {str(e)}. Please try a shorter question."
def chat_interface(message, history, temperature, max_tokens):
"""Main chat interface function"""
if not message.strip():
return history, ""
# Generate response
response = generate_response(message, history, temperature, max_tokens)
# Add to history
history.append((message, response))
return history, ""
# Create Gradio interface
with gr.Blocks(title="AEGIS Conduct - Economic Analysis Chat") as demo:
gr.Markdown("""
# 🤖 AEGIS Conduct - Economic Analysis Chat
Chat with an AI model specialized in economic and financial analysis. This model features:
- **Thinking Mode**: Automatic activation for complex reasoning
- **Economic Expertise**: Specialized knowledge in finance, markets, and policy
- **CPU Optimized**: Running efficiently on CPU hardware
Ask questions about economics, finance, market analysis, policy impacts, and more!
**Note**: This is a CPU-optimized version. Responses may take a moment to generate.
""")
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(
height=400,
show_label=False
)
msg = gr.Textbox(
placeholder="Ask me about economics, finance, markets... (keep questions concise for faster responses)",
show_label=False
)
with gr.Row():
submit_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear Chat")
with gr.Column(scale=1):
gr.Markdown("### Settings")
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
label="Temperature"
)
max_tokens = gr.Slider(
minimum=32,
maximum=256,
value=128,
step=32,
label="Max Response Length"
)
gr.Markdown("""
### Example Questions
- What causes inflation?
- Explain interest rates
- How do markets work?
- What is GDP?
- Define recession
### CPU Optimization
- Responses limited to 128 tokens for speed
- Only recent conversation used
- Optimized for CPU processing
- Keep questions concise
""")
# Event handlers
def submit_message(message, history, temp, max_tok):
return chat_interface(message, history, temp, max_tok)
def clear_chat():
# Force garbage collection when clearing
gc.collect()
return [], ""
# Bind events
submit_btn.click(
submit_message,
inputs=[msg, chatbot, temperature, max_tokens],
outputs=[chatbot, msg]
)
msg.submit(
submit_message,
inputs=[msg, chatbot, temperature, max_tokens],
outputs=[chatbot, msg]
)
clear_btn.click(
clear_chat,
outputs=[chatbot, msg]
)
# Load model on startup
print("Initializing AEGIS Conduct Chat Interface...")
load_model()
# Launch configuration
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
) |