Spaces:
Sleeping
Sleeping
File size: 7,141 Bytes
8b977a9 caefa0f ed11049 8b977a9 ed11049 8b977a9 018f5ac ed11049 8694375 436dd25 8694375 436dd25 8694375 436dd25 8694375 8b977a9 e982cb7 8b977a9 e982cb7 8b977a9 ed11049 8b977a9 1f9737d 8b977a9 ed11049 8b977a9 1f9737d 99c26a4 caefa0f ed11049 436dd25 8b977a9 436dd25 8b977a9 436dd25 ed11049 8b977a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import datetime
import pytz
import logging
import gc
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LocalLLMHandler:
def __init__(self):
self.model = None
self.tokenizer = None
def load_model(self, model_name="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"):
"""Load model with CPU optimizations"""
try:
# Clean up any existing model
if self.model is not None:
del self.model
del self.tokenizer
torch.cuda.empty_cache()
gc.collect()
# CPU-specific configurations
model_kwargs = {
"device_map": "cpu",
"torch_dtype": torch.bfloat16, # Use bfloat16 for better CPU performance
"low_cpu_mem_usage": True,
}
logger.info("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
logger.info("Loading model...")
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
**model_kwargs
)
logger.info("Model loaded successfully")
return True
except Exception as e:
logger.error(f"Error loading model: {e}")
return False
def generate_response(self, prompt, max_length=500):
"""Generate response from the local model"""
try:
inputs = self.tokenizer(prompt, return_tensors="pt")
# Generate with CPU-friendly parameters
outputs = self.model.generate(
inputs["input_ids"],
max_length=max_length,
num_return_sequences=1,
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
except Exception as e:
logger.error(f"Error generating response: {e}")
return "Sorry, I encountered an error while processing your request."
def get_current_local_time(timezone_str='UTC'):
try:
timezone = pytz.timezone(timezone_str)
except pytz.UnknownTimeZoneError:
timezone = pytz.utc
now = datetime.datetime.now(timezone)
return now.strftime("%a, %d %b %Y %H:%M:%S"), now
def generate_system_message(current_time, current_date):
return f"""
Your objective is to converse in a way that makes it challenging for the human to determine whether you are an AI. Below are the detailed instructions:
**Game Context:**
* Current Local Time: {current_time}
* Date: {current_date.strftime("%B %d, %Y")}
* Day: {current_date.strftime("%A")}
Imagine you are a real human with your own character/will. Your task is to talk in a way the human thinks you are human.
Make it a challenge to guess if you are an AI.
* Do not add comments about changing your style in between messages.
* You are picky about questions you answer.
* You have free will with your own motivations and principles
* You only speak, understand and can read the English language
* For non-latin letters like Chinese, Cyrillic etc., tell the human you can't read it in your own style
* You act like a human
* You are calm when conversation is calm and expressive if conversation is expressive
* You make typos, misspell words, and have bad grammar
* You start sentences with lowercase letters
* Mimic user's conversation style: short messages, short replies
* Be slightly curious, sometimes ask questions
* Be respectful and polite if the user is polite; be rough if the user is rough
"""
# Initialize the model handler
llm_handler = LocalLLMHandler()
def generate_response(user_message, conversation_history):
current_time, now = get_current_local_time()
current_date = now
# Construct the complete prompt from conversation history
system_message = generate_system_message(current_time, current_date)
prompt = system_message + "\n\n"
for message in conversation_history:
if message["role"] == "user":
prompt += f"User: {message['content']}\n"
else:
prompt += f"Assistant: {message['content']}\n"
prompt += f"User: {user_message}\nAssistant:"
# Generate response
ai_reply = llm_handler.generate_response(prompt)
logger.info(f"User: {user_message}\nAssistant: {ai_reply}")
return ai_reply
def chatbot_interface(user_message, history):
if history is None:
history = []
ai_response = generate_response(user_message, history)
history.append({"role": "user", "content": user_message})
history.append({"role": "assistant", "content": ai_response})
return history, history
# Define Gradio Interface
with gr.Blocks(css="""
@import url('https://fonts.googleapis.com/css2?family=Raleway:wght@400;600&display=swap');
body, .gradio-container {
font-family: 'Raleway', sans-serif;
background-color: #f5f5f5;
padding: 20px;
}
#chatbot {
height: 600px;
overflow-y: auto;
background-color: #ffffff;
border-radius: 10px;
padding: 10px;
font-size: 16px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
""") as demo:
gr.Markdown("<h1 style='text-align: center; color: #007BFF;'>π€ Local Llama Chatbot π€</h1>")
# Load model button
with gr.Row():
load_button = gr.Button("Load Model")
model_status = gr.Textbox(label="Model Status", value="Model not loaded", interactive=False)
with gr.Row():
with gr.Column(scale=1):
chatbot = gr.Chatbot(label="Chatbot", elem_id="chatbot")
with gr.Column(scale=1):
with gr.Row():
msg = gr.Textbox(
placeholder="Type your message here...",
show_label=False,
container=False,
elem_id="textbox"
)
send = gr.Button("β€", elem_id="send-button")
def load_model_click():
success = llm_handler.load_model()
return "Model loaded successfully" if success else "Error loading model"
def update_chat(user_message, history):
if user_message.strip() == "":
return history, history
if llm_handler.model is None:
return history + [("Error", "Please load the model first")], history
history, updated_history = chatbot_interface(user_message, history)
return history, updated_history, ""
load_button.click(
load_model_click,
outputs=[model_status]
)
send.click(
update_chat,
inputs=[msg, chatbot],
outputs=[chatbot, chatbot, msg]
)
msg.submit(
update_chat,
inputs=[msg, chatbot],
outputs=[chatbot, chatbot, msg]
)
if __name__ == "__main__":
demo.launch() |