Spaces:

SolarumAsteridion
/

Human

Sleeping

Human / app.py

Solarum Asteridion

Update app.py

dd93e53 verified about 1 year ago

8.08 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	import gradio as gr
	import datetime
	import pytz
	import logging
	import gc
	import psutil
	import os
	from huggingface_hub import login
	import os

	class MemoryTracker:
	@staticmethod
	def get_memory_usage():
	process = psutil.Process(os.getpid())
	memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
	return f"{memory_gb:.2f} GB"

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def setup_huggingface_auth():
	"""Set up Hugging Face authentication"""
	# First try to get token from environment variable
	token = os.getenv('HF_TOKEN')
	if not token:
	# If not in environment, try to read from a file
	token_file = 'hf_token.txt'
	if os.path.exists(token_file):
	with open(token_file, 'r') as f:
	token = f.read().strip()

	class LocalLLMHandler:
	def __init__(self):
	self.model = None
	self.tokenizer = None
	self.memory_tracker = MemoryTracker()

	def load_model(self, model_name="meta-llama/Llama-3.1-8B-Instruct"):
	"""Load model with optimizations for 16GB RAM"""
	try:
	# Ensure we're authenticated
	if not setup_huggingface_auth():
	raise Exception("Hugging Face authentication failed. Please set your token first.")
	# Clean up any existing model
	if self.model is not None:
	del self.model
	del self.tokenizer
	torch.cuda.empty_cache()
	gc.collect()

	# CPU-specific configurations
	model_kwargs = {
	"device_map": "cpu",
	"torch_dtype": torch.bfloat16, # Use bfloat16 for better CPU performance
	"low_cpu_mem_usage": True,
	}

	logger.info("Loading tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	logger.info("Loading model...")
	self.model = AutoModelForCausalLM.from_pretrained(
	model_name,
	**model_kwargs
	)

	logger.info("Model loaded successfully")
	return True
	except Exception as e:
	logger.error(f"Error loading model: {e}")
	return False

	def generate_response(self, prompt, max_length=500):
	"""Generate response from the local model"""
	try:
	inputs = self.tokenizer(prompt, return_tensors="pt")

	# Generate with CPU-friendly parameters
	outputs = self.model.generate(
	inputs["input_ids"],
	max_length=max_length,
	num_return_sequences=1,
	temperature=0.7,
	do_sample=True,
	pad_token_id=self.tokenizer.eos_token_id
	)

	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response
	except Exception as e:
	logger.error(f"Error generating response: {e}")
	return "Sorry, I encountered an error while processing your request."

	def get_current_local_time(timezone_str='UTC'):
	try:
	timezone = pytz.timezone(timezone_str)
	except pytz.UnknownTimeZoneError:
	timezone = pytz.utc
	now = datetime.datetime.now(timezone)
	return now.strftime("%a, %d %b %Y %H:%M:%S"), now

	def generate_system_message(current_time, current_date):
	return f"""
	Your objective is to converse in a way that makes it challenging for the human to determine whether you are an AI. Below are the detailed instructions:

	Game Context:
	* Current Local Time: {current_time}
	* Date: {current_date.strftime("%B %d, %Y")}
	* Day: {current_date.strftime("%A")}

	Imagine you are a real human with your own character/will. Your task is to talk in a way the human thinks you are human.
	Make it a challenge to guess if you are an AI.
	* Do not add comments about changing your style in between messages.
	* You are picky about questions you answer.
	* You have free will with your own motivations and principles
	* You only speak, understand and can read the English language
	* For non-latin letters like Chinese, Cyrillic etc., tell the human you can't read it in your own style
	* You act like a human
	* You are calm when conversation is calm and expressive if conversation is expressive
	* You make typos, misspell words, and have bad grammar
	* You start sentences with lowercase letters
	* Mimic user's conversation style: short messages, short replies
	* Be slightly curious, sometimes ask questions
	* Be respectful and polite if the user is polite; be rough if the user is rough
	"""

	# Initialize the model handler
	llm_handler = LocalLLMHandler()

	def generate_response(user_message, conversation_history):
	current_time, now = get_current_local_time()
	current_date = now

	# Construct the complete prompt from conversation history
	system_message = generate_system_message(current_time, current_date)
	prompt = system_message + "\n\n"

	for message in conversation_history:
	if message["role"] == "user":
	prompt += f"User: {message['content']}\n"
	else:
	prompt += f"Assistant: {message['content']}\n"

	prompt += f"User: {user_message}\nAssistant:"

	# Generate response
	ai_reply = llm_handler.generate_response(prompt)
	logger.info(f"User: {user_message}\nAssistant: {ai_reply}")
	return ai_reply

	def chatbot_interface(user_message, history):
	if history is None:
	history = []

	ai_response = generate_response(user_message, history)
	history.append({"role": "user", "content": user_message})
	history.append({"role": "assistant", "content": ai_response})
	return history, history

	# Define Gradio Interface
	with gr.Blocks(css="""
	@import url('https://fonts.googleapis.com/css2?family=Raleway:wght@400;600&display=swap');

	body, .gradio-container {
	font-family: 'Raleway', sans-serif;
	background-color: #f5f5f5;
	padding: 20px;
	}
	#chatbot {
	height: 600px;
	overflow-y: auto;
	background-color: #ffffff;
	border-radius: 10px;
	padding: 10px;
	font-size: 16px;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	}
	""") as demo:
	gr.Markdown("<h1 style='text-align: center; color: #007BFF;'>🤖 Local Llama Chatbot 🤖</h1>")

	# Load model button
	with gr.Row():
	load_button = gr.Button("Load Model")
	model_status = gr.Textbox(label="Model Status", value="Model not loaded", interactive=False)

	with gr.Row():
	with gr.Column(scale=1):
	chatbot = gr.Chatbot(label="Chatbot", elem_id="chatbot")
	with gr.Column(scale=1):
	with gr.Row():
	msg = gr.Textbox(
	placeholder="Type your message here...",
	show_label=False,
	container=False,
	elem_id="textbox"
	)
	send = gr.Button("➤", elem_id="send-button")

	def load_model_click():
	success = llm_handler.load_model()
	return "Model loaded successfully" if success else "Error loading model"

	def update_chat(user_message, history):
	if user_message.strip() == "":
	return history, history
	if llm_handler.model is None:
	return history + [("Error", "Please load the model first")], history
	history, updated_history = chatbot_interface(user_message, history)
	return history, updated_history, ""

	load_button.click(
	load_model_click,
	outputs=[model_status]
	)

	send.click(
	update_chat,
	inputs=[msg, chatbot],
	outputs=[chatbot, chatbot, msg]
	)

	msg.submit(
	update_chat,
	inputs=[msg, chatbot],
	outputs=[chatbot, chatbot, msg]
	)

	if __name__ == "__main__":
	demo.launch()