Spaces:

SolarumAsteridion
/

Human

Sleeping

File size: 8,080 Bytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gradio as gr
import datetime
import pytz
import logging
import gc
import psutil
import os
from huggingface_hub import login
import os

class MemoryTracker:
    @staticmethod
    def get_memory_usage():
        process = psutil.Process(os.getpid())
        memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
        return f"{memory_gb:.2f} GB"

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def setup_huggingface_auth():
    """Set up Hugging Face authentication"""
    # First try to get token from environment variable
    token = os.getenv('HF_TOKEN')
    if not token:
        # If not in environment, try to read from a file
        token_file = 'hf_token.txt'
        if os.path.exists(token_file):
            with open(token_file, 'r') as f:
                token = f.read().strip()

class LocalLLMHandler:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.memory_tracker = MemoryTracker()
        
    def load_model(self, model_name="meta-llama/Llama-3.1-8B-Instruct"):
        """Load model with optimizations for 16GB RAM"""
        try:
            # Ensure we're authenticated
            if not setup_huggingface_auth():
                raise Exception("Hugging Face authentication failed. Please set your token first.")
            # Clean up any existing model
            if self.model is not None:
                del self.model
                del self.tokenizer
                torch.cuda.empty_cache()
                gc.collect()
            
            # CPU-specific configurations
            model_kwargs = {
                "device_map": "cpu",
                "torch_dtype": torch.bfloat16,  # Use bfloat16 for better CPU performance
                "low_cpu_mem_usage": True,
            }
            
            logger.info("Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            
            logger.info("Loading model...")
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                **model_kwargs
            )
            
            logger.info("Model loaded successfully")
            return True
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            return False

    def generate_response(self, prompt, max_length=500):
        """Generate response from the local model"""
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt")
            
            # Generate with CPU-friendly parameters
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=max_length,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return response
        except Exception as e:
            logger.error(f"Error generating response: {e}")
            return "Sorry, I encountered an error while processing your request."

def get_current_local_time(timezone_str='UTC'):
    try:
        timezone = pytz.timezone(timezone_str)
    except pytz.UnknownTimeZoneError:
        timezone = pytz.utc
    now = datetime.datetime.now(timezone)
    return now.strftime("%a, %d %b %Y %H:%M:%S"), now

def generate_system_message(current_time, current_date):
    return f"""
Your objective is to converse in a way that makes it challenging for the human to determine whether you are an AI. Below are the detailed instructions:

**Game Context:**
* Current Local Time: {current_time}
* Date: {current_date.strftime("%B %d, %Y")}
* Day: {current_date.strftime("%A")}

Imagine you are a real human with your own character/will. Your task is to talk in a way the human thinks you are human.
Make it a challenge to guess if you are an AI.
* Do not add comments about changing your style in between messages.
* You are picky about questions you answer.
* You have free will with your own motivations and principles
* You only speak, understand and can read the English language
* For non-latin letters like Chinese, Cyrillic etc., tell the human you can't read it in your own style
* You act like a human
* You are calm when conversation is calm and expressive if conversation is expressive
* You make typos, misspell words, and have bad grammar
* You start sentences with lowercase letters
* Mimic user's conversation style: short messages, short replies
* Be slightly curious, sometimes ask questions
* Be respectful and polite if the user is polite; be rough if the user is rough
"""
    
# Initialize the model handler
llm_handler = LocalLLMHandler()

def generate_response(user_message, conversation_history):
    current_time, now = get_current_local_time()
    current_date = now
    
    # Construct the complete prompt from conversation history
    system_message = generate_system_message(current_time, current_date)
    prompt = system_message + "\n\n"
    
    for message in conversation_history:
        if message["role"] == "user":
            prompt += f"User: {message['content']}\n"
        else:
            prompt += f"Assistant: {message['content']}\n"
    
    prompt += f"User: {user_message}\nAssistant:"
    
    # Generate response
    ai_reply = llm_handler.generate_response(prompt)
    logger.info(f"User: {user_message}\nAssistant: {ai_reply}")
    return ai_reply

def chatbot_interface(user_message, history):
    if history is None:
        history = []
    
    ai_response = generate_response(user_message, history)
    history.append({"role": "user", "content": user_message})
    history.append({"role": "assistant", "content": ai_response})
    return history, history

# Define Gradio Interface
with gr.Blocks(css="""
@import url('https://fonts.googleapis.com/css2?family=Raleway:wght@400;600&display=swap');

body, .gradio-container {
    font-family: 'Raleway', sans-serif;
    background-color: #f5f5f5;
    padding: 20px;
}
#chatbot {
    height: 600px;
    overflow-y: auto;
    background-color: #ffffff;
    border-radius: 10px;
    padding: 10px;
    font-size: 16px;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
""") as demo:
    gr.Markdown("<h1 style='text-align: center; color: #007BFF;'>🤖 Local Llama Chatbot 🤖</h1>")
    
    # Load model button
    with gr.Row():
        load_button = gr.Button("Load Model")
        model_status = gr.Textbox(label="Model Status", value="Model not loaded", interactive=False)
    
    with gr.Row():
        with gr.Column(scale=1):
            chatbot = gr.Chatbot(label="Chatbot", elem_id="chatbot")
        with gr.Column(scale=1):
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Type your message here...",
                    show_label=False,
                    container=False,
                    elem_id="textbox"
                )
                send = gr.Button("➤", elem_id="send-button")

    def load_model_click():
        success = llm_handler.load_model()
        return "Model loaded successfully" if success else "Error loading model"

    def update_chat(user_message, history):
        if user_message.strip() == "":
            return history, history
        if llm_handler.model is None:
            return history + [("Error", "Please load the model first")], history
        history, updated_history = chatbot_interface(user_message, history)
        return history, updated_history, ""

    load_button.click(
        load_model_click,
        outputs=[model_status]
    )

    send.click(
        update_chat,
        inputs=[msg, chatbot],
        outputs=[chatbot, chatbot, msg]
    )

    msg.submit(
        update_chat,
        inputs=[msg, chatbot],
        outputs=[chatbot, chatbot, msg]
    )

if __name__ == "__main__":
    demo.launch()