Spaces:

N-Bot-Int
/

OpenElla-GGUF

Sleeping

File size: 5,132 Bytes

#!/usr/bin/env python3

# Automatic dependency installation
import sys
import subprocess
import importlib

def install_package(package):
    """Automatically install a package if not already present"""
    try:
        importlib.import_module(package)
    except ImportError:
        print(f"{package} not found. Installing...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# List of required packages
REQUIRED_PACKAGES = [
    'gradio', 
    'torch', 
    'transformers', 
    'huggingface_hub', 
    'llama-cpp-python'
]

# Install required packages
for package in REQUIRED_PACKAGES:
    install_package(package)

# Now import the installed packages
import gradio as gr
import torch
from huggingface_hub import hf_hub_download
import os

# Efficient GGUF model download and loading
def download_and_load_model(
    repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF", 
    filename="unsloth.Q4_K_M.gguf"
):
    """
    Download GGUF model from HuggingFace if not exists
    
    Args:
        repo_id (str): HuggingFace repository ID
        filename (str): Specific GGUF model filename
    
    Returns:
        tuple: Loaded model and model path
    """
    try:
        # Try to import llama-cpp directly to ensure it's available
        from llama_cpp import Llama
    except ImportError:
        print("Critical error: llama-cpp-python could not be imported.")
        sys.exit(1)
    
    # Determine download directory (use a cache directory)
    cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
    os.makedirs(cache_dir, exist_ok=True)
    
    # Check if model already exists
    model_path = os.path.join(cache_dir, filename)
    
    if not os.path.exists(model_path):
        print(f"Downloading {filename} from {repo_id}...")
        try:
            model_path = hf_hub_download(
                repo_id=repo_id, 
                filename=filename, 
                local_dir=cache_dir,
                local_dir_use_symlinks=False
            )
        except Exception as e:
            print(f"Error downloading model: {e}")
            sys.exit(1)
    
    print(f"Using model at: {model_path}")
    
    # Initialize the model with optimized CPU settings
    model = Llama(
        model_path=model_path,
        n_ctx=2048,  # Context window size
        n_batch=512,  # Batch size for prompt processing
        n_threads=max(torch.get_num_threads() // 2, 1),  # Use half of available threads
        n_gpu_layers=-1,  # Use CPU
        seed=-1,  # Random seed
        verbose=True  # Enable verbose logging for download confirmation
    )
    
    return model, model_path

# Global model initialization
try:
    llm_model, MODEL_PATH = download_and_load_model()
except Exception as e:
    print(f"Fatal error initializing model: {e}")
    sys.exit(1)

def respond(
    message,
    history,
    system_message="You are a friendly Chatbot.",
    max_tokens=512,
    temperature=0.7,
    top_p=0.95,
):
    """
    Generate a response using the GGUF model
    
    Args:
        message (str): User's input message
        history (list): Chat history
        system_message (str): System prompt
        max_tokens (int): Maximum number of tokens to generate
        temperature (float): Sampling temperature
        top_p (float): Nucleus sampling probability threshold
    
    Returns:
        str: Generated response
    """
    # Prepare the full prompt with system message and history
    full_prompt = system_message + "\n\n"
    
    # Add chat history
    for user, assistant in history:
        if user:
            full_prompt += f"User: {user}\n"
        if assistant:
            full_prompt += f"Assistant: {assistant}\n"
    
    # Add current message
    full_prompt += f"User: {message}\n"
    full_prompt += "Assistant: "
    
    # Generate response
    try:
        # Use the most basic generation method
        response = llm_model(
            prompt=full_prompt,
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens
        )['choices'][0]['text']
        
        return response
    except Exception as e:
        print(f"Error generating response: {e}")
        return f"An error occurred: {e}"

# Create Gradio interface with updated configuration
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ]
)

if __name__ == "__main__":
    # Optional: Add some system checks
    print(f"Available CPU threads: {torch.get_num_threads()}")
    print(f"Model path: {MODEL_PATH}")
    
    # Launch the Gradio interface with compatible parameters
    demo.launch(
        show_api=False,  # Disable API endpoint
        share=False     # Do not create public URL
    )