File size: 5,132 Bytes
ed9b2e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7971f13
ed9b2e1
 
 
 
f8ce270
ed9b2e1
b12bae0
761a03d
ed9b2e1
 
 
 
 
 
 
 
 
56b07de
ed9b2e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7971f13
ed9b2e1
 
 
 
 
 
7971f13
 
 
56b07de
ed9b2e1
 
 
 
7971f13
ed9b2e1
 
 
 
 
 
 
 
 
 
 
56b07de
 
ed9b2e1
 
8317786
ed9b2e1
8317786
 
 
 
 
 
 
 
 
 
ed9b2e1
56b07de
8317786
97ef35f
99d1dcd
97ef35f
8317786
97ef35f
 
99d1dcd
56b07de
 
8317786
 
56b07de
7971f13
8317786
7971f13
 
 
 
97ef35f
7971f13
 
 
 
 
 
 
 
56b07de
7971f13
 
 
ed9b2e1
 
 
 
8317786
ed9b2e1
 
8317786
ed9b2e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python3

# Automatic dependency installation
import sys
import subprocess
import importlib

def install_package(package):
    """Automatically install a package if not already present"""
    try:
        importlib.import_module(package)
    except ImportError:
        print(f"{package} not found. Installing...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# List of required packages
REQUIRED_PACKAGES = [
    'gradio', 
    'torch', 
    'transformers', 
    'huggingface_hub', 
    'llama-cpp-python'
]

# Install required packages
for package in REQUIRED_PACKAGES:
    install_package(package)

# Now import the installed packages
import gradio as gr
import torch
from huggingface_hub import hf_hub_download
import os

# Efficient GGUF model download and loading
def download_and_load_model(
    repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF", 
    filename="unsloth.Q4_K_M.gguf"
):
    """
    Download GGUF model from HuggingFace if not exists
    
    Args:
        repo_id (str): HuggingFace repository ID
        filename (str): Specific GGUF model filename
    
    Returns:
        tuple: Loaded model and model path
    """
    try:
        # Try to import llama-cpp directly to ensure it's available
        from llama_cpp import Llama
    except ImportError:
        print("Critical error: llama-cpp-python could not be imported.")
        sys.exit(1)
    
    # Determine download directory (use a cache directory)
    cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
    os.makedirs(cache_dir, exist_ok=True)
    
    # Check if model already exists
    model_path = os.path.join(cache_dir, filename)
    
    if not os.path.exists(model_path):
        print(f"Downloading {filename} from {repo_id}...")
        try:
            model_path = hf_hub_download(
                repo_id=repo_id, 
                filename=filename, 
                local_dir=cache_dir,
                local_dir_use_symlinks=False
            )
        except Exception as e:
            print(f"Error downloading model: {e}")
            sys.exit(1)
    
    print(f"Using model at: {model_path}")
    
    # Initialize the model with optimized CPU settings
    model = Llama(
        model_path=model_path,
        n_ctx=2048,  # Context window size
        n_batch=512,  # Batch size for prompt processing
        n_threads=max(torch.get_num_threads() // 2, 1),  # Use half of available threads
        n_gpu_layers=-1,  # Use CPU
        seed=-1,  # Random seed
        verbose=True  # Enable verbose logging for download confirmation
    )
    
    return model, model_path

# Global model initialization
try:
    llm_model, MODEL_PATH = download_and_load_model()
except Exception as e:
    print(f"Fatal error initializing model: {e}")
    sys.exit(1)

def respond(
    message,
    history,
    system_message="You are a friendly Chatbot.",
    max_tokens=512,
    temperature=0.7,
    top_p=0.95,
):
    """
    Generate a response using the GGUF model
    
    Args:
        message (str): User's input message
        history (list): Chat history
        system_message (str): System prompt
        max_tokens (int): Maximum number of tokens to generate
        temperature (float): Sampling temperature
        top_p (float): Nucleus sampling probability threshold
    
    Returns:
        str: Generated response
    """
    # Prepare the full prompt with system message and history
    full_prompt = system_message + "\n\n"
    
    # Add chat history
    for user, assistant in history:
        if user:
            full_prompt += f"User: {user}\n"
        if assistant:
            full_prompt += f"Assistant: {assistant}\n"
    
    # Add current message
    full_prompt += f"User: {message}\n"
    full_prompt += "Assistant: "
    
    # Generate response
    try:
        # Use the most basic generation method
        response = llm_model(
            prompt=full_prompt,
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens
        )['choices'][0]['text']
        
        return response
    except Exception as e:
        print(f"Error generating response: {e}")
        return f"An error occurred: {e}"

# Create Gradio interface with updated configuration
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ]
)

if __name__ == "__main__":
    # Optional: Add some system checks
    print(f"Available CPU threads: {torch.get_num_threads()}")
    print(f"Model path: {MODEL_PATH}")
    
    # Launch the Gradio interface with compatible parameters
    demo.launch(
        show_api=False,  # Disable API endpoint
        share=False     # Do not create public URL
    )