Spaces:
Sleeping
Sleeping
File size: 5,132 Bytes
ed9b2e1 7971f13 ed9b2e1 f8ce270 ed9b2e1 b12bae0 761a03d ed9b2e1 56b07de ed9b2e1 7971f13 ed9b2e1 7971f13 56b07de ed9b2e1 7971f13 ed9b2e1 56b07de ed9b2e1 8317786 ed9b2e1 8317786 ed9b2e1 56b07de 8317786 97ef35f 99d1dcd 97ef35f 8317786 97ef35f 99d1dcd 56b07de 8317786 56b07de 7971f13 8317786 7971f13 97ef35f 7971f13 56b07de 7971f13 ed9b2e1 8317786 ed9b2e1 8317786 ed9b2e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
#!/usr/bin/env python3
# Automatic dependency installation
import sys
import subprocess
import importlib
def install_package(package):
"""Automatically install a package if not already present"""
try:
importlib.import_module(package)
except ImportError:
print(f"{package} not found. Installing...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
# List of required packages
REQUIRED_PACKAGES = [
'gradio',
'torch',
'transformers',
'huggingface_hub',
'llama-cpp-python'
]
# Install required packages
for package in REQUIRED_PACKAGES:
install_package(package)
# Now import the installed packages
import gradio as gr
import torch
from huggingface_hub import hf_hub_download
import os
# Efficient GGUF model download and loading
def download_and_load_model(
repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF",
filename="unsloth.Q4_K_M.gguf"
):
"""
Download GGUF model from HuggingFace if not exists
Args:
repo_id (str): HuggingFace repository ID
filename (str): Specific GGUF model filename
Returns:
tuple: Loaded model and model path
"""
try:
# Try to import llama-cpp directly to ensure it's available
from llama_cpp import Llama
except ImportError:
print("Critical error: llama-cpp-python could not be imported.")
sys.exit(1)
# Determine download directory (use a cache directory)
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
os.makedirs(cache_dir, exist_ok=True)
# Check if model already exists
model_path = os.path.join(cache_dir, filename)
if not os.path.exists(model_path):
print(f"Downloading {filename} from {repo_id}...")
try:
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=cache_dir,
local_dir_use_symlinks=False
)
except Exception as e:
print(f"Error downloading model: {e}")
sys.exit(1)
print(f"Using model at: {model_path}")
# Initialize the model with optimized CPU settings
model = Llama(
model_path=model_path,
n_ctx=2048, # Context window size
n_batch=512, # Batch size for prompt processing
n_threads=max(torch.get_num_threads() // 2, 1), # Use half of available threads
n_gpu_layers=-1, # Use CPU
seed=-1, # Random seed
verbose=True # Enable verbose logging for download confirmation
)
return model, model_path
# Global model initialization
try:
llm_model, MODEL_PATH = download_and_load_model()
except Exception as e:
print(f"Fatal error initializing model: {e}")
sys.exit(1)
def respond(
message,
history,
system_message="You are a friendly Chatbot.",
max_tokens=512,
temperature=0.7,
top_p=0.95,
):
"""
Generate a response using the GGUF model
Args:
message (str): User's input message
history (list): Chat history
system_message (str): System prompt
max_tokens (int): Maximum number of tokens to generate
temperature (float): Sampling temperature
top_p (float): Nucleus sampling probability threshold
Returns:
str: Generated response
"""
# Prepare the full prompt with system message and history
full_prompt = system_message + "\n\n"
# Add chat history
for user, assistant in history:
if user:
full_prompt += f"User: {user}\n"
if assistant:
full_prompt += f"Assistant: {assistant}\n"
# Add current message
full_prompt += f"User: {message}\n"
full_prompt += "Assistant: "
# Generate response
try:
# Use the most basic generation method
response = llm_model(
prompt=full_prompt,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)['choices'][0]['text']
return response
except Exception as e:
print(f"Error generating response: {e}")
return f"An error occurred: {e}"
# Create Gradio interface with updated configuration
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
]
)
if __name__ == "__main__":
# Optional: Add some system checks
print(f"Available CPU threads: {torch.get_num_threads()}")
print(f"Model path: {MODEL_PATH}")
# Launch the Gradio interface with compatible parameters
demo.launch(
show_api=False, # Disable API endpoint
share=False # Do not create public URL
) |