|
|
|
|
|
import warnings |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
import os |
|
|
import json |
|
|
import subprocess |
|
|
import sys |
|
|
from typing import List, Tuple |
|
|
from llama_cpp import Llama |
|
|
from llama_cpp_agent import LlamaCppAgent |
|
|
from llama_cpp_agent import MessagesFormatterType |
|
|
from llama_cpp_agent.providers import LlamaCppPythonProvider |
|
|
from llama_cpp_agent.chat_history import BasicChatHistory |
|
|
from llama_cpp_agent.chat_history.messages import Roles |
|
|
from huggingface_hub import hf_hub_download |
|
|
import gradio as gr |
|
|
from logger import logging |
|
|
from exception import CustomExceptionHandling |
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists("./models"): |
|
|
os.makedirs("./models") |
|
|
|
|
|
hf_hub_download( |
|
|
repo_id="bartowski/Dolphin3.0-Llama3.2-1B-GGUF", |
|
|
filename="Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf", |
|
|
local_dir="./models", |
|
|
) |
|
|
hf_hub_download( |
|
|
repo_id="MaziyarPanahi/WizardLM-2-7B-GGUF", |
|
|
filename="WizardLM-2-7B.Q6_K.gguf", |
|
|
local_dir="./models", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
title = "Dolphin🐬 Llama.cpp" |
|
|
description = """**[Dolphin 3.0](https://huggingface.co/collections/cognitivecomputations/dolphin-30-677ab47f73d7ff66743979a3)** is a powerful, general-purpose local AI model designed for coding, math, and various other tasks, aiming similar to the models like ChatGPT and Claude. |
|
|
This interactive chat interface allows you to experiment with the [`Dolphin3.0-Qwen2.5-0.5B`](https://huggingface.co/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B) and [`Dolphin3.0-Llama3.2-1B`](https://huggingface.co/cognitivecomputations/Dolphin3.0-Llama3.2-1B) text models using various prompts and generation parameters. |
|
|
Users can select different model variants (GGUF format), system prompts, and observe generated responses in real-time. |
|
|
Key generation parameters, such as `temperature`, `max_tokens`, `top_k` and others are exposed below for tuning model behavior.""" |
|
|
|
|
|
|
|
|
llm = None |
|
|
llm_model = None |
|
|
|
|
|
def respond( |
|
|
message: str, |
|
|
history: List[Tuple[str, str]], |
|
|
model: str = "WizardLM-2-7B.Q6_K.gguf", |
|
|
system_message: str = "You are a helpful assistant.", |
|
|
max_tokens: int = 1024, |
|
|
temperature: float = 0.7, |
|
|
top_p: float = 0.95, |
|
|
top_k: int = 40, |
|
|
repeat_penalty: float = 1.1, |
|
|
): |
|
|
""" |
|
|
Respond to a message using the Dolphin-3 model via Llama.cpp. |
|
|
|
|
|
Args: |
|
|
- message (str): The message to respond to. |
|
|
- history (List[Tuple[str, str]]): The chat history. |
|
|
- model (str): The model to use. |
|
|
- system_message (str): The system message to use. |
|
|
- max_tokens (int): The maximum number of tokens to generate. |
|
|
- temperature (float): The temperature of the model. |
|
|
- top_p (float): The top-p of the model. |
|
|
- top_k (int): The top-k of the model. |
|
|
- repeat_penalty (float): The repetition penalty of the model. |
|
|
|
|
|
Returns: |
|
|
str: The response to the message. |
|
|
""" |
|
|
try: |
|
|
|
|
|
global llm |
|
|
global llm_model |
|
|
|
|
|
|
|
|
if model is None: |
|
|
model = "WizardLM-2-7B.Q6_K.gguf" |
|
|
|
|
|
|
|
|
if llm is None or llm_model != model: |
|
|
|
|
|
model_path = f"models/{model}" |
|
|
if not os.path.exists(model_path): |
|
|
yield f"Error: Model file not found at {model_path}. Please check your model path." |
|
|
return |
|
|
|
|
|
llm = Llama( |
|
|
model_path=f"models/{model}", |
|
|
flash_attn=False, |
|
|
n_gpu_layers=0, |
|
|
n_batch=8, |
|
|
n_ctx=2048, |
|
|
n_threads=8, |
|
|
n_threads_batch=8, |
|
|
) |
|
|
llm_model = model |
|
|
provider = LlamaCppPythonProvider(llm) |
|
|
|
|
|
|
|
|
agent = LlamaCppAgent( |
|
|
provider, |
|
|
system_prompt=f"{system_message}", |
|
|
predefined_messages_formatter_type=MessagesFormatterType.CHATML, |
|
|
debug_output=True, |
|
|
) |
|
|
|
|
|
|
|
|
settings = provider.get_provider_default_settings() |
|
|
settings.temperature = temperature |
|
|
settings.top_k = top_k |
|
|
settings.top_p = top_p |
|
|
settings.max_tokens = max_tokens |
|
|
settings.repeat_penalty = repeat_penalty |
|
|
settings.stream = True |
|
|
|
|
|
messages = BasicChatHistory() |
|
|
|
|
|
|
|
|
for msn in history: |
|
|
user = {"role": Roles.user, "content": msn[0]} |
|
|
assistant = {"role": Roles.assistant, "content": msn[1]} |
|
|
messages.add_message(user) |
|
|
messages.add_message(assistant) |
|
|
|
|
|
|
|
|
stream = agent.get_chat_response( |
|
|
message, |
|
|
llm_sampling_settings=settings, |
|
|
chat_history=messages, |
|
|
returns_streaming_generator=True, |
|
|
print_output=False, |
|
|
) |
|
|
|
|
|
|
|
|
logging.info("Response stream generated successfully") |
|
|
|
|
|
|
|
|
outputs = "" |
|
|
for output in stream: |
|
|
outputs += output |
|
|
yield outputs |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
raise CustomExceptionHandling(e, sys) from e |
|
|
|
|
|
|
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
respond, |
|
|
examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]], |
|
|
additional_inputs_accordion=gr.Accordion( |
|
|
label="⚙️ Parameters", open=False, render=False |
|
|
), |
|
|
additional_inputs=[ |
|
|
gr.Dropdown( |
|
|
choices=[ |
|
|
"Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf", |
|
|
"WizardLM-2-7B.Q6_K.gguf", |
|
|
], |
|
|
value="WizardLM-2-7B.Q6_K.gguf", |
|
|
label="Model", |
|
|
info="Select the AI model to use for chat", |
|
|
), |
|
|
gr.Textbox( |
|
|
value="You are Dolphin, a helpful AI assistant focused on accurate and ethical responses.", |
|
|
label="System Prompt", |
|
|
info="Define the AI assistant's personality and behavior", |
|
|
lines=2, |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=512, |
|
|
maximum=2048, |
|
|
value=1024, |
|
|
step=1, |
|
|
label="Max Tokens", |
|
|
info="Maximum length of response (higher = longer replies)", |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=2.0, |
|
|
value=0.7, |
|
|
step=0.1, |
|
|
label="Temperature", |
|
|
info="Creativity level (higher = more creative, lower = more focused)", |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=1.0, |
|
|
value=0.95, |
|
|
step=0.05, |
|
|
label="Top-p", |
|
|
info="Nucleus sampling threshold", |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=1, |
|
|
maximum=100, |
|
|
value=40, |
|
|
step=1, |
|
|
label="Top-k", |
|
|
info="Limit vocabulary choices to top K tokens", |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=1.0, |
|
|
maximum=2.0, |
|
|
value=1.1, |
|
|
step=0.1, |
|
|
label="Repetition Penalty", |
|
|
info="Penalize repeated words (higher = less repetition)", |
|
|
), |
|
|
], |
|
|
theme="Ocean", |
|
|
submit_btn="Send", |
|
|
stop_btn="Stop", |
|
|
title=title, |
|
|
description=description, |
|
|
chatbot=gr.Chatbot(scale=1, show_copy_button=True, resizable=True), |
|
|
flagging_mode="never", |
|
|
editable=True, |
|
|
cache_examples=False, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
share=False, |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_api=False, |
|
|
) |
|
|
|