Spaces:
Paused
Paused
File size: 7,372 Bytes
21b8ce0 13e498a 17dc147 53a8258 6e7a7b9 615a20b 6e7a7b9 d8a3c53 17dc147 53a8258 40afde6 1d9807e a15f664 17dc147 0e76b02 670bd04 1d9807e 0e76b02 1d9807e 670bd04 0e76b02 1d9807e 53a8258 1d9807e 2d1ec07 d8a3c53 1d9807e d8a3c53 0824852 d8a3c53 a15f664 17dc147 1d9807e 17dc147 53a8258 17dc147 53a8258 1d9807e 53a8258 a15f664 5c89384 cf6a52f 11987d3 0e76b02 cf6a52f 8e6bf26 17dc147 cf6a52f 5c89384 e9aaf81 5c89384 cf6a52f ac70b49 1d9807e 17dc147 0e76b02 17dc147 559c9c0 8e6bf26 b9838b1 559c9c0 606c0ce 53a8258 1d9807e 53a8258 1d9807e 17dc147 1d9807e 2bfa2c0 d8a3c53 4b81451 17dc147 d8a3c53 2bfa2c0 d8a3c53 6ccc337 15a7813 5eb0b07 5c900a6 ed82b9d 9a1c399 736d872 c940f4e 2110ec0 7c944d3 c940f4e 905a689 f97343d 53a8258 d8a3c53 1d9807e 53a8258 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import spaces
import json
import os
import glob
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download, list_repo_files
from model_loader import MODEL_DROPDOWN_CHOICES, MODEL_FILE_MAPPING
# --- Globale Konfiguration und Variablen ---
llm = None
llm_model = None
css = """.bubble-wrap { padding-top: calc(var(--spacing-xl) * 3) !important;}.message-row { justify-content: space-evenly !important; width: 100% !important; max-width: 100% !important; margin: calc(var(--spacing-xl)) 0 !important; padding: 0 calc(var(--spacing-xl) * 3) !important;}.flex-wrap.user { border-bottom-right-radius: var(--radius-lg) !important;}.flex-wrap.bot { border-bottom-left-radius: var(--radius-lg) !important;}.message.user{ padding: 10px;}.message.bot{ text-align: right; width: 100%; padding: 10px; border-radius: 10px;}.message-bubble-border { border-radius: 6px !important;}.message-buttons { justify-content: flex-end !important;}.message-buttons-left { align-self: end !important;}.message-buttons-bot, .message-buttons-user { right: 10px !important; left: auto !important; bottom: 2px !important;}.dark.message-bubble-border { border-color: #343140 !important;}.dark.user { background: #1e1c26 !important;}.dark.assistant.dark, .dark.pending.dark { background: #16141c !important;}"""
def get_messages_formatter_type(model_name):
if "Llama" in model_name:
return MessagesFormatterType.LLAMA_3
elif "Mistral" in model_name:
return MessagesFormatterType.MISTRAL
elif "GLM" in model_name or "Granite" in model_name:
return MessagesFormatterType.CHATML
else:
print("Formatter type not found, trying default")
return MessagesFormatterType.CHATML
# ----------------------------------------------------------------------
## Main Response Function for ChatInterface
# ----------------------------------------------------------------------
@spaces.GPU(duration=90)
def respond(
message,
history: list[dict[str, str]],
selected_model_name,
system_message,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
):
global llm
global llm_model
model_file_path = MODEL_FILE_MAPPING.get(selected_model_name)
if not model_file_path:
return f"Error: Model file for '{selected_model_name}' not found. Has the download completed?"
chat_template = get_messages_formatter_type(selected_model_name)
if llm is None or llm_model != model_file_path:
print(f"Loading new model: {model_file_path}")
try:
llm = Llama(
model_path=model_file_path,
flash_attn=True,
n_gpu_layers=81,
n_batch=1024,
n_ctx=8192,
)
llm_model = model_file_path
except Exception as e:
return f"Error during loading of Llama model '{selected_model_name}' ({model_file_path}): {e}"
provider = LlamaCppPythonProvider(llm)
agent = LlamaCppAgent(
provider,
system_prompt=f"{system_message}",
predefined_messages_formatter_type=chat_template,
debug_output=True
)
settings = provider.get_provider_default_settings()
settings.temperature = temperature
settings.top_k = top_k
settings.top_p = top_p
settings.max_tokens = max_tokens
settings.repeat_penalty = repeat_penalty
settings.stream = True
messages = BasicChatHistory()
for msn in history:
role = Roles.user if msn.get('role') == 'user' else Roles.assistant
messages.add_message({'role': role, 'content': msn.get('content', '')})
stream = agent.get_chat_response(
message,
llm_sampling_settings=settings,
chat_history=messages,
returns_streaming_generator=True,
print_output=False
)
outputs = ""
for output in stream:
outputs += output
yield outputs
PLACEHOLDER = """<div class="message-bubble-border" style="display:flex; max-width: 600px; border-radius: 6px; border-width: 1px; border-color: #e5e7eb; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); backdrop-filter: blur(10px);"> <div style="padding: .5rem 1.5rem;display: flex;flex-direction: column;justify-content: space-evenly;"> <h2 style="text-align: left; font-size: 1.5rem; font-weight: 700; margin-bottom: 0.5rem;">llama.cpp based quantized gguf inference</h2> <p style="text-align: left; font-size: 16px; line-height: 1.5; margin-bottom: 15px;">This space hosts an Advanced model.</p> </div></div>"""
# --- Gradio Components (Dynamically populated) ---
default_model = MODEL_DROPDOWN_CHOICES[0] if MODEL_DROPDOWN_CHOICES else None
model_dropdown = gr.Dropdown(
choices=MODEL_DROPDOWN_CHOICES,
value=default_model,
label="Model"
)
system_textbox = gr.Textbox(value="You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem.", label="System message")
max_tokens_slider = gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens")
temperature_slider = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
)
top_k_slider = gr.Slider(
minimum=0,
maximum=100,
value=40,
step=1,
label="Top-k",
)
repeat_penalty_slider = gr.Slider(
minimum=0.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition penalty",
)
demo = gr.ChatInterface(
respond,
type="messages",
chatbot=gr.Chatbot(placeholder=PLACEHOLDER, height=450, type="messages", label=False),
additional_inputs=[
model_dropdown,
system_textbox,
max_tokens_slider,
temperature_slider,
top_p_slider,
top_k_slider,
repeat_penalty_slider
],
theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
body_background_fill_dark="#16141c",
block_background_fill_dark="#16141c",
block_border_width="1px",
block_title_background_fill_dark="#1e1c26",
input_background_fill_dark="#292733",
button_secondary_background_fill_dark="#24212b",
border_color_accent_dark="#343140",
border_color_primary_dark="#343140",
background_fill_secondary_dark="#16141c",
color_accent_soft_dark="transparent",
code_background_fill_dark="#292733",
),
css=css,
description="Advanced model",
)
if __name__ == "__main__":
if default_model:
demo.launch()
else:
print("Could not load any models or configure. App will not start.") |