Spaces:
Sleeping
Sleeping
File size: 4,589 Bytes
f898665 620678a 064cdf4 620678a 9047fc3 064cdf4 9047fc3 620678a 9047fc3 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 620678a 064cdf4 f898665 899897b f898665 064cdf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import gradio as gr
from llama_cpp import Llama
# 1. Path to your GGUF file inside the Space repository
#MODEL_PATH = "simonper/fine-tuned-gguf-modal1/Llama-3.2-1B.Q8_0.gguf" # <- change if your file is named differently
llm = Llama.from_pretrained(
repo_id="simonper/fine-tuned-gguf-modal1",
filename="Llama-3.2-1B.Q8_0.gguf",
)
"""
# 2. Load the GGUF model once at startup
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096, # context length, adjust if needed
n_threads=8, # tweak based on CPU in the Space
n_gpu_layers=0, # 0 = pure CPU, >0 if GPU layers are available
)
"""
def build_prompt(system_message: str, history: list[dict], user_message: str) -> str:
"""
Simple instruction-style prompt builder for GGUF/llama.cpp.
You can make this fancier or closer to Llama 3's official format if you want.
"""
lines = []
if system_message:
lines.append(f"System: {system_message}\n")
for turn in history:
role = turn["role"]
content = turn["content"]
if role == "user":
lines.append(f"User: {content}")
elif role == "assistant":
lines.append(f"Assistant: {content}")
lines.append(f"User: {user_message}")
lines.append("Assistant:")
return "\n".join(lines)
def respond(
message,
history: list[dict[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# 3. Build a text prompt from system + history + new message
prompt = build_prompt(system_message, history, message)
# 4. Call llama.cpp model
output = llm(
prompt,
max_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
stop=["User:", "System:"], # stop when next user/system turn would start
)
reply = output["choices"][0]["text"].strip()
return reply
# 5. Gradio UI
chatbot = gr.ChatInterface(
respond,
type="messages", # history comes in as [{"role": "...", "content": "..."}]
additional_inputs=[
gr.Textbox(
value="You are a friendly chatbot.",
label="System message",
),
gr.Slider(
minimum=1,
maximum=2048,
value=512,
step=1,
label="Max new tokens",
),
gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.7,
step=0.1,
label="Temperature",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
demo = chatbot
if __name__ == "__main__":
demo.launch()
# Old UI implementation
'''
import gradio as gr
from huggingface_hub import InferenceClient
def respond(
message,
history: list[dict[str, str]],
system_message,
max_tokens,
temperature,
top_p,
hf_token: gr.OAuthToken,
):
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient(token=hf_token.token, model="meta-llama/Meta-Llama-3-8B")
messages = [{"role": "system", "content": system_message}]
messages.extend(history)
messages.append({"role": "user", "content": message})
response = ""
for message in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
choices = message.choices
token = ""
if len(choices) and choices[0].delta.content:
token = choices[0].delta.content
response += token
yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
chatbot = gr.ChatInterface(
respond,
type="messages",
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
with gr.Blocks() as demo:
with gr.Sidebar():
gr.LoginButton()
chatbot.render()
if __name__ == "__main__":
demo.launch()
''' |