Spaces:
Build error
Build error
File size: 2,769 Bytes
88fc169 a5953c7 88fc169 a5953c7 88fc169 a5953c7 88fc169 a5953c7 73e94a4 a5953c7 88fc169 a5953c7 88fc169 3e93048 88fc169 a5953c7 88fc169 a5953c7 88fc169 3e93048 88fc169 3e93048 88fc169 a5953c7 88fc169 a5953c7 88fc169 a5953c7 88fc169 a5953c7 88fc169 a5953c7 3e93048 a5953c7 3e93048 eb0271e a5953c7 eb0271e a5953c7 eb0271e 88fc169 eb0271e a5953c7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | import os
import copy
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Fix for Python 3.13: audioop was removed from the standard library.
# This try/except block handles the missing dependency if audioop-lts is installed.
try:
import audioop
except ImportError:
try:
import audioop_lts as audioop
except ImportError:
print("Warning: audioop not found. If Gradio fails to load, install 'audioop-lts'.")
# 1. Download the model correctly
# Repo: unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF
# File: NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf
model_path = hf_hub_download(
repo_id=os.environ.get("REPO_ID", "unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF"),
filename=os.environ.get("MODEL_FILE", "NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf"),
)
# 2. Initialize the Llama model
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_gpu_layers=-1, # -1 uses all available GPU layers, change to 0 for CPU only
)
def generate_text(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
temp = ""
# Standard ChatML / Llama format logic
input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
for interaction in history:
input_prompt += f"{interaction[0]} [/INST] {interaction[1]} </s><s> [INST] "
input_prompt += f"{message} [/INST] "
output = llm(
input_prompt,
temperature=temperature,
top_p=top_p,
top_k=40,
repeat_penalty=1.1,
max_tokens=max_tokens,
stop=[
"[/INST]",
"</s>",
"<|endoftext|>",
"USER:",
"ASSISTANT:",
],
stream=True,
)
for out in output:
stream = copy.deepcopy(out)
temp += stream["choices"][0]["text"]
yield temp
# 3. Define the Gradio Interface
demo = gr.ChatInterface(
generate_text,
title="NVIDIA Nemotron-3 Nano (Llama-cpp)",
description="Running NVIDIA Nemotron-3-Nano-4B via llama-cpp-python",
examples=[
['How to setup a human base on Mars? Give short answer.'],
['Explain theory of relativity to me like I’m 8 years old.'],
['What is 9,000 * 9,000?']
],
cache_examples=False,
additional_inputs=[
gr.Textbox(value="You are a helpful and friendly AI assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |