File size: 2,769 Bytes
88fc169
 
a5953c7
88fc169
a5953c7
88fc169
a5953c7
 
 
 
 
 
 
 
 
88fc169
a5953c7
 
 
 
 
 
 
73e94a4
a5953c7
 
 
88fc169
a5953c7
88fc169
 
3e93048
 
 
 
 
 
 
 
88fc169
a5953c7
88fc169
 
a5953c7
 
 
88fc169
 
 
3e93048
 
 
88fc169
3e93048
88fc169
a5953c7
 
88fc169
 
a5953c7
88fc169
 
 
a5953c7
88fc169
 
 
 
 
a5953c7
88fc169
 
a5953c7
 
3e93048
 
 
a5953c7
3e93048
 
eb0271e
a5953c7
eb0271e
a5953c7
 
eb0271e
88fc169
eb0271e
 
a5953c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import copy
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Fix for Python 3.13: audioop was removed from the standard library.
# This try/except block handles the missing dependency if audioop-lts is installed.
try:
    import audioop
except ImportError:
    try:
        import audioop_lts as audioop
    except ImportError:
        print("Warning: audioop not found. If Gradio fails to load, install 'audioop-lts'.")

# 1. Download the model correctly
# Repo: unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF
# File: NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf
model_path = hf_hub_download(
    repo_id=os.environ.get("REPO_ID", "unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF"),
    filename=os.environ.get("MODEL_FILE", "NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf"),
)

# 2. Initialize the Llama model
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_gpu_layers=-1, # -1 uses all available GPU layers, change to 0 for CPU only
) 

def generate_text(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    temp = ""
    # Standard ChatML / Llama format logic
    input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
    for interaction in history:
        input_prompt += f"{interaction[0]} [/INST] {interaction[1]} </s><s> [INST] "
    
    input_prompt += f"{message} [/INST] "

    output = llm(
        input_prompt,
        temperature=temperature,
        top_p=top_p,
        top_k=40,
        repeat_penalty=1.1,
        max_tokens=max_tokens,
        stop=[
            "[/INST]",
            "</s>",
            "<|endoftext|>",
            "USER:",
            "ASSISTANT:",
        ],
        stream=True,
    )
    
    for out in output:
        stream = copy.deepcopy(out)
        temp += stream["choices"][0]["text"]
        yield temp

# 3. Define the Gradio Interface
demo = gr.ChatInterface(
    generate_text,
    title="NVIDIA Nemotron-3 Nano (Llama-cpp)",
    description="Running NVIDIA Nemotron-3-Nano-4B via llama-cpp-python",
    examples=[
        ['How to setup a human base on Mars? Give short answer.'],
        ['Explain theory of relativity to me like I’m 8 years old.'],
        ['What is 9,000 * 9,000?']
    ],
    cache_examples=False,
    additional_inputs=[
        gr.Textbox(value="You are a helpful and friendly AI assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
    ],
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)