File size: 1,652 Bytes
134a248
d575a02
 
134a248
50e8850
d575a02
8481230
 
134a248
 
50e8850
d575a02
 
50e8850
 
 
d575a02
 
134a248
50e8850
d575a02
50e8850
 
 
 
 
 
d575a02
 
 
 
50e8850
 
 
 
 
 
 
 
 
d575a02
50e8850
d575a02
50e8850
 
 
61887df
50e8850
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Baixa o modelo uncensored (s贸 na primeira vez)
model_path = hf_hub_download(
    repo_id="VibeStudio/Nidum-Llama-3.2-3B-Uncensored-GGUF",
    filename="Nidum-Llama-3.2-3B-Uncensored-GGUF.gguf"
)

# Carrega o modelo em CPU (n_gpu_layers=0 for莽a CPU)
llm = Llama(
    model_path=model_path,
    n_ctx=8192,        # contexto grande
    n_batch=128,       # reduzido para evitar travamentos em CPU
    n_gpu_layers=0,    # 0 = s贸 CPU (essencial pro basic)
    verbose=False
)

# Template simples pra chat (funciona bem com Llama)
def format_prompt(message, history):
    prompt = "<|begin_of_text|>"
    for user, assistant in history:
        prompt += f"<|start_header_id|>user<|end_header_id|>\n{user}<|eot_id|>"
        prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant}<|eot_id|>"
    prompt += f"<|start_header_id|>user<|end_header_id|>\n{message}<|eot_id|>"
    prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
    return prompt

def chat(message, history):
    prompt = format_prompt(message, history)
    output = llm(
        prompt,
        max_tokens=2048,       # aumentado para respostas mais completas
        temperature=0.7,
        top_p=0.95,
        repeat_penalty=1.15    # evita repeti莽茫o
        # removido stop=["<|eot_id|>"] para n茫o cortar no meio
    )
    return output["choices"][0]["text"].strip()

# Interface Gradio
gr.ChatInterface(
    chat,
    title="Llama 3.2 3B Uncensored (CPU B谩sica)",
    description="IA sem censura rodando s贸 em CPU gratuita! Respostas em ~10-20s."
).launch()