File size: 1,652 Bytes
134a248 d575a02 134a248 50e8850 d575a02 8481230 134a248 50e8850 d575a02 50e8850 d575a02 134a248 50e8850 d575a02 50e8850 d575a02 50e8850 d575a02 50e8850 d575a02 50e8850 61887df 50e8850 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# Baixa o modelo uncensored (s贸 na primeira vez)
model_path = hf_hub_download(
repo_id="VibeStudio/Nidum-Llama-3.2-3B-Uncensored-GGUF",
filename="Nidum-Llama-3.2-3B-Uncensored-GGUF.gguf"
)
# Carrega o modelo em CPU (n_gpu_layers=0 for莽a CPU)
llm = Llama(
model_path=model_path,
n_ctx=8192, # contexto grande
n_batch=128, # reduzido para evitar travamentos em CPU
n_gpu_layers=0, # 0 = s贸 CPU (essencial pro basic)
verbose=False
)
# Template simples pra chat (funciona bem com Llama)
def format_prompt(message, history):
prompt = "<|begin_of_text|>"
for user, assistant in history:
prompt += f"<|start_header_id|>user<|end_header_id|>\n{user}<|eot_id|>"
prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant}<|eot_id|>"
prompt += f"<|start_header_id|>user<|end_header_id|>\n{message}<|eot_id|>"
prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
return prompt
def chat(message, history):
prompt = format_prompt(message, history)
output = llm(
prompt,
max_tokens=2048, # aumentado para respostas mais completas
temperature=0.7,
top_p=0.95,
repeat_penalty=1.15 # evita repeti莽茫o
# removido stop=["<|eot_id|>"] para n茫o cortar no meio
)
return output["choices"][0]["text"].strip()
# Interface Gradio
gr.ChatInterface(
chat,
title="Llama 3.2 3B Uncensored (CPU B谩sica)",
description="IA sem censura rodando s贸 em CPU gratuita! Respostas em ~10-20s."
).launch()
|