| | import gradio as gr |
| | from huggingface_hub import hf_hub_download |
| | from llama_cpp import Llama |
| |
|
| | |
| | model_path = hf_hub_download( |
| | repo_id="VibeStudio/Nidum-Llama-3.2-3B-Uncensored-GGUF", |
| | filename="Nidum-Llama-3.2-3B-Uncensored-GGUF.gguf" |
| | ) |
| |
|
| | |
| | llm = Llama( |
| | model_path=model_path, |
| | n_ctx=8192, |
| | n_batch=128, |
| | n_gpu_layers=0, |
| | verbose=False |
| | ) |
| |
|
| | |
| | def format_prompt(message, history): |
| | prompt = "<|begin_of_text|>" |
| | for user, assistant in history: |
| | prompt += f"<|start_header_id|>user<|end_header_id|>\n{user}<|eot_id|>" |
| | prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant}<|eot_id|>" |
| | prompt += f"<|start_header_id|>user<|end_header_id|>\n{message}<|eot_id|>" |
| | prompt += "<|start_header_id|>assistant<|end_header_id|>\n" |
| | return prompt |
| |
|
| | def chat(message, history): |
| | prompt = format_prompt(message, history) |
| | output = llm( |
| | prompt, |
| | max_tokens=2048, |
| | temperature=0.7, |
| | top_p=0.95, |
| | repeat_penalty=1.15 |
| | |
| | ) |
| | return output["choices"][0]["text"].strip() |
| |
|
| | |
| | gr.ChatInterface( |
| | chat, |
| | title="Llama 3.2 3B Uncensored (CPU Básica)", |
| | description="IA sem censura rodando só em CPU gratuita! Respostas em ~10-20s." |
| | ).launch() |
| |
|
| |
|