|
|
import torch |
|
|
import gradio as gr |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "Qwen/Qwen3-1.7B" |
|
|
|
|
|
print("🔄 Carregando tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
|
|
|
|
print("🧠 Carregando modelo (FP16, CPU)...") |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_ID, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="cpu", |
|
|
low_cpu_mem_usage=True |
|
|
) |
|
|
model.eval() |
|
|
print("✅ Modelo carregado!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AXION_CORE_PROMPT = """ |
|
|
You are AxionChat, an intelligent, honest and precise AI assistant. |
|
|
Be clear and structured. |
|
|
Never hallucinate facts. |
|
|
If you do not know something, say so. |
|
|
Stay concise unless depth is requested. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_prompt(system_prompt, history, user_input, show_reasoning): |
|
|
prompt = system_prompt.strip() + "\n\n" |
|
|
|
|
|
for u, a in history[-4:]: |
|
|
prompt += f"User: {u}\nAssistant: {a}\n" |
|
|
|
|
|
if show_reasoning: |
|
|
user_input = "Think briefly, then answer clearly:\n" + user_input |
|
|
|
|
|
prompt += f"User: {user_input}\nAssistant:" |
|
|
return prompt |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def chat( |
|
|
user_input, |
|
|
chat_history, |
|
|
user_system_prompt, |
|
|
temperature, |
|
|
max_tokens, |
|
|
show_reasoning |
|
|
): |
|
|
if not user_input.strip(): |
|
|
return chat_history, "" |
|
|
|
|
|
system_prompt = AXION_CORE_PROMPT |
|
|
if user_system_prompt.strip(): |
|
|
system_prompt += "\n" + user_system_prompt.strip() |
|
|
|
|
|
prompt = build_prompt( |
|
|
system_prompt, |
|
|
chat_history, |
|
|
user_input, |
|
|
show_reasoning |
|
|
) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
|
output = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=min(int(max_tokens), 128), |
|
|
temperature=float(temperature), |
|
|
top_p=0.9, |
|
|
do_sample=True |
|
|
) |
|
|
|
|
|
decoded = tokenizer.decode(output[0], skip_special_tokens=True) |
|
|
response = decoded.split("Assistant:")[-1].strip() |
|
|
|
|
|
chat_history.append((user_input, response)) |
|
|
return chat_history, "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="AxionChat-v2") as demo: |
|
|
gr.Markdown("# 🧠 AxionChat-v2") |
|
|
gr.Markdown("Chat experimental focado em clareza, honestidade e velocidade.") |
|
|
|
|
|
chatbot = gr.Chatbot(height=420) |
|
|
|
|
|
user_input = gr.Textbox( |
|
|
placeholder="Digite sua mensagem...", |
|
|
label="Mensagem" |
|
|
) |
|
|
|
|
|
user_system = gr.Textbox( |
|
|
placeholder="System prompt opcional (personalidade, tom, estilo...)", |
|
|
label="System Prompt" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
temperature = gr.Slider(0.1, 1.2, value=0.7, label="Temperatura") |
|
|
max_tokens = gr.Slider(32, 256, value=96, step=32, label="Máx. tokens") |
|
|
|
|
|
show_reasoning = gr.Checkbox( |
|
|
label="Mostrar raciocínio (mais lento)", |
|
|
value=False |
|
|
) |
|
|
|
|
|
send = gr.Button("Enviar 🚀") |
|
|
|
|
|
state = gr.State([]) |
|
|
|
|
|
send.click( |
|
|
chat, |
|
|
inputs=[ |
|
|
user_input, |
|
|
state, |
|
|
user_system, |
|
|
temperature, |
|
|
max_tokens, |
|
|
show_reasoning |
|
|
], |
|
|
outputs=[chatbot, user_input] |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|
|
|
|