File size: 2,753 Bytes
8891bca
6e7ae54
 
8891bca
7a55e65
 
 
34902a4
9a3be1e
8891bca
6e7ae54
 
8891bca
 
7a55e65
02f0452
7a55e65
 
6e7ae54
 
 
 
 
 
 
 
 
 
480ba11
02f0452
6e7ae54
7a55e65
6e7ae54
 
 
 
8891bca
 
7a55e65
02f0452
7a55e65
02f0452
 
8891bca
02f0452
 
 
 
 
 
 
 
8891bca
6e7ae54
7a55e65
 
6e7ae54
8891bca
7a55e65
 
 
02f0452
7a55e65
 
6e7ae54
7a55e65
02f0452
 
 
7a55e65
 
6e7ae54
7a55e65
6e7ae54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a55e65
 
02f0452
 
 
 
7a55e65
 
 
 
 
 
 
 
02f0452
 
7a55e65
 
 
02f0452
 
7a55e65
 
 
 
02f0452
6e7ae54
7a55e65
 
02f0452
7a55e65
 
 
 
 
 
 
6e7ae54
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# =========================
# CONFIG
# =========================
MODEL_ID = "AxionLab-Co/DogeAI-v2.0-4B-Reasoning"
MAX_NEW_TOKENS = 256  # menor = menos timeout em CPU

tokenizer = None
model = None


# =========================
# LOAD MODEL (LAZY + SAFE)
# =========================
def load_model():
    global tokenizer, model

    if model is None:
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_ID,
            use_fast=True
        )

        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            device_map="cpu",
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True
        )

        model.eval()

    return tokenizer, model


# =========================
# PROMPT (CPU-FRIENDLY)
# =========================
def build_prompt(user_input: str) -> str:
    return f"""You are DogeAI-v2.0-4B-Reasoning.

Think step by step internally.
Do not reveal your full chain-of-thought.
Provide a clear final answer with a short explanation.

If the user speaks Brazilian Portuguese:
- use Brazilian slang lightly
- keep the Doge vibe 🐕🇧🇷
- stay serious and logical

User:
{user_input}

Assistant:
"""


# =========================
# CHAT FUNCTION (SSE-SAFE)
# =========================
def chat(user_input):
    tokenizer, model = load_model()

    # mantém o SSE vivo imediatamente
    yield "🤔 DogeAI está pensando... segura aí..."

    prompt = build_prompt(user_input)

    inputs = tokenizer(
        prompt,
        return_tensors="pt"
    )

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    text = tokenizer.decode(
        output[0],
        skip_special_tokens=True
    )

    # remove o prompt da resposta final
    response = text.split("Assistant:", 1)[-1].strip()

    yield response


# =========================
# GRADIO UI
# =========================
with gr.Blocks(title="DogeAI-v2.0-4B-Reasoning") as demo:
    gr.Markdown(
        "# 🐕 DogeAI-v2.0-4B-Reasoning\n"
        "**4B reasoning model rodando em CPU no HF Space**\n\n"
        "Pensamento explícito interno, resposta clara externa."
    )

    input_box = gr.Textbox(
        label="Pergunta",
        placeholder="Pergunta que exige raciocínio de verdade...",
        lines=4
    )

    output_box = gr.Textbox(
        label="Resposta do DogeAI",
        lines=14
    )

    run_btn = gr.Button("Pensar 🧠🐕")

    run_btn.click(
        fn=chat,
        inputs=input_box,
        outputs=output_box
    )

demo.launch()