File size: 5,881 Bytes
d16c194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81f99f5
d16c194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b1d28
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""SafeClaw Coder LoRA — public Gradio demo on a free CPU Space.

Loads ``Qwen/Qwen2.5-Coder-1.5B`` and attaches the LoRA adapter
``vladpp91/Tett`` on top. Trained as a SafeClaw fine-tuning PoC on
``bigcode/the-stack-smol-xl`` (python + javascript + typescript + go + rust).

Note: this Space runs on a free CPU instance (no GPU). A 1.5B model is
slow on CPU — expect ~30 s for short prompts to a few minutes for longer
ones. For production-grade latency, switch the Space hardware to a GPU
tier (e.g. ZeroGPU or a paid GPU upgrade).
"""

from __future__ import annotations

import os
import threading
import time
from typing import Any

import gradio as gr
import torch
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
)

BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-0.5B")
ADAPTER_ID = os.environ.get("ADAPTER_ID", "vladpp91/Tett")
HF_TOKEN = os.environ.get("HF_TOKEN") or None

LANGUAGES = ["python", "javascript", "typescript", "go", "rust"]
DEFAULT_SYSTEM = (
    "You are SafeClaw Coder, a privacy-first coding assistant fine-tuned "
    "on a small multilingual code corpus. Reply with concise, runnable code."
)

print(f"[startup] base={BASE_MODEL} adapter={ADAPTER_ID}")

tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL, trust_remote_code=True, token=HF_TOKEN
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    token=HF_TOKEN,
)
try:
    model: Any = PeftModel.from_pretrained(base, ADAPTER_ID, token=HF_TOKEN)
    adapter_status = f"LoRA `{ADAPTER_ID}` loaded on top of `{BASE_MODEL}`."
except Exception as exc:  # pragma: no cover — Space-only fallback path
    print(f"[startup] adapter load failed ({exc}); falling back to base model")
    model = base
    adapter_status = (
        f"⚠️ Failed to load adapter `{ADAPTER_ID}`: {exc}. "
        f"Running raw `{BASE_MODEL}`."
    )

model.eval()


def _format_prompt(message: str, language: str, system_prompt: str) -> str:
    user_block = f"// language: {language}\n{message}"
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_block},
    ]
    return tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )


def _generate_stream(
    message: str,
    language: str,
    system_prompt: str,
    max_new_tokens: int,
    temperature: float,
):
    prompt = _format_prompt(message, language, system_prompt)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    streamer = TextIteratorStreamer(
        tokenizer, skip_special_tokens=True, skip_prompt=True
    )

    gen_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=int(max_new_tokens),
        do_sample=temperature > 0,
        temperature=max(float(temperature), 1e-5),
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,
    )

    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs, daemon=True)
    thread.start()

    accumulated = ""
    started_at = time.time()
    for new_text in streamer:
        accumulated += new_text
        elapsed = time.time() - started_at
        yield (
            f"```{language}\n{accumulated}\n```\n\n"
            f"_{len(accumulated)} chars · {elapsed:.1f}s elapsed (CPU is slow)._"
        )

    thread.join()


with gr.Blocks(title="SafeClaw Coder LoRA") as demo:
    gr.Markdown(
        f"""
        # SafeClaw Coder LoRA — public CPU demo

        {adapter_status}

        Trained as a SafeClaw fine-tuning PoC on
        [`bigcode/the-stack-smol-xl`](https://huggingface.co/datasets/bigcode/the-stack-smol-xl)
        (python + javascript + typescript + go + rust).

        ⚠️ This Space runs on a **free CPU instance**. Generation is slow
        (~30 s for short answers, several minutes for long ones). The
        LoRA adapter only changes ~0.6% of the parameters — the
        underlying coding ability still comes from the Qwen base model.
        """
    )

    with gr.Row():
        prompt = gr.Textbox(
            lines=4,
            placeholder="e.g. Write a function that returns the n-th Fibonacci number.",
            label="Prompt",
        )

    with gr.Row():
        language = gr.Dropdown(
            LANGUAGES, value="python", label="Target language"
        )
        max_new_tokens = gr.Slider(
            minimum=32,
            maximum=512,
            value=128,
            step=16,
            label="max_new_tokens (smaller = faster on CPU)",
        )
        temperature = gr.Slider(
            minimum=0.0,
            maximum=1.5,
            value=0.2,
            step=0.05,
            label="temperature",
        )

    with gr.Accordion("System prompt", open=False):
        system_prompt = gr.Textbox(value=DEFAULT_SYSTEM, lines=3, label=" ")

    output = gr.Markdown(label="Output")

    submit = gr.Button("Generate", variant="primary")
    submit.click(
        _generate_stream,
        inputs=[prompt, language, system_prompt, max_new_tokens, temperature],
        outputs=output,
    )

    gr.Examples(
        examples=[
            ["Write a quicksort that operates in-place on a list of integers.", "python"],
            ["Implement a debounce helper.", "javascript"],
            ["Define a generic LRU cache.", "typescript"],
            ["Write an HTTP middleware that adds a request ID header.", "go"],
            ["Implement a binary search tree with insert and search.", "rust"],
        ],
        inputs=[prompt, language],
    )


if __name__ == "__main__":
    demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860)