vladpp91's picture
Bind Gradio to 0.0.0.0:7860 for HF Space proxy
a9b1d28 verified
"""SafeClaw Coder LoRA — public Gradio demo on a free CPU Space.
Loads ``Qwen/Qwen2.5-Coder-1.5B`` and attaches the LoRA adapter
``vladpp91/Tett`` on top. Trained as a SafeClaw fine-tuning PoC on
``bigcode/the-stack-smol-xl`` (python + javascript + typescript + go + rust).
Note: this Space runs on a free CPU instance (no GPU). A 1.5B model is
slow on CPU — expect ~30 s for short prompts to a few minutes for longer
ones. For production-grade latency, switch the Space hardware to a GPU
tier (e.g. ZeroGPU or a paid GPU upgrade).
"""
from __future__ import annotations
import os
import threading
import time
from typing import Any
import gradio as gr
import torch
from peft import PeftModel
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
)
BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-0.5B")
ADAPTER_ID = os.environ.get("ADAPTER_ID", "vladpp91/Tett")
HF_TOKEN = os.environ.get("HF_TOKEN") or None
LANGUAGES = ["python", "javascript", "typescript", "go", "rust"]
DEFAULT_SYSTEM = (
"You are SafeClaw Coder, a privacy-first coding assistant fine-tuned "
"on a small multilingual code corpus. Reply with concise, runnable code."
)
print(f"[startup] base={BASE_MODEL} adapter={ADAPTER_ID}")
tokenizer = AutoTokenizer.from_pretrained(
BASE_MODEL, trust_remote_code=True, token=HF_TOKEN
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
token=HF_TOKEN,
)
try:
model: Any = PeftModel.from_pretrained(base, ADAPTER_ID, token=HF_TOKEN)
adapter_status = f"LoRA `{ADAPTER_ID}` loaded on top of `{BASE_MODEL}`."
except Exception as exc: # pragma: no cover — Space-only fallback path
print(f"[startup] adapter load failed ({exc}); falling back to base model")
model = base
adapter_status = (
f"⚠️ Failed to load adapter `{ADAPTER_ID}`: {exc}. "
f"Running raw `{BASE_MODEL}`."
)
model.eval()
def _format_prompt(message: str, language: str, system_prompt: str) -> str:
user_block = f"// language: {language}\n{message}"
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_block},
]
return tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
def _generate_stream(
message: str,
language: str,
system_prompt: str,
max_new_tokens: int,
temperature: float,
):
prompt = _format_prompt(message, language, system_prompt)
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
streamer = TextIteratorStreamer(
tokenizer, skip_special_tokens=True, skip_prompt=True
)
gen_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=int(max_new_tokens),
do_sample=temperature > 0,
temperature=max(float(temperature), 1e-5),
top_p=0.95,
pad_token_id=tokenizer.pad_token_id,
)
thread = threading.Thread(target=model.generate, kwargs=gen_kwargs, daemon=True)
thread.start()
accumulated = ""
started_at = time.time()
for new_text in streamer:
accumulated += new_text
elapsed = time.time() - started_at
yield (
f"```{language}\n{accumulated}\n```\n\n"
f"_{len(accumulated)} chars · {elapsed:.1f}s elapsed (CPU is slow)._"
)
thread.join()
with gr.Blocks(title="SafeClaw Coder LoRA") as demo:
gr.Markdown(
f"""
# SafeClaw Coder LoRA — public CPU demo
{adapter_status}
Trained as a SafeClaw fine-tuning PoC on
[`bigcode/the-stack-smol-xl`](https://huggingface.co/datasets/bigcode/the-stack-smol-xl)
(python + javascript + typescript + go + rust).
⚠️ This Space runs on a **free CPU instance**. Generation is slow
(~30 s for short answers, several minutes for long ones). The
LoRA adapter only changes ~0.6% of the parameters — the
underlying coding ability still comes from the Qwen base model.
"""
)
with gr.Row():
prompt = gr.Textbox(
lines=4,
placeholder="e.g. Write a function that returns the n-th Fibonacci number.",
label="Prompt",
)
with gr.Row():
language = gr.Dropdown(
LANGUAGES, value="python", label="Target language"
)
max_new_tokens = gr.Slider(
minimum=32,
maximum=512,
value=128,
step=16,
label="max_new_tokens (smaller = faster on CPU)",
)
temperature = gr.Slider(
minimum=0.0,
maximum=1.5,
value=0.2,
step=0.05,
label="temperature",
)
with gr.Accordion("System prompt", open=False):
system_prompt = gr.Textbox(value=DEFAULT_SYSTEM, lines=3, label=" ")
output = gr.Markdown(label="Output")
submit = gr.Button("Generate", variant="primary")
submit.click(
_generate_stream,
inputs=[prompt, language, system_prompt, max_new_tokens, temperature],
outputs=output,
)
gr.Examples(
examples=[
["Write a quicksort that operates in-place on a list of integers.", "python"],
["Implement a debounce helper.", "javascript"],
["Define a generic LRU cache.", "typescript"],
["Write an HTTP middleware that adds a request ID header.", "go"],
["Implement a binary search tree with insert and search.", "rust"],
],
inputs=[prompt, language],
)
if __name__ == "__main__":
demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860)