"""SafeClaw Coder LoRA — public Gradio demo on a free CPU Space. Loads ``Qwen/Qwen2.5-Coder-1.5B`` and attaches the LoRA adapter ``vladpp91/Tett`` on top. Trained as a SafeClaw fine-tuning PoC on ``bigcode/the-stack-smol-xl`` (python + javascript + typescript + go + rust). Note: this Space runs on a free CPU instance (no GPU). A 1.5B model is slow on CPU — expect ~30 s for short prompts to a few minutes for longer ones. For production-grade latency, switch the Space hardware to a GPU tier (e.g. ZeroGPU or a paid GPU upgrade). """ from __future__ import annotations import os import threading import time from typing import Any import gradio as gr import torch from peft import PeftModel from transformers import ( AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, ) BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-0.5B") ADAPTER_ID = os.environ.get("ADAPTER_ID", "vladpp91/Tett") HF_TOKEN = os.environ.get("HF_TOKEN") or None LANGUAGES = ["python", "javascript", "typescript", "go", "rust"] DEFAULT_SYSTEM = ( "You are SafeClaw Coder, a privacy-first coding assistant fine-tuned " "on a small multilingual code corpus. Reply with concise, runnable code." ) print(f"[startup] base={BASE_MODEL} adapter={ADAPTER_ID}") tokenizer = AutoTokenizer.from_pretrained( BASE_MODEL, trust_remote_code=True, token=HF_TOKEN ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token base = AutoModelForCausalLM.from_pretrained( BASE_MODEL, trust_remote_code=True, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, token=HF_TOKEN, ) try: model: Any = PeftModel.from_pretrained(base, ADAPTER_ID, token=HF_TOKEN) adapter_status = f"LoRA `{ADAPTER_ID}` loaded on top of `{BASE_MODEL}`." except Exception as exc: # pragma: no cover — Space-only fallback path print(f"[startup] adapter load failed ({exc}); falling back to base model") model = base adapter_status = ( f"⚠️ Failed to load adapter `{ADAPTER_ID}`: {exc}. " f"Running raw `{BASE_MODEL}`." ) model.eval() def _format_prompt(message: str, language: str, system_prompt: str) -> str: user_block = f"// language: {language}\n{message}" messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_block}, ] return tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) def _generate_stream( message: str, language: str, system_prompt: str, max_new_tokens: int, temperature: float, ): prompt = _format_prompt(message, language, system_prompt) inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) streamer = TextIteratorStreamer( tokenizer, skip_special_tokens=True, skip_prompt=True ) gen_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=int(max_new_tokens), do_sample=temperature > 0, temperature=max(float(temperature), 1e-5), top_p=0.95, pad_token_id=tokenizer.pad_token_id, ) thread = threading.Thread(target=model.generate, kwargs=gen_kwargs, daemon=True) thread.start() accumulated = "" started_at = time.time() for new_text in streamer: accumulated += new_text elapsed = time.time() - started_at yield ( f"```{language}\n{accumulated}\n```\n\n" f"_{len(accumulated)} chars · {elapsed:.1f}s elapsed (CPU is slow)._" ) thread.join() with gr.Blocks(title="SafeClaw Coder LoRA") as demo: gr.Markdown( f""" # SafeClaw Coder LoRA — public CPU demo {adapter_status} Trained as a SafeClaw fine-tuning PoC on [`bigcode/the-stack-smol-xl`](https://huggingface.co/datasets/bigcode/the-stack-smol-xl) (python + javascript + typescript + go + rust). ⚠️ This Space runs on a **free CPU instance**. Generation is slow (~30 s for short answers, several minutes for long ones). The LoRA adapter only changes ~0.6% of the parameters — the underlying coding ability still comes from the Qwen base model. """ ) with gr.Row(): prompt = gr.Textbox( lines=4, placeholder="e.g. Write a function that returns the n-th Fibonacci number.", label="Prompt", ) with gr.Row(): language = gr.Dropdown( LANGUAGES, value="python", label="Target language" ) max_new_tokens = gr.Slider( minimum=32, maximum=512, value=128, step=16, label="max_new_tokens (smaller = faster on CPU)", ) temperature = gr.Slider( minimum=0.0, maximum=1.5, value=0.2, step=0.05, label="temperature", ) with gr.Accordion("System prompt", open=False): system_prompt = gr.Textbox(value=DEFAULT_SYSTEM, lines=3, label=" ") output = gr.Markdown(label="Output") submit = gr.Button("Generate", variant="primary") submit.click( _generate_stream, inputs=[prompt, language, system_prompt, max_new_tokens, temperature], outputs=output, ) gr.Examples( examples=[ ["Write a quicksort that operates in-place on a list of integers.", "python"], ["Implement a debounce helper.", "javascript"], ["Define a generic LRU cache.", "typescript"], ["Write an HTTP middleware that adds a request ID header.", "go"], ["Implement a binary search tree with insert and search.", "rust"], ], inputs=[prompt, language], ) if __name__ == "__main__": demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860)