Spaces:
Runtime error
Runtime error
| """SafeClaw Coder LoRA — public Gradio demo on a free CPU Space. | |
| Loads ``Qwen/Qwen2.5-Coder-1.5B`` and attaches the LoRA adapter | |
| ``vladpp91/Tett`` on top. Trained as a SafeClaw fine-tuning PoC on | |
| ``bigcode/the-stack-smol-xl`` (python + javascript + typescript + go + rust). | |
| Note: this Space runs on a free CPU instance (no GPU). A 1.5B model is | |
| slow on CPU — expect ~30 s for short prompts to a few minutes for longer | |
| ones. For production-grade latency, switch the Space hardware to a GPU | |
| tier (e.g. ZeroGPU or a paid GPU upgrade). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import threading | |
| import time | |
| from typing import Any | |
| import gradio as gr | |
| import torch | |
| from peft import PeftModel | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| TextIteratorStreamer, | |
| ) | |
| BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-0.5B") | |
| ADAPTER_ID = os.environ.get("ADAPTER_ID", "vladpp91/Tett") | |
| HF_TOKEN = os.environ.get("HF_TOKEN") or None | |
| LANGUAGES = ["python", "javascript", "typescript", "go", "rust"] | |
| DEFAULT_SYSTEM = ( | |
| "You are SafeClaw Coder, a privacy-first coding assistant fine-tuned " | |
| "on a small multilingual code corpus. Reply with concise, runnable code." | |
| ) | |
| print(f"[startup] base={BASE_MODEL} adapter={ADAPTER_ID}") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| BASE_MODEL, trust_remote_code=True, token=HF_TOKEN | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| base = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16, | |
| low_cpu_mem_usage=True, | |
| token=HF_TOKEN, | |
| ) | |
| try: | |
| model: Any = PeftModel.from_pretrained(base, ADAPTER_ID, token=HF_TOKEN) | |
| adapter_status = f"LoRA `{ADAPTER_ID}` loaded on top of `{BASE_MODEL}`." | |
| except Exception as exc: # pragma: no cover — Space-only fallback path | |
| print(f"[startup] adapter load failed ({exc}); falling back to base model") | |
| model = base | |
| adapter_status = ( | |
| f"⚠️ Failed to load adapter `{ADAPTER_ID}`: {exc}. " | |
| f"Running raw `{BASE_MODEL}`." | |
| ) | |
| model.eval() | |
| def _format_prompt(message: str, language: str, system_prompt: str) -> str: | |
| user_block = f"// language: {language}\n{message}" | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_block}, | |
| ] | |
| return tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| def _generate_stream( | |
| message: str, | |
| language: str, | |
| system_prompt: str, | |
| max_new_tokens: int, | |
| temperature: float, | |
| ): | |
| prompt = _format_prompt(message, language, system_prompt) | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) | |
| streamer = TextIteratorStreamer( | |
| tokenizer, skip_special_tokens=True, skip_prompt=True | |
| ) | |
| gen_kwargs = dict( | |
| **inputs, | |
| streamer=streamer, | |
| max_new_tokens=int(max_new_tokens), | |
| do_sample=temperature > 0, | |
| temperature=max(float(temperature), 1e-5), | |
| top_p=0.95, | |
| pad_token_id=tokenizer.pad_token_id, | |
| ) | |
| thread = threading.Thread(target=model.generate, kwargs=gen_kwargs, daemon=True) | |
| thread.start() | |
| accumulated = "" | |
| started_at = time.time() | |
| for new_text in streamer: | |
| accumulated += new_text | |
| elapsed = time.time() - started_at | |
| yield ( | |
| f"```{language}\n{accumulated}\n```\n\n" | |
| f"_{len(accumulated)} chars · {elapsed:.1f}s elapsed (CPU is slow)._" | |
| ) | |
| thread.join() | |
| with gr.Blocks(title="SafeClaw Coder LoRA") as demo: | |
| gr.Markdown( | |
| f""" | |
| # SafeClaw Coder LoRA — public CPU demo | |
| {adapter_status} | |
| Trained as a SafeClaw fine-tuning PoC on | |
| [`bigcode/the-stack-smol-xl`](https://huggingface.co/datasets/bigcode/the-stack-smol-xl) | |
| (python + javascript + typescript + go + rust). | |
| ⚠️ This Space runs on a **free CPU instance**. Generation is slow | |
| (~30 s for short answers, several minutes for long ones). The | |
| LoRA adapter only changes ~0.6% of the parameters — the | |
| underlying coding ability still comes from the Qwen base model. | |
| """ | |
| ) | |
| with gr.Row(): | |
| prompt = gr.Textbox( | |
| lines=4, | |
| placeholder="e.g. Write a function that returns the n-th Fibonacci number.", | |
| label="Prompt", | |
| ) | |
| with gr.Row(): | |
| language = gr.Dropdown( | |
| LANGUAGES, value="python", label="Target language" | |
| ) | |
| max_new_tokens = gr.Slider( | |
| minimum=32, | |
| maximum=512, | |
| value=128, | |
| step=16, | |
| label="max_new_tokens (smaller = faster on CPU)", | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.5, | |
| value=0.2, | |
| step=0.05, | |
| label="temperature", | |
| ) | |
| with gr.Accordion("System prompt", open=False): | |
| system_prompt = gr.Textbox(value=DEFAULT_SYSTEM, lines=3, label=" ") | |
| output = gr.Markdown(label="Output") | |
| submit = gr.Button("Generate", variant="primary") | |
| submit.click( | |
| _generate_stream, | |
| inputs=[prompt, language, system_prompt, max_new_tokens, temperature], | |
| outputs=output, | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Write a quicksort that operates in-place on a list of integers.", "python"], | |
| ["Implement a debounce helper.", "javascript"], | |
| ["Define a generic LRU cache.", "typescript"], | |
| ["Write an HTTP middleware that adds a request ID header.", "go"], | |
| ["Implement a binary search tree with insert and search.", "rust"], | |
| ], | |
| inputs=[prompt, language], | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860) | |