| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from threading import Thread | |
| BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct" | |
| LORA_REPO = "alxstuff/Lumen-7b-v2" | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) | |
| print("Loading base model...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| print("Loading LoRA adapter...") | |
| model.load_adapter(LORA_REPO) | |
| model.eval() | |
| print("✅ Lumen ready!") | |
| def chat(message, history): | |
| prompt = "<|im_start|>system\nYou are Lumen, an expert AI coding assistant built by TheAlxLabs. You write clean, efficient code and explain it clearly.<|im_end|>\n" | |
| for user, assistant in history: | |
| prompt += f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n{assistant}<|im_end|>\n" | |
| prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| thread = Thread(target=model.generate, kwargs={ | |
| **inputs, | |
| "streamer": streamer, | |
| "max_new_tokens": 1024, | |
| "temperature": 0.2, | |
| "do_sample": True, | |
| }) | |
| thread.start() | |
| response = "" | |
| for token in streamer: | |
| response += token | |
| yield response | |
| gr.ChatInterface( | |
| fn=chat, | |
| title="⚡ Lumen — AI Coding Assistant", | |
| description="Local-first AI coding assistant by TheAlxLabs.", | |
| examples=[ | |
| "Write a Python function to reverse a linked list", | |
| "Explain what this does: `[x for x in range(10) if x % 2 == 0]`", | |
| "Fix this bug: TypeError: 'NoneType' object is not subscriptable" | |
| ], | |
| ).launch() |