import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct" LORA_REPO = "alxstuff/Lumen-7b-v2" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) print("Loading base model...") model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True, ) print("Loading LoRA adapter...") model.load_adapter(LORA_REPO) model.eval() print("✅ Lumen ready!") def chat(message, history): prompt = "<|im_start|>system\nYou are Lumen, an expert AI coding assistant built by TheAlxLabs. You write clean, efficient code and explain it clearly.<|im_end|>\n" for user, assistant in history: prompt += f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n{assistant}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) thread = Thread(target=model.generate, kwargs={ **inputs, "streamer": streamer, "max_new_tokens": 1024, "temperature": 0.2, "do_sample": True, }) thread.start() response = "" for token in streamer: response += token yield response gr.ChatInterface( fn=chat, title="⚡ Lumen — AI Coding Assistant", description="Local-first AI coding assistant by TheAlxLabs.", examples=[ "Write a Python function to reverse a linked list", "Explain what this does: `[x for x in range(10) if x % 2 == 0]`", "Fix this bug: TypeError: 'NoneType' object is not subscriptable" ], ).launch()