import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" LORA_PATH = "./" # Tokenizer tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) tokenizer.pad_token = tokenizer.eos_token # Base model (CPU) model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float32, device_map={"": "cpu"}, low_cpu_mem_usage=True ) # Load LoRA model = PeftModel.from_pretrained(model, LORA_PATH) model.eval() def chat(user_prompt, max_tokens, temperature): prompt = f""" You are a lab assistant. Answer in **Markdown** format. Use headings, bullet points, and code blocks when appropriate. Question: {user_prompt} Answer: """ inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=int(max_tokens), do_sample=False, # CPU için hızlı eos_token_id=tokenizer.eos_token_id ) generated = output[0][inputs["input_ids"].shape[-1]:] return tokenizer.decode(generated, skip_special_tokens=True) # Gradio UI demo = gr.Interface( fn=chat, inputs=[ gr.Textbox(lines=5, label="Prompt"), gr.Slider(32, 512, value=256, step=32, label="Max tokens"), gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature"), ], outputs=gr.Markdown(label="Answer"), title="DeepSeek Lab Assistant (LoRA)", ) if __name__ == "__main__": demo.launch()