import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel model_name = "HuggingFaceH4/zephyr-7b-beta" adapter_path = "zephyr_lora_adapter" tokenizer = AutoTokenizer.from_pretrained(adapter_path) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16 ) base_model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map="auto", trust_remote_code=True ) model = PeftModel.from_pretrained(base_model, adapter_path) model.eval() def solve_math(question, max_tokens=512): prompt = f"<|user|>\n{question}\n<|assistant|>\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id ) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) return decoded.split("<|assistant|>")[-1].strip() demo = gr.Interface(fn=solve_math, inputs=gr.Textbox(lines=5, label="Enter math problem"), outputs=gr.Textbox(label="Solution"), title="Math Solver (Zephyr Fine-Tuned)", description="This app uses a fine-tuned LLM to solve school-level math problems step by step.") demo.launch()