Spaces:

umarfarzan
/

Inspaire

Runtime error

App Files Files Community

umarfarzan commited on Oct 27, 2025

Commit

4e98186

verified ·

1 Parent(s): 0c7210a

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -68

app.py CHANGED Viewed

@@ -1,91 +1,61 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from peft import PeftModel
-import time
 # ----------------------------
-# 🔹 Load base model + LoRA weights
 # ----------------------------
-BASE_MODEL = "unsloth/qwen2.5-7b"  # Original base model
-LORA_WEIGHTS = "umarfarzan/my-finetuned-model2-lora"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-@torch.inference_mode()
-def load_model():
-    print("Loading base model...")
-    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-    model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        device_map={"": device},
-        torch_dtype=torch.float32
-    )
-    print("Applying LoRA weights...")
-    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device})
-    model.eval()
-    print("✅ Model loaded successfully!")
-    return model, tokenizer
-model, tokenizer = load_model()
 # ----------------------------
-# 🔹 Generation function
 # ----------------------------
-def generate_training_program(instruction, max_tokens=500, temperature=0.7, top_p=0.9):
-    prompt_text = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 ### Instruction:
-{instruction}
 ### Response:
-"""
-    inputs = tokenizer([prompt_text], return_tensors="pt").to(device)
-    start_time = time.time()
     outputs = model.generate(
         **inputs,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
         do_sample=True,
         use_cache=True
     )
-    gen_time = time.time() - start_time
-    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-    if "### Response:" in generated_text:
-        response = generated_text.split("### Response:")[-1].strip()
-    else:
-        response = generated_text
-    return response, f"⏱️ Generated in {gen_time:.2f} seconds"
 # ----------------------------
-# 🔹 Gradio UI
 # ----------------------------
-examples = [
-    ["Design a 1-week training program 'The Leader's Blueprint' for mid-level managers and team leads."],
-    ["Create a 3-day workshop on effective communication for remote teams."],
-    ["Develop a 5-day leadership bootcamp for new managers."],
-    ["Design a half-day data-driven decision-making session for executives."],
-    ["Create a 2-week onboarding program for new software engineers."]
-]
 with gr.Blocks() as demo:
-    gr.HTML("<h1 style='text-align:center'>🎯 AI Training Program Generator</h1>")
-    instruction_input = gr.Textbox(label="📝 Training Program Description", lines=5)
-    max_tokens_slider = gr.Slider(100, 8000, value=500, step=100, label="Max Output Length")
-    temperature_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity (Temperature)")
-    top_p_slider = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Diversity (Top-p)")
-    generate_btn = gr.Button("🚀 Generate Training Program")
-    output_text = gr.Textbox(label="📋 Generated Training Program", lines=25, show_copy_button=True)
-    generation_info = gr.Textbox(label="ℹ️ Generation Info", interactive=False, show_label=False)
-    generate_btn.click(
-        generate_training_program,
-        inputs=[instruction_input, max_tokens_slider, temperature_slider, top_p_slider],
-        outputs=[output_text, generation_info]
     )
-    gr.Examples(examples=examples, inputs=instruction_input)
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+from unsloth import FastLanguageModel
 import torch
 # ----------------------------
+# Load LoRA-finetuned model
 # ----------------------------
+max_seq_length = 1024
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="umarfarzan/my-finetuned-model2-lora",
+    max_seq_length=max_seq_length,
+    dtype=None,
+    load_in_4bit=True  # still works on CPU with int4 quantization
+)
+FastLanguageModel.for_inference(model)
 # ----------------------------
+# Inference function
 # ----------------------------
+alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
+{}
+### Input:
+{}
 ### Response:
+{}"""
+def generate_response(instruction, input_text=""):
+    prompt = alpaca_prompt.format(instruction, input_text, "")
+    inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
     outputs = model.generate(
         **inputs,
+        max_new_tokens=512,
+        temperature=0.7,
+        top_p=0.9,
         do_sample=True,
         use_cache=True
     )
+    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
 # ----------------------------
+# Gradio UI
 # ----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## LoRA Qwen2.5-7B Demo (CPU)")
+    instruction_input = gr.Textbox(label="Instruction", lines=3)
+    context_input = gr.Textbox(label="Input (Optional)", lines=2)
+    output_box = gr.Textbox(label="Output", lines=10)
+    submit_btn = gr.Button("Generate")
+    submit_btn.click(
+        generate_response,
+        inputs=[instruction_input, context_input],
+        outputs=output_box
     )
+demo.launch()