umarfarzan commited on
Commit
4e98186
·
verified ·
1 Parent(s): 0c7210a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -68
app.py CHANGED
@@ -1,91 +1,61 @@
1
  import gradio as gr
 
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
- from peft import PeftModel
5
- import time
6
 
7
  # ----------------------------
8
- # 🔹 Load base model + LoRA weights
9
  # ----------------------------
10
- BASE_MODEL = "unsloth/qwen2.5-7b" # Original base model
11
- LORA_WEIGHTS = "umarfarzan/my-finetuned-model2-lora"
 
 
 
 
 
12
 
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
-
15
- @torch.inference_mode()
16
- def load_model():
17
- print("Loading base model...")
18
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
19
- model = AutoModelForCausalLM.from_pretrained(
20
- BASE_MODEL,
21
- device_map={"": device},
22
- torch_dtype=torch.float32
23
- )
24
- print("Applying LoRA weights...")
25
- model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device})
26
- model.eval()
27
- print("✅ Model loaded successfully!")
28
- return model, tokenizer
29
-
30
- model, tokenizer = load_model()
31
 
32
  # ----------------------------
33
- # 🔹 Generation function
34
  # ----------------------------
35
- def generate_training_program(instruction, max_tokens=500, temperature=0.7, top_p=0.9):
36
- prompt_text = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
37
  ### Instruction:
38
- {instruction}
 
 
 
 
39
  ### Response:
40
- """
41
- inputs = tokenizer([prompt_text], return_tensors="pt").to(device)
42
 
43
- start_time = time.time()
 
 
44
  outputs = model.generate(
45
  **inputs,
46
- max_new_tokens=max_tokens,
47
- temperature=temperature,
48
- top_p=top_p,
49
  do_sample=True,
50
  use_cache=True
51
  )
52
- gen_time = time.time() - start_time
53
- generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
54
-
55
- if "### Response:" in generated_text:
56
- response = generated_text.split("### Response:")[-1].strip()
57
- else:
58
- response = generated_text
59
-
60
- return response, f"⏱️ Generated in {gen_time:.2f} seconds"
61
 
62
  # ----------------------------
63
- # 🔹 Gradio UI
64
  # ----------------------------
65
- examples = [
66
- ["Design a 1-week training program 'The Leader's Blueprint' for mid-level managers and team leads."],
67
- ["Create a 3-day workshop on effective communication for remote teams."],
68
- ["Develop a 5-day leadership bootcamp for new managers."],
69
- ["Design a half-day data-driven decision-making session for executives."],
70
- ["Create a 2-week onboarding program for new software engineers."]
71
- ]
72
-
73
  with gr.Blocks() as demo:
74
- gr.HTML("<h1 style='text-align:center'>🎯 AI Training Program Generator</h1>")
75
- instruction_input = gr.Textbox(label="📝 Training Program Description", lines=5)
76
- max_tokens_slider = gr.Slider(100, 8000, value=500, step=100, label="Max Output Length")
77
- temperature_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity (Temperature)")
78
- top_p_slider = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Diversity (Top-p)")
79
- generate_btn = gr.Button("🚀 Generate Training Program")
80
- output_text = gr.Textbox(label="📋 Generated Training Program", lines=25, show_copy_button=True)
81
- generation_info = gr.Textbox(label="ℹ️ Generation Info", interactive=False, show_label=False)
82
-
83
- generate_btn.click(
84
- generate_training_program,
85
- inputs=[instruction_input, max_tokens_slider, temperature_slider, top_p_slider],
86
- outputs=[output_text, generation_info]
87
  )
88
 
89
- gr.Examples(examples=examples, inputs=instruction_input)
90
-
91
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ from unsloth import FastLanguageModel
3
  import torch
 
 
 
4
 
5
  # ----------------------------
6
+ # Load LoRA-finetuned model
7
  # ----------------------------
8
+ max_seq_length = 1024
9
+ model, tokenizer = FastLanguageModel.from_pretrained(
10
+ model_name="umarfarzan/my-finetuned-model2-lora",
11
+ max_seq_length=max_seq_length,
12
+ dtype=None,
13
+ load_in_4bit=True # still works on CPU with int4 quantization
14
+ )
15
 
16
+ FastLanguageModel.for_inference(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # ----------------------------
19
+ # Inference function
20
  # ----------------------------
21
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
22
+
23
  ### Instruction:
24
+ {}
25
+
26
+ ### Input:
27
+ {}
28
+
29
  ### Response:
30
+ {}"""
 
31
 
32
+ def generate_response(instruction, input_text=""):
33
+ prompt = alpaca_prompt.format(instruction, input_text, "")
34
+ inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
35
  outputs = model.generate(
36
  **inputs,
37
+ max_new_tokens=512,
38
+ temperature=0.7,
39
+ top_p=0.9,
40
  do_sample=True,
41
  use_cache=True
42
  )
43
+ return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
44
 
45
  # ----------------------------
46
+ # Gradio UI
47
  # ----------------------------
 
 
 
 
 
 
 
 
48
  with gr.Blocks() as demo:
49
+ gr.Markdown("## LoRA Qwen2.5-7B Demo (CPU)")
50
+ instruction_input = gr.Textbox(label="Instruction", lines=3)
51
+ context_input = gr.Textbox(label="Input (Optional)", lines=2)
52
+ output_box = gr.Textbox(label="Output", lines=10)
53
+ submit_btn = gr.Button("Generate")
54
+
55
+ submit_btn.click(
56
+ generate_response,
57
+ inputs=[instruction_input, context_input],
58
+ outputs=output_box
 
 
 
59
  )
60
 
61
+ demo.launch()