Sumit404 commited on
Commit
ba22c0b
·
verified ·
1 Parent(s): 9593f2c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from unsloth import FastLanguageModel
3
+ from peft import PeftModel
4
+ import torch
5
+
6
+ # Load the base model and tokenizer
7
+ max_seq_length = 4096
8
+ dtype = None
9
+ load_in_4bit = True
10
+
11
+ model, tokenizer = FastLanguageModel.from_pretrained(
12
+ model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
13
+ max_seq_length=max_seq_length,
14
+ dtype=dtype,
15
+ load_in_4bit=load_in_4bit
16
+ )
17
+
18
+ # Load the LoRA adapters
19
+ LORA_ADAPTER_PATH = "Sumit404/Llama-3.2-3B-Instruct-bnb-4bit-finetuned" # Replace with your repo ID
20
+ model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
21
+
22
+ # Set tokenizer and model for inference
23
+ from unsloth.chat_templates import get_chat_template
24
+ tokenizer = get_chat_template(
25
+ tokenizer,
26
+ chat_template = "llama-3.2",
27
+ )
28
+ tokenizer.pad_token = tokenizer.eos_token
29
+ FastLanguageModel.for_inference(model)
30
+
31
+ def generate_text(prompt):
32
+ messages = [{"role": "user", "content": prompt}]
33
+ inputs = tokenizer.apply_chat_template(
34
+ messages,
35
+ tokenize=True,
36
+ add_generation_prompt=True,
37
+ return_tensors="pt",
38
+ padding=True,
39
+ ).to("cuda")
40
+
41
+ attention_mask = inputs != tokenizer.pad_token_id
42
+
43
+ outputs = model.generate(
44
+ input_ids=inputs,
45
+ attention_mask=attention_mask,
46
+ max_new_tokens=128, # Increased output length for potentially longer answers
47
+ use_cache=True,
48
+ temperature=0.6,
49
+ min_p=0.1,
50
+ )
51
+
52
+ text = tokenizer.decode(outputs[0], skip_special_tokens=True)
53
+ # Extract only the assistant's response
54
+ assistant_response_start = text.find("<|start_header_id|>assistant<|end_header_id|>\n\n")
55
+ if assistant_response_start != -1:
56
+ text = text[assistant_response_start + len("<|start_header_id|>assistant<|end_header_id|>\n\n"):]
57
+
58
+ return text
59
+
60
+ # Create the Gradio interface
61
+ interface = gr.Interface(
62
+ fn=generate_text,
63
+ inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
64
+ outputs="text",
65
+ title="Fine-tuned Llama-3.2 Instruct Model",
66
+ description="Ask a question to the fine-tuned model."
67
+ )
68
+
69
+ # To run this in Colab, set share=True
70
+ interface.launch(share=True)