Remostart commited on
Commit
68ff241
·
verified ·
1 Parent(s): 2ad0f54

Fix tokenizer size mismatch

Browse files
Files changed (1) hide show
  1. app.py +41 -19
app.py CHANGED
@@ -2,35 +2,57 @@ import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from peft import PeftModel
4
  import torch
 
 
5
 
6
- # Replace with your model repository ID
7
- model_repo_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct"
 
8
 
9
- # Load the tokenizer
10
- tokenizer = AutoTokenizer.from_pretrained(model_repo_id)
 
11
 
12
- # Load the base model and apply the PEFT adapter
 
 
 
13
  base_model = AutoModelForCausalLM.from_pretrained(
14
- "meta-llama/Llama-3.2-3B-Instruct",
15
  torch_dtype=torch.float16,
16
- device_map="auto"
 
 
 
17
  )
18
- model = PeftModel.from_pretrained(base_model, model_repo_id)
19
 
20
- # Define the prediction function
21
- def predict(text):
22
- inputs = tokenizer(text, return_tensors="pt").to("cuda")
23
- outputs = model.generate(**inputs, max_length=100) # Adjust parameters as needed
24
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
25
 
26
- # Create Gradio interface
27
  demo = gr.Interface(
28
  fn=predict,
29
- inputs=gr.Textbox(label="Input Text"),
 
 
 
30
  outputs=gr.Textbox(label="Model Output"),
31
- title="My Model Demo",
32
- description="Test the fine-tuned model hosted on Hugging Face."
 
33
  )
34
 
35
- # Launch the app
36
- demo.launch()
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from peft import PeftModel
4
  import torch
5
+ import os
6
+ from huggingface_hub import login
7
 
8
+ # Authenticate with Hugging Face
9
+ hf_token = os.getenv("HF_TOKEN")
10
+ login(token=hf_token)
11
 
12
+ # Model repository IDs
13
+ base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
14
+ peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct"
15
 
16
+ # Load the tokenizer from the fine-tuned model
17
+ tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
18
+
19
+ # Load the base model
20
  base_model = AutoModelForCausalLM.from_pretrained(
21
+ base_model_id,
22
  torch_dtype=torch.float16,
23
+ device_map="auto",
24
+ token=hf_token,
25
+ low_cpu_mem_usage=True,
26
+ trust_remote_code=True
27
  )
 
28
 
29
+ # Resize the base model's embeddings to match the fine-tuned tokenizer
30
+ base_model.resize_token_embeddings(len(tokenizer))
31
+
32
+ # Load the PEFT adapter
33
+ model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
34
+
35
+ # Define the prediction function with chat template
36
+ def predict(text, max_length=100):
37
+ try:
38
+ messages = [{"role": "user", "content": text}]
39
+ inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
40
+ outputs = model.generate(inputs, max_length=max_length)
41
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
42
+ except Exception as e:
43
+ return f"Error during inference: {str(e)}"
44
 
45
+ # Create Gradio interface for ZeroGPU
46
  demo = gr.Interface(
47
  fn=predict,
48
+ inputs=[
49
+ gr.Textbox(label="Input Text"),
50
+ gr.Slider(label="Max Length", minimum=50, maximum=500, value=100, step=1)
51
+ ],
52
  outputs=gr.Textbox(label="Model Output"),
53
+ title="LearnPlutus Demo",
54
+ description="Test the fine-tuned Llama-3.2-3B-Instruct model on ZeroGPU.",
55
+ allow_flagging="never"
56
  )
57
 
58
+ demo.launch(server_name="0.0.0.0", server_port=7860)