Remostart commited on
Commit
ffa43a7
·
verified ·
1 Parent(s): a3a1052

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -4,7 +4,7 @@ from peft import PeftModel
4
  import torch
5
  import os
6
  from huggingface_hub import login
7
- import spaces # Required for @spaces.GPU decorator
8
 
9
  # Authenticate with Hugging Face
10
  hf_token = os.getenv("HF_TOKEN")
@@ -12,33 +12,34 @@ login(token=hf_token)
12
 
13
  # Model repository IDs
14
  base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
15
- peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct" # Replace with your fine-tuned model repo (e.g., ubiodee/my-finetuned-model)
16
 
17
  # Load the tokenizer from the fine-tuned model
18
  tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
19
 
20
- # Load the base model (ZeroGPU handles device placement automatically)
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
  base_model_id,
23
  torch_dtype=torch.float16,
24
- device_map="auto", # Let ZeroGPU/accelerate handle GPU placement
25
  token=hf_token,
26
  low_cpu_mem_usage=True,
27
  trust_remote_code=True
28
  )
29
- base_model.resize_token_embeddings(len(tokenizer)) # Fix vocabulary mismatch
30
 
31
  # Load the PEFT adapter
32
  model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
33
 
34
- # Decorate the prediction function with @spaces.GPU to trigger GPU allocation
35
- @spaces.GPU(duration=120) # 120s max runtime; adjust if your inferences are longer/shorter (default: 60s)
36
  def predict(text, max_length=100):
37
  try:
38
  messages = [{"role": "user", "content": text}]
39
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
40
- # No explicit .to("cuda") needed; device_map="auto" handles it
41
- outputs = model.generate(inputs, max_length=max_length)
 
42
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
43
  except Exception as e:
44
  return f"Error during inference: {str(e)}"
@@ -53,7 +54,7 @@ demo = gr.Interface(
53
  outputs=gr.Textbox(label="Model Output"),
54
  title="LearnPlutus Demo",
55
  description="Test the fine-tuned Llama-3.2-3B-Instruct model on ZeroGPU.",
56
- flagging_mode="never" # Updated for Gradio compatibility
57
  )
58
 
59
  # Launch with ZeroGPU settings
 
4
  import torch
5
  import os
6
  from huggingface_hub import login
7
+ import spaces # Required for ZeroGPU
8
 
9
  # Authenticate with Hugging Face
10
  hf_token = os.getenv("HF_TOKEN")
 
12
 
13
  # Model repository IDs
14
  base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
15
+ peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct" # Replace with your model repo (e.g., ubiodee/my-finetuned-model)
16
 
17
  # Load the tokenizer from the fine-tuned model
18
  tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
19
 
20
+ # Load the base model
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
  base_model_id,
23
  torch_dtype=torch.float16,
24
+ device_map="auto", # Use GPU for ZeroGPU
25
  token=hf_token,
26
  low_cpu_mem_usage=True,
27
  trust_remote_code=True
28
  )
29
+ base_model.resize_token_embeddings(len(tokenizer))
30
 
31
  # Load the PEFT adapter
32
  model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
33
 
34
+ # Define the prediction function with GPU support
35
+ @spaces.GPU(duration=120) # Allocate GPU for 120s per inference
36
  def predict(text, max_length=100):
37
  try:
38
  messages = [{"role": "user", "content": text}]
39
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
40
+ # Explicitly move inputs to GPU
41
+ inputs = {key: val.to("cuda:0") for key, val in inputs.items()}
42
+ outputs = model.generate(**inputs, max_length=max_length)
43
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
44
  except Exception as e:
45
  return f"Error during inference: {str(e)}"
 
54
  outputs=gr.Textbox(label="Model Output"),
55
  title="LearnPlutus Demo",
56
  description="Test the fine-tuned Llama-3.2-3B-Instruct model on ZeroGPU.",
57
+ flagging_mode="never"
58
  )
59
 
60
  # Launch with ZeroGPU settings