Remostart commited on
Commit
4b57ecb
·
verified ·
1 Parent(s): ffa43a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -12
app.py CHANGED
@@ -4,7 +4,7 @@ from peft import PeftModel
4
  import torch
5
  import os
6
  from huggingface_hub import login
7
- import spaces # Required for ZeroGPU
8
 
9
  # Authenticate with Hugging Face
10
  hf_token = os.getenv("HF_TOKEN")
@@ -12,7 +12,7 @@ login(token=hf_token)
12
 
13
  # Model repository IDs
14
  base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
15
- peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct" # Replace with your model repo (e.g., ubiodee/my-finetuned-model)
16
 
17
  # Load the tokenizer from the fine-tuned model
18
  tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
@@ -21,7 +21,7 @@ tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
  base_model_id,
23
  torch_dtype=torch.float16,
24
- device_map="auto", # Use GPU for ZeroGPU
25
  token=hf_token,
26
  low_cpu_mem_usage=True,
27
  trust_remote_code=True
@@ -31,14 +31,18 @@ base_model.resize_token_embeddings(len(tokenizer))
31
  # Load the PEFT adapter
32
  model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
33
 
34
- # Define the prediction function with GPU support
35
- @spaces.GPU(duration=120) # Allocate GPU for 120s per inference
36
  def predict(text, max_length=100):
37
  try:
38
  messages = [{"role": "user", "content": text}]
39
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
40
- # Explicitly move inputs to GPU
41
- inputs = {key: val.to("cuda:0") for key, val in inputs.items()}
 
 
 
 
42
  outputs = model.generate(**inputs, max_length=max_length)
43
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
44
  except Exception as e:
@@ -57,11 +61,10 @@ demo = gr.Interface(
57
  flagging_mode="never"
58
  )
59
 
60
- # Launch with ZeroGPU settings
61
  demo.launch(
62
  server_name="0.0.0.0",
63
  server_port=7860,
64
- ssr_mode=False,
65
- share=True, # Enable public URL
66
- debug=True # Enable debug mode for detailed logs
67
- )
 
4
  import torch
5
  import os
6
  from huggingface_hub import login
7
+ import spaces
8
 
9
  # Authenticate with Hugging Face
10
  hf_token = os.getenv("HF_TOKEN")
 
12
 
13
  # Model repository IDs
14
  base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
15
+ peft_model_id = "ubiodee/<your-model-repo>" # Replace with your model repo
16
 
17
  # Load the tokenizer from the fine-tuned model
18
  tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
 
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
  base_model_id,
23
  torch_dtype=torch.float16,
24
+ device_map="auto",
25
  token=hf_token,
26
  low_cpu_mem_usage=True,
27
  trust_remote_code=True
 
31
  # Load the PEFT adapter
32
  model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
33
 
34
+ # Define the prediction function with proper device handling
35
+ @spaces.GPU(duration=120)
36
  def predict(text, max_length=100):
37
  try:
38
  messages = [{"role": "user", "content": text}]
39
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
40
+ # Move inputs to GPU if they are a dictionary of tensors
41
+ if isinstance(inputs, dict):
42
+ inputs = {key: val.to("cuda:0") for key, val in inputs.items()}
43
+ else:
44
+ # If inputs is a single tensor (unlikely but for robustness)
45
+ inputs = inputs.to("cuda:0")
46
  outputs = model.generate(**inputs, max_length=max_length)
47
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
48
  except Exception as e:
 
61
  flagging_mode="never"
62
  )
63
 
64
+ # Launch the app
65
  demo.launch(
66
  server_name="0.0.0.0",
67
  server_port=7860,
68
+ share=True,
69
+ debug=True
70
+ )