FadQ commited on
Commit
80ac91b
·
verified ·
1 Parent(s): 3a2f449

add offload

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
5
-
6
  import os
7
 
8
  # Pastikan Hugging Face Token disediakan (jika private repo)
@@ -12,11 +11,20 @@ hf_token = os.getenv('HF_TOKEN')
12
  base_model = "google/gemma-2b-it"
13
  adapter_model = "FadQ/gemma-2b-diary-consultaton-chatbot"
14
 
15
- # Load model dasar
16
- model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16, device_map="auto")
 
 
 
 
 
17
 
18
- # Load adapter PEFT
19
- model = PeftModel.from_pretrained(model, adapter_model)
 
 
 
 
20
 
21
  # Load tokenizer
22
  tokenizer = AutoTokenizer.from_pretrained(base_model)
@@ -26,7 +34,8 @@ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
26
 
27
  def predict(input_text):
28
  inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
29
- output = model.generate(**inputs, max_length=150)
 
30
  return tokenizer.decode(output[0], skip_special_tokens=True)
31
 
32
  # Create Gradio interface
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
 
5
  import os
6
 
7
  # Pastikan Hugging Face Token disediakan (jika private repo)
 
11
  base_model = "google/gemma-2b-it"
12
  adapter_model = "FadQ/gemma-2b-diary-consultaton-chatbot"
13
 
14
+ # Load model dasar dengan offloading
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ base_model,
17
+ torch_dtype=torch.float16,
18
+ device_map="auto",
19
+ offload_folder="offload" # Tambahkan folder untuk offloading ke disk
20
+ )
21
 
22
+ # Load adapter PEFT dengan offloading
23
+ model = PeftModel.from_pretrained(
24
+ model,
25
+ adapter_model,
26
+ offload_folder="offload"
27
+ )
28
 
29
  # Load tokenizer
30
  tokenizer = AutoTokenizer.from_pretrained(base_model)
 
34
 
35
  def predict(input_text):
36
  inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
37
+ with torch.no_grad(): # Hindari penggunaan memori yang tidak perlu
38
+ output = model.generate(**inputs, max_length=150)
39
  return tokenizer.decode(output[0], skip_special_tokens=True)
40
 
41
  # Create Gradio interface