Ghaithhmz commited on
Commit
f183b54
·
1 Parent(s): 9f1e682

Optimize for 16GB CPU: Enable 4-bit quantization and low memory loading

Browse files
Files changed (2) hide show
  1. app.py +16 -16
  2. requirements.txt +2 -2
app.py CHANGED
@@ -42,44 +42,44 @@ def load_model():
42
  # Check if it's an adapter (has adapter_config.json)
43
  is_adapter = os.path.exists(os.path.join(local_model_path, "adapter_config.json"))
44
 
45
- # Quantization only works on GPU
46
- bnb_config = None
47
- if device == "cuda":
48
  bnb_config = BitsAndBytesConfig(
49
  load_in_4bit=True,
50
  bnb_4bit_quant_type="nf4",
51
- bnb_4bit_compute_dtype=torch.float16,
 
52
  )
 
 
 
53
 
54
  try:
 
 
 
 
 
 
 
55
  if is_adapter:
56
  print(f"Loading Base Model ({device})...")
57
  base_model = AutoModelForCausalLM.from_pretrained(
58
  BASE_MODEL_ID,
59
- quantization_config=bnb_config,
60
- device_map="auto" if device == "cuda" else None,
61
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
62
- trust_remote_code=True
63
  )
64
  print("Loading Adapter...")
65
  model = PeftModel.from_pretrained(base_model, local_model_path)
66
- # Use base model tokenizer
67
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
68
 
69
  else:
70
  print(f"Loading Full Merged Model ({device})...")
71
  model = AutoModelForCausalLM.from_pretrained(
72
  local_model_path,
73
- quantization_config=bnb_config,
74
- device_map="auto" if device == "cuda" else None,
75
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
76
- trust_remote_code=True
77
  )
78
  tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
79
 
80
- if device == "cpu":
81
- model = model.to("cpu")
82
-
83
  tokenizer.pad_token = tokenizer.eos_token
84
  return model, tokenizer
85
 
 
42
  # Check if it's an adapter (has adapter_config.json)
43
  is_adapter = os.path.exists(os.path.join(local_model_path, "adapter_config.json"))
44
 
45
+ # Quantization Config (Now enabling for both CPU and GPU if possible)
46
+ try:
 
47
  bnb_config = BitsAndBytesConfig(
48
  load_in_4bit=True,
49
  bnb_4bit_quant_type="nf4",
50
+ bnb_4bit_compute_dtype=torch.float16 if device == "cuda" else torch.bfloat16,
51
+ bnb_4bit_use_double_quant=True,
52
  )
53
+ except Exception as e:
54
+ print(f"⚠️ BitsAndBytes not fully supported on this CPU environment, falling back: {e}")
55
+ bnb_config = None
56
 
57
  try:
58
+ common_kwargs = {
59
+ "quantization_config": bnb_config,
60
+ "low_cpu_mem_usage": True, # Critical for 16GB limit
61
+ "trust_remote_code": True,
62
+ "device_map": "auto" # Let accelerate handle the placement
63
+ }
64
+
65
  if is_adapter:
66
  print(f"Loading Base Model ({device})...")
67
  base_model = AutoModelForCausalLM.from_pretrained(
68
  BASE_MODEL_ID,
69
+ **common_kwargs
 
 
 
70
  )
71
  print("Loading Adapter...")
72
  model = PeftModel.from_pretrained(base_model, local_model_path)
 
73
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
74
 
75
  else:
76
  print(f"Loading Full Merged Model ({device})...")
77
  model = AutoModelForCausalLM.from_pretrained(
78
  local_model_path,
79
+ **common_kwargs
 
 
 
80
  )
81
  tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
82
 
 
 
 
83
  tokenizer.pad_token = tokenizer.eos_token
84
  return model, tokenizer
85
 
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
  gradio==5.15.0
2
  huggingface_hub
3
  torch
 
 
4
  transformers
5
  peft
6
- accelerate
7
- bitsandbytes
8
  sentencepiece
 
1
  gradio==5.15.0
2
  huggingface_hub
3
  torch
4
+ bitsandbytes>=0.43.0
5
+ accelerate
6
  transformers
7
  peft
 
 
8
  sentencepiece