MatthewStroud commited on
Commit
7b88a5a
·
verified ·
1 Parent(s): a888ed5

Update src/llm/local_model_client.py

Browse files
Files changed (1) hide show
  1. src/llm/local_model_client.py +3 -1
src/llm/local_model_client.py CHANGED
@@ -1,4 +1,5 @@
1
  """Local Be.FM model client - OPTIMIZED for MPS performance"""
 
2
 
3
  import os
4
  import gc
@@ -99,10 +100,11 @@ class LocalModelClient:
99
  print(f" [3/5] Loading base model ({self.base_model_id})...")
100
  if self.device == "cuda":
101
  # Use 8-bit quantization to fit in 15GB GPU (saves ~50% memory)
 
102
  self._model = AutoModelForCausalLM.from_pretrained(
103
  self.base_model_id,
104
  load_in_8bit=True,
105
- device_map="auto",
106
  token=hf_token,
107
  )
108
  else:
 
1
  """Local Be.FM model client - OPTIMIZED for MPS performance"""
2
+ """Local Be.FM model client - OPTIMIZED for MPS performance"""
3
 
4
  import os
5
  import gc
 
100
  print(f" [3/5] Loading base model ({self.base_model_id})...")
101
  if self.device == "cuda":
102
  # Use 8-bit quantization to fit in 15GB GPU (saves ~50% memory)
103
+ # Don't use device_map="auto" - it causes meta tensor issues
104
  self._model = AutoModelForCausalLM.from_pretrained(
105
  self.base_model_id,
106
  load_in_8bit=True,
107
+ device_map={"": 0}, # Load everything on GPU 0, avoiding meta tensors
108
  token=hf_token,
109
  )
110
  else: