Spaces:

MatthewStroud
/

AI_Personas

Sleeping

MatthewStroud commited on Oct 31, 2025

Commit

7b88a5a

verified ·

1 Parent(s): a888ed5

Update src/llm/local_model_client.py

Files changed (1) hide show

src/llm/local_model_client.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Local Be.FM model client - OPTIMIZED for MPS performance"""
 import os
 import gc
@@ -99,10 +100,11 @@ class LocalModelClient:
             print(f"  [3/5] Loading base model ({self.base_model_id})...")
             if self.device == "cuda":
                 # Use 8-bit quantization to fit in 15GB GPU (saves ~50% memory)
                 self._model = AutoModelForCausalLM.from_pretrained(
                     self.base_model_id,
                     load_in_8bit=True,
-                    device_map="auto",
                     token=hf_token,
                 )
             else:

 """Local Be.FM model client - OPTIMIZED for MPS performance"""
+"""Local Be.FM model client - OPTIMIZED for MPS performance"""
 import os
 import gc
             print(f"  [3/5] Loading base model ({self.base_model_id})...")
             if self.device == "cuda":
                 # Use 8-bit quantization to fit in 15GB GPU (saves ~50% memory)
+                # Don't use device_map="auto" - it causes meta tensor issues
                 self._model = AutoModelForCausalLM.from_pretrained(
                     self.base_model_id,
                     load_in_8bit=True,
+                    device_map={"": 0},  # Load everything on GPU 0, avoiding meta tensors
                     token=hf_token,
                 )
             else: