Spaces:
Sleeping
Sleeping
Update src/llm/local_model_client.py
Browse files
src/llm/local_model_client.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
"""Local Be.FM model client - OPTIMIZED for MPS performance"""
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import gc
|
|
@@ -99,10 +100,11 @@ class LocalModelClient:
|
|
| 99 |
print(f" [3/5] Loading base model ({self.base_model_id})...")
|
| 100 |
if self.device == "cuda":
|
| 101 |
# Use 8-bit quantization to fit in 15GB GPU (saves ~50% memory)
|
|
|
|
| 102 |
self._model = AutoModelForCausalLM.from_pretrained(
|
| 103 |
self.base_model_id,
|
| 104 |
load_in_8bit=True,
|
| 105 |
-
device_map="
|
| 106 |
token=hf_token,
|
| 107 |
)
|
| 108 |
else:
|
|
|
|
| 1 |
"""Local Be.FM model client - OPTIMIZED for MPS performance"""
|
| 2 |
+
"""Local Be.FM model client - OPTIMIZED for MPS performance"""
|
| 3 |
|
| 4 |
import os
|
| 5 |
import gc
|
|
|
|
| 100 |
print(f" [3/5] Loading base model ({self.base_model_id})...")
|
| 101 |
if self.device == "cuda":
|
| 102 |
# Use 8-bit quantization to fit in 15GB GPU (saves ~50% memory)
|
| 103 |
+
# Don't use device_map="auto" - it causes meta tensor issues
|
| 104 |
self._model = AutoModelForCausalLM.from_pretrained(
|
| 105 |
self.base_model_id,
|
| 106 |
load_in_8bit=True,
|
| 107 |
+
device_map={"": 0}, # Load everything on GPU 0, avoiding meta tensors
|
| 108 |
token=hf_token,
|
| 109 |
)
|
| 110 |
else:
|