Spaces:
Sleeping
Sleeping
updated app.py
Browse files
app.py
CHANGED
|
@@ -119,7 +119,7 @@ class ModelManager:
|
|
| 119 |
|
| 120 |
# GPU check for 4-bit loading
|
| 121 |
if use_4bit and self._device == "cuda":
|
| 122 |
-
print("
|
| 123 |
try:
|
| 124 |
from transformers import BitsAndBytesConfig
|
| 125 |
|
|
@@ -138,14 +138,14 @@ class ModelManager:
|
|
| 138 |
torch_dtype=torch.float16,
|
| 139 |
)
|
| 140 |
except ImportError:
|
| 141 |
-
print("
|
| 142 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 143 |
model_name,
|
| 144 |
device_map="auto",
|
| 145 |
trust_remote_code=True,
|
| 146 |
)
|
| 147 |
else:
|
| 148 |
-
print(f"
|
| 149 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 150 |
model_name,
|
| 151 |
device_map=self._device,
|
|
@@ -162,9 +162,9 @@ class ModelManager:
|
|
| 162 |
torch_dtype=torch.float16 if self._device == "cuda" else torch.float32
|
| 163 |
)
|
| 164 |
self._current_adapter = adapter_path
|
| 165 |
-
print(f"
|
| 166 |
except Exception as e:
|
| 167 |
-
print(f"
|
| 168 |
print(" Using base model without adapter")
|
| 169 |
self._model = base_model
|
| 170 |
self._current_adapter = None
|
|
@@ -308,13 +308,13 @@ async def startup_event():
|
|
| 308 |
print("Starting up...")
|
| 309 |
|
| 310 |
model_manager.initialize(
|
| 311 |
-
# 1. The Base Model
|
| 312 |
model_name="meta-llama/Llama-3.2-3B-Instruct",
|
| 313 |
|
| 314 |
-
# 2. Adapter
|
| 315 |
adapter_path="pierreramez/Llama-3.2-3B-Instruct-bnb-4bit_finetuned",
|
| 316 |
|
| 317 |
-
# 3. CPU Optimization
|
| 318 |
use_4bit=False
|
| 319 |
)
|
| 320 |
|
|
@@ -487,7 +487,7 @@ async def reload_adapter(request: ReloadAdapterRequest):
|
|
| 487 |
"""Hot reload model."""
|
| 488 |
try:
|
| 489 |
model_manager.initialize(
|
| 490 |
-
model_name="meta-llama/Llama-3.2-
|
| 491 |
adapter_path=request.adapter_path,
|
| 492 |
use_4bit=False
|
| 493 |
)
|
|
@@ -498,3 +498,4 @@ async def reload_adapter(request: ReloadAdapterRequest):
|
|
| 498 |
if __name__ == "__main__":
|
| 499 |
import uvicorn
|
| 500 |
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
|
|
|
|
|
|
| 119 |
|
| 120 |
# GPU check for 4-bit loading
|
| 121 |
if use_4bit and self._device == "cuda":
|
| 122 |
+
print("GPU detected: Loading in 4-bit mode")
|
| 123 |
try:
|
| 124 |
from transformers import BitsAndBytesConfig
|
| 125 |
|
|
|
|
| 138 |
torch_dtype=torch.float16,
|
| 139 |
)
|
| 140 |
except ImportError:
|
| 141 |
+
print("bitsandbytes not installed. Falling back to standard loading.")
|
| 142 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 143 |
model_name,
|
| 144 |
device_map="auto",
|
| 145 |
trust_remote_code=True,
|
| 146 |
)
|
| 147 |
else:
|
| 148 |
+
print(f"Using {self._device} (No GPU or use_4bit=False). Loading standard model.")
|
| 149 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 150 |
model_name,
|
| 151 |
device_map=self._device,
|
|
|
|
| 162 |
torch_dtype=torch.float16 if self._device == "cuda" else torch.float32
|
| 163 |
)
|
| 164 |
self._current_adapter = adapter_path
|
| 165 |
+
print(f"Adapter loaded successfully")
|
| 166 |
except Exception as e:
|
| 167 |
+
print(f"Could not load adapter: {e}")
|
| 168 |
print(" Using base model without adapter")
|
| 169 |
self._model = base_model
|
| 170 |
self._current_adapter = None
|
|
|
|
| 308 |
print("Starting up...")
|
| 309 |
|
| 310 |
model_manager.initialize(
|
| 311 |
+
# 1. The Base Model
|
| 312 |
model_name="meta-llama/Llama-3.2-3B-Instruct",
|
| 313 |
|
| 314 |
+
# 2. Adapter
|
| 315 |
adapter_path="pierreramez/Llama-3.2-3B-Instruct-bnb-4bit_finetuned",
|
| 316 |
|
| 317 |
+
# 3. CPU Optimization
|
| 318 |
use_4bit=False
|
| 319 |
)
|
| 320 |
|
|
|
|
| 487 |
"""Hot reload model."""
|
| 488 |
try:
|
| 489 |
model_manager.initialize(
|
| 490 |
+
model_name="meta-llama/Llama-3.2-3B-Instruct",
|
| 491 |
adapter_path=request.adapter_path,
|
| 492 |
use_4bit=False
|
| 493 |
)
|
|
|
|
| 498 |
if __name__ == "__main__":
|
| 499 |
import uvicorn
|
| 500 |
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
|
| 501 |
+
|