pierreramez commited on
Commit
8f7da87
·
verified ·
1 Parent(s): d1dc38a

updated app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -119,7 +119,7 @@ class ModelManager:
119
 
120
  # GPU check for 4-bit loading
121
  if use_4bit and self._device == "cuda":
122
- print("🚀 GPU detected: Loading in 4-bit mode")
123
  try:
124
  from transformers import BitsAndBytesConfig
125
 
@@ -138,14 +138,14 @@ class ModelManager:
138
  torch_dtype=torch.float16,
139
  )
140
  except ImportError:
141
- print("⚠️ bitsandbytes not installed. Falling back to standard loading.")
142
  base_model = AutoModelForCausalLM.from_pretrained(
143
  model_name,
144
  device_map="auto",
145
  trust_remote_code=True,
146
  )
147
  else:
148
- print(f"⚠️ Using {self._device} (No GPU or use_4bit=False). Loading standard model.")
149
  base_model = AutoModelForCausalLM.from_pretrained(
150
  model_name,
151
  device_map=self._device,
@@ -162,9 +162,9 @@ class ModelManager:
162
  torch_dtype=torch.float16 if self._device == "cuda" else torch.float32
163
  )
164
  self._current_adapter = adapter_path
165
- print(f"Adapter loaded successfully")
166
  except Exception as e:
167
- print(f"⚠️ Could not load adapter: {e}")
168
  print(" Using base model without adapter")
169
  self._model = base_model
170
  self._current_adapter = None
@@ -308,13 +308,13 @@ async def startup_event():
308
  print("Starting up...")
309
 
310
  model_manager.initialize(
311
- # 1. The Base Model (The heavy lifter)
312
  model_name="meta-llama/Llama-3.2-3B-Instruct",
313
 
314
- # 2. Adapter (The personalization) - YOUR SPECIFIC REPO
315
  adapter_path="pierreramez/Llama-3.2-3B-Instruct-bnb-4bit_finetuned",
316
 
317
- # 3. CPU Optimization (Must be False for free tier)
318
  use_4bit=False
319
  )
320
 
@@ -487,7 +487,7 @@ async def reload_adapter(request: ReloadAdapterRequest):
487
  """Hot reload model."""
488
  try:
489
  model_manager.initialize(
490
- model_name="meta-llama/Llama-3.2-1B-Instruct",
491
  adapter_path=request.adapter_path,
492
  use_4bit=False
493
  )
@@ -498,3 +498,4 @@ async def reload_adapter(request: ReloadAdapterRequest):
498
  if __name__ == "__main__":
499
  import uvicorn
500
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
 
 
119
 
120
  # GPU check for 4-bit loading
121
  if use_4bit and self._device == "cuda":
122
+ print("GPU detected: Loading in 4-bit mode")
123
  try:
124
  from transformers import BitsAndBytesConfig
125
 
 
138
  torch_dtype=torch.float16,
139
  )
140
  except ImportError:
141
+ print("bitsandbytes not installed. Falling back to standard loading.")
142
  base_model = AutoModelForCausalLM.from_pretrained(
143
  model_name,
144
  device_map="auto",
145
  trust_remote_code=True,
146
  )
147
  else:
148
+ print(f"Using {self._device} (No GPU or use_4bit=False). Loading standard model.")
149
  base_model = AutoModelForCausalLM.from_pretrained(
150
  model_name,
151
  device_map=self._device,
 
162
  torch_dtype=torch.float16 if self._device == "cuda" else torch.float32
163
  )
164
  self._current_adapter = adapter_path
165
+ print(f"Adapter loaded successfully")
166
  except Exception as e:
167
+ print(f"Could not load adapter: {e}")
168
  print(" Using base model without adapter")
169
  self._model = base_model
170
  self._current_adapter = None
 
308
  print("Starting up...")
309
 
310
  model_manager.initialize(
311
+ # 1. The Base Model
312
  model_name="meta-llama/Llama-3.2-3B-Instruct",
313
 
314
+ # 2. Adapter
315
  adapter_path="pierreramez/Llama-3.2-3B-Instruct-bnb-4bit_finetuned",
316
 
317
+ # 3. CPU Optimization
318
  use_4bit=False
319
  )
320
 
 
487
  """Hot reload model."""
488
  try:
489
  model_manager.initialize(
490
+ model_name="meta-llama/Llama-3.2-3B-Instruct",
491
  adapter_path=request.adapter_path,
492
  use_4bit=False
493
  )
 
498
  if __name__ == "__main__":
499
  import uvicorn
500
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
501
+