jdesiree commited on
Commit
89e465f
·
verified ·
1 Parent(s): f2731c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -12
app.py CHANGED
@@ -285,9 +285,9 @@ Your goal is to be an educational partner who empowers students to succeed throu
285
  class Phi3MiniEducationalLLM(Runnable):
286
  """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer and proper cache handling"""
287
 
288
- def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = False):
289
  super().__init__()
290
- logger.info(f"Loading Phi-3-mini model: {model_path} (use_4bit={use_4bit})")
291
  start_Loading_Model_time = time.perf_counter()
292
  current_time = datetime.now()
293
 
@@ -302,24 +302,25 @@ class Phi3MiniEducationalLLM(Runnable):
302
  )
303
 
304
  if use_4bit:
 
305
  quant_config = BitsAndBytesConfig(
306
  load_in_4bit=True,
307
- bnb_4bit_compute_dtype=torch.float16,
308
  bnb_4bit_use_double_quant=True,
309
- bnb_4bit_quant_type="nf4",
310
- llm_int8_threshold=0.0,
311
  llm_int8_skip_modules=["lm_head"]
312
  )
313
 
314
  self.model = AutoModelForCausalLM.from_pretrained(
315
  model_path,
316
  quantization_config=quant_config,
317
- device_map="auto",
318
- torch_dtype=torch.float16,
319
  trust_remote_code=True,
320
  low_cpu_mem_usage=True,
321
  token=hf_token,
322
- # Use eager attention for better compatibility in HF Spaces
323
  attn_implementation="eager"
324
  )
325
  else:
@@ -342,15 +343,15 @@ class Phi3MiniEducationalLLM(Runnable):
342
  self.streamer = None
343
 
344
  def _load_optimized_model(self, model_path: str):
345
- """Optimized model loading for Phi-3-mini with proper cache support."""
346
  self.model = AutoModelForCausalLM.from_pretrained(
347
  model_path,
348
- torch_dtype=torch.float16, # Use float16 to save memory
349
- device_map="auto", # Let transformers decide placement
350
  trust_remote_code=True,
351
  low_cpu_mem_usage=True,
352
  token=hf_token,
353
- # Use eager attention for better compatibility in HF Spaces
354
  attn_implementation="eager"
355
  )
356
 
@@ -951,12 +952,18 @@ def clear_chat():
951
  """Clear the chat history."""
952
  return [], ""
953
 
 
 
 
 
954
  def warmup_agent():
955
  """Warm up the agent with a test query to preload everything."""
956
  start_agent_warmup_time = time.perf_counter()
957
  current_time = datetime.now()
958
 
959
  logger.info("Warming up Phi-3-mini LangGraph agent with test query...")
 
 
960
  try:
961
  current_agent = get_agent()
962
 
@@ -964,6 +971,9 @@ def warmup_agent():
964
  test_response = current_agent.chat("Hello, this is a warmup test.")
965
  logger.info(f"Phi-3-mini LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
966
 
 
 
 
967
  end_agent_warmup_time = time.perf_counter()
968
  agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
969
  log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
 
285
  class Phi3MiniEducationalLLM(Runnable):
286
  """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer and proper cache handling"""
287
 
288
+ def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = True):
289
  super().__init__()
290
+ logger.info(f"Loading Phi-3-mini model: {model_path} (use_4bit={use_4bit}) for CPU")
291
  start_Loading_Model_time = time.perf_counter()
292
  current_time = datetime.now()
293
 
 
302
  )
303
 
304
  if use_4bit:
305
+ # CPU-optimized 4-bit quantization configuration
306
  quant_config = BitsAndBytesConfig(
307
  load_in_4bit=True,
308
+ bnb_4bit_compute_dtype=torch.float32, # Use float32 for CPU compatibility
309
  bnb_4bit_use_double_quant=True,
310
+ bnb_4bit_quant_type="nf4", # NF4 is optimal for normally distributed weights
311
+ llm_int8_threshold=6.0, # Default threshold for outlier detection
312
  llm_int8_skip_modules=["lm_head"]
313
  )
314
 
315
  self.model = AutoModelForCausalLM.from_pretrained(
316
  model_path,
317
  quantization_config=quant_config,
318
+ device_map="cpu", # Force CPU placement
319
+ dtype=torch.float32, # Use float32 for CPU
320
  trust_remote_code=True,
321
  low_cpu_mem_usage=True,
322
  token=hf_token,
323
+ # Use eager attention for better compatibility
324
  attn_implementation="eager"
325
  )
326
  else:
 
343
  self.streamer = None
344
 
345
  def _load_optimized_model(self, model_path: str):
346
+ """Optimized model loading for Phi-3-mini with proper CPU support."""
347
  self.model = AutoModelForCausalLM.from_pretrained(
348
  model_path,
349
+ dtype=torch.float32, # Use float32 for CPU compatibility
350
+ device_map="cpu", # Force CPU placement
351
  trust_remote_code=True,
352
  low_cpu_mem_usage=True,
353
  token=hf_token,
354
+ # Use eager attention for better compatibility
355
  attn_implementation="eager"
356
  )
357
 
 
952
  """Clear the chat history."""
953
  return [], ""
954
 
955
+ def log_cpu_memory_usage():
956
+ """Placeholder for CPU/memory logging function."""
957
+ pass
958
+
959
  def warmup_agent():
960
  """Warm up the agent with a test query to preload everything."""
961
  start_agent_warmup_time = time.perf_counter()
962
  current_time = datetime.now()
963
 
964
  logger.info("Warming up Phi-3-mini LangGraph agent with test query...")
965
+ log_cpu_memory_usage() # Log usage before warmup
966
+
967
  try:
968
  current_agent = get_agent()
969
 
 
971
  test_response = current_agent.chat("Hello, this is a warmup test.")
972
  logger.info(f"Phi-3-mini LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
973
 
974
+ # Log usage after warmup
975
+ log_cpu_memory_usage()
976
+
977
  end_agent_warmup_time = time.perf_counter()
978
  agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
979
  log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")