Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -285,9 +285,9 @@ Your goal is to be an educational partner who empowers students to succeed throu
|
|
| 285 |
class Phi3MiniEducationalLLM(Runnable):
|
| 286 |
"""LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer and proper cache handling"""
|
| 287 |
|
| 288 |
-
def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool =
|
| 289 |
super().__init__()
|
| 290 |
-
logger.info(f"Loading Phi-3-mini model: {model_path} (use_4bit={use_4bit})")
|
| 291 |
start_Loading_Model_time = time.perf_counter()
|
| 292 |
current_time = datetime.now()
|
| 293 |
|
|
@@ -302,24 +302,25 @@ class Phi3MiniEducationalLLM(Runnable):
|
|
| 302 |
)
|
| 303 |
|
| 304 |
if use_4bit:
|
|
|
|
| 305 |
quant_config = BitsAndBytesConfig(
|
| 306 |
load_in_4bit=True,
|
| 307 |
-
bnb_4bit_compute_dtype=torch.
|
| 308 |
bnb_4bit_use_double_quant=True,
|
| 309 |
-
bnb_4bit_quant_type="nf4",
|
| 310 |
-
llm_int8_threshold=
|
| 311 |
llm_int8_skip_modules=["lm_head"]
|
| 312 |
)
|
| 313 |
|
| 314 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 315 |
model_path,
|
| 316 |
quantization_config=quant_config,
|
| 317 |
-
device_map="
|
| 318 |
-
|
| 319 |
trust_remote_code=True,
|
| 320 |
low_cpu_mem_usage=True,
|
| 321 |
token=hf_token,
|
| 322 |
-
# Use eager attention for better compatibility
|
| 323 |
attn_implementation="eager"
|
| 324 |
)
|
| 325 |
else:
|
|
@@ -342,15 +343,15 @@ class Phi3MiniEducationalLLM(Runnable):
|
|
| 342 |
self.streamer = None
|
| 343 |
|
| 344 |
def _load_optimized_model(self, model_path: str):
|
| 345 |
-
"""Optimized model loading for Phi-3-mini with proper
|
| 346 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 347 |
model_path,
|
| 348 |
-
|
| 349 |
-
device_map="
|
| 350 |
trust_remote_code=True,
|
| 351 |
low_cpu_mem_usage=True,
|
| 352 |
token=hf_token,
|
| 353 |
-
# Use eager attention for better compatibility
|
| 354 |
attn_implementation="eager"
|
| 355 |
)
|
| 356 |
|
|
@@ -951,12 +952,18 @@ def clear_chat():
|
|
| 951 |
"""Clear the chat history."""
|
| 952 |
return [], ""
|
| 953 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 954 |
def warmup_agent():
|
| 955 |
"""Warm up the agent with a test query to preload everything."""
|
| 956 |
start_agent_warmup_time = time.perf_counter()
|
| 957 |
current_time = datetime.now()
|
| 958 |
|
| 959 |
logger.info("Warming up Phi-3-mini LangGraph agent with test query...")
|
|
|
|
|
|
|
| 960 |
try:
|
| 961 |
current_agent = get_agent()
|
| 962 |
|
|
@@ -964,6 +971,9 @@ def warmup_agent():
|
|
| 964 |
test_response = current_agent.chat("Hello, this is a warmup test.")
|
| 965 |
logger.info(f"Phi-3-mini LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
|
| 966 |
|
|
|
|
|
|
|
|
|
|
| 967 |
end_agent_warmup_time = time.perf_counter()
|
| 968 |
agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
|
| 969 |
log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
|
|
|
|
| 285 |
class Phi3MiniEducationalLLM(Runnable):
|
| 286 |
"""LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer and proper cache handling"""
|
| 287 |
|
| 288 |
+
def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = True):
|
| 289 |
super().__init__()
|
| 290 |
+
logger.info(f"Loading Phi-3-mini model: {model_path} (use_4bit={use_4bit}) for CPU")
|
| 291 |
start_Loading_Model_time = time.perf_counter()
|
| 292 |
current_time = datetime.now()
|
| 293 |
|
|
|
|
| 302 |
)
|
| 303 |
|
| 304 |
if use_4bit:
|
| 305 |
+
# CPU-optimized 4-bit quantization configuration
|
| 306 |
quant_config = BitsAndBytesConfig(
|
| 307 |
load_in_4bit=True,
|
| 308 |
+
bnb_4bit_compute_dtype=torch.float32, # Use float32 for CPU compatibility
|
| 309 |
bnb_4bit_use_double_quant=True,
|
| 310 |
+
bnb_4bit_quant_type="nf4", # NF4 is optimal for normally distributed weights
|
| 311 |
+
llm_int8_threshold=6.0, # Default threshold for outlier detection
|
| 312 |
llm_int8_skip_modules=["lm_head"]
|
| 313 |
)
|
| 314 |
|
| 315 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 316 |
model_path,
|
| 317 |
quantization_config=quant_config,
|
| 318 |
+
device_map="cpu", # Force CPU placement
|
| 319 |
+
dtype=torch.float32, # Use float32 for CPU
|
| 320 |
trust_remote_code=True,
|
| 321 |
low_cpu_mem_usage=True,
|
| 322 |
token=hf_token,
|
| 323 |
+
# Use eager attention for better compatibility
|
| 324 |
attn_implementation="eager"
|
| 325 |
)
|
| 326 |
else:
|
|
|
|
| 343 |
self.streamer = None
|
| 344 |
|
| 345 |
def _load_optimized_model(self, model_path: str):
|
| 346 |
+
"""Optimized model loading for Phi-3-mini with proper CPU support."""
|
| 347 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 348 |
model_path,
|
| 349 |
+
dtype=torch.float32, # Use float32 for CPU compatibility
|
| 350 |
+
device_map="cpu", # Force CPU placement
|
| 351 |
trust_remote_code=True,
|
| 352 |
low_cpu_mem_usage=True,
|
| 353 |
token=hf_token,
|
| 354 |
+
# Use eager attention for better compatibility
|
| 355 |
attn_implementation="eager"
|
| 356 |
)
|
| 357 |
|
|
|
|
| 952 |
"""Clear the chat history."""
|
| 953 |
return [], ""
|
| 954 |
|
| 955 |
+
def log_cpu_memory_usage():
|
| 956 |
+
"""Placeholder for CPU/memory logging function."""
|
| 957 |
+
pass
|
| 958 |
+
|
| 959 |
def warmup_agent():
|
| 960 |
"""Warm up the agent with a test query to preload everything."""
|
| 961 |
start_agent_warmup_time = time.perf_counter()
|
| 962 |
current_time = datetime.now()
|
| 963 |
|
| 964 |
logger.info("Warming up Phi-3-mini LangGraph agent with test query...")
|
| 965 |
+
log_cpu_memory_usage() # Log usage before warmup
|
| 966 |
+
|
| 967 |
try:
|
| 968 |
current_agent = get_agent()
|
| 969 |
|
|
|
|
| 971 |
test_response = current_agent.chat("Hello, this is a warmup test.")
|
| 972 |
logger.info(f"Phi-3-mini LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
|
| 973 |
|
| 974 |
+
# Log usage after warmup
|
| 975 |
+
log_cpu_memory_usage()
|
| 976 |
+
|
| 977 |
end_agent_warmup_time = time.perf_counter()
|
| 978 |
agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
|
| 979 |
log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
|