Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 4, 2025

Commit

e13b10c

verified ·

1 Parent(s): ad201b4

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -34

app.py CHANGED Viewed

@@ -261,46 +261,52 @@ class Qwen25SmallLLM(Runnable):
     """LLM class that properly inherits from Runnable for LangChain compatibility"""
     def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
-        super().__init__()
-        logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
-        start_Loading_Model_time = time.perf_counter()
-        try:
-            # Load tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-            if use_4bit:
-                quant_config = BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_compute_dtype=torch.float16,
-                    bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type="nf4",
-                    llm_int8_threshold=0.0,
-                    llm_int8_skip_modules=["lm_head"]
-                )
-                # Try quantized load with updated dtype parameter
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    quantization_config=quant_config,
-                    device_map="auto",
-                    dtype=torch.bfloat16,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True
-                )
-            else:
-                self._load_fallback_model(model_path)
         end_Loading_Model_time = time.perf_counter()
         Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
         log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-        except Exception as e:
-            logger.warning(f"Quantized load failed, falling back: {e}")
-            self._load_fallback_model(model_path)
-        # Ensure pad token
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
     def _load_fallback_model(self, model_path: str):
         """Fallback if quantization fails."""

     """LLM class that properly inherits from Runnable for LangChain compatibility"""
     def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
+    super().__init__()
+    logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
+    start_Loading_Model_time = time.perf_counter()
+    current_time = datetime.now()
+    try:
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        if use_4bit:
+            quant_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_threshold=0.0,
+                llm_int8_skip_modules=["lm_head"]
+            )
+            # Try quantized load with updated dtype parameter
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                quantization_config=quant_config,
+                device_map="auto",
+                dtype=torch.bfloat16,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+        else:
+            self._load_fallback_model(model_path)
+        # Success path - log timing
         end_Loading_Model_time = time.perf_counter()
         Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
         log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+    except Exception as e:
+        logger.warning(f"Quantized load failed, falling back: {e}")
+        self._load_fallback_model(model_path)
+        end_Loading_Model_time = time.perf_counter()
+        Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
+        log_metric(f"Model Load time (fallback): {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+    # Ensure pad token
+    if self.tokenizer.pad_token is None:
+        self.tokenizer.pad_token = self.tokenizer.eos_token
     def _load_fallback_model(self, model_path: str):
         """Fallback if quantization fails."""