Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 4, 2025

Commit

79d5341

verified ·

1 Parent(s): e13b10c

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -43

app.py CHANGED Viewed

@@ -261,52 +261,52 @@ class Qwen25SmallLLM(Runnable):
     """LLM class that properly inherits from Runnable for LangChain compatibility"""
     def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
-    super().__init__()
-    logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
-    start_Loading_Model_time = time.perf_counter()
-    current_time = datetime.now()
-    try:
-        # Load tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        if use_4bit:
-            quant_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type="nf4",
-                llm_int8_threshold=0.0,
-                llm_int8_skip_modules=["lm_head"]
-            )
-            # Try quantized load with updated dtype parameter
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                quantization_config=quant_config,
-                device_map="auto",
-                dtype=torch.bfloat16,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True
-            )
-        else:
-            self._load_fallback_model(model_path)
-        # Success path - log timing
-        end_Loading_Model_time = time.perf_counter()
-        Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
-        log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-    except Exception as e:
-        logger.warning(f"Quantized load failed, falling back: {e}")
-        self._load_fallback_model(model_path)
-        end_Loading_Model_time = time.perf_counter()
-        Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
-        log_metric(f"Model Load time (fallback): {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-    # Ensure pad token
-    if self.tokenizer.pad_token is None:
-        self.tokenizer.pad_token = self.tokenizer.eos_token
     def _load_fallback_model(self, model_path: str):
         """Fallback if quantization fails."""

     """LLM class that properly inherits from Runnable for LangChain compatibility"""
     def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
+        super().__init__()
+        logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
+        start_Loading_Model_time = time.perf_counter()
+        current_time = datetime.now()
+        try:
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+            if use_4bit:
+                quant_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                    llm_int8_threshold=0.0,
+                    llm_int8_skip_modules=["lm_head"]
+                )
+                # Try quantized load with updated dtype parameter
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    model_path,
+                    quantization_config=quant_config,
+                    device_map="auto",
+                    dtype=torch.bfloat16,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True
+                )
+            else:
+                self._load_fallback_model(model_path)
+            # Success path - log timing
+            end_Loading_Model_time = time.perf_counter()
+            Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
+            log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+        except Exception as e:
+            logger.warning(f"Quantized load failed, falling back: {e}")
+            self._load_fallback_model(model_path)
+            end_Loading_Model_time = time.perf_counter()
+            Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
+            log_metric(f"Model Load time (fallback): {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+        # Ensure pad token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
     def _load_fallback_model(self, model_path: str):
         """Fallback if quantization fails."""