Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 6, 2025

Commit

613dbea

verified ·

1 Parent(s): 578ef70

Readded Quantumization

Browse files

Files changed (1) hide show

app.py +75 -57

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
-from transformers import AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
 import torch
 import time
 import warnings
@@ -292,28 +292,38 @@ Rather than providing complete solutions, you should:
 Your goal is to be an educational partner who empowers students to succeed through understanding."""
 # --- Updated LLM Class with Phi-3-mini ---
 class Phi3MiniEducationalLLM(Runnable):
-    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct without quantization"""
     def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct"):
         super().__init__()
-        logger.info(f"Loading Phi-3-mini model: {model_path}")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
         self.model_name = model_path
         try:
-            # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_path,
                 trust_remote_code=True,
-                token=hf_token
             )
-            # Store model path instead of loading model immediately
             self.model_path = model_path
-            self.model = None  # Load model lazily in GPU methods
         except Exception as e:
             logger.error(f"Failed to initialize Phi-3-mini model {model_path}: {e}")
@@ -326,16 +336,24 @@ class Phi3MiniEducationalLLM(Runnable):
         self.streamer = None
     def _load_model_if_needed(self):
-        """Load model only when needed inside GPU context"""
         if self.model is None:
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_path,
-                torch_dtype=torch.float16,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,
-                token=hf_token,
-                attn_implementation="eager"
-            )
         return self.model
     def _format_chat_template(self, prompt: str) -> str:
@@ -357,80 +375,82 @@ class Phi3MiniEducationalLLM(Runnable):
             # Fallback to manual Phi-3 format
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
-    @spaces.GPU(duration=60)
     def invoke(self, input: Input, config=None) -> Output:
-        """Main invoke method optimized for Phi-3-mini"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
-        # Handle both string and dict inputs for flexibility
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
             # Format using Phi-3 chat template
             text = self._format_chat_template(prompt)
             inputs = self.tokenizer(
                 text,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
-                max_length=3072  # Leave room for generation within 4k context
             )
             # Move inputs to model device
-            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            # Generate with the model
             with torch.no_grad():
-                outputs = self.model.generate(
                     **inputs,
-                    max_new_tokens=800,     # Increased for comprehensive responses
                     do_sample=True,
-                    temperature=0.7,        # Good balance for educational content
                     top_p=0.9,
                     top_k=50,
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
                     early_stopping=True,
-                    use_cache=False,
                     past_key_values=None
                 )
             # Decode only new tokens
             new_tokens = outputs[0][len(inputs.input_ids[0]):]
             result = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
-            log_metric(f"LLM Invoke time: {invoke_time:0.4f} seconds. Input length: {len(prompt)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return result if result else "I'm still learning how to respond to that properly."
         except Exception as e:
-            logger.error(f"Generation error: {e}")
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
             log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return f"[Error generating response: {str(e)}]"
-    @spaces.GPU(duration=120)
     def stream_generate(self, input: Input, config=None):
-        """Streaming generation using TextIteratorStreamer with loop detection and early escape."""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
-        logger.info("Starting stream_generate with TextIteratorStreamer and loop detection...")
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
-            # Load model inside GPU context
             model = self._load_model_if_needed()
             # Clear GPU cache
@@ -438,7 +458,7 @@ class Phi3MiniEducationalLLM(Runnable):
                 torch.cuda.empty_cache()
             text = self._format_chat_template(prompt)
             inputs = self.tokenizer(
                 text,
                 return_tensors="pt",
@@ -446,18 +466,18 @@ class Phi3MiniEducationalLLM(Runnable):
                 truncation=True,
                 max_length=3072
             )
-            # Move inputs to model device - now model is not None
             inputs = {k: v.to(model.device) for k, v in inputs.items()}
             # Initialize TextIteratorStreamer
             streamer = TextIteratorStreamer(
                 self.tokenizer,
                 skip_prompt=True,
                 skip_special_tokens=True
             )
-            # Generation parameters
             generation_kwargs = {
                 **inputs,
                 "max_new_tokens": 800,
@@ -471,15 +491,15 @@ class Phi3MiniEducationalLLM(Runnable):
                 "use_cache": False,
                 "past_key_values": None
             }
             # Start generation in background
             generation_thread = threading.Thread(
-                target=model.generate,  # Use the loaded model
                 kwargs=generation_kwargs
             )
             generation_thread.start()
-            # Track outputs
             generated_text = ""
             token_history = []
             loop_window = 20
@@ -492,39 +512,37 @@ class Phi3MiniEducationalLLM(Runnable):
                     generated_text += new_text
-                    # Tokenize and track
                     tokens = self.tokenizer.tokenize(new_text)
                     token_history.extend(tokens)
-                    # Check for loops
                     if len(token_history) >= 2 * loop_window:
                         recent = token_history[-loop_window:]
                         prev = token_history[-2*loop_window:-loop_window]
                         overlap = sum(1 for r, p in zip(recent, prev) if r == p)
                         if overlap >= loop_threshold:
-                            logger.warning(f"Looping detected (overlap: {overlap}/{loop_window}). Aborting generation.")
                             yield "[Looping detected — generation stopped early]"
                             break
                     yield generated_text
             except Exception as e:
-                logger.error(f"Error in streaming iteration: {e}")
                 yield f"[Streaming error: {str(e)}]"
             generation_thread.join()
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
-            log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Generated length: {len(generated_text)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            logger.info(f"Stream generation completed: {len(generated_text)} chars in {stream_time:.2f}s")
         except Exception as e:
-            logger.error(f"Streaming generation error: {e}")
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
             log_metric(f"LLM Stream time (error): {stream_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            yield f"[Error in streaming generation: {str(e)}]"
     @property

 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
+from transformers import AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 import time
 import warnings
 Your goal is to be an educational partner who empowers students to succeed through understanding."""
 # --- Updated LLM Class with Phi-3-mini ---
 class Phi3MiniEducationalLLM(Runnable):
+    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with 4-bit quantization"""
     def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct"):
         super().__init__()
+        logger.info(f"Loading Phi-3-mini model with 4-bit quantization: {model_path}")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
         self.model_name = model_path
         try:
+            # Load tokenizer (can be done on CPU)
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_path,
                 trust_remote_code=True,
+                token=hf_token,
+                use_fast=False
+            )
+            # Configure 4-bit quantization
+            self.quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_quant_type="nf4",  # NormalFloat 4-bit
+                bnb_4bit_use_double_quant=True,  # Nested quantization for extra savings
             )
+            # Store model path - model will be loaded inside GPU context
             self.model_path = model_path
+            self.model = None
         except Exception as e:
             logger.error(f"Failed to initialize Phi-3-mini model {model_path}: {e}")
         self.streamer = None
     def _load_model_if_needed(self):
+        """Load model with 4-bit quantization only when needed inside GPU context"""
         if self.model is None:
+            logger.info("Loading model with 4-bit quantization...")
+            try:
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_path,
+                    quantization_config=self.quantization_config,
+                    torch_dtype=torch.bfloat16,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    token=hf_token,
+                    attn_implementation="eager",
+                    device_map="auto"
+                )
+                logger.info(f"Model loaded successfully. Memory footprint reduced to ~2.2GB with 4-bit quantization")
+            except Exception as e:
+                logger.error(f"Failed to load quantized model: {e}")
+                raise
         return self.model
     def _format_chat_template(self, prompt: str) -> str:
             # Fallback to manual Phi-3 format
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
+    @spaces.GPU(duration=180)
     def invoke(self, input: Input, config=None) -> Output:
+        """Main invoke method optimized for 4-bit quantized Phi-3-mini"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
+            # Load model inside GPU context
+            model = self._load_model_if_needed()
             # Format using Phi-3 chat template
             text = self._format_chat_template(prompt)
             inputs = self.tokenizer(
                 text,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
+                max_length=3072
             )
             # Move inputs to model device
+            inputs = {k: v.to(model.device) for k, v in inputs.items()}
+            # Generate with optimized parameters for quantized model
             with torch.no_grad():
+                outputs = model.generate(
                     **inputs,
+                    max_new_tokens=800,
                     do_sample=True,
+                    temperature=0.7,
                     top_p=0.9,
                     top_k=50,
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
                     early_stopping=True,
+                    use_cache=False,  # Disable cache for compatibility
                     past_key_values=None
                 )
             # Decode only new tokens
             new_tokens = outputs[0][len(inputs.input_ids[0]):]
             result = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
+            log_metric(f"LLM Invoke time (4-bit): {invoke_time:0.4f} seconds. Input length: {len(prompt)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return result if result else "I'm still learning how to respond to that properly."
         except Exception as e:
+            logger.error(f"Generation error with 4-bit model: {e}")
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
             log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return f"[Error generating response: {str(e)}]"
+    @spaces.GPU(duration=240)
     def stream_generate(self, input: Input, config=None):
+        """Streaming generation with 4-bit quantized model"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
+        logger.info("Starting stream_generate with 4-bit quantized model...")
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
+            # Load quantized model inside GPU context
             model = self._load_model_if_needed()
             # Clear GPU cache
                 torch.cuda.empty_cache()
             text = self._format_chat_template(prompt)
             inputs = self.tokenizer(
                 text,
                 return_tensors="pt",
                 truncation=True,
                 max_length=3072
             )
+            # Move inputs to model device
             inputs = {k: v.to(model.device) for k, v in inputs.items()}
             # Initialize TextIteratorStreamer
             streamer = TextIteratorStreamer(
                 self.tokenizer,
                 skip_prompt=True,
                 skip_special_tokens=True
             )
+            # Generation parameters optimized for 4-bit
             generation_kwargs = {
                 **inputs,
                 "max_new_tokens": 800,
                 "use_cache": False,
                 "past_key_values": None
             }
             # Start generation in background
             generation_thread = threading.Thread(
+                target=model.generate,
                 kwargs=generation_kwargs
             )
             generation_thread.start()
+            # Stream results with loop detection
             generated_text = ""
             token_history = []
             loop_window = 20
                     generated_text += new_text
+                    # Loop detection logic
                     tokens = self.tokenizer.tokenize(new_text)
                     token_history.extend(tokens)
                     if len(token_history) >= 2 * loop_window:
                         recent = token_history[-loop_window:]
                         prev = token_history[-2*loop_window:-loop_window]
                         overlap = sum(1 for r, p in zip(recent, prev) if r == p)
                         if overlap >= loop_threshold:
+                            logger.warning(f"Looping detected with 4-bit model. Stopping generation.")
                             yield "[Looping detected — generation stopped early]"
                             break
                     yield generated_text
             except Exception as e:
+                logger.error(f"Error in 4-bit streaming iteration: {e}")
                 yield f"[Streaming error: {str(e)}]"
             generation_thread.join()
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
+            log_metric(f"LLM Stream time (4-bit): {stream_time:0.4f} seconds. Generated length: {len(generated_text)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
+            logger.error(f"4-bit streaming generation error: {e}")
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
             log_metric(f"LLM Stream time (error): {stream_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            yield f"[Error in 4-bit streaming generation: {str(e)}]"
     @property