Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 4, 2025

Commit

393c789

verified ·

1 Parent(s): e4fcfe9

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -98

app.py CHANGED Viewed

@@ -301,7 +301,7 @@ class Phi2EducationalLLM(Runnable):
                     model_path,
                     quantization_config=quant_config,
                     device_map="auto",
-                    dtype=torch.float16,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 )
@@ -328,7 +328,7 @@ class Phi2EducationalLLM(Runnable):
         """Optimized model loading for 16GB RAM systems."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            dtype=torch.float16,  # Use float16 to save memory
             device_map="cpu",           # Force CPU for stability
             trust_remote_code=True,
             low_cpu_mem_usage=True,
@@ -345,7 +345,7 @@ class Phi2EducationalLLM(Runnable):
         self.model = AutoModelForCausalLM.from_pretrained(
             fallback_model,
-            dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="cpu",
             trust_remote_code=True,
             low_cpu_mem_usage=True
@@ -373,8 +373,8 @@ class Phi2EducationalLLM(Runnable):
             except:
                 # Fallback for models without chat template support
                 if "phi" in self.model_name.lower():
-                    # Phi-2 format
-                    text = f"Instruct: {SYSTEM_PROMPT}\n\nUser: {prompt}\nOutput:"
                 else:
                     # Generic format for other models
                     text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
@@ -414,129 +414,116 @@ class Phi2EducationalLLM(Runnable):
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
-        """Streaming generation method for real-time response display."""
-        import time
-        from datetime import datetime
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
-        # --- Debug Start ---
         logger.info("Starting stream_generate...")
-        logger.debug(f"Input received: {input}")
-        # -------------------
-        # Handle input
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
-            # === Configurable Generation Parameters ===
-            temperature = config.get("temperature", 0.7) if config else 0.7
-            top_k = config.get("top_k", 50) if config else 50
-            top_p = config.get("top_p", 0.9) if config else 0.9
-            max_new_tokens = config.get("max_new_tokens", 600) if config else 600
-            timeout_seconds = config.get("timeout_seconds", 15) if config else 15
-            # === Prompt Construction ===
             try:
                 messages = [
                     {"role": "system", "content": SYSTEM_PROMPT},
                     {"role": "user", "content": prompt}
                 ]
                 text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             except Exception as e:
                 logger.warning(f"Failed to use chat template: {e}")
                 if "phi" in self.model_name.lower():
                     text = f"Instruct: {SYSTEM_PROMPT}\n\nUser: {prompt}\nOutput:"
                 else:
                     text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
-            # === Tokenize ===
-            inputs = self.tokenizer(
-                [text],
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=self.tokenizer.model_max_length
-            )
             if torch.cuda.is_available():
                 inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            input_ids = inputs["input_ids"]
-            attention_mask = inputs["attention_mask"]
-            input_length = input_ids.shape[1]
-            # === Streaming Generation ===
             generated_tokens = []
-            past_key_values = None
-            eos_token_id = self.tokenizer.eos_token_id
-            start_time = time.time()
             logger.info("Beginning token-by-token generation...")
             for step in range(max_new_tokens):
-                if time.time() - start_time > timeout_seconds:
-                    logger.warning("Timeout reached. Ending stream.")
                     break
-                with torch.no_grad():
-                    model_inputs = {
-                        "attention_mask": attention_mask,
-                        "use_cache": True
-                    }
-                    if past_key_values is None:
-                        model_inputs["input_ids"] = input_ids
-                    else:
-                        model_inputs["input_ids"] = next_token
-                        model_inputs["past_key_values"] = past_key_values
-                    outputs = self.model(**model_inputs)
-                    logits = outputs.logits[:, -1, :]
-                    past_key_values = outputs.past_key_values
-                    # Sampling
-                    logits = logits / temperature
-                    filtered_logits = self._top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
-                    probs = torch.nn.functional.softmax(filtered_logits, dim=-1)
-                    next_token = torch.multinomial(probs, num_samples=1)
-                    token_id = next_token.item()
-                    logger.debug(f"Step {step}: Token ID = {token_id}")
-                    if eos_token_id is not None and token_id == eos_token_id:
-                        logger.info("EOS token encountered. Ending generation.")
-                        break
-                    generated_tokens.append(token_id)
-                    # Decode efficiently
-                    new_text = self.tokenizer.decode([token_id], skip_special_tokens=True)
-                    yield new_text
-                    # Optional heuristic: stop on sentence-ending punctuation
-                    if new_text.strip().endswith(('.', '?', '!')):
-                        logger.info("Sentence-ending punctuation hit. Ending early.")
-                        break
-                    # Prepare for next step
-                    input_ids = next_token
-                    attention_mask = torch.cat([
-                        attention_mask,
-                        torch.ones((1, 1), dtype=attention_mask.dtype, device=attention_mask.device)
-                    ], dim=-1)
-            # Final output logging
             final_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
-            logger.info(f"Streaming complete. Tokens generated: {len(generated_tokens)}")
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
             log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Tokens generated: {len(generated_tokens)}. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
             logger.error(f"Streaming generation error: {e}")
             end_stream_time = time.perf_counter()
@@ -879,7 +866,7 @@ mathjax_config = '''
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
-    displayMath: [['$', '$'], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},
@@ -1077,7 +1064,7 @@ def create_interface():
         with gr.Column(elem_classes=["main-container"]):
             # Title Section
-            gr.HTML('<div class="title-header"><h1>🎓 Mimir 🎓</h1></div>')
             # Chat Section
             with gr.Row():

                     model_path,
                     quantization_config=quant_config,
                     device_map="auto",
+                    torch_dtype=torch.float16,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 )
         """Optimized model loading for 16GB RAM systems."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
+            torch_dtype=torch.float16,  # Use float16 to save memory
             device_map="cpu",           # Force CPU for stability
             trust_remote_code=True,
             low_cpu_mem_usage=True,
         self.model = AutoModelForCausalLM.from_pretrained(
             fallback_model,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="cpu",
             trust_remote_code=True,
             low_cpu_mem_usage=True
             except:
                 # Fallback for models without chat template support
                 if "phi" in self.model_name.lower():
+                    # Phi-2 proper format
+                    text = f"{SYSTEM_PROMPT}\n\nQuestion: {prompt}\nAnswer:"
                 else:
                     # Generic format for other models
                     text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
+        """Streaming generation method for real-time response display"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
         logger.info("Starting stream_generate...")
+        # Handle both string and dict inputs for flexibility
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
+            # Prepare input text with better error handling
             try:
                 messages = [
                     {"role": "system", "content": SYSTEM_PROMPT},
                     {"role": "user", "content": prompt}
                 ]
                 text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                logger.info("Successfully used chat template")
             except Exception as e:
                 logger.warning(f"Failed to use chat template: {e}")
                 if "phi" in self.model_name.lower():
                     text = f"Instruct: {SYSTEM_PROMPT}\n\nUser: {prompt}\nOutput:"
+                    logger.info("Using Phi-2 format")
                 else:
                     text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
+                    logger.info("Using generic format")
+            inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=1024)
             if torch.cuda.is_available():
                 inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Initialize for streaming
             generated_tokens = []
+            max_new_tokens = 600
             logger.info("Beginning token-by-token generation...")
+            # Generate token by token
+            current_input_ids = inputs.input_ids
+            current_attention_mask = inputs.attention_mask
             for step in range(max_new_tokens):
+                try:
+                    with torch.no_grad():
+                        outputs = self.model(
+                            input_ids=current_input_ids,
+                            attention_mask=current_attention_mask,
+                            use_cache=True
+                        )
+                        # Get next token probabilities
+                        next_token_logits = outputs.logits[:, -1, :]
+                        # Apply temperature and sampling
+                        next_token_logits = next_token_logits / 0.7
+                        # Apply top-k and top-p filtering
+                        filtered_logits = self._top_k_top_p_filtering(next_token_logits, top_k=50, top_p=0.9)
+                        # Sample next token
+                        probs = torch.nn.functional.softmax(filtered_logits, dim=-1)
+                        next_token = torch.multinomial(probs, num_samples=1)
+                        # Check for end of sequence
+                        if next_token.item() == self.tokenizer.eos_token_id:
+                            logger.info(f"Reached EOS token at step {step}")
+                            break
+                        # Add to generated tokens
+                        generated_tokens.append(next_token.item())
+                        # Decode and yield partial result every few tokens for efficiency
+                        if step % 3 == 0 or step < 10:  # Yield more frequently at start
+                            partial_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+                            if partial_text.strip():  # Only yield non-empty text
+                                yield partial_text
+                        # Safety checks to prevent infinite loops
+                        if step > 10 and len(generated_tokens) == 0:
+                            logger.error("No tokens generated after 10 steps, breaking")
+                            break
+                        if step > 50 and len(partial_text.strip()) < 10:
+                            logger.warning("Very little text generated, continuing...")
+                        # Update input for next iteration
+                        current_input_ids = torch.cat([current_input_ids, next_token], dim=-1)
+                        current_attention_mask = torch.cat([
+                            current_attention_mask,
+                            torch.ones((1, 1), dtype=current_attention_mask.dtype, device=current_attention_mask.device)
+                        ], dim=-1)
+                except Exception as e:
+                    logger.error(f"Error in generation step {step}: {e}")
                     break
+            # Final result
             final_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+            if final_text:
+                yield final_text
+            else:
+                logger.error("No final text generated")
+                yield "I'm having trouble generating a response. Please try again."
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
             log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Tokens generated: {len(generated_tokens)}. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            logger.info(f"Stream generation completed: {len(generated_tokens)} tokens in {stream_time:.2f}s")
         except Exception as e:
             logger.error(f"Streaming generation error: {e}")
             end_stream_time = time.perf_counter()
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
+    displayMath: [[', '], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},
         with gr.Column(elem_classes=["main-container"]):
             # Title Section
+            gr.HTML('<div class="title-header"><h1> Mimir 🎓</h1></div>')
             # Chat Section
             with gr.Row():