Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 1, 2025

Commit

05be05b

verified ·

1 Parent(s): 9a5ee2e

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -173

app.py CHANGED Viewed

@@ -235,215 +235,94 @@ logger = logging.getLogger(__name__)
 class Qwen25SmallLLM(LLM):
     model: Any = None
     tokenizer: Any = None
     def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
         super().__init__()
-        logger.info(f"Loading model with BitsAndBytes quantization: {model_path}")
-        # Configure BitsAndBytes quantization
-        if use_4bit:
-            quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for better performance
-                bnb_4bit_use_double_quant=True,         # Double quantization for additional memory savings
-                bnb_4bit_quant_type="nf4"               # Normal Float 4-bit quantization
-            )
-            logger.info("Using 4-bit quantization with BitsAndBytes")
-        else:
-            quantization_config = BitsAndBytesConfig(
-                load_in_8bit=True,
-                llm_int8_enable_fp32_cpu_offload=True   # Offload to CPU if needed
-            )
-            logger.info("Using 8-bit quantization with BitsAndBytes")
         try:
             # Load tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_path,
-                trust_remote_code=True
-            )
-            # Load model with quantization
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                quantization_config=quantization_config,
-                device_map="auto",  # Automatically distribute across available devices
-                torch_dtype=torch.bfloat16,  # Use bfloat16 for memory efficiency
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,  # Reduce CPU memory usage during loading
-                max_memory={0: "15GB"} if torch.cuda.is_available() else None  # Limit GPU memory usage
-            )
-            # Ensure pad token is set
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-            logger.info("Model loaded successfully with BitsAndBytes quantization")
-        except Exception as e:
-            logger.error(f"Failed to load model with quantization: {e}")
-            logger.info("Falling back to standard loading...")
-            # Fallback to standard loading if quantization fails
-            self._load_fallback_model(model_path)
-    def _load_fallback_model(self, model_path: str):
-        """Fallback method to load model without quantization if needed."""
-        try:
             self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto" if torch.cuda.is_available() else None,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             )
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-            logger.info("Model loaded with fallback method")
-        except Exception as e:
-            logger.error(f"Fallback model loading also failed: {e}")
-            raise e
-    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
-        """Generate text response using the quantized local model."""
-        try:
-            # Format the conversation
-            messages = [
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": prompt}
-            ]
-            # Apply chat template
-            text = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
-            # Tokenize with proper padding
-            model_inputs = self.tokenizer(
-                [text],
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=2048  # Limit input length to prevent memory issues
-            )
-            # Move to model device if available
-            if torch.cuda.is_available():
-                model_inputs = {k: v.to(self.model.device) for k, v in model_inputs.items()}
-            # Generate with memory-efficient settings
-            with torch.no_grad():
-                generated_ids = self.model.generate(
-                    **model_inputs,
-                    max_new_tokens=800,  # Reduced for memory efficiency
-                    do_sample=True,
-                    temperature=0.7,
-                    top_p=0.9,
-                    top_k=50,
-                    repetition_penalty=1.1,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    use_cache=True,  # Enable KV cache for efficiency
-                    attention_mask=model_inputs.get('attention_mask', None)
-                )
-            # Decode response (only new tokens)
-            generated_ids = [
-                output_ids[len(input_ids):]
-                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-            ]
-            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            # Clean up GPU memory
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            return response.strip()
-        except torch.cuda.OutOfMemoryError:
-            logger.error("GPU out of memory during generation")
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            return "I apologize, but I'm experiencing memory constraints. Please try a shorter message or restart the application."
         except Exception as e:
-            logger.error(f"Error in model generation: {e}")
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            return f"I apologize, but I encountered an error while generating a response: {str(e)}"
-    @property
-    def _llm_type(self) -> str:
-        return "qwen25_small_quantized"
-    model: Any = None
-    tokenizer: Any = None
-    def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct"):
-        super().__init__()
-        logger.info(f"Loading model: {model_path}")
-        # Load tokenizer and model
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True
         )
-        logger.info("Model loaded successfully")
     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
-        """Generate text response using the local model."""
         try:
-            # Format the conversation
             messages = [
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": prompt}
             ]
-            # Apply chat template
-            text = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
-            # Tokenize
-            model_inputs = self.tokenizer([text], return_tensors="pt")
             if torch.cuda.is_available():
-                model_inputs = model_inputs.to(self.model.device)
-            # Generate
             with torch.no_grad():
-                generated_ids = self.model.generate(
-                    **model_inputs,
-                    max_new_tokens=1000,
                     do_sample=True,
                     temperature=0.7,
                     top_p=0.9,
                     pad_token_id=self.tokenizer.eos_token_id
                 )
-            # Decode response
-            generated_ids = [
-                output_ids[len(input_ids):]
-                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-            ]
-            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            return response.strip()
         except Exception as e:
-            logger.error(f"Error in model generation: {e}")
-            return f"I apologize, but I encountered an error while generating a response: {str(e)}"
     @property
     def _llm_type(self) -> str:
         return "qwen25_small"
 # Example of how the AI should use the tool
 def example_usage_for_ai():
     """
@@ -495,6 +374,38 @@ def get_agent():
         agent = create_langchain_agent()
     return agent
 # --- UI: MathJax Configuration ---
 mathjax_config = '''
 <script>

 class Qwen25SmallLLM(LLM):
     model: Any = None
     tokenizer: Any = None
     def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
         super().__init__()
+        logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
         try:
             # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+            if use_4bit:
+                quant_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4"
+                )
+                logger.info("Using 4-bit quantization with BitsAndBytes")
+            else:
+                quant_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    llm_int8_enable_fp32_cpu_offload=True
+                )
+                logger.info("Using 8-bit quantization with BitsAndBytes")
+            # Try quantized load
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
+                quantization_config=quant_config,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             )
         except Exception as e:
+            logger.warning(f"Quantized load failed, falling back: {e}")
+            self._load_fallback_model(model_path)
+        # Ensure pad token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def _load_fallback_model(self, model_path: str):
+        """Fallback if quantization fails."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True
         )
     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
         try:
             messages = [
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": prompt}
             ]
+            text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=2048)
             if torch.cuda.is_available():
+                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=800,
                     do_sample=True,
                     temperature=0.7,
                     top_p=0.9,
+                    top_k=50,
+                    repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id
                 )
+            new_tokens = [out[len(inp):] for inp, out in zip(inputs.input_ids, outputs)]
+            return self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
         except Exception as e:
+            logger.error(f"Generation error: {e}")
+            return f"[Error generating response: {str(e)}]"
     @property
     def _llm_type(self) -> str:
         return "qwen25_small"
 # Example of how the AI should use the tool
 def example_usage_for_ai():
     """
         agent = create_langchain_agent()
     return agent
+def create_langchain_agent():
+    """Factory to build the LangChain agent with memory and tools."""
+    try:
+        # Initialize your LLM
+        llm = Qwen25SmallLLM(model_path="Qwen/Qwen2.5-3B-Instruct")
+        # Memory
+        memory = ConversationBufferWindowMemory(
+            memory_key="chat_history",
+            return_messages=True,
+            k=5  # keep last 5 exchanges
+        )
+        # Tools (graph tool, etc.)
+        tools = [create_educational_graph_tool()]
+        # Initialize agent
+        agent = initialize_agent(
+            tools=tools,
+            llm=llm,
+            agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
+            memory=memory,
+            verbose=True,
+            handle_parsing_errors=True
+        )
+        return agent
+    except Exception as e:
+        logger.error(f"Error creating LangChain agent: {e}")
+        raise
 # --- UI: MathJax Configuration ---
 mathjax_config = '''
 <script>