future-html

Sleeping

App Files Files Community

aadya1762 commited on Mar 16, 2025

Commit

5160420

1 Parent(s): 06a0392

increase cache limit -> fewer recompilations by pytorch

Browse files

Files changed (5) hide show

app.py +39 -19
gemmademo/__init__.py +6 -1
gemmademo/_chat.py +16 -9
gemmademo/_model.py +48 -42
gemmademo/_utils.py +1 -0

app.py CHANGED Viewed

@@ -5,11 +5,17 @@
 # Add a button to clear the chat history.
 import streamlit as st
-from gemmademo import HuggingFaceGemmaModel, StreamlitChat, PromptManager, huggingface_login
 import os
 import sys
 import subprocess
 def main():
     # Page configuration
     st.set_page_config(page_title="Gemma Chat Demo", layout="wide")
@@ -25,7 +31,7 @@ def main():
     # Sidebar for login and configuration
     with st.sidebar:
         st.title("Gemma Chat Configuration")
         # Login section
         st.subheader("Login")
         if not st.session_state.authenticated:
@@ -42,31 +48,31 @@ def main():
             if st.button("Logout"):
                 st.session_state.authenticated = False
                 st.rerun()
         # Model selection
         st.subheader("Model Selection")
         model_options = list(HuggingFaceGemmaModel.AVAILABLE_MODELS.keys())
         selected_model = st.selectbox(
             "Select Gemma Model",
             model_options,
-            index=model_options.index(st.session_state.selected_model)
         )
         if selected_model != st.session_state.selected_model:
             st.session_state.selected_model = selected_model
             st.rerun()
         # Task selection
         st.subheader("Task Selection")
         task_options = ["Question Answering", "Text Generation", "Code Completion"]
         selected_task = st.selectbox(
             "Select Task",
             task_options,
-            index=task_options.index(st.session_state.selected_task)
         )
         if selected_task != st.session_state.selected_task:
             st.session_state.selected_task = selected_task
             st.rerun()
         # Clear chat history button
         if st.button("Clear Chat History"):
             if "chat_instance" in st.session_state:
@@ -76,37 +82,51 @@ def main():
     # Main content area
     if st.session_state.authenticated:
         # Initialize model with the selected configuration
-        model_name = HuggingFaceGemmaModel.AVAILABLE_MODELS[st.session_state.selected_model]["name"]
         model = HuggingFaceGemmaModel(name=model_name)
         # Load model (will use cached version if available)
         with st.spinner(f"Loading {model_name}..."):
             model.load_model(device_map="auto")
         # Initialize prompt manager with selected task
         prompt_manager = PromptManager(task=st.session_state.selected_task)
         # Initialize chat interface
         chat = StreamlitChat(model=model, prompt_manager=prompt_manager)
         st.session_state.chat_instance = chat
         # Run the chat interface
         chat.run()
     else:
-        st.info("Please login with your Hugging Face token in the sidebar to start chatting.")
 if __name__ == "__main__":
     # Check if the script is being run directly with Python
     # If so, launch Streamlit programmatically
-    if not os.environ.get('STREAMLIT_RUN_APP'):
-        os.environ['STREAMLIT_RUN_APP'] = '1'
         # Get the current script path
         script_path = os.path.abspath(__file__)
         # Launch streamlit run with port 7860 and headless mode
-        cmd = [sys.executable, "-m", "streamlit", "run", script_path,
-               "--server.port", "7860",
-               "--server.address", "0.0.0.0",
-               "--server.headless", "true"]
         subprocess.run(cmd)
     else:
         # Normal Streamlit execution

 # Add a button to clear the chat history.
 import streamlit as st
+from gemmademo import (
+    HuggingFaceGemmaModel,
+    StreamlitChat,
+    PromptManager,
+    huggingface_login,
+)
 import os
 import sys
 import subprocess
 def main():
     # Page configuration
     st.set_page_config(page_title="Gemma Chat Demo", layout="wide")
     # Sidebar for login and configuration
     with st.sidebar:
         st.title("Gemma Chat Configuration")
         # Login section
         st.subheader("Login")
         if not st.session_state.authenticated:
             if st.button("Logout"):
                 st.session_state.authenticated = False
                 st.rerun()
         # Model selection
         st.subheader("Model Selection")
         model_options = list(HuggingFaceGemmaModel.AVAILABLE_MODELS.keys())
         selected_model = st.selectbox(
             "Select Gemma Model",
             model_options,
+            index=model_options.index(st.session_state.selected_model),
         )
         if selected_model != st.session_state.selected_model:
             st.session_state.selected_model = selected_model
             st.rerun()
         # Task selection
         st.subheader("Task Selection")
         task_options = ["Question Answering", "Text Generation", "Code Completion"]
         selected_task = st.selectbox(
             "Select Task",
             task_options,
+            index=task_options.index(st.session_state.selected_task),
         )
         if selected_task != st.session_state.selected_task:
             st.session_state.selected_task = selected_task
             st.rerun()
         # Clear chat history button
         if st.button("Clear Chat History"):
             if "chat_instance" in st.session_state:
     # Main content area
     if st.session_state.authenticated:
         # Initialize model with the selected configuration
+        model_name = HuggingFaceGemmaModel.AVAILABLE_MODELS[
+            st.session_state.selected_model
+        ]["name"]
         model = HuggingFaceGemmaModel(name=model_name)
         # Load model (will use cached version if available)
         with st.spinner(f"Loading {model_name}..."):
             model.load_model(device_map="auto")
         # Initialize prompt manager with selected task
         prompt_manager = PromptManager(task=st.session_state.selected_task)
         # Initialize chat interface
         chat = StreamlitChat(model=model, prompt_manager=prompt_manager)
         st.session_state.chat_instance = chat
         # Run the chat interface
         chat.run()
     else:
+        st.info(
+            "Please login with your Hugging Face token in the sidebar to start chatting."
+        )
 if __name__ == "__main__":
     # Check if the script is being run directly with Python
     # If so, launch Streamlit programmatically
+    if not os.environ.get("STREAMLIT_RUN_APP"):
+        os.environ["STREAMLIT_RUN_APP"] = "1"
         # Get the current script path
         script_path = os.path.abspath(__file__)
         # Launch streamlit run with port 7860 and headless mode
+        cmd = [
+            sys.executable,
+            "-m",
+            "streamlit",
+            "run",
+            script_path,
+            "--server.port",
+            "7860",
+            "--server.address",
+            "0.0.0.0",
+            "--server.headless",
+            "true",
+        ]
         subprocess.run(cmd)
     else:
         # Normal Streamlit execution

gemmademo/__init__.py CHANGED Viewed

@@ -3,4 +3,9 @@ from ._model import HuggingFaceGemmaModel
 from ._prompts import PromptManager
 from ._utils import huggingface_login
-__all__ = ["StreamlitChat", "HuggingFaceGemmaModel", "PromptManager", "huggingface_login"]

 from ._prompts import PromptManager
 from ._utils import huggingface_login
+__all__ = [
+    "StreamlitChat",
+    "HuggingFaceGemmaModel",
+    "PromptManager",
+    "huggingface_login",
+]

gemmademo/_chat.py CHANGED Viewed

@@ -2,23 +2,25 @@ import streamlit as st
 from ._model import HuggingFaceGemmaModel
 from ._prompts import PromptManager
 class StreamlitChat:
     """
     A class that handles the chat interface for the Gemma model.
     Features:
     ✅ A Streamlit-based chatbot UI.
     ✅ Maintains chat history across reruns.
     ✅ Uses Gemma (Hugging Face) model for generating responses.
     ✅ Formats user inputs before sending them to the model.
     """
     def __init__(self, model: HuggingFaceGemmaModel, prompt_manager: PromptManager):
         self.model = model
         self.prompt_manager = prompt_manager
     def run(self):
         self._chat()
     def _chat(self):
         st.title("Using model : " + self.model.get_model_name())
         self._build_states()
@@ -27,25 +29,30 @@ class StreamlitChat:
         for message in st.session_state.messages:
             with st.chat_message(message["role"]):
                 st.markdown(message["content"])
         # React to user input
         if prompt := st.chat_input("What is up?"):
-            prompt = prompt.replace("\n", "  \n") # Only double spaced backslash is rendered in streamlit for newlines.
             with st.chat_message("User"):
                 st.markdown(prompt)
             st.session_state.messages.append({"role": "User", "content": prompt})
-            prompt = self.prompt_manager.get_prompt(user_input=st.session_state.messages[-1]["content"])
-            response = self.model.generate_response(prompt).replace("\n", "  \n") # Only double spaced backslash is rendered in streamlit for newlines.
             with st.chat_message("Gemma"):
                 st.markdown(response)
             st.session_state.messages.append({"role": "Gemma", "content": response})
     def _build_states(self):
         # Initialize chat history
         if "messages" not in st.session_state:
             st.session_state.messages = []
     def clear_history(self):
         st.session_state.messages = []

 from ._model import HuggingFaceGemmaModel
 from ._prompts import PromptManager
 class StreamlitChat:
     """
     A class that handles the chat interface for the Gemma model.
     Features:
     ✅ A Streamlit-based chatbot UI.
     ✅ Maintains chat history across reruns.
     ✅ Uses Gemma (Hugging Face) model for generating responses.
     ✅ Formats user inputs before sending them to the model.
     """
     def __init__(self, model: HuggingFaceGemmaModel, prompt_manager: PromptManager):
         self.model = model
         self.prompt_manager = prompt_manager
     def run(self):
         self._chat()
     def _chat(self):
         st.title("Using model : " + self.model.get_model_name())
         self._build_states()
         for message in st.session_state.messages:
             with st.chat_message(message["role"]):
                 st.markdown(message["content"])
         # React to user input
         if prompt := st.chat_input("What is up?"):
+            prompt = prompt.replace(
+                "\n", "  \n"
+            )  # Only double spaced backslash is rendered in streamlit for newlines.
             with st.chat_message("User"):
                 st.markdown(prompt)
             st.session_state.messages.append({"role": "User", "content": prompt})
+            prompt = self.prompt_manager.get_prompt(
+                user_input=st.session_state.messages[-1]["content"]
+            )
+            response = self.model.generate_response(prompt).replace(
+                "\n", "  \n"
+            )  # Only double spaced backslash is rendered in streamlit for newlines.
             with st.chat_message("Gemma"):
                 st.markdown(response)
             st.session_state.messages.append({"role": "Gemma", "content": response})
     def _build_states(self):
         # Initialize chat history
         if "messages" not in st.session_state:
             st.session_state.messages = []
     def clear_history(self):
         st.session_state.messages = []

gemmademo/_model.py CHANGED Viewed

@@ -3,17 +3,24 @@ import torch
 from typing import Dict, Optional
 import streamlit as st
-torch.classes.__path__ = [] # add this line to manually set it to empty.
 def load_model(name: str, device_map: str = "cpu"):
     """
     Model loading function that loads the model without caching
     """
     import torch._dynamo
     torch._dynamo.config.suppress_errors = True
     tokenizer = AutoTokenizer.from_pretrained(name)
     model = AutoModelForCausalLM.from_pretrained(
         name,
         torch_dtype=torch.bfloat16,
@@ -24,7 +31,7 @@ def load_model(name: str, device_map: str = "cpu"):
         use_cache=True,
         load_in_8bit=True,
     )
     pipe = pipeline(
         "text-generation",
         model=model,
@@ -36,11 +43,12 @@ def load_model(name: str, device_map: str = "cpu"):
         max_new_tokens=512,
         pad_token_id=tokenizer.eos_token_id,
         eos_token_id=tokenizer.eos_token_id,
-        return_full_text=False
     )
     return tokenizer, model, pipe
 class HuggingFaceGemmaModel:
     """
     A class for the Hugging Face Gemma model. Handles model selection, loading, and inference.
@@ -49,7 +57,7 @@ class HuggingFaceGemmaModel:
     Example
     -------
     Select Gemma 2B, 7B etc.
     Additional Information:
     ----------------------
     Complete Information: https://huggingface.co/google/gemma-2b
@@ -60,40 +68,40 @@ class HuggingFaceGemmaModel:
     - google/gemma-7b (7B parameters, base)
     - google/gemma-7b-it (7B parameters, instruction-tuned)
     """
     AVAILABLE_MODELS: Dict[str, Dict] = {
         "gemma-2b": {
             "name": "google/gemma-2b",
             "description": "2B parameters, base model",
-            "type": "base"
         },
         "gemma-2b-it": {
             "name": "google/gemma-2b-it",
             "description": "2B parameters, instruction-tuned",
-            "type": "instruct"
         },
         "gemma-7b": {
             "name": "google/gemma-7b",
             "description": "7B parameters, base model",
-            "type": "base"
         },
         "gemma-7b-it": {
             "name": "google/gemma-7b-it",
             "description": "7B parameters, instruction-tuned",
-            "type": "instruct"
-        }
     }
     def __init__(self, name: str = "google/gemma-2b"):
         self.name = name
         self.model = None
         self.tokenizer = None
         self.pipeline = None
     def load_model(self, device_map: str = "cpu"):
         """
         Load the model using session state
         Args:
             device_map: Device mapping strategy (should be "cpu" for CPU-only inference)
         """
@@ -101,85 +109,83 @@ class HuggingFaceGemmaModel:
         model_key = f"gemma_model_{self.name}"
         tokenizer_key = f"gemma_tokenizer_{self.name}"
         pipeline_key = f"gemma_pipeline_{self.name}"
         # Check if model is already loaded in session state
-        if (model_key not in st.session_state or
-            tokenizer_key not in st.session_state or
-            pipeline_key not in st.session_state):
             # Show loading indicator
             with st.spinner(f"Loading {self.name}..."):
                 tokenizer, model, pipe = load_model(self.name, device_map)
                 # Store in session state
                 st.session_state[tokenizer_key] = tokenizer
                 st.session_state[model_key] = model
                 st.session_state[pipeline_key] = pipe
         # Get model from session state
         self.tokenizer = st.session_state[tokenizer_key]
         self.model = st.session_state[model_key]
         self.pipeline = st.session_state[pipeline_key]
         return self
     def generate_response(
-        self,
-        prompt: str,
         max_length: int = 512,
         temperature: float = 0.7,
         num_return_sequences: int = 1,
-        **kwargs
     ) -> str:
         """
         Generate a response using the text generation pipeline
         Args:
             prompt: Input text
             max_length: Maximum number of new tokens to generate
             temperature: Sampling temperature (higher = more creative)
             num_return_sequences: Number of responses to generate
             **kwargs: Additional generation parameters for the pipeline
         Returns:
             str: Generated response
         """
         if not self.pipeline:
             self.load_model()
         # Update generation config with any provided kwargs
         generation_config = {
             "max_new_tokens": max_length,
             "temperature": temperature,
             "num_return_sequences": num_return_sequences,
             "do_sample": True,
-            **kwargs
         }
         # Generate response using the pipeline
-        outputs = self.pipeline(
-            prompt,
-            **generation_config
-        )
         # Extract the generated text
         if num_return_sequences == 1:
             response = outputs[0]["generated_text"]
         else:
             # Join multiple sequences if requested
             response = "\n---\n".join(output["generated_text"] for output in outputs)
         return response.strip()
     def get_model_info(self) -> Dict:
         """Return information about the model"""
         return {
             "name": self.name,
             "loaded": self.model is not None,
-            "pipeline_ready": self.pipeline is not None
         }
     def get_model_name(self) -> str:
         """Return the name of the model"""
         return self.name

 from typing import Dict, Optional
 import streamlit as st
+torch.classes.__path__ = (
+    []
+)  # add this line to manually set it to empty. If not done, this throws a warning.
 def load_model(name: str, device_map: str = "cpu"):
     """
     Model loading function that loads the model without caching
     """
     import torch._dynamo
+    torch._dynamo.config.suppress_errors = True  # Already in your code
+    torch._dynamo.config.cache_size_limit = 64  # Increase cache limit
+    torch._dynamo.config.force_inference_mode = True  # Reduce recompilations
     torch._dynamo.config.suppress_errors = True
     tokenizer = AutoTokenizer.from_pretrained(name)
     model = AutoModelForCausalLM.from_pretrained(
         name,
         torch_dtype=torch.bfloat16,
         use_cache=True,
         load_in_8bit=True,
     )
     pipe = pipeline(
         "text-generation",
         model=model,
         max_new_tokens=512,
         pad_token_id=tokenizer.eos_token_id,
         eos_token_id=tokenizer.eos_token_id,
+        return_full_text=False,
     )
     return tokenizer, model, pipe
 class HuggingFaceGemmaModel:
     """
     A class for the Hugging Face Gemma model. Handles model selection, loading, and inference.
     Example
     -------
     Select Gemma 2B, 7B etc.
     Additional Information:
     ----------------------
     Complete Information: https://huggingface.co/google/gemma-2b
     - google/gemma-7b (7B parameters, base)
     - google/gemma-7b-it (7B parameters, instruction-tuned)
     """
     AVAILABLE_MODELS: Dict[str, Dict] = {
         "gemma-2b": {
             "name": "google/gemma-2b",
             "description": "2B parameters, base model",
+            "type": "base",
         },
         "gemma-2b-it": {
             "name": "google/gemma-2b-it",
             "description": "2B parameters, instruction-tuned",
+            "type": "instruct",
         },
         "gemma-7b": {
             "name": "google/gemma-7b",
             "description": "7B parameters, base model",
+            "type": "base",
         },
         "gemma-7b-it": {
             "name": "google/gemma-7b-it",
             "description": "7B parameters, instruction-tuned",
+            "type": "instruct",
+        },
     }
     def __init__(self, name: str = "google/gemma-2b"):
         self.name = name
         self.model = None
         self.tokenizer = None
         self.pipeline = None
     def load_model(self, device_map: str = "cpu"):
         """
         Load the model using session state
         Args:
             device_map: Device mapping strategy (should be "cpu" for CPU-only inference)
         """
         model_key = f"gemma_model_{self.name}"
         tokenizer_key = f"gemma_tokenizer_{self.name}"
         pipeline_key = f"gemma_pipeline_{self.name}"
         # Check if model is already loaded in session state
+        if (
+            model_key not in st.session_state
+            or tokenizer_key not in st.session_state
+            or pipeline_key not in st.session_state
+        ):
             # Show loading indicator
             with st.spinner(f"Loading {self.name}..."):
                 tokenizer, model, pipe = load_model(self.name, device_map)
                 # Store in session state
                 st.session_state[tokenizer_key] = tokenizer
                 st.session_state[model_key] = model
                 st.session_state[pipeline_key] = pipe
         # Get model from session state
         self.tokenizer = st.session_state[tokenizer_key]
         self.model = st.session_state[model_key]
         self.pipeline = st.session_state[pipeline_key]
         return self
     def generate_response(
+        self,
+        prompt: str,
         max_length: int = 512,
         temperature: float = 0.7,
         num_return_sequences: int = 1,
+        **kwargs,
     ) -> str:
         """
         Generate a response using the text generation pipeline
         Args:
             prompt: Input text
             max_length: Maximum number of new tokens to generate
             temperature: Sampling temperature (higher = more creative)
             num_return_sequences: Number of responses to generate
             **kwargs: Additional generation parameters for the pipeline
         Returns:
             str: Generated response
         """
         if not self.pipeline:
             self.load_model()
         # Update generation config with any provided kwargs
         generation_config = {
             "max_new_tokens": max_length,
             "temperature": temperature,
             "num_return_sequences": num_return_sequences,
             "do_sample": True,
+            **kwargs,
         }
         # Generate response using the pipeline
+        outputs = self.pipeline(prompt, **generation_config)
         # Extract the generated text
         if num_return_sequences == 1:
             response = outputs[0]["generated_text"]
         else:
             # Join multiple sequences if requested
             response = "\n---\n".join(output["generated_text"] for output in outputs)
         return response.strip()
     def get_model_info(self) -> Dict:
         """Return information about the model"""
         return {
             "name": self.name,
             "loaded": self.model is not None,
+            "pipeline_ready": self.pipeline is not None,
         }
     def get_model_name(self) -> str:
         """Return the name of the model"""
         return self.name

gemmademo/_utils.py CHANGED Viewed

@@ -3,4 +3,5 @@ def huggingface_login(token: str):
     Login to Hugging Face using the token
     """
     from huggingface_hub import login
     login(token=token)

     Login to Hugging Face using the token
     """
     from huggingface_hub import login
     login(token=token)