future-html

Sleeping

App Files Files Community

aadya1762 commited on Mar 22, 2025

Commit

e9e9e0c

1 Parent(s): 9842827

add sliders

Browse files

Files changed (3) hide show

gemmademo/_chat.py +72 -5
gemmademo/_model.py +18 -11
gemmademo/_prompts.py +8 -38

gemmademo/_chat.py CHANGED Viewed

@@ -33,13 +33,18 @@ class GradioChat:
         if model_name in self.models_cache:
             return self.models_cache[model_name]
-        model = LlamaCppGemmaModel(name=model_name).load_model()
         self.models_cache[model_name] = model
         return model
     def _load_task(self, task_name: str):
         """Loads the task dynamically when switching tasks."""
-        return PromptManager(task=task_name)
     def _chat(self):
         def chat_fn(message, history, selected_model, selected_task):
@@ -49,18 +54,22 @@ class GradioChat:
             # Reload model if changed, using cache when possible
             if selected_model != self.current_model_name:
-                self.current_model_name = selected_model
                 self.model = self._load_model(selected_model)
                 # Clear message history when model changes
                 self.model.messages = []
             # Reload task if changed
             if selected_task != self.current_task_name:
-                self.current_task_name = selected_task
-                self.prompt_manager = self._load_task(selected_task)
                 # Clear message history when task changes
                 if self.model:
                     self.model.messages = []
             # Generate response using updated model & prompt manager
             prompt = self.prompt_manager.get_prompt(user_input=message)
@@ -137,6 +146,64 @@ class GradioChat:
                     task_dropdown.change(
                         _update_examples, task_dropdown, examples_list.dataset
                     )
         demo.launch()

         if model_name in self.models_cache:
             return self.models_cache[model_name]
+        model = LlamaCppGemmaModel(name=model_name).load_model(
+            system_prompt=self.prompt_manager.get_system_prompt()
+        )
         self.models_cache[model_name] = model
+        self.current_model_name = model_name
         return model
     def _load_task(self, task_name: str):
         """Loads the task dynamically when switching tasks."""
+        self.current_task_name = task_name
+        self.prompt_manager = PromptManager(task=task_name)
+        return
     def _chat(self):
         def chat_fn(message, history, selected_model, selected_task):
             # Reload model if changed, using cache when possible
             if selected_model != self.current_model_name:
                 self.model = self._load_model(selected_model)
                 # Clear message history when model changes
                 self.model.messages = []
             # Reload task if changed
             if selected_task != self.current_task_name:
+                self._load_task(selected_task)
                 # Clear message history when task changes
                 if self.model:
                     self.model.messages = []
+                    self.model.messages = [
+                        {
+                            "role": "system",
+                            "content": self.prompt_manager.get_system_prompt(),
+                        }
+                    ]
             # Generate response using updated model & prompt manager
             prompt = self.prompt_manager.get_prompt(user_input=message)
                     task_dropdown.change(
                         _update_examples, task_dropdown, examples_list.dataset
                     )
+                    temperature_slider = gr.Slider(
+                        minimum=0.1, maximum=2, value=1.0, label="Temperature"
+                    )
+                    gr.Markdown(
+                        "**Temperature:** Controls the randomness of the model's output. Lower values make the output more deterministic."
+                    )
+                    temperature_slider.change(
+                        fn=lambda temp: setattr(self.model, "temperature", temp),
+                        inputs=temperature_slider,
+                    )
+                    top_p_slider = gr.Slider(
+                        minimum=0.1, maximum=1.0, value=0.9, label="Top P"
+                    )
+                    gr.Markdown(
+                        "**Top P:** Limits the sampling to a subset of the most probable tokens. Lower values make the output more focused."
+                    )
+                    top_p_slider.change(
+                        fn=lambda top_p: setattr(self.model, "top_p", top_p),
+                        inputs=top_p_slider,
+                    )
+                    top_k_slider = gr.Slider(
+                        minimum=1, maximum=100, value=50, label="Top K"
+                    )
+                    gr.Markdown(
+                        "**Top K:** Limits the sampling to the top K most probable tokens. Lower values make the output more focused."
+                    )
+                    top_k_slider.change(
+                        fn=lambda top_k: setattr(self.model, "top_k", top_k),
+                        inputs=top_k_slider,
+                    )
+                    repetition_penalty_slider = gr.Slider(
+                        minimum=1.0, maximum=2.0, value=1.0, label="Repetition Penalty"
+                    )
+                    gr.Markdown(
+                        "**Repetition Penalty:** Penalizes repeated tokens to reduce repetition in the output."
+                    )
+                    repetition_penalty_slider.change(
+                        fn=lambda penalty: setattr(
+                            self.model, "repetition_penalty", penalty
+                        ),
+                        inputs=repetition_penalty_slider,
+                    )
+                    max_tokens_slider = gr.Slider(
+                        minimum=512, maximum=2048, value=1024, label="Max Tokens"
+                    )
+                    gr.Markdown(
+                        "**Max Tokens:** Sets the maximum number of tokens the model can generate in a single response."
+                    )
+                    max_tokens_slider.change(
+                        fn=lambda max_tokens: setattr(
+                            self.model, "max_tokens", max_tokens
+                        ),
+                        inputs=max_tokens_slider,
+                    )
         demo.launch()

gemmademo/_model.py CHANGED Viewed

@@ -50,7 +50,14 @@ class LlamaCppGemmaModel:
         self.model = None  # Instance of Llama from llama.cpp
         self.messages = []
-    def load_model(self, n_ctx: int = 2048, n_gpu_layers: int = 0):
         """
         Load the model. If the model file does not exist, it will be downloaded.
         Uses caching to avoid reloading models unnecessarily.
@@ -94,6 +101,8 @@ class LlamaCppGemmaModel:
         _threads = min(2, os.cpu_count() or 1)
         self.model = Llama(
             model_path=model_path,
             n_threads=_threads,
@@ -102,8 +111,11 @@ class LlamaCppGemmaModel:
             n_gpu_layers=n_gpu_layers,
             n_batch=8,
             verbose=False,
         )
         # Cache the model for future use
         LlamaCppGemmaModel._model_cache[cache_key] = self.model
         return self
@@ -111,11 +123,6 @@ class LlamaCppGemmaModel:
     def generate_response(
         self,
         prompt: str,
-        max_tokens: int = 512,
-        temperature: float = 0.7,
-        top_p: float = 0.95,
-        top_k: int = 40,
-        repeat_penalty: float = 1.1,
     ):
         """
         Generate a response using the llama.cpp model with optimized parameters.
@@ -138,11 +145,11 @@ class LlamaCppGemmaModel:
         response_stream = self.model.create_chat_completion(
             messages=self.messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repeat_penalty=repeat_penalty,
             stream=True,
         )
         self.messages.append({"role": "assistant", "content": ""})

         self.model = None  # Instance of Llama from llama.cpp
         self.messages = []
+        # Model response generation attributes
+        self.max_tokens = (512,)
+        self.temperature = (0.7,)
+        self.top_p = (0.95,)
+        self.top_k = (40,)
+        self.repeat_penalty = (1.1,)
+    def load_model(self, n_ctx: int = 2048, n_gpu_layers: int = 0, system_prompt=""):
         """
         Load the model. If the model file does not exist, it will be downloaded.
         Uses caching to avoid reloading models unnecessarily.
         _threads = min(2, os.cpu_count() or 1)
+        _sys_prompt = {"role": "system", "content": system_prompt}
         self.model = Llama(
             model_path=model_path,
             n_threads=_threads,
             n_gpu_layers=n_gpu_layers,
             n_batch=8,
             verbose=False,
+            chat_format="chatml",
         )
+        self.messages.append(_sys_prompt)
         # Cache the model for future use
         LlamaCppGemmaModel._model_cache[cache_key] = self.model
         return self
     def generate_response(
         self,
         prompt: str,
     ):
         """
         Generate a response using the llama.cpp model with optimized parameters.
         response_stream = self.model.create_chat_completion(
             messages=self.messages,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            top_k=self.top_k,
+            repeat_penalty=self.repeat_penalty,
             stream=True,
         )
         self.messages.append({"role": "assistant", "content": ""})

gemmademo/_prompts.py CHANGED Viewed

@@ -6,48 +6,18 @@ class PromptManager:
     various tasks such as Question Answering, Text Generation, and Code Completion.
     It raises a ValueError if an unsupported task is specified.
     """
     def __init__(self, task):
         self.task = task
     def get_prompt(self, user_input):
         if self.task == "Question Answering":
-            return self.get_question_answering_prompt(user_input)
         elif self.task == "Text Generation":
-            return self.get_text_generation_prompt(user_input)
         elif self.task == "Code Completion":
-            return self.get_code_completion_prompt(user_input)
-        else:
-            raise ValueError(f"Task {self.task} not supported")
-    def get_question_answering_prompt(self, user_input):
-        """
-        Format user input for question answering task
-        """
-        prompt = f"""You are a helpful AI assistant. Answer the following question accurately and concisely.
-        Only answer the question, do not provide any other information.
-        Question: {user_input}
-        Answer:"""
-        return prompt
-    def get_text_generation_prompt(self, user_input):
-        """
-        Format user input for text generation task
-        """
-        prompt = f"""Continue the following text in a coherent and engaging way:
-        Only continue the text, do not provide any other information.
-        {user_input}
-        Continuation:"""
-        return prompt
-    def get_code_completion_prompt(self, user_input):
-        """
-        Format user input for code completion task
-        """
-        prompt = f"""Complete the following code snippet directly with proper syntax and
-        without explanations or extra text:
-        {user_input}"""
-        return prompt

     various tasks such as Question Answering, Text Generation, and Code Completion.
     It raises a ValueError if an unsupported task is specified.
     """
     def __init__(self, task):
         self.task = task
     def get_prompt(self, user_input):
+        return user_input
+    def get_system_prompt(self):
+        """Returns the system prompt based on the specified task."""
         if self.task == "Question Answering":
+            return "You are a helpful AI assistant. Answer questions concisely and accurately."
         elif self.task == "Text Generation":
+            return "You are a creative AI writer. Generate engaging and coherent text based on the input."
         elif self.task == "Code Completion":
+            return "You are a coding assistant. Complete code snippets correctly without explanations."