Spaces:

nvidia
/

kvpress

Running on Zero

App Files Files Community

simjeg commited on Feb 4, 2025

Commit

f55d759

verified ·

1 Parent(s): ff1672e

Add Qwen2.5-7B-Instruct-1M

Browse files

Files changed (1) hide show

app.py +21 -16

app.py CHANGED Viewed

@@ -24,13 +24,17 @@ press_dict = {
     "TOVAPress": TOVAPress,
 }
 @spaces.GPU
-def process_request(url, question, press_name, compression_ratio):
     """ """
     if press_name not in press_dict:
-        return f"Invalid press type selected: {press_name}", -1, -1
     # Fetch the Wikipedia article
     try:
@@ -45,8 +49,8 @@ def process_request(url, question, press_name, compression_ratio):
         # Initialize the press
         press = press_dict[press_name](compression_ratio)
-        num_tokens = pipe.tokenizer(context, return_tensors="pt")["input_ids"].shape[1]
-        pred_answer = pipe(context, question=question, press=press)["answer"]
         return pred_answer, num_tokens, int(num_tokens * (1 - compression_ratio))
     except Exception as e:
@@ -61,13 +65,12 @@ def gradio_interface():
         gr.Markdown(
             """
             # Wikipedia Article Question Answering with kvpress
-            This demo uses the llama 3.1 8B Instruct model to answer questions about any given Wikipedia article.
             Under the hood, [kvpress](https://github.com/NVIDIA/kvpress) *compresses the key-value (KV) cache* associated with the article, helping reduce memory usage and accelerate decoding.
             **How to use:**
-            1. Enter a Wikipedia article URL
             2. Type your question
-            3. Select a press type and the desired compression ratio
             4. Press "Submit" to see the answer, along with token statistics before and after compression
             """
         )
@@ -77,10 +80,17 @@ def gradio_interface():
             question_input = gr.Textbox(label="Question", placeholder="Type your question here")
         with gr.Row():
             press_selector = gr.Dropdown(
                 choices=list(press_dict.keys()),
                 value="ExpectedAttentionPress",
-                label="Select Press Type",
             )
             compression_slider = gr.Slider(minimum=0.0, maximum=0.9, step=0.1, value=0.5, label="Compression Ratio")
@@ -104,7 +114,7 @@ def gradio_interface():
                     "ExpectedAttentionPress",
                     0.5,
                 ],
-                                [
                     "https://en.wikipedia.org/wiki/World_Chess_Championship_2024",
                     "On which move did the world chess championship end?",
                     "ExpectedAttentionPress",
@@ -116,7 +126,7 @@ def gradio_interface():
         submit_button.click(
             process_request,
-            inputs=[url_input, question_input, press_selector, compression_slider],
             outputs=[output, output_num_tokens, output_compressed_num_tokens],
         )
@@ -125,11 +135,6 @@ def gradio_interface():
 if __name__ == "__main__":
-    # Load pipeline
-    device = "cuda:0"
-    ckpt = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-    pipe = pipeline("kv-press-text-generation", model=ckpt, device=device, torch_dtype="auto")
     # Launch demo
     demo = gradio_interface()
     demo.launch()

     "TOVAPress": TOVAPress,
 }
+pipe_dict = dict(
+    (ckpt, pipeline("kv-press-text-generation", model=ckpt, device="cuda:0", torch_dtype="auto"))
+    for ckpt in ["meta-llama/Meta-Llama-3.1-8B-Instruct", "Qwen/Qwen2.5-7B-Instruct-1M"]
+)
 @spaces.GPU
+def process_request(url, question, press_name, pipe_name, compression_ratio):
     """ """
     if press_name not in press_dict:
+        return f"Invalid press selected: {press_name}", -1, -1
     # Fetch the Wikipedia article
     try:
         # Initialize the press
         press = press_dict[press_name](compression_ratio)
+        num_tokens = pipe_dict[pipe_name].tokenizer(context, return_tensors="pt")["input_ids"].shape[1]
+        pred_answer = pipe_dict[pipe_name](context, question=question, press=press)["answer"]
         return pred_answer, num_tokens, int(num_tokens * (1 - compression_ratio))
     except Exception as e:
         gr.Markdown(
             """
             # Wikipedia Article Question Answering with kvpress
+            This demo uses the llama 3.1 8B Instruct model to answer questions about any given Wikipedia article.
             Under the hood, [kvpress](https://github.com/NVIDIA/kvpress) *compresses the key-value (KV) cache* associated with the article, helping reduce memory usage and accelerate decoding.
             **How to use:**
+            1. Enter a Wikipedia article URL
             2. Type your question
+            3. Select a model, a press and the desired compression ratio
             4. Press "Submit" to see the answer, along with token statistics before and after compression
             """
         )
             question_input = gr.Textbox(label="Question", placeholder="Type your question here")
         with gr.Row():
+            pipe_selector = gr.Dropdown(
+                choices=list(pipe_dict.keys()),
+                value="meta-llama/Meta-Llama-3.1-8B-Instruct",
+                label="Select Model",
+            )
             press_selector = gr.Dropdown(
                 choices=list(press_dict.keys()),
                 value="ExpectedAttentionPress",
+                label="Select Press",
             )
             compression_slider = gr.Slider(minimum=0.0, maximum=0.9, step=0.1, value=0.5, label="Compression Ratio")
                     "ExpectedAttentionPress",
                     0.5,
                 ],
+                [
                     "https://en.wikipedia.org/wiki/World_Chess_Championship_2024",
                     "On which move did the world chess championship end?",
                     "ExpectedAttentionPress",
         submit_button.click(
             process_request,
+            inputs=[url_input, question_input, press_selector, pipe_selector, compression_slider],
             outputs=[output, output_num_tokens, output_compressed_num_tokens],
         )
 if __name__ == "__main__":
     # Launch demo
     demo = gradio_interface()
     demo.launch()