Spaces:

salomonsky
/

xaman

Sleeping

salomonsky commited on Feb 1, 2024

Commit

02f697f

verified ·

1 Parent(s): 8f514e3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
-import concurrent.futures
 system_prompt = ""
 system_prompt_sent = False
@@ -36,6 +39,10 @@ def generate(prompt, history, temperature=0.9, max_new_tokens=4096, top_p=0.95,
     )
     formatted_prompt = format_prompt(prompt, history)
     stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
     output = ""
@@ -44,17 +51,14 @@ def generate(prompt, history, temperature=0.9, max_new_tokens=4096, top_p=0.95,
         output += response.token.text
         yield output
     return output
-def run_chatbot(prompt, history, temperature, max_new_tokens, top_p, repetition_penalty):
-    global client
-    client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
-    with concurrent.futures.ProcessPoolExecutor() as executor:
-        result = executor.submit(generate, prompt, history, temperature, max_new_tokens, top_p, repetition_penalty)
-        return result
 chat_interface = gr.ChatInterface(
-    fn=run_chatbot,
     chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=False, likeable=False, layout="vertical", height=900),
     concurrency_limit=9,
     theme="soft",

+import concurrent.futures
 import gradio as gr
+from dogpile.cache import make_region
 from huggingface_hub import InferenceClient
+cache = make_region().configure('dogpile.cache.memory', thread_local=True)
 system_prompt = ""
 system_prompt_sent = False
     )
     formatted_prompt = format_prompt(prompt, history)
+    cache_key = f"generate:{formatted_prompt}:{temperature}:{max_new_tokens}:{top_p}:{repetition_penalty}"
+    cached_response = cache.get(cache_key)
+    if cached_response is not None:
+        return cached_response
     stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
     output = ""
         output += response.token.text
         yield output
+    cache.set(cache_key, output)
     return output
+client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 chat_interface = gr.ChatInterface(
+    fn=generate,
     chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=False, likeable=False, layout="vertical", height=900),
     concurrency_limit=9,
     theme="soft",