Spaces:

coqui
/

voice-chat-with-mistral

Paused

App Files Files Community

gorkemgoknar commited on Nov 13, 2023

Commit

28f7799

1 Parent(s): 87be2eb

add Yi-6B-200K

Browse files

Files changed (1) hide show

app.py +29 -14

app.py CHANGED Viewed

@@ -106,7 +106,7 @@ system_message = system_message.replace("CURRENT_DATE", str(datetime.date.today(
 # MISTRAL ONLY
 default_system_understand_message = (
-    "I understand, I am a Mistral chatbot with speech by Coqui team."
 )
 system_understand_message = os.environ.get(
     "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
@@ -132,24 +132,26 @@ ROLE_PROMPTS["AI Beard The Pirate"]= pirate_system_message
-### WILL USE LOCAL MISTRAL OR ZEPHYR
 from huggingface_hub import hf_hub_download
 print("Downloading LLM")
-print("Downloading Zephyr")
 #Zephyr
 hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
-# use new gguf format
 zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
-print("Downloading Mistral")
 #Mistral
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
-# use new gguf format
 mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
@@ -164,7 +166,10 @@ print("Running LLM Mistral")
 llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 print("Running LLM Zephyr")
-llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-10,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 # Mistral formatter
@@ -230,8 +235,15 @@ def generate_local(
         formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message)
         llm = llm_zephyr
     else:
-        sys_message= system_message.replace("##LLM_MODEL###","Mistral").replace("##LLM_MODEL_PROVIDER###","Mistral")
-        formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message)
         llm = llm_mistral
@@ -680,9 +692,11 @@ EXAMPLES = [
     [[],"AI Assistant","Speak in French, tell me how are you doing?"],
     [[],"AI Assistant","Antworten Sie mir von nun an auf Deutsch"],
     [[],"AI Beard The Pirate","Who are you?"],
 ]
-MODELS = ["Mistral","Zephyr"]
 OTHER_HTML=f"""<div>
 <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
@@ -699,7 +713,7 @@ with gr.Blocks(title=title) as demo:
     with gr.Row():
         model_selected = gr.Dropdown(
             label="Select Instuct LLM Model to Use",
-            info="Zephyr and Mistral 5-bit GGUF models are preloaded",
             choices=MODELS,
             max_choices=1,
             value=MODELS[0],
@@ -789,7 +803,8 @@ It relies on following models :
 Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
 LLM Zephyr     : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
-Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml

 # MISTRAL ONLY
 default_system_understand_message = (
+    "I understand, I am a ##LLM_MODEL### chatbot with speech by Coqui team."
 )
 system_understand_message = os.environ.get(
     "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
+### WILL USE LOCAL MISTRAL OR ZEPHYR OR YI
+### While zephyr and yi will use half GPU to fit all into 16GB, XTTS will use at most 5GB VRAM
 from huggingface_hub import hf_hub_download
 print("Downloading LLM")
+print("Downloading Zephyr 7B beta")
 #Zephyr
 hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
 zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
+print("Downloading Mistral 7B Instruct")
 #Mistral
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
 mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
+print("Downloading Yi-6B-200k")
+#Yi-6B-200K
+hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b-200k.Q5_K_M.gguf")
+yi_model_path="./yi-6b-200k.Q5_K_M.gguf"
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 print("Running LLM Zephyr")
+llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-5,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
+print("Running Yi LLM")
+llm_zephyr = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS-5,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 # Mistral formatter
         formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message)
         llm = llm_zephyr
     else:
+        if "yi" in llm_model.lower():
+            llm_provider= "01.ai"
+            llm_model = "Yi"
+        else:
+            llm_provider= "Mistral"
+            llm_model = "Mistral"
+        sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
+        sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
+        formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
         llm = llm_mistral
     [[],"AI Assistant","Speak in French, tell me how are you doing?"],
     [[],"AI Assistant","Antworten Sie mir von nun an auf Deutsch"],
     [[],"AI Beard The Pirate","Who are you?"],
+    [[],"AI Beard The Pirate","告诉我你的冒险经历"],
 ]
+MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta","Yi 6B"]
 OTHER_HTML=f"""<div>
 <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
     with gr.Row():
         model_selected = gr.Dropdown(
             label="Select Instuct LLM Model to Use",
+            info="Mistral, Zephyr, Yi : 5-bit GGUF models are preloaded",
             choices=MODELS,
             max_choices=1,
             value=MODELS[0],
 Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
 LLM Zephyr     : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
+LLM Yi         : [Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/Yi-6B-200K-GGUF).
+Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml