Spaces:

K00B404
/

3Luik

Sleeping

App Files Files Community

K00B404 commited on Jan 23, 2025

Commit

0ecd9af

verified ·

1 Parent(s): e7693f3

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -80

app.py CHANGED Viewed

@@ -1,9 +1,3 @@
-#from huggingfaceinferenceclient import HuggingFaceInferenceClient
-#from outpaintprocessor import DynamicImageOutpainter
-#from aivideopipeline import AIImageVideoPipeline
-#from mmig import MultiModelImageGenerator
 import os
 import requests
 from PIL import Image
@@ -12,100 +6,161 @@ from huggingface_hub import InferenceClient
 from IPython.display import Audio, display
 import gradio as gr
 read_token = os.getenv('HF_READ')
 write_token = os.getenv('HF_WRITE')
-#chatmodel
-chatmodel="mistralai/Mistral-Nemo-Instruct-2407"
-# Whisper for Speech-to-Text
-WHISPER_API_URL = "https://api-inference.huggingface.co/models/distil-whisper/distil-large-v2"
-WHISPER_HEADERS = {"Authorization": "Bearer " + read_token}
-# Bark for Text-to-Speech
-BARK_API_URL = "https://api-inference.huggingface.co/models/suno/bark"
-BARK_HEADERS = {"Authorization": "Bearer "+read_token}
-# Flux for Image Generation
-FLUX_API_URL = "https://api-inference.huggingface.co/models/enhanceaiteam/Flux-uncensored"
-FLUX_HEADERS = {"Authorization": "Bearer "+read_token}
-def speech_to_text(filename):
-    with open(filename, "rb") as f:
-        data = f.read()
-    response = requests.post(WHISPER_API_URL, headers=WHISPER_HEADERS, data=data)
-    if response.status_code == 200:
-        return response.json().get("text", "Could not recognize speech")
-    else:
-        print(f"Error: {response.status_code} - {response.text}")
-        return None
-# Chatbot Logic with Hugging Face InferenceClient
 client = InferenceClient(api_key=read_token)
 def chatbot_logic(input_text):
-    messages = [{"role": "user", "content": input_text}]
     try:
         completion = client.chat.completions.create(
-            model=chatmodel,
-            messages=messages,
             max_tokens=500
         )
-        return completion.choices[0].message["content"]
-    except Exception as e:
-        print(f"Error: {e}")
-        return None
 def text_to_speech(text):
-    payload = {"inputs": text}
-    response = requests.post(BARK_API_URL, headers=BARK_HEADERS, json=payload)
-    if response.status_code == 200:
-        return response.content
-    else:
-        print(f"Error: {response.status_code} - {response.text}")
-        return None
 def generate_image(prompt):
-    data = {"inputs": prompt}
-    response = requests.post(FLUX_API_URL, headers=FLUX_HEADERS, json=data)
-    if response.status_code == 200:
-        image_bytes = BytesIO(response.content)
-        return Image.open(image_bytes)
-    else:
-        print(f"Error: {response.status_code} - {response.text}")
-        return None
-# Gradio Interface for Chatbot and Image Generator
-def create_ui():
-    def process_chat(audio_file):
-        # Step 1: Speech to Text
-        recognized_text = speech_to_text(audio_file)
-        if not recognized_text:
-            return "Could not recognize speech", None, None
-        # Step 2: Chatbot Logic
-        response_text = chatbot_logic(recognized_text)
-        if not response_text:
-            return f"Error generating response for: {recognized_text}", None, None
-        # Step 3: Text to Speech
-        audio_output = text_to_speech(response_text)
-        if not audio_output:
-            return f"Error synthesizing response: {response_text}", None, None
-        # Step 4: Image Generation
-        generated_image = generate_image(response_text)
-        return response_text, Audio(audio_output, autoplay=True), generated_image
-    with gr.Blocks(title="Voice-to-Voice Chatbot with Image Generation") as ui:
-        gr.Markdown("## Voice-to-Voice Chatbot with Image Generation\nUpload an audio file to interact with the chatbot.")
         audio_input = gr.Audio(source="upload", type="filepath", label="Input Audio File")
-        submit_button = gr.Button("Process")
         with gr.Row():
-            chatbot_response = gr.Textbox(label="Chatbot Response", lines=2)
         with gr.Row():
-            audio_output = gr.Audio(label="Generated Audio Response")
             image_output = gr.Image(label="Generated Image")
         submit_button.click(
@@ -117,6 +172,5 @@ def create_ui():
     return ui
-# Run the Gradio Interface
 if __name__ == "__main__":
     create_ui().launch(debug=True)

 import os
 import requests
 from PIL import Image
 from IPython.display import Audio, display
 import gradio as gr
+# Tokens for Hugging Face API
 read_token = os.getenv('HF_READ')
 write_token = os.getenv('HF_WRITE')
+# Model configurations
+HEADERS = {"Authorization": f"Bearer {read_token}"}
+BASE_URL='https://api-inference.huggingface.co/models/'
+CHAT_MODEL = "mistralai/Mistral-Nemo-Instruct-2407"
+WHISPER_API_URL = "distil-whisper/distil-large-v2"
+BARK_API_URL = "suno/bark"
+FLUX_API_URL = "enhanceaiteam/Flux-uncensored"
+# Initialize Hugging Face Inference Client
 client = InferenceClient(api_key=read_token)
+# Chatbot system prompt
+system_prompt = """
+You are an empathetic and knowledgeable AI assistant designed to engage in meaningful conversations,
+assist with tasks, and provide accurate information.
+You can also generate vivid visuals!
+To request an image, include a description between the IMG tags, like this:
+    ##IMG: A serene forest at dawn with a golden glow:IMG##
+"""
+chat_history = []
+def tagger(bot_response):
+    """
+    Extract tags from the bot response and return the filtered response text and tags.
+    Args:
+        bot_response (str): The full response text from the chatbot.
+    Returns:
+        tuple: A tuple containing:
+            - filtered_response (str): The response text with tags removed.
+            - tags (dict): A dictionary of extracted tags and their values.
+    """
+    import re
+    tags = {}
+    filtered_response = bot_response
+    # Match patterns like ##IMG: ... :IMG##
+    img_pattern = r"##IMG:(.+?):IMG##"
+    img_matches = re.findall(img_pattern, bot_response)
+    if img_matches:
+        tags['images'] = img_matches
+        # Remove image tags from the response text
+        filtered_response = re.sub(img_pattern, "", filtered_response).strip()
+    # Additional tags can be added here as needed
+    # For example, if you want to support ##AUDIO: ... :AUDIO## tags:
+    # audio_pattern = r"##AUDIO:(.+?):AUDIO##"
+    # audio_matches = re.findall(audio_pattern, bot_response)
+    # if audio_matches:
+    #     tags['audio'] = audio_matches
+    #     filtered_response = re.sub(audio_pattern, "", filtered_response).strip()
+    return filtered_response, tags
+def speech_to_text(filename):
+    """Convert speech to text using Whisper API."""
+    try:
+        with open(filename, "rb") as f:
+            data = f.read()
+        response = requests.post(BASE_URL+WHISPER_API_URL, headers=HEADERS, data=data)
+        if response.status_code == 200:
+            return response.json().get("text", "Could not recognize speech")
+        print(f"Whisper Error: {response.status_code} - {response.text}")
+    except Exception as e:
+        print(f"Exception in speech_to_text: {e}")
+    return None
 def chatbot_logic(input_text):
+    """Generate a response from the chatbot and handle tags."""
+    global chat_history
+    chat_history.append({"role": "user", "content": input_text})
+    messages = [{"role": "system", "content": system_prompt}] + chat_history
     try:
         completion = client.chat.completions.create(
+            model=CHAT_MODEL,
+            messages=messages,
             max_tokens=500
         )
+        response_text = completion.choices[0].message["content"]
+        # Use tagger to process tags and clean response text
+        response_text, tags = tagger(response_text)
+        chat_history.append({"role": "assistant", "content": response_text})
+        # Extract image prompt from tags if present
+        image_prompt = tags.get("images")[0] if "images" in tags else None
+        return response_text, image_prompt
+    except Exception as e:
+        print(f"Chatbot Error: {e}")
+    return None, None
 def text_to_speech(text):
+    """Convert text to speech using Bark API."""
+    try:
+        response = requests.post(BASE_URL+BARK_API_URL, headers=HEADERS, json={"inputs": text})
+        if response.status_code == 200:
+            return response.content
+        print(f"Bark Error: {response.status_code} - {response.text}")
+    except Exception as e:
+        print(f"Exception in text_to_speech: {e}")
+    return None
 def generate_image(prompt):
+    """Generate an image using the Flux API."""
+    try:
+        response = requests.post(BASE_URL+FLUX_API_URL, headers=HEADERS, json={"inputs": prompt})
+        if response.status_code == 200:
+            return Image.open(BytesIO(response.content))
+        print(f"Flux Error: {response.status_code} - {response.text}")
+    except Exception as e:
+        print(f"Exception in generate_image: {e}")
+    return None
+def process_chat(audio_file):
+    """Process user input, generate response, and optionally create media."""
+    # Step 1: Speech-to-text
+    recognized_text = speech_to_text(audio_file)
+    if not recognized_text:
+        return "Speech recognition failed.", None, None
+    # Step 2: Chatbot response
+    response_text, image_prompt = chatbot_logic(recognized_text)
+    if not response_text:
+        return "Failed to generate chatbot response.", None, None
+    # Step 3: Text-to-speech
+    audio_response = text_to_speech(response_text)
+    # Step 4: Optional image generation
+    generated_image = generate_image(image_prompt) if image_prompt else None
+    return response_text, Audio(audio_response, autoplay=True), generated_image
+def create_ui():
+    """Build and launch the Gradio interface."""
+    with gr.Blocks(title="Enhanced Voice-to-Voice Chatbot with Images") as ui:
+        gr.Markdown("## Voice-to-Voice AI Chatbot\nTalk to the AI and see its responses, including images it generates!")
         audio_input = gr.Audio(source="upload", type="filepath", label="Input Audio File")
+        submit_button = gr.Button("Submit")
         with gr.Row():
+            chatbot_response = gr.Textbox(label="Chatbot Response", lines=4)
         with gr.Row():
+            audio_output = gr.Audio(label="Audio Response")
             image_output = gr.Image(label="Generated Image")
         submit_button.click(
     return ui
 if __name__ == "__main__":
     create_ui().launch(debug=True)