Mood_Based_Music_Recommender

Sleeping

App Files Files Community

syedmudassir16 commited on Sep 24, 2024

Commit

9c835dd

verified ·

1 Parent(s): dc40641

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -441

app.py CHANGED Viewed

@@ -1,217 +1,48 @@
-from huggingface_hub import InferenceClient
-from transformers import pipeline
-import gradio as gr
-import edge_tts
-import tempfile
 import os
-import wave
 import io
-import asyncio
-import emoji
-# Initialize the inference client with your Hugging Face token
 client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
-# Initialize the ASR pipeline
-asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
-# Define the description for the Gradio interface
-DESCRIPTION = """# <center><b>Mood-Based Music Recommender⚡</b></center>
-### <center>Hi! I'm a music recommender app.
-### <center>What kind of music do you want to listen to, or how are you feeling today?</center>
-"""
-def speech_to_text(speech_path):
-    """Converts speech to text using the ASR pipeline."""
-    return asr(speech_path)["text"]
-def classify_mood(input_string):
-    """Classifies the mood based on keywords in the input string."""
-    input_string = input_string.lower()
-    mood_words = {"happy", "sad", "instrumental", "party"}
-    for word in mood_words:
-        if word in input_string:
-            return word, True
-    return None, False
 def format_prompt(message, history):
-    """Formats the prompt including fixed instructions and conversation history."""
     fixed_prompt = """
-            You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user’s mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
-            Note: Do not write anything else other than the classified mood if classified.
-            Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.
-            Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.
-            Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.
-            Examples
-            User: What is C programming?
-            LLM Response: C programming is a programming language. How are you feeling now after knowing the answer?
-            User: Can I get a coffee?
-            LLM Response: It sounds like you're in need of a little pick-me-up. How are you feeling right now? Are you looking for something upbeat, something to relax to, or maybe some instrumental music while you enjoy your coffee?
-            User: I feel like rocking
-            LLM Response: Party
-            User: I'm feeling so energetic today!
-            LLM Response: Happy
-            User: I'm feeling down today.
-            LLM Response: Sad
-            User: I'm ready to have some fun tonight!
-            LLM Response: Party
-            User: I need some background music while I am stuck in traffic.
-            LLM Response: Instrumental
-            User: Hi
-            LLM Response: Hi, how are you doing?
-            User: Feeling okay only.
-            LLM Response: Are you having a good day?
-            User: I don't know
-            LLM Response: Do you want to listen to some relaxing music?
-            User: No
-            LLM Response: How about listening to some rock and roll music?
-            User: Yes
-            LLM Response: Party
-            User: Where do I find an encyclopedia?
-            LLM Response: You can find it in any of the libraries or on the Internet. Does this answer make you happy?
-            User: I need a coffee
-            LLM Response: It sounds like you're in need of a little pick-me-up. How are you feeling right now? Are you looking for something upbeat, something to relax to, or maybe some instrumental music while you enjoy your coffee?
-            User: I just got promoted at work!
-            LLM Response: Happy
-            User: Today is my birthday!
-            LLM Response: Happy
-            User: I won a prize in the lottery.
-            LLM Response: Happy
-            User: I am so excited about my vacation next week!
-            LLM Response: Happy
-            User: I aced my exams!
-            LLM Response: Happy
-            User: I had a wonderful time with my family today.
-            LLM Response: Happy
-            User: I just finished a great workout!
-            LLM Response: Happy
-            User: I am feeling really good about myself today.
-            LLM Response: Happy
-            User: I finally finished my project and it was a success!
-            LLM Response: Happy
-            User: I just heard my favorite song on the radio.
-            LLM Response: Happy
-            User: My pet passed away yesterday.
-            LLM Response: Sad
-            User: I lost my job today.
-            LLM Response: Sad
-            User: I'm feeling really lonely.
-            LLM Response: Sad
-            User: I didn't get the results I wanted.
-            LLM Response: Sad
-            User: I had a fight with my best friend.
-            LLM Response: Sad
-            User: I'm feeling really overwhelmed with everything.
-            LLM Response: Sad
-            User: I just got some bad news.
-            LLM Response: Sad
-            User: I'm missing my family.
-            LLM Response: Sad
-            User: I am feeling really down today.
-            LLM Response: Sad
-            User: Nothing seems to be going right.
-            LLM Response: Sad
-            User: I need some music while I study.
-            LLM Response: Instrumental
-            User: I want to listen to something soothing while I work.
-            LLM Response: Instrumental
-            User: Do you have any recommendations for background music?
-            LLM Response: Instrumental
-            User: I'm looking for some relaxing tunes.
-            LLM Response: Instrumental
-            User: I need some music to focus on my tasks.
-            LLM Response: Instrumental
-            User: Can you suggest some ambient music for meditation?
-            LLM Response: Instrumental
-            User: What's good for background music during reading?
-            LLM Response: Instrumental
-            User: I need some calm music to help me sleep.
-            LLM Response: Instrumental
-            User: I prefer instrumental music while cooking.
-            LLM Response: Instrumental
-            User: What's the best music to play while doing yoga?
-            LLM Response: Instrumental
-            User: Let's have a blast tonight!
-            LLM Response: Party
-            User: I'm in the mood to dance!
-            LLM Response: Party
-            User: I want to celebrate all night long!
-            LLM Response: Party
-            User: Time to hit the club!
-            LLM Response: Party
-            User: I feel like partying till dawn.
-            LLM Response: Party
-            User: Let's get this party started!
-            LLM Response: Party
-            User: I'm ready to party hard tonight.
-            LLM Response: Party
-            User: I'm in the mood for some loud music and dancing!
-            LLM Response: Party
-            User: Tonight's going to be epic!
-            LLM Response: Party
-            User: Lets turn up the music and have some fun!
-            LLM Response: Party
-"""  # Include your fixed prompt and instructions here
-    prompt = f"{fixed_prompt}"
     for user_prompt, bot_response in history:
-        prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}"
     prompt += f"\nUser: {message}\nLLM Response:"
     return prompt
-def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
     temperature = float(temperature)
     if temperature < 1e-2:
         temperature = 1e-2
@@ -219,7 +50,7 @@ def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, r
     generate_kwargs = dict(
         temperature=temperature,
-        max_new_tokens=2048,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         do_sample=True,
@@ -238,253 +69,83 @@ def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, r
             playlist_message = f"Playing {mood.capitalize()} playlist for you!"
             return playlist_message
     return output
-def generate_llm_output(
-        prompt,
-        history,
-        llm,
-        temperature=0.8,
-        max_tokens=256,
-        top_p=0.95,
-        stop_words=["<s>","[/INST]", "</s>"]
-    ):
-        temperature = float(temperature)
-        if temperature < 1e-2:
-            temperature = 1e-2
-        top_p = float(top_p)
-        generate_kwargs = dict(
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            stop=stop_words
-        )
-        formatted_prompt = format_prompt(prompt, history)
-        try:
-            print("LLM Input:", formatted_prompt)
-            # Local GGUF
-            stream = llm(
-                formatted_prompt,
-                **generate_kwargs,
-                stream=True,
-            )
-            output = ""
-            for response in stream:
-                character= response["choices"][0]["text"]
-                if character in stop_words:
-                    # end of context
-                    return
-                if emoji.is_emoji(character):
-                    # Bad emoji not a meaning messes chat from next lines
-                    return
-                output += response["choices"][0]["text"]
-                yield output
-        except Exception as e:
-            print("Unhandled Exception: ", str(e))
-            gr.Warning("Unfortunately Mistral is unable to process")
-            output = "I do not know what happened but I could not understand you ."
-        return output
-def get_sentence(history, client):
-    history = [["", None]] if history is None else history
-    history[-1][1] = ""
-    sentence_list = []
-    sentence_hash_list = []
-    text_to_generate = ""
-    stored_sentence = None
-    stored_sentence_hash = None
-    for character in generate_llm_output(history[-1][0], history[:-1], client):
-        history[-1][1] = character.replace("<|assistant|>","")
-        # It is coming word by word
-        text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
-        if len(text_to_generate) > 1:
-            dif = len(text_to_generate) - len(sentence_list)
-            if dif == 1 and len(sentence_list) != 0:
-                continue
-            if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
-                continue
-            # All this complexity due to trying append first short sentence to next one for proper language auto-detect
-            if stored_sentence is not None and stored_sentence_hash is None and dif>1:
-                #means we consumed stored sentence and should look at next sentence to generate
-                sentence = text_to_generate[len(sentence_list)+1]
-            elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
-                print("Appending stored")
-                sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
-                stored_sentence_hash = None
-            else:
-                sentence = text_to_generate[len(sentence_list)]
-            # too short sentence just append to next one if there is any
-            # this is for proper language detection
-            if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
-                if sentence[-1] in [".","!","?"]:
-                    if stored_sentence_hash != hash(sentence):
-                        stored_sentence = sentence
-                        stored_sentence_hash = hash(sentence)
-                        print("Storing:",stored_sentence)
-                        continue
-            sentence_hash = hash(sentence)
-            if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
-                continue
-            if sentence_hash not in sentence_hash_list:
-                sentence_hash_list.append(sentence_hash)
-                sentence_list.append(sentence)
-                print("New Sentence: ", sentence)
-                yield (sentence, history)
-    # return that final sentence token
     try:
-        last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
-        sentence_hash = hash(last_sentence)
-        if sentence_hash not in sentence_hash_list:
-            if stored_sentence is not None and stored_sentence_hash is not None:
-                last_sentence = stored_sentence + last_sentence
-                stored_sentence = stored_sentence_hash = None
-                print("Last Sentence with stored:",last_sentence)
-            sentence_hash_list.append(sentence_hash)
-            sentence_list.append(last_sentence)
-            print("Last Sentence: ", last_sentence)
-            yield (last_sentence, history)
-    except:
-        print("ERROR on last sentence history is :", history)
-def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
-    """Creates a WAV header for the audio chunk."""
-    wav_buf = io.BytesIO()
-    with wave.open(wav_buf, "wb") as vfout:
-        vfout.setnchannels(channels)
-        vfout.setsampwidth(sample_width)
-        vfout.setframerate(sample_rate)
-        vfout.writeframes(frame_input)
-    wav_buf.seek(0)
-    return wav_buf.read()
-async def process_speech(speech_file_path):
-    """Processes speech input to text and then calls generate."""
-    text = speech_to_text(speech_file_path)
-    reply = generate(text, history="")
-    communicate = edge_tts.Communicate(reply)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-        tmp_path = tmp_file.name
-        await communicate.save(tmp_path)
-    return tmp_path
-async def handle_speech_generation(speech_file_path, chatbot_history, chatbot_voice):
-    if speech_file_path != "":
-        speech_path = await process_speech(speech_file_path)
-        return speech_file_path, chatbot_history, speech_path
-    return "", chatbot_history, None
-async def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
-    yield "", chatbot_history, wave_header_chunk()
-    if initial_greeting:
-        for _, sentence in chatbot_history:
-            result = await handle_speech_generation(sentence, chatbot_history, chatbot_voice)
-            if result:
-                yield result
-    else:
-        for sentence, chatbot_history in get_sentence(chatbot_history, client):
-            result = await handle_speech_generation(sentence, chatbot_history, chatbot_voice)
-            if result:
-                yield result
-def wrap_async_generator(coro, *args):
-    async def run_async_gen():
-        results = []
-        async for item in coro(*args):
-            results.append(item)
-        return results
-    return asyncio.run(run_async_gen())
-# Gradio interface setup
-with gr.Blocks(css="style.css") as demo:
-    gr.Markdown(DESCRIPTION)
-    chatbot = gr.Chatbot(
-        # value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")],  # Initial greeting from the chatbot
-        elem_id="chatbot",
-        avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
-        bubble_full_width=False,
-    )
-    VOICES = ["female", "male"]
-    with gr.Row():
-        chatbot_voice = gr.Dropdown(
-            label="Voice of the Chatbot",
-            info="How should Chatbot talk like",
-            choices=VOICES,
-            multiselect=False,
-            value=VOICES[0],
-        )
     with gr.Row():
-        txt_box = gr.Textbox(
-            scale=3,
-            show_label=False,
-            placeholder="Enter text and press enter, or speak to your microphone",
-            container=False,
-            interactive=True,
-        )
-        audio_record = gr.Audio(sources="microphone", type="filepath", scale=4)
-    with gr.Row():
-        sentence = gr.Textbox(visible=False)
-        audio_playback = gr.Audio(
-            value=None,
-            label="Generated audio response",
-            streaming=True,
-            autoplay=True,
-            interactive=False,
-            show_label=True,
-        )
-    def add_text(chatbot_history, text):
-        chatbot_history = [] if chatbot_history is None else chatbot_history
-        chatbot_history = chatbot_history + [(text, None)]
-        return chatbot_history, gr.update(value="", interactive=False)
-    async def add_audio(chatbot_history, audio_path):
-        chatbot_history = [] if chatbot_history is None else chatbot_history
-        response = speech_to_text(audio_path)
-        text = response.strip()
-        chatbot_history = chatbot_history + [(text, None)]
-        return chatbot_history, gr.update(value="", interactive=False)
-    txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
-                             ).then(lambda *args: wrap_async_generator(generate_speech, *args), inputs=[chatbot, chatbot_voice], outputs=[sentence, chatbot, audio_playback])
-    txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
-    audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
-                                            ).then(lambda *args: wrap_async_generator(generate_speech, *args), inputs=[chatbot, chatbot_voice], outputs=[sentence, chatbot, audio_playback])
-    audio_msg.then(fn=lambda: (gr.update(interactive=True), gr.update(interactive=True, value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
-    FOOTNOTE = """
-            This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
-            It relies on the following models :
-            - Speech to Text Model: [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) an ASR model, to transcribe recorded audio to text.
-            - Large Language Model: [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) a LLM to generate the chatbot responses.
-            - Text to Speech Model: [edge-tts](https://pypi.org/project/edge-tts/) a TTS model, to generate the voice of the chatbot.
-            Note:
-            - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
-            - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
-    gr.Markdown(FOOTNOTE)
-    demo.load(lambda *args: wrap_async_generator(generate_speech, *args), inputs=[chatbot, chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
-demo.queue().launch(debug=True, share=True)

 import os
+import gradio as gr
+import whisper
+from gtts import gTTS
 import io
+from huggingface_hub import InferenceClient
+# Initialize the Hugging Face Inference Client
 client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
+# Load the Whisper model
+model = whisper.load_model("base")
 def format_prompt(message, history):
     fixed_prompt = """
+    You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
+    Note: Do not write anything else other than the classified mood if classified.
+    Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.
+    Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.
+    Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.
+    [Examples omitted for brevity]
+    """
+    prompt = f"<s>{fixed_prompt}"
     for user_prompt, bot_response in history:
+        prompt += f"\n User:{user_prompt}\n LLM Response:{bot_response}"
     prompt += f"\nUser: {message}\nLLM Response:"
     return prompt
+def classify_mood(input_string):
+    input_string = input_string.lower()
+    mood_words = {"happy", "sad", "instrumental", "party"}
+    for word in mood_words:
+        if word in input_string:
+            return word, True
+    return None, False
+def generate(
+    prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0,
+):
     temperature = float(temperature)
     if temperature < 1e-2:
         temperature = 1e-2
     generate_kwargs = dict(
         temperature=temperature,
+        max_new_tokens=max_new_tokens,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         do_sample=True,
             playlist_message = f"Playing {mood.capitalize()} playlist for you!"
             return playlist_message
     return output
+def process_audio(audio_file):
     try:
+        # Transcribe the audio using Whisper
+        result = model.transcribe(audio_file)
+        text = result["text"]
+        # Generate a response using the existing generate function
+        response = generate(text, [])
+        # Convert the response text to speech
+        tts = gTTS(response)
+        response_audio_io = io.BytesIO()
+        tts.write_to_fp(response_audio_io)
+        response_audio_io.seek(0)
+        # Save audio to a file
+        response_audio_path = "response.mp3"
+        with open(response_audio_path, "wb") as audio_file:
+            audio_file.write(response_audio_io.getvalue())
+        return text, response, response_audio_path
+    except Exception as e:
+        return f"An error occurred: {e}", "", None
+# Create the Gradio interface with customized UI
+with gr.Blocks(css="""
+    .gradio-container {
+        font-family: Arial, sans-serif;
+        background-color: #f0f4c3;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.2);
+        text-align: center;
+    }
+    .gradio-input, .gradio-output {
+        border-radius: 6px;
+        border: 1px solid #ddd;
+        padding: 10px;
+    }
+    .gradio-button {
+        background-color: #ff7043;
+        color: white;
+        border-radius: 6px;
+        border: none;
+        padding: 10px 20px;
+        font-size: 16px;
+        cursor: pointer;
+    }
+    .gradio-button:hover {
+        background-color: #e64a19;
+    }
+    .gradio-title {
+        font-size: 28px;
+        font-weight: bold;
+        margin-bottom: 20px;
+        color: #37474f;
+    }
+    .gradio-description {
+        font-size: 16px;
+        margin-bottom: 20px;
+        color: #616161;
+    }
+""") as demo:
+    gr.Markdown("# Voice-Enabled Mood-Based Music Recommender")
+    gr.Markdown("Upload an audio file or use the microphone to interact with the mood-based music recommender. The system will transcribe your audio, analyze your mood, and provide a spoken recommendation.")
     with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(source="microphone", type="filepath", label="Upload Audio or Use Microphone")
+            submit_button = gr.Button("Submit")
+        with gr.Column():
+            transcription = gr.Textbox(label="Transcription", placeholder="Your speech will be transcribed here", lines=3)
+            response_text = gr.Textbox(label="Recommendation", placeholder="The mood-based recommendation will appear here", lines=3)
+            response_audio = gr.Audio(label="Audio Response", type="filepath")
+    submit_button.click(fn=process_audio, inputs=audio_input, outputs=[transcription, response_text, response_audio])
+demo.launch(share=True)