Spaces:

EarthSpeciesProject
/

NatureLM-Audio

Running on Zero

App Files Files Community

Updating app.py to be up to date with UI changes

by dianekim - opened Aug 22, 2025

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+300

-161

Files changed (1) hide show

app.py +300 -161

app.py CHANGED Viewed

@@ -287,7 +287,7 @@ def main(
     vireo_audio = assets_dir / "yell-YELLWarblingVireoMammoth20150614T29ms.mp3"
     examples = {
-        "Caption the audio (Lazuli Bunting)": [
             str(laz_audio),
             "What is the common name for the focal species in the audio?",
         ],
@@ -299,17 +299,37 @@ def main(
             str(robin_audio),
             "Caption the audio, using the scientific name for any animal species.",
         ],
-        "Caption the audio (Warbling Vireo)": [str(vireo_audio), "Caption the audio."],
     }
     with gr.Blocks(
-        title="NatureLM-audio",
         theme=gr.themes.Base(
             primary_hue="blue", font=[gr.themes.GoogleFont("Noto Sans")]
         ),
     ) as app:
         header = gr.HTML("""
-        <div style="display: flex; align-items: center; gap: 12px;"><h2 style="margin: 0;">NatureLM-audio<span style="font-size: 0.55em; color: #28a745; background: #e6f4ea; padding: 2px 6px; border-radius: 4px; margin-left: 8px; display: inline-block; vertical-align: top;">BETA</span></h2></div>
         """)
@@ -322,50 +342,25 @@ def main(
                 #     label="Model Status",
                 #     interactive=False,
                 #     visible=True,
-                # )
                 with gr.Column(visible=True) as onboarding_message:
                     gr.HTML(
                         """
-                    <div style="
-                        background: transparent;
-                        border: 1px solid #e5e7eb;
-                        border-radius: 8px;
-                        padding: 16px 20px;
-                        display: flex;
-                        align-items: center;
-                        justify-content: space-between;
-                        margin-bottom: 16px;
-                        margin-left: 0;
-                        margin-right: 0;
-                        box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
-                    ">
                         <div style="display: flex; padding: 0px; align-items: center; flex: 1;">
                             <div style="font-size: 20px; margin-right: 12px;">👋</div>
                             <div style="flex: 1;">
                                 <div style="font-size: 16px; font-weight: 600; color: #374151; margin-bottom: 4px;">Welcome to NatureLM-audio!</div>
-                                <div style="font-size: 14px; color: #6b7280; line-height: 1.4;">Upload your first audio file below or try a sample from our library.</div>
                             </div>
                         </div>
-                        <a href="https://www.earthspecies.org/blog" target="_blank" style="
-                            padding: 6px 12px;
-                            border-radius: 6px;
-                            font-size: 13px;
-                            font-weight: 500;
-                            cursor: pointer;
-                            border: none;
-                            background: #3b82f6;
-                            color: white;
-                            text-decoration: none;
-                            display: inline-block;
-                            transition: background 0.2s ease;
-                        "
-                        onmouseover="this.style.background='#2563eb';"
-                        onmouseout="this.style.background='#3b82f6';"
-                        >View Tutorial</a>
                     </div>
                     """,
                         padding=False,
-                    )
                 with gr.Column(visible=True) as upload_section:
                     audio_input = gr.Audio(
                         type="filepath",
@@ -373,17 +368,17 @@ def main(
                         interactive=True,
                         sources=["upload"],
                     )
-                with gr.Group(visible=False) as chat:
                     plotter = gr.Plot(
                         get_spectrogram(torch.zeros(1, SAMPLE_RATE)),
                         label="Spectrogram",
                         visible=False,
                         elem_id="spectrogram-plot",
                     )
                     task_dropdown = gr.Dropdown(
                         [
-                            "What are the common names for the species in the audio, if any?",
-                            "Caption the audio.",
                             "Caption the audio, using the scientific name for any animal species.",
                             "Caption the audio, using the common name for any animal species.",
                             "What is the scientific name for the focal species in the audio?",
@@ -394,13 +389,15 @@ def main(
                             "What call types are heard from the focal species in the audio?",
                             "What is the life stage of the focal species in the audio?",
                         ],
-                        label="Pre-configured Tasks",
-                        allow_custom_value=True,
-                        info="Select a task or enter a custom query below",
-                        value=None,
-                    )
                     chatbot = gr.Chatbot(
                         elem_id="chatbot",
                         type="messages",
                         label="Chat",
                         render_markdown=False,
@@ -413,130 +410,272 @@ def main(
                             "other",
                         ],
                         resizeable=True,
                     )
-                    gr.Markdown("### Your Query")
-                    def validate_and_submit(chatbot_history, chat_input):
-                        if not chat_input or not chat_input.strip():
-                            gr.Warning("Please enter a query before sending.")
-                            return chatbot_history, chat_input
-                        updated_history = add_user_query(chatbot_history, chat_input)
-                        return updated_history, ""
-                    def update_current_audio(audio_input):
-                        # if this audio_input is the same as the CURRENT_AUDIO, set it None
-                        # else update CURRENT_AUDIO
-                        global CURRENT_AUDIO
-                        if audio_input != CURRENT_AUDIO:
-                            CURRENT_AUDIO = audio_input
-                    chat_input = gr.Textbox(
-                        placeholder="Enter a query and press Shift+Enter to send",
-                        type="text",
-                        label="Custom query",
-                        lines=2,
-                        show_label=True,
-                        container=False,
-                        submit_btn="Send",
-                        elem_id="chat-input",
-                    )
-                    # if task_dropdown is selected, set chat_input to that value
-                    def set_query(task):
-                        if task:
-                            return gr.update(value=task)
-                        return gr.update(value="")
-                    task_dropdown.select(
-                        fn=set_query,
-                        inputs=[task_dropdown],
-                        outputs=[chat_input],
-                    )
-                    clear_button = gr.ClearButton(
-                        components=[chatbot, chat_input, audio_input, plotter],
-                        visible=False,
-                    )
-                    def start_chat_interface(audio_path):
-                        return (
-                            gr.update(visible=False),  # hide onboarding message
-                            gr.update(visible=True),  # show upload section
-                            gr.update(visible=True),  # show chat box
-                            gr.update(visible=True),  # show plotter
-                        )
-                    # When audio added, set spectrogram
-                    audio_input.change(
-                        fn=start_chat_interface,
-                        inputs=[audio_input],
-                        outputs=[onboarding_message, upload_section, chat, plotter],
-                    ).then(
-                        fn=update_current_audio,
-                        inputs=[audio_input],
-                        outputs=[],
-                    ).then(
-                        fn=make_spectrogram_figure,
-                        inputs=[audio_input],
-                        outputs=[plotter],
-                    )
-                    # When submit clicked first:
-                    # 1. Validate and add user query to chat history
-                    # 2. Get response from model
-                    # 3. Clear the chat input box
-                    # 4. Show clear button
-                    chat_input.submit(
-                        validate_and_submit,
-                        inputs=[chatbot, chat_input],
-                        outputs=[chatbot, chat_input],
-                    ).then(
-                        get_response,
-                        inputs=[chatbot, audio_input],
-                        outputs=[chatbot],
-                    ).then(
-                        lambda: gr.update(visible=True),  # Show clear button
-                        None,
-                        [clear_button],
                     )
-                    clear_button.click(
-                        lambda: gr.ClearButton(visible=False), None, [clear_button]
-                    )
-            with gr.Tab("Sample Library"):
-                gr.Markdown("## Sample Library\n\nExplore example audio files below.")
-                gr.Examples(
-                    list(examples.values()),
-                    [audio_input, chat_input],
-                    [audio_input, chat_input],
-                    example_labels=list(examples.keys()),
-                    examples_per_page=20,
                 )
-            with gr.Tab("💡 Help"):
-                gr.Markdown("## User Guide")  # to fill out
-                gr.Markdown("## Share Feedback")  # to fill out
-                gr.Markdown("## FAQs")  # to fill out
-            app.css = """
-            .welcome-banner {
-                background: transparent !important;
-                border: 1px solid #e5e7eb !important;
-                border-radius: 8px !important;
-                padding: 16px 20px !important;
-                margin-bottom: 16px !important;
-                box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
-            }
-            .welcome-banner > div {
-                background: transparent !important;
-            }
-            .welcome-banner button {
-                margin: 0 4px !important;
-            }
             """
     return app

     vireo_audio = assets_dir / "yell-YELLWarblingVireoMammoth20150614T29ms.mp3"
     examples = {
+        "Identifying Focal Species (Lazuli Bunting)": [
             str(laz_audio),
             "What is the common name for the focal species in the audio?",
         ],
             str(robin_audio),
             "Caption the audio, using the scientific name for any animal species.",
         ],
+        "Caption the audio (Warbling Vireo)": [
+            str(vireo_audio),
+            "Caption the audio."
+        ],
+        "Speaker Count (Lazuli Bunting)": [
+            str(laz_audio),
+            "How many individuals are vocalizing in this audio?",
+        ],
+        "Caption the audio (Green Tree Frog)": [
+            str(frog_audio),
+            "Caption the audio, using the common name for any animal species.",
+        ],
+        "Caption the audio (American Robin)": [
+            str(robin_audio),
+            "Caption the audio, using the scientific name for any animal species.",
+        ],
+        "Caption the audio (Warbling Vireo)": [
+            str(vireo_audio),
+            "Caption the audio."
+        ],
     }
     with gr.Blocks(
+        title="NatureLM-audio",
         theme=gr.themes.Base(
             primary_hue="blue", font=[gr.themes.GoogleFont("Noto Sans")]
         ),
+        css="styles.css",
     ) as app:
         header = gr.HTML("""
+        <div style="display: flex; align-items: center; gap: 12px;"><img src="https://huggingface.co/spaces/EarthSpeciesProject/NatureLM-Audio/resolve/main/assets/esp_logo.png" style="height: 40px; width: auto;"><h2 style="margin: 0;">NatureLM-audio<span style="font-size: 0.55em; color: #28a745; background: #e6f4ea; padding: 2px 6px; border-radius: 4px; margin-left: 8px; display: inline-block; vertical-align: top;">BETA</span></h2></div>
         """)
                 #     label="Model Status",
                 #     interactive=False,
                 #     visible=True,
+                # )
                 with gr.Column(visible=True) as onboarding_message:
                     gr.HTML(
                         """
+                    <div class="banner">
                         <div style="display: flex; padding: 0px; align-items: center; flex: 1;">
                             <div style="font-size: 20px; margin-right: 12px;">👋</div>
                             <div style="flex: 1;">
                                 <div style="font-size: 16px; font-weight: 600; color: #374151; margin-bottom: 4px;">Welcome to NatureLM-audio!</div>
+                                <div style="font-size: 14px; color: #6b7280; line-height: 1.4;">Upload your first audio file below or select a pre-loaded example below.</div>
                             </div>
                         </div>
+                        <a href="https://www.earthspecies.org/blog" target="_blank" class="link-btn">View Tutorial</a>
                     </div>
                     """,
                         padding=False,
+                    )
                 with gr.Column(visible=True) as upload_section:
                     audio_input = gr.Audio(
                         type="filepath",
                         interactive=True,
                         sources=["upload"],
                     )
+                with gr.Accordion(label="Toggle Spectrogram", open=False, visible=False) as spectrogram:
                     plotter = gr.Plot(
                         get_spectrogram(torch.zeros(1, SAMPLE_RATE)),
                         label="Spectrogram",
                         visible=False,
                         elem_id="spectrogram-plot",
                     )
+                with gr.Column(visible=False) as tasks:
                     task_dropdown = gr.Dropdown(
                         [
+                            "What are the common names for the species in the audio, if any?",
                             "Caption the audio, using the scientific name for any animal species.",
                             "Caption the audio, using the common name for any animal species.",
                             "What is the scientific name for the focal species in the audio?",
                             "What call types are heard from the focal species in the audio?",
                             "What is the life stage of the focal species in the audio?",
                         ],
+                        label="Pre-Loaded Tasks",
+                        info="Select a task, or write your own prompt below.",
+                        allow_custom_value=False,
+                        value=None,
+                    )
+                with gr.Group(visible=False) as chat:
                     chatbot = gr.Chatbot(
                         elem_id="chatbot",
+                        height=250,
                         type="messages",
                         label="Chat",
                         render_markdown=False,
                             "other",
                         ],
                         resizeable=True,
+                    )
+                    with gr.Column() as text:
+                        chat_input = gr.Textbox(
+                            placeholder="Type your message and press Enter to send",
+                            type="text",
+                            lines=1,
+                            show_label=False,
+                            submit_btn="Send",
+                            container=False,
+                            autofocus=True,
+                            elem_id="chat-input",
+                        )
+                with gr.Column() as examples_section:
+                    gr.Examples(
+                        list(examples.values()),
+                        [audio_input, chat_input],
+                        [audio_input, chat_input],
+                        example_labels=list(examples.keys()),
+                        examples_per_page=20,
                     )
+                def validate_and_submit(chatbot_history, chat_input):
+                    if not chat_input or not chat_input.strip():
+                        gr.Warning("Please enter a question or message before sending.")
+                        return chatbot_history, chat_input
+                    updated_history = add_user_query(chatbot_history, chat_input)
+                    return updated_history, ""
+                def update_current_audio(audio_input):
+                    global CURRENT_AUDIO
+                    if audio_input != CURRENT_AUDIO:
+                        CURRENT_AUDIO = audio_input
+                clear_button = gr.ClearButton(
+                    components=[chatbot, chat_input, audio_input, plotter],
+                    visible=False,
+                )
+                # if task_dropdown is selected, set chat_input to that value
+                def set_query(task):
+                    if task:
+                        return gr.update(value=task)
+                    return gr.update(value="")
+                task_dropdown.select(
+                    fn=set_query,
+                    inputs=[task_dropdown],
+                    outputs=[chat_input],
+                )
+                def start_chat_interface(audio_path):
+                    return (
+                        gr.update(visible=False), # hide onboarding message
+                        gr.update(visible=True),  # show upload section
+                        gr.update(visible=True),  # show spectrogram
+                        gr.update(visible=True),  # show tasks
+                        gr.update(visible=True),  # show chat box
+                        gr.update(visible=True),  # show plotter
                     )
+                # When audio added, set spectrogram
+                audio_input.change(
+                    fn=start_chat_interface,
+                    inputs=[audio_input],
+                    outputs=[onboarding_message, upload_section, spectrogram, tasks, chat, plotter],
+                ).then(
+                    fn=update_current_audio,
+                    inputs=[audio_input],
+                    outputs=[],
+                ).then(
+                    fn=make_spectrogram_figure,
+                    inputs=[audio_input],
+                    outputs=[plotter],
+                )
+                # When submit clicked first:
+                # 1. Validate and add user query to chat history
+                # 2. Get response from model
+                # 3. Clear the chat input box
+                # 4. Show clear button
+                chat_input.submit(
+                    validate_and_submit,
+                    inputs=[chatbot, chat_input],
+                    outputs=[chatbot, chat_input],
+                ).then(
+                    get_response,
+                    inputs=[chatbot, audio_input],
+                    outputs=[chatbot],
+                ).then(
+                    lambda: gr.update(visible=True),  # Show clear button
+                    None,
+                    [clear_button],
+                )
+                clear_button.click(
+                    lambda: gr.ClearButton(visible=False), None, [clear_button]
                 )
+            with gr.Tab("Sample Library"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Download Sample Audio")
+                        gr.Markdown(
+                            """Feel free to explore these sample audio files. To download, click the button in the top-right corner of each audio file, or **Download All**. You can also find a large collection of publicly available animal sounds on
+                            [Xenocanto](https://xeno-canto.org/explore/taxonomy) and [Watkins Marine Mammal Sound Database](https://whoicf2.whoi.edu/science/B/whalesounds/index.cfm).""")
+                        samples=[
+                            ("assets/Lazuli_Bunting_yell-YELLLAZB20160625SM303143.m4a", "Lazuli Bunting"),
+                            ("assets/nri-GreenTreeFrogEvergladesNP.mp3", "Green Tree Frog"),
+                            ("assets/Eastern Gray Squirrel - Sciurus carolinensis.wav", "Eastern Gray Squirrel"),
+                            ("assets/Gray Wolf - Canis lupus italicus.m4a", "Gray Wolf"),
+                            ("assets/Humpback Whale - Megaptera novaeangliae.wav", "Humpback Whale"),
+                            ("assets/Walrus - Odobenus rosmarus.wav", "Walrus"),
+                        ]
+                        for row_i in range(0, len(samples), 3):
+                            with gr.Row():
+                                for filepath, label in samples[row_i:row_i+3]:
+                                    with gr.Column():
+                                        gr.Audio(
+                                            filepath,
+                                            label=label,
+                                            type="filepath",
+                                            show_download_button=True
+                                        )
+                        with gr.Row():
+                            gr.HTML("""<center>
+                                <a href="assets/Sample Audio Files NatureLM_audio.zip" download class="download-btn">Download All</a></center>
+                            """
+                            )
+            with gr.Tab("💡 Help"):
+                gr.HTML("""
+                        <div class="guide-section">
+                            <h3>Getting Started</h3>
+                            <ol style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
+                                <li style="margin-bottom: 8px;"><strong>Upload your audio</strong> - Click the upload area or drag and drop your audio file containing animal vocalizations.</li>
+                                <li style="margin-bottom: 8px;"><strong>Trim your audio (if needed)</strong> - Try to keep your audio to 10 seconds or less.</li>
+                                <li style="margin-bottom: 8px;"><strong>View the Spectrogram (optional)</strong> - You can easily view/hide the spectrogram of your audio for closer analysis.</li>
+                                <li style="margin-bottom: 8px;"><strong>Select a task or write your own</strong> - Select an option from pre-loaded tasks. This will auto-fill the text box with a prompt, so all you have to do is hit Send. Or, type a custom prompt directly into the chat.</li>
+                                <li style="margin-bottom: 0;"><strong>Send and Analyze Audio</strong> - Press "Send" or type Enter to begin processing your audio. Ask follow-up questions or press "Clear" to start a new conversation.</li>
+                            </ol>
+                        <p></p>
+                        </div>
+                        <div class="guide-section">
+                            <h3>Tips & Tricks</h3>
+                                <b>Prompting Best Practices</b>
+                                <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
+                                    <li>Be specific about what you want to know (e.g., "What species made this call?" vs "Analyze this audio")</li>
+                                    <li>Mention the context if known (geographic area/location, time of day or year, habitat type)</li>
+                                    <li>[TO ADD: examples of classification prompts that do and don't work well]</li>
+                                </ul>
+                                <b>Audio Files</b>
+                                <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
+                                    <li>Supported formats: .wav, .mp3, .aac, .flac, .ogg,  .webm,  .midi, .aiff,  .wma, .opus, .amr</li>
+                                    <li>If you are uploading an .mp4, please check that it is not an MPEG-4 Movie file. </li>
+                                    <li>For best results, use high-quality recordings with minimal background noise.</li>
+                                </ul>
+                        </div>
+                            <div class="guide-section">
+                                    <h3>Learn More</h3>
+                                    <ul style="margin-top: 12px; padding-left: 20px; color: #6b7280; font-size: 14px; line-height: 1.6;">
+                                        <li>Read our <a href="https://earthspecies.org/blog" target="_blank">recent blog post</a> with a step-by-step tutorial</li>
+                                        <li>Check out the <a href="https://openreview.net/forum?id=hJVdwBpWjt" target="_blank">published paper</a> for a deeper technical dive on NatureLM-audio.</li>
+                                        <li>Visit the <a href="https://earthspecies.github.io/naturelm-audio-demo/" target="_blank">NatureLM-audio Demo Page</a> for additional context, a demo video, and more examples of the model in action.</li>
+                                        <li>Sign up for our <a href="https://forms.gle/WjrbmFhKkzmEgwvY7" target="_blank">closed beta waitlist</a>, if you’re interested in testing upcoming features like longer audio files and batch processing.</li>
+                                    </ul>
+                            </div>
+                            <div class="guide-section">
+                                    <h4>Help us improve the model!</h4>
+                                    <p>Found an issue or have suggestions? Please join us on <a href="https://earthspeciesproject.discourse.group/" target="_blank">Discourse</a> to share any feedback, questions, bug reports, or other ideas. Your input helps make NatureLM-audio better for everyone.</p>
+                            </div>
+                    </div>
+                        """)
+            app.css = """
+                #chat-input {
+                    background: white;
+                    padding: 10px;
+                    min-height: 44px;
+                    display: flex;
+                    align-items: center;
+                }
+                #chat-input textarea {
+                    background: white;
+                    flex: 1;
+                }
+                #chat-input .submit-button {
+                    padding: 10px;
+                    margin: 2px 6px;
+                    align-self: center;
+                }
+                #spectrogram-plot {
+                    padding: 12px;
+                    margin: 12px;
+                }
+                .banner {
+                    background: transparent;
+                    border: 1px solid #e5e7eb;
+                    border-radius: 8px;
+                    padding: 16px 20px;
+                    display: flex;
+                    align-items: center;
+                    justify-content: space-between;
+                    margin-bottom: 16px;
+                    margin-left: 0;
+                    margin-right: 0;
+                    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+                }
+                .link-btn {
+                    padding: 6px 12px;
+                    border-radius: 6px;
+                    font-size: 13px;
+                    font-weight: 500;
+                    cursor: pointer;
+                    border: none;
+                    background: #3b82f6;
+                    color: white;
+                    text-decoration: none;
+                    display: inline-block;
+                    transition: background 0.2s ease;
+                }
+                .link-btn:hover {
+                    background: #2563eb;
+                }
+                .download-btn {
+                    padding: 10px 20px;
+                    border-radius: 6px;
+                    font-size: 13px;
+                    font-weight: 500;
+                    cursor: pointer;
+                    border: none;
+                    background: #3b82f6;
+                    color: white;
+                    text-decoration: none;
+                    display: block;
+                    text-align: center;
+                    transition: background 0.2s ease;
+                    width: 200px;
+                    box-sizing: border-box;
+                }
+                .download-btn:hover {
+                    background: #2563eb;
+                }
+                .guide-section {
+                    margin-bottom: 32px;
+                    background: white;
+                    border-radius: 8px;
+                    padding: 14px;
+                    border: 1px solid #e5e7eb;
+                }
+                .guide-section h3 {
+                    color: #1f2937;
+                    margin-top: 4px;
+                    margin-bottom: 16px;
+                    border-bottom: 1px solid #e5e7eb;
+                    padding-bottom: 12px;
+                }
+                .guide-section h4 {
+                    color: #1f2937;
+                    margin-top: 4px;
+                }
             """
     return app