Spaces:

nvidia
/

audio-flamingo-next

Running on Zero

App Files Files Community

SreyanG-NVIDIA commited on 25 days ago

Commit

773b366

1 Parent(s): 2deca47

Add AudioFlamingoNext

Browse files

Files changed (4) hide show

README.md +6 -5
app.py +546 -0
packages.txt +4 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
 title: Audio Flamingo Next
-emoji: 📊
-colorFrom: indigo
-colorTo: gray
 sdk: gradio
-sdk_version: 6.11.0
 app_file: app.py
 pinned: false
-short_description: Audio Flamingo Next
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Audio Flamingo Next
+emoji: 🔊
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.49.1
+python_version: 3.12
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import shutil
+import gradio as gr
+import spaces
+import yt_dlp
+import os
+import tempfile
+import re
+import subprocess
+import socket
+import time
+import atexit
+import torch
+from transformers import AutoModel, AutoProcessor
+PROXY_URL = None
+_tunnel_proc = None
+def _write_temp_key_and_kh(key_str, kh_line):
+    key_clean = key_str.replace("\r\n", "\n").replace("\r", "\n")
+    if not key_clean.endswith("\n"):
+        key_clean += "\n"
+    keyf = tempfile.NamedTemporaryFile("w", delete=False)
+    keyf.write(key_clean)
+    keyf.flush()
+    os.chmod(keyf.name, 0o600)
+    keyf.close()
+    khf = tempfile.NamedTemporaryFile("w", delete=False)
+    khf.write(kh_line.strip() + "\n")
+    khf.flush()
+    khf.close()
+    return keyf.name, khf.name
+def _validate_private_key(path):
+    if not shutil.which("ssh-keygen"):
+        return True
+    try:
+        subprocess.check_output(["ssh-keygen", "-y", "-f", path], stderr=subprocess.STDOUT)
+        return True
+    except subprocess.CalledProcessError:
+        return False
+def _ensure_local_socks_tunnel():
+    global PROXY_URL, _tunnel_proc
+    if PROXY_URL:
+        return
+    srv = os.getenv("SSH_SERVER")
+    port = os.getenv("SSH_PORT", "22")
+    key = os.getenv("SSH_PRIVATE_KEY")
+    hk = os.getenv("SSH_HOSTKEY")
+    if not (srv and key and hk and shutil.which("ssh")):
+        return
+    key_path, kh_path = _write_temp_key_and_kh(key, hk)
+    if not _validate_private_key(key_path):
+        return
+    cmd = [
+        "ssh","-NT","-p", port,"-i", key_path,
+        "-D","127.0.0.1:1080",
+        "-o","IdentitiesOnly=yes",
+        "-o","ExitOnForwardFailure=yes",
+        "-o","BatchMode=yes",
+        "-o","StrictHostKeyChecking=yes",
+        "-o", f"UserKnownHostsFile={kh_path}",
+        "-o","GlobalKnownHostsFile=/dev/null",
+        "-o","ServerAliveInterval=30","-o","ServerAliveCountMax=3",
+        srv,
+    ]
+    with open("/tmp/ssh_tunnel.log", "w") as lf:
+        _tunnel_proc = subprocess.Popen(cmd, stdout=lf, stderr=lf)
+    for _ in range(40):
+        if _tunnel_proc.poll() is not None:
+            return
+        try:
+            socket.create_connection(("127.0.0.1", 1080), 0.5).close()
+            PROXY_URL = "socks5h://127.0.0.1:1080"
+            break
+        except OSError:
+            time.sleep(0.25)
+    atexit.register(lambda: _tunnel_proc and _tunnel_proc.terminate())
+_ensure_local_socks_tunnel()
+REPO_URL = "https://github.com/afnext-umd-nvidia/afnext-umd-nvidia.github.io"
+MODEL_ID = "nvidia/audio-flamingo-next-hf"
+HERO_IMAGE_URL = "https://afnext-umd-nvidia.github.io/logo.webp"
+HERO_TITLE = "Audio Flamingo Next: Next-Generation Open Audio-Language Models for Speech, Sound, and Music"
+HERO_SUBTITLE = "Upload audio or paste a YouTube URL and ask about speech, environmental sounds, music, timestamps, speakers, or long-form events. Audio Flamingo Next gives detailed answers."
+HERO_AUTHORS = """
+<div style="margin-top: 8px; margin-bottom: 4px; padding: 8px 20px; text-align: center; max-width: 900px; margin-inline: auto;">
+  <p style="font-size: 0.95rem; line-height: 1.6; margin-bottom: 10px;">
+    <strong>Authors:</strong> Sreyan Ghosh<sup>1,2</sup>, Arushi Goel<sup>1</sup>, Kaousheik Jayakumar<sup>2</sup>, Lasha Koroshinadze<sup>2</sup>, Nishit Anand<sup>2</sup>, Zhifeng Kong<sup>1</sup>, Siddharth Gururani<sup>1</sup>, Sang-gil Lee<sup>1</sup>, Jaehyeon Kim<sup>1</sup>, Aya Aljafari<sup>1</sup>, Chao-Han Huck Yang<sup>1</sup>, Sungwon Kim<sup>1</sup>, Ramani Duraiswami<sup>2</sup>, Dinesh Manocha<sup>2</sup>, Mohammad Shoeybi<sup>1</sup>, Bryan Catanzaro<sup>1</sup>, Ming-Yu Liu<sup>1</sup>, Wei Ping<sup>1</sup>
+  </p>
+  <p style="font-size: 0.88rem; opacity: 0.75; margin-bottom: 8px;">
+    <sup>1</sup>NVIDIA, CA, USA | <sup>2</sup>University of Maryland, College Park, USA
+  </p>
+  <p style="font-size: 0.85rem; opacity: 0.7; margin-bottom: 0;">
+    <strong>Correspondence:</strong> <a href="mailto:sreyang@umd.edu" style="color: inherit; text-decoration: underline;">sreyang@umd.edu</a>, <a href="mailto:arushig@nvidia.com" style="color: inherit; text-decoration: underline;">arushig@nvidia.com</a>
+  </p>
+</div>
+"""
+HERO_BADGES = f"""
+<div style="display: flex; justify-content: center; margin-top: 6px; align-items: center;">
+  <div style="display: flex; justify-content: center; flex-wrap: wrap; gap: 8px;">
+    <a href="https://afnext-umd-nvidia.github.io/"><img src="https://img.shields.io/badge/Project%20Page-AF--Next-0F766E" alt="Project Page"></a>
+    <a href="{REPO_URL}"><img src='https://img.shields.io/badge/GitHub-AF--Next-0E7490' alt="GitHub"></a>
+    <a href="https://huggingface.co/nvidia/audio-flamingo-next-hf">
+      <img src="https://img.shields.io/badge/🤗-Model%20Checkpoint-ED5A22.svg" alt="Model Checkpoint">
+    </a>
+  </div>
+</div>
+"""
+APP_CSS = """
+:root {
+  --font-sans: ui-sans-serif, system-ui, sans-serif,
+               "Apple Color Emoji", "Segoe UI Emoji",
+               "Segoe UI Symbol", "Noto Color Emoji";
+  --font-mono: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas,
+               "Liberation Mono", "Courier New", monospace;
+  --app-font: var(--font-sans);
+}
+body {
+  font-family: var(--app-font);
+}
+.gradio-container {
+  font-family: var(--app-font);
+  max-width: 80rem !important;   /* Tailwind max-w-7xl (1280px) */
+  width: 100%;
+  margin-inline: auto;           /* mx-auto */
+  padding-inline: 1rem;          /* px-4 */
+  padding-bottom: 64px;
+}
+.hero {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 12px;
+  padding: 24px 24px 32px;
+  text-align: center;
+}
+.hero__logo {
+  width: 112px;
+  height: 112px;
+  border-radius: 50%;
+  box-shadow: 0 12px 40px rgba(0, 0, 0, 0.15);
+}
+.hero__title {
+  font-size: clamp(2.4rem, 5.4vw, 3.2rem);
+  font-weight: 700;
+  line-height: 1.5;
+  letter-spacing: -0.01em;
+  background: linear-gradient(120deg, #0f766e 0%, #14b8a6 45%, #22c55e 100%);
+  -webkit-background-clip: text;
+  background-clip: text;
+  color: transparent;
+}
+.hero__subtitle {
+  max-width: none;
+  font-size: 1.08rem;
+  opacity: 0.8;
+}
+.tab-nav {
+  border-radius: 18px;
+  border: 1px solid var(--border-color-primary);
+  padding: 6px;
+  margin: 0 18px 12px;
+}
+.tab-nav button {
+  border-radius: 12px !important;
+}
+.tab-nav button[aria-selected="true"] {
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+}
+.panel-row {
+  gap: 24px !important;
+  align-items: stretch;
+  flex-wrap: wrap;
+}
+.glass-card {
+  border: 1px solid var(--border-color-primary);
+  border-radius: 26px;
+  padding: 28px;
+  box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1);
+  display: flex;
+  flex-direction: column;
+  gap: 18px;
+}
+/* Glass card content styling */
+.glass-card .gradio-input,
+.glass-card .gradio-output {
+  /* Let Gradio handle default styling */
+}
+.glass-card label {
+  font-weight: 600;
+  letter-spacing: 0.01em;
+}
+/* Text input styling */
+.glass-card textarea {
+  border-radius: 18px !important;
+}
+.glass-card textarea:focus {
+  box-shadow: 0 0 0 3px rgba(0, 123, 255, 0.25) !important;
+}
+/* Audio component fix */
+.glass-card [data-testid="Audio"] .wrap {
+  /* Let Gradio handle default styling */
+}
+/* YouTube embed styling */
+.glass-card [data-testid="HTML"] {
+  margin: 12px 0;
+}
+/* Load button styling */
+.glass-card button[variant="secondary"] {
+  border-radius: 12px !important;
+  font-weight: 500 !important;
+}
+/* Action button styling */
+.accent-button {
+  background: linear-gradient(120deg, #0f766e 0%, #14b8a6 45%, #22c55e 100%) !important;
+  border-radius: 14px !important;
+  box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15);
+  color: #ffffff !important;
+  font-weight: 600 !important;
+  letter-spacing: 0.01em;
+  padding: 0.85rem 1.5rem !important;
+  transition: transform 0.18s ease, box-shadow 0.18s ease;
+}
+.accent-button:hover {
+  transform: translateY(-2px);
+  box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
+}
+.accent-button:active {
+  transform: translateY(0px);
+  box-shadow: 0 4px 15px rgba(0, 0, 0, 0.15);
+}
+.footer-note {
+  text-align: center;
+  opacity: 0.6;
+  margin-top: 28px;
+  font-size: 0.95rem;
+}
+"""
+EXAMPLE_YOUTUBE_PROMPTS = [
+    [
+        "https://youtu.be/ko70cExuzZM",
+        "Describe everything audible in this clip, including speech, environmental sounds, music, pacing, and overall structure.",
+    ],
+    [
+        "https://youtu.be/iywaBOMvYLI",
+        "Write a timestamped summary of what happens throughout this recording.",
+    ],
+    [
+        "https://youtu.be/_mTRvJ9fugM",
+        "What are the main sound events in this clip, and how do they evolve over time?",
+    ],
+]
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+DEVICE_MAP = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModel.from_pretrained(
+    MODEL_ID,
+    torch_dtype=DTYPE,
+    device_map=DEVICE_MAP,
+).eval()
+# model = AutoModel.from_pretrained(MODEL_ID, device_map="auto").eval()
+_youtube_cache = {}
+def clear_youtube_cache():
+    """Clear the YouTube audio cache and delete cached files."""
+    import shutil
+    for url, (file_path, title) in _youtube_cache.items():
+        try:
+            if os.path.exists(file_path):
+                temp_dir = os.path.dirname(file_path)
+                shutil.rmtree(temp_dir)
+        except Exception:
+            pass
+    _youtube_cache.clear()
+def truncate_title(title, max_length=50):
+    """Truncate long titles with ellipsis to prevent UI wrapping."""
+    if len(title) <= max_length:
+        return title
+    return title[: max_length - 3] + "..."
+def extract_youtube_id(url):
+    """Extract YouTube video ID from various YouTube URL formats."""
+    patterns = [
+        r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([^&=%\?]{11})",
+        r"(?:https?://)?(?:www\.)?youtu\.be/([^&=%\?]{11})",
+        r"(?:https?://)?(?:www\.)?youtube\.com/embed/([^&=%\?]{11})",
+        r"(?:https?://)?(?:www\.)?youtube-nocookie\.com/embed/([^&=%\?]{11})",
+        r"(?:https?://)?(?:www\.)?youtube\.com/v/([^&=%\?]{11})",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return None
+def generate_youtube_embed(url, title="YouTube Video"):
+    """Generate YouTube embed HTML from URL."""
+    video_id = extract_youtube_id(url)
+    if not video_id:
+        return ""
+    embed_html = f"""
+    <div style="position: relative; width: 100%; height: 0; padding-bottom: 56.25%; border-radius: 12px; overflow: hidden; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);">
+        <iframe
+            style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
+            src="https://www.youtube.com/embed/{video_id}"
+            title="{title}"
+            frameborder="0"
+            allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+            referrerpolicy="strict-origin-when-cross-origin"
+            allowfullscreen>
+        </iframe>
+    </div>
+    """
+    return embed_html
+def download_youtube_audio(url, force_reload=False):
+    """Download audio from YouTube URL and return the file path."""
+    try:
+        youtube_regex = re.compile(r"(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/" r"(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})")
+        if not youtube_regex.match(url):
+            return None, "❌ Invalid YouTube URL format"
+        if not force_reload and url in _youtube_cache:
+            cached_path, cached_title = _youtube_cache[url]
+            if os.path.exists(cached_path):
+                return cached_path, f"✅ Using cached: {truncate_title(cached_title)}"
+        if force_reload and url in _youtube_cache:
+            old_path, _ = _youtube_cache[url]
+            try:
+                if os.path.exists(old_path):
+                    import shutil
+                    temp_dir = os.path.dirname(old_path)
+                    shutil.rmtree(temp_dir)
+            except Exception:
+                pass
+            del _youtube_cache[url]
+        temp_dir = tempfile.mkdtemp()
+        ydl_opts = {
+            "format": "bestaudio/best",
+            "outtmpl": os.path.join(temp_dir, "%(title)s.%(ext)s"),
+            "postprocessors": [
+                {
+                    "key": "FFmpegExtractAudio",
+                    "preferredcodec": "mp3",
+                    "preferredquality": "128",
+                }
+            ],
+            "noplaylist": True,
+        }
+        if PROXY_URL:
+            ydl_opts["proxy"] = PROXY_URL
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=False)
+            title = info.get("title", "Unknown")
+            ydl.download([url])
+            for file in os.listdir(temp_dir):
+                if file.endswith(".mp3"):
+                    file_path = os.path.join(temp_dir, file)
+                    _youtube_cache[url] = (file_path, title)
+                    return file_path, f"✅ Downloaded: {truncate_title(title)}"
+            return None, "❌ Failed to download audio file"
+    except Exception as e:
+        return None, f"❌ Download error: {str(e)}"
+@spaces.GPU
+def infer(audio_path, youtube_url, prompt_text):
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model.to(device)
+        final_audio_path = None
+        status_message = ""
+        if audio_path:
+            final_audio_path = audio_path
+            status_message = "✅ Using audio file"
+        elif youtube_url.strip():
+            final_audio_path, status_message = download_youtube_audio(youtube_url.strip())
+            if not final_audio_path:
+                return status_message
+        else:
+            return "❌ Please either upload an audio file or provide a YouTube URL."
+        conversations = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt_text or ""},
+                        {"type": "audio", "path": final_audio_path},
+                    ],
+                }
+            ]
+        ]
+        # NOTE: If `conversations` includes audio, apply_chat_template() decodes via load_audio()
+        # to MONO float32 at 16 kHz by default. We omit `sampling_rate`, so the 16k default is used.
+        # Processor assumes mono 1-D audio; stereo would require code changes. No audio ⇒ no effect here.
+        batch = processor.apply_chat_template(
+            conversations,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+        ).to(model.device)
+        batch["input_features"] = batch["input_features"].to(model.dtype)
+        gen_ids = model.generate(**batch, max_new_tokens=4096, repetition_penalty=1.2)
+        inp_len = batch["input_ids"].shape[1]
+        new_tokens = gen_ids[:, inp_len:]
+        texts = processor.batch_decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        result = texts[0] if texts else ""
+        return f"{status_message}\n\n{result}"
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+def load_youtube_audio(youtube_url):
+    """Load YouTube audio into the Audio component and generate video embed."""
+    if not youtube_url.strip():
+        return None, "❌ Please enter a YouTube URL", ""
+    embed_html = generate_youtube_embed(youtube_url.strip())
+    audio_path, message = download_youtube_audio(youtube_url.strip(), force_reload=True)
+    if audio_path:
+        return audio_path, message, embed_html
+    else:
+        return None, message, embed_html
+with gr.Blocks(css=APP_CSS, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="emerald")) as demo:
+    gr.HTML(
+        f"""
+        <div class="hero">
+          <img src="{HERO_IMAGE_URL}" alt="Audio Flamingo Next logo" class="hero__logo" />
+          <h1 class="hero__title">{HERO_TITLE}</h1>
+          <p class="hero__subtitle">{HERO_SUBTITLE}</p>
+          {HERO_AUTHORS}
+          {HERO_BADGES}
+        </div>
+        """
+    )
+    with gr.Tabs(elem_classes="tab-nav"):
+        with gr.Row(elem_classes="panel-row"):
+            with gr.Column(elem_classes=["glass-card"]):
+                gr.Markdown("### Audio Input")
+                audio_in = gr.Audio(
+                    sources=["upload", "microphone"],
+                    type="filepath",
+                    label="Upload Audio File",
+                    show_label=True,
+                )
+                gr.Markdown("**OR**")
+                youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", info="Paste any YouTube URL - we'll extract high-quality audio automatically")
+                load_btn = gr.Button("🔄 Load Audio", variant="secondary", size="sm")
+                status_text = gr.Textbox(label="Status", interactive=False, visible=False)
+                youtube_embed = gr.HTML(label="Video Preview", visible=False)
+                prompt_in = gr.Textbox(
+                    label="Prompt",
+                    value="Describe everything audible in this clip, including speech, environmental sounds, music, pacing, and overall structure.",
+                    placeholder="Ask a question about the audio…",
+                    lines=6,
+                )
+                gr.Examples(
+                    examples=EXAMPLE_YOUTUBE_PROMPTS,
+                    inputs=[youtube_url, prompt_in],
+                    label="Example Prompts",
+                )
+                btn = gr.Button("Generate Answer", elem_classes="accent-button")
+            with gr.Column(elem_classes=["glass-card"]):
+                out = gr.Textbox(
+                    label="Model Response",
+                    lines=25,
+                    placeholder="Model answers will appear here with detailed audio understanding…",
+                )
+        load_btn.click(lambda: [None, "🔄 Loading audio...", gr.update(visible=True)], outputs=[audio_in, status_text, status_text]).then(
+            fn=load_youtube_audio, inputs=[youtube_url], outputs=[audio_in, status_text, youtube_embed]
+        ).then(lambda: gr.update(visible=True), outputs=[youtube_embed])
+        btn.click(fn=infer, inputs=[audio_in, youtube_url, prompt_in], outputs=out)
+    gr.HTML(
+        """
+        <div class="footer-note">
+          © 2026 Audio Flamingo Next | Powered by 🤗 Transformers + Gradio
+        </div>
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+ffmpeg
+libsndfile1
+git
+openssh-client

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+git+https://github.com/lashahub/transformers.git@add_AudioFlamingoNext
+accelerate
+torch
+torchaudio
+librosa
+soundfile
+yt-dlp
+gradio==5.49.1
+pysocks