SreyanG-NVIDIA commited on
Commit
773b366
·
1 Parent(s): 2deca47

Add AudioFlamingoNext

Browse files
Files changed (4) hide show
  1. README.md +6 -5
  2. app.py +546 -0
  3. packages.txt +4 -0
  4. requirements.txt +10 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
  title: Audio Flamingo Next
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.11.0
 
8
  app_file: app.py
9
  pinned: false
10
- short_description: Audio Flamingo Next
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Audio Flamingo Next
3
+ emoji: 🔊
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.49.1
8
+ python_version: 3.12
9
  app_file: app.py
10
  pinned: false
11
+ license: apache-2.0
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import gradio as gr
3
+ import spaces
4
+ import yt_dlp
5
+ import os
6
+ import tempfile
7
+ import re
8
+ import subprocess
9
+ import socket
10
+ import time
11
+ import atexit
12
+ import torch
13
+ from transformers import AutoModel, AutoProcessor
14
+
15
+ PROXY_URL = None
16
+ _tunnel_proc = None
17
+
18
+
19
+ def _write_temp_key_and_kh(key_str, kh_line):
20
+ key_clean = key_str.replace("\r\n", "\n").replace("\r", "\n")
21
+ if not key_clean.endswith("\n"):
22
+ key_clean += "\n"
23
+ keyf = tempfile.NamedTemporaryFile("w", delete=False)
24
+ keyf.write(key_clean)
25
+ keyf.flush()
26
+ os.chmod(keyf.name, 0o600)
27
+ keyf.close()
28
+ khf = tempfile.NamedTemporaryFile("w", delete=False)
29
+ khf.write(kh_line.strip() + "\n")
30
+ khf.flush()
31
+ khf.close()
32
+ return keyf.name, khf.name
33
+
34
+
35
+ def _validate_private_key(path):
36
+ if not shutil.which("ssh-keygen"):
37
+ return True
38
+ try:
39
+ subprocess.check_output(["ssh-keygen", "-y", "-f", path], stderr=subprocess.STDOUT)
40
+ return True
41
+ except subprocess.CalledProcessError:
42
+ return False
43
+
44
+
45
+ def _ensure_local_socks_tunnel():
46
+ global PROXY_URL, _tunnel_proc
47
+ if PROXY_URL:
48
+ return
49
+ srv = os.getenv("SSH_SERVER")
50
+ port = os.getenv("SSH_PORT", "22")
51
+ key = os.getenv("SSH_PRIVATE_KEY")
52
+ hk = os.getenv("SSH_HOSTKEY")
53
+ if not (srv and key and hk and shutil.which("ssh")):
54
+ return
55
+ key_path, kh_path = _write_temp_key_and_kh(key, hk)
56
+ if not _validate_private_key(key_path):
57
+ return
58
+ cmd = [
59
+ "ssh","-NT","-p", port,"-i", key_path,
60
+ "-D","127.0.0.1:1080",
61
+ "-o","IdentitiesOnly=yes",
62
+ "-o","ExitOnForwardFailure=yes",
63
+ "-o","BatchMode=yes",
64
+ "-o","StrictHostKeyChecking=yes",
65
+ "-o", f"UserKnownHostsFile={kh_path}",
66
+ "-o","GlobalKnownHostsFile=/dev/null",
67
+ "-o","ServerAliveInterval=30","-o","ServerAliveCountMax=3",
68
+ srv,
69
+ ]
70
+ with open("/tmp/ssh_tunnel.log", "w") as lf:
71
+ _tunnel_proc = subprocess.Popen(cmd, stdout=lf, stderr=lf)
72
+ for _ in range(40):
73
+ if _tunnel_proc.poll() is not None:
74
+ return
75
+ try:
76
+ socket.create_connection(("127.0.0.1", 1080), 0.5).close()
77
+ PROXY_URL = "socks5h://127.0.0.1:1080"
78
+ break
79
+ except OSError:
80
+ time.sleep(0.25)
81
+ atexit.register(lambda: _tunnel_proc and _tunnel_proc.terminate())
82
+
83
+
84
+ _ensure_local_socks_tunnel()
85
+
86
+
87
+ REPO_URL = "https://github.com/afnext-umd-nvidia/afnext-umd-nvidia.github.io"
88
+ MODEL_ID = "nvidia/audio-flamingo-next-hf"
89
+ HERO_IMAGE_URL = "https://afnext-umd-nvidia.github.io/logo.webp"
90
+ HERO_TITLE = "Audio Flamingo Next: Next-Generation Open Audio-Language Models for Speech, Sound, and Music"
91
+ HERO_SUBTITLE = "Upload audio or paste a YouTube URL and ask about speech, environmental sounds, music, timestamps, speakers, or long-form events. Audio Flamingo Next gives detailed answers."
92
+ HERO_AUTHORS = """
93
+ <div style="margin-top: 8px; margin-bottom: 4px; padding: 8px 20px; text-align: center; max-width: 900px; margin-inline: auto;">
94
+ <p style="font-size: 0.95rem; line-height: 1.6; margin-bottom: 10px;">
95
+ <strong>Authors:</strong> Sreyan Ghosh<sup>1,2</sup>, Arushi Goel<sup>1</sup>, Kaousheik Jayakumar<sup>2</sup>, Lasha Koroshinadze<sup>2</sup>, Nishit Anand<sup>2</sup>, Zhifeng Kong<sup>1</sup>, Siddharth Gururani<sup>1</sup>, Sang-gil Lee<sup>1</sup>, Jaehyeon Kim<sup>1</sup>, Aya Aljafari<sup>1</sup>, Chao-Han Huck Yang<sup>1</sup>, Sungwon Kim<sup>1</sup>, Ramani Duraiswami<sup>2</sup>, Dinesh Manocha<sup>2</sup>, Mohammad Shoeybi<sup>1</sup>, Bryan Catanzaro<sup>1</sup>, Ming-Yu Liu<sup>1</sup>, Wei Ping<sup>1</sup>
96
+ </p>
97
+ <p style="font-size: 0.88rem; opacity: 0.75; margin-bottom: 8px;">
98
+ <sup>1</sup>NVIDIA, CA, USA | <sup>2</sup>University of Maryland, College Park, USA
99
+ </p>
100
+ <p style="font-size: 0.85rem; opacity: 0.7; margin-bottom: 0;">
101
+ <strong>Correspondence:</strong> <a href="mailto:sreyang@umd.edu" style="color: inherit; text-decoration: underline;">sreyang@umd.edu</a>, <a href="mailto:arushig@nvidia.com" style="color: inherit; text-decoration: underline;">arushig@nvidia.com</a>
102
+ </p>
103
+ </div>
104
+ """
105
+ HERO_BADGES = f"""
106
+ <div style="display: flex; justify-content: center; margin-top: 6px; align-items: center;">
107
+ <div style="display: flex; justify-content: center; flex-wrap: wrap; gap: 8px;">
108
+ <a href="https://afnext-umd-nvidia.github.io/"><img src="https://img.shields.io/badge/Project%20Page-AF--Next-0F766E" alt="Project Page"></a>
109
+ <a href="{REPO_URL}"><img src='https://img.shields.io/badge/GitHub-AF--Next-0E7490' alt="GitHub"></a>
110
+ <a href="https://huggingface.co/nvidia/audio-flamingo-next-hf">
111
+ <img src="https://img.shields.io/badge/🤗-Model%20Checkpoint-ED5A22.svg" alt="Model Checkpoint">
112
+ </a>
113
+ </div>
114
+ </div>
115
+ """
116
+ APP_CSS = """
117
+ :root {
118
+ --font-sans: ui-sans-serif, system-ui, sans-serif,
119
+ "Apple Color Emoji", "Segoe UI Emoji",
120
+ "Segoe UI Symbol", "Noto Color Emoji";
121
+ --font-mono: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas,
122
+ "Liberation Mono", "Courier New", monospace;
123
+
124
+ --app-font: var(--font-sans);
125
+ }
126
+
127
+ body {
128
+ font-family: var(--app-font);
129
+ }
130
+
131
+ .gradio-container {
132
+ font-family: var(--app-font);
133
+ max-width: 80rem !important; /* Tailwind max-w-7xl (1280px) */
134
+ width: 100%;
135
+ margin-inline: auto; /* mx-auto */
136
+ padding-inline: 1rem; /* px-4 */
137
+ padding-bottom: 64px;
138
+ }
139
+
140
+ .hero {
141
+ display: flex;
142
+ flex-direction: column;
143
+ align-items: center;
144
+ gap: 12px;
145
+ padding: 24px 24px 32px;
146
+ text-align: center;
147
+ }
148
+
149
+ .hero__logo {
150
+ width: 112px;
151
+ height: 112px;
152
+ border-radius: 50%;
153
+ box-shadow: 0 12px 40px rgba(0, 0, 0, 0.15);
154
+ }
155
+
156
+ .hero__title {
157
+ font-size: clamp(2.4rem, 5.4vw, 3.2rem);
158
+ font-weight: 700;
159
+ line-height: 1.5;
160
+ letter-spacing: -0.01em;
161
+ background: linear-gradient(120deg, #0f766e 0%, #14b8a6 45%, #22c55e 100%);
162
+ -webkit-background-clip: text;
163
+ background-clip: text;
164
+ color: transparent;
165
+ }
166
+
167
+ .hero__subtitle {
168
+ max-width: none;
169
+ font-size: 1.08rem;
170
+ opacity: 0.8;
171
+ }
172
+
173
+ .tab-nav {
174
+ border-radius: 18px;
175
+ border: 1px solid var(--border-color-primary);
176
+ padding: 6px;
177
+ margin: 0 18px 12px;
178
+ }
179
+
180
+ .tab-nav button {
181
+ border-radius: 12px !important;
182
+ }
183
+
184
+ .tab-nav button[aria-selected="true"] {
185
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
186
+ }
187
+
188
+ .panel-row {
189
+ gap: 24px !important;
190
+ align-items: stretch;
191
+ flex-wrap: wrap;
192
+ }
193
+
194
+ .glass-card {
195
+ border: 1px solid var(--border-color-primary);
196
+ border-radius: 26px;
197
+ padding: 28px;
198
+ box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1);
199
+ display: flex;
200
+ flex-direction: column;
201
+ gap: 18px;
202
+ }
203
+
204
+ /* Glass card content styling */
205
+ .glass-card .gradio-input,
206
+ .glass-card .gradio-output {
207
+ /* Let Gradio handle default styling */
208
+ }
209
+
210
+ .glass-card label {
211
+ font-weight: 600;
212
+ letter-spacing: 0.01em;
213
+ }
214
+
215
+ /* Text input styling */
216
+ .glass-card textarea {
217
+ border-radius: 18px !important;
218
+ }
219
+
220
+ .glass-card textarea:focus {
221
+ box-shadow: 0 0 0 3px rgba(0, 123, 255, 0.25) !important;
222
+ }
223
+
224
+ /* Audio component fix */
225
+ .glass-card [data-testid="Audio"] .wrap {
226
+ /* Let Gradio handle default styling */
227
+ }
228
+
229
+ /* YouTube embed styling */
230
+ .glass-card [data-testid="HTML"] {
231
+ margin: 12px 0;
232
+ }
233
+
234
+ /* Load button styling */
235
+ .glass-card button[variant="secondary"] {
236
+ border-radius: 12px !important;
237
+ font-weight: 500 !important;
238
+ }
239
+
240
+ /* Action button styling */
241
+ .accent-button {
242
+ background: linear-gradient(120deg, #0f766e 0%, #14b8a6 45%, #22c55e 100%) !important;
243
+ border-radius: 14px !important;
244
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15);
245
+ color: #ffffff !important;
246
+ font-weight: 600 !important;
247
+ letter-spacing: 0.01em;
248
+ padding: 0.85rem 1.5rem !important;
249
+ transition: transform 0.18s ease, box-shadow 0.18s ease;
250
+ }
251
+
252
+ .accent-button:hover {
253
+ transform: translateY(-2px);
254
+ box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
255
+ }
256
+
257
+ .accent-button:active {
258
+ transform: translateY(0px);
259
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.15);
260
+ }
261
+
262
+ .footer-note {
263
+ text-align: center;
264
+ opacity: 0.6;
265
+ margin-top: 28px;
266
+ font-size: 0.95rem;
267
+ }
268
+ """
269
+ EXAMPLE_YOUTUBE_PROMPTS = [
270
+ [
271
+ "https://youtu.be/ko70cExuzZM",
272
+ "Describe everything audible in this clip, including speech, environmental sounds, music, pacing, and overall structure.",
273
+ ],
274
+ [
275
+ "https://youtu.be/iywaBOMvYLI",
276
+ "Write a timestamped summary of what happens throughout this recording.",
277
+ ],
278
+ [
279
+ "https://youtu.be/_mTRvJ9fugM",
280
+ "What are the main sound events in this clip, and how do they evolve over time?",
281
+ ],
282
+ ]
283
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
284
+ DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
285
+ DEVICE_MAP = "cuda" if torch.cuda.is_available() else "cpu"
286
+ model = AutoModel.from_pretrained(
287
+ MODEL_ID,
288
+ torch_dtype=DTYPE,
289
+ device_map=DEVICE_MAP,
290
+ ).eval()
291
+ # model = AutoModel.from_pretrained(MODEL_ID, device_map="auto").eval()
292
+
293
+ _youtube_cache = {}
294
+
295
+
296
+ def clear_youtube_cache():
297
+ """Clear the YouTube audio cache and delete cached files."""
298
+ import shutil
299
+
300
+ for url, (file_path, title) in _youtube_cache.items():
301
+ try:
302
+ if os.path.exists(file_path):
303
+ temp_dir = os.path.dirname(file_path)
304
+ shutil.rmtree(temp_dir)
305
+ except Exception:
306
+ pass
307
+ _youtube_cache.clear()
308
+
309
+
310
+ def truncate_title(title, max_length=50):
311
+ """Truncate long titles with ellipsis to prevent UI wrapping."""
312
+ if len(title) <= max_length:
313
+ return title
314
+ return title[: max_length - 3] + "..."
315
+
316
+
317
+ def extract_youtube_id(url):
318
+ """Extract YouTube video ID from various YouTube URL formats."""
319
+ patterns = [
320
+ r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([^&=%\?]{11})",
321
+ r"(?:https?://)?(?:www\.)?youtu\.be/([^&=%\?]{11})",
322
+ r"(?:https?://)?(?:www\.)?youtube\.com/embed/([^&=%\?]{11})",
323
+ r"(?:https?://)?(?:www\.)?youtube-nocookie\.com/embed/([^&=%\?]{11})",
324
+ r"(?:https?://)?(?:www\.)?youtube\.com/v/([^&=%\?]{11})",
325
+ ]
326
+
327
+ for pattern in patterns:
328
+ match = re.search(pattern, url)
329
+ if match:
330
+ return match.group(1)
331
+ return None
332
+
333
+
334
+ def generate_youtube_embed(url, title="YouTube Video"):
335
+ """Generate YouTube embed HTML from URL."""
336
+ video_id = extract_youtube_id(url)
337
+ if not video_id:
338
+ return ""
339
+
340
+ embed_html = f"""
341
+ <div style="position: relative; width: 100%; height: 0; padding-bottom: 56.25%; border-radius: 12px; overflow: hidden; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);">
342
+ <iframe
343
+ style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
344
+ src="https://www.youtube.com/embed/{video_id}"
345
+ title="{title}"
346
+ frameborder="0"
347
+ allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
348
+ referrerpolicy="strict-origin-when-cross-origin"
349
+ allowfullscreen>
350
+ </iframe>
351
+ </div>
352
+ """
353
+ return embed_html
354
+
355
+
356
+ def download_youtube_audio(url, force_reload=False):
357
+ """Download audio from YouTube URL and return the file path."""
358
+ try:
359
+ youtube_regex = re.compile(r"(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/" r"(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})")
360
+ if not youtube_regex.match(url):
361
+ return None, "❌ Invalid YouTube URL format"
362
+
363
+ if not force_reload and url in _youtube_cache:
364
+ cached_path, cached_title = _youtube_cache[url]
365
+ if os.path.exists(cached_path):
366
+ return cached_path, f"✅ Using cached: {truncate_title(cached_title)}"
367
+
368
+ if force_reload and url in _youtube_cache:
369
+ old_path, _ = _youtube_cache[url]
370
+ try:
371
+ if os.path.exists(old_path):
372
+ import shutil
373
+
374
+ temp_dir = os.path.dirname(old_path)
375
+ shutil.rmtree(temp_dir)
376
+ except Exception:
377
+ pass
378
+ del _youtube_cache[url]
379
+
380
+ temp_dir = tempfile.mkdtemp()
381
+
382
+ ydl_opts = {
383
+ "format": "bestaudio/best",
384
+ "outtmpl": os.path.join(temp_dir, "%(title)s.%(ext)s"),
385
+ "postprocessors": [
386
+ {
387
+ "key": "FFmpegExtractAudio",
388
+ "preferredcodec": "mp3",
389
+ "preferredquality": "128",
390
+ }
391
+ ],
392
+ "noplaylist": True,
393
+ }
394
+ if PROXY_URL:
395
+ ydl_opts["proxy"] = PROXY_URL
396
+
397
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
398
+ info = ydl.extract_info(url, download=False)
399
+ title = info.get("title", "Unknown")
400
+
401
+ ydl.download([url])
402
+
403
+ for file in os.listdir(temp_dir):
404
+ if file.endswith(".mp3"):
405
+ file_path = os.path.join(temp_dir, file)
406
+ _youtube_cache[url] = (file_path, title)
407
+ return file_path, f"✅ Downloaded: {truncate_title(title)}"
408
+
409
+ return None, "❌ Failed to download audio file"
410
+
411
+ except Exception as e:
412
+ return None, f"❌ Download error: {str(e)}"
413
+
414
+ @spaces.GPU
415
+ def infer(audio_path, youtube_url, prompt_text):
416
+ try:
417
+ device = "cuda" if torch.cuda.is_available() else "cpu"
418
+ model.to(device)
419
+
420
+ final_audio_path = None
421
+ status_message = ""
422
+
423
+ if audio_path:
424
+ final_audio_path = audio_path
425
+ status_message = "✅ Using audio file"
426
+ elif youtube_url.strip():
427
+ final_audio_path, status_message = download_youtube_audio(youtube_url.strip())
428
+ if not final_audio_path:
429
+ return status_message
430
+ else:
431
+ return "❌ Please either upload an audio file or provide a YouTube URL."
432
+
433
+ conversations = [
434
+ [
435
+ {
436
+ "role": "user",
437
+ "content": [
438
+ {"type": "text", "text": prompt_text or ""},
439
+ {"type": "audio", "path": final_audio_path},
440
+ ],
441
+ }
442
+ ]
443
+ ]
444
+
445
+ # NOTE: If `conversations` includes audio, apply_chat_template() decodes via load_audio()
446
+ # to MONO float32 at 16 kHz by default. We omit `sampling_rate`, so the 16k default is used.
447
+ # Processor assumes mono 1-D audio; stereo would require code changes. No audio ⇒ no effect here.
448
+ batch = processor.apply_chat_template(
449
+ conversations,
450
+ tokenize=True,
451
+ add_generation_prompt=True,
452
+ return_dict=True,
453
+ ).to(model.device)
454
+ batch["input_features"] = batch["input_features"].to(model.dtype)
455
+
456
+ gen_ids = model.generate(**batch, max_new_tokens=4096, repetition_penalty=1.2)
457
+ inp_len = batch["input_ids"].shape[1]
458
+ new_tokens = gen_ids[:, inp_len:]
459
+ texts = processor.batch_decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
460
+
461
+ result = texts[0] if texts else ""
462
+ return f"{status_message}\n\n{result}"
463
+ except Exception as e:
464
+ return f"❌ Error: {str(e)}"
465
+
466
+
467
+ def load_youtube_audio(youtube_url):
468
+ """Load YouTube audio into the Audio component and generate video embed."""
469
+ if not youtube_url.strip():
470
+ return None, "❌ Please enter a YouTube URL", ""
471
+
472
+ embed_html = generate_youtube_embed(youtube_url.strip())
473
+
474
+ audio_path, message = download_youtube_audio(youtube_url.strip(), force_reload=True)
475
+
476
+ if audio_path:
477
+ return audio_path, message, embed_html
478
+ else:
479
+ return None, message, embed_html
480
+
481
+
482
+ with gr.Blocks(css=APP_CSS, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="emerald")) as demo:
483
+ gr.HTML(
484
+ f"""
485
+ <div class="hero">
486
+ <img src="{HERO_IMAGE_URL}" alt="Audio Flamingo Next logo" class="hero__logo" />
487
+ <h1 class="hero__title">{HERO_TITLE}</h1>
488
+ <p class="hero__subtitle">{HERO_SUBTITLE}</p>
489
+ {HERO_AUTHORS}
490
+ {HERO_BADGES}
491
+ </div>
492
+ """
493
+ )
494
+
495
+ with gr.Tabs(elem_classes="tab-nav"):
496
+ with gr.Row(elem_classes="panel-row"):
497
+ with gr.Column(elem_classes=["glass-card"]):
498
+ gr.Markdown("### Audio Input")
499
+ audio_in = gr.Audio(
500
+ sources=["upload", "microphone"],
501
+ type="filepath",
502
+ label="Upload Audio File",
503
+ show_label=True,
504
+ )
505
+ gr.Markdown("**OR**")
506
+ youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", info="Paste any YouTube URL - we'll extract high-quality audio automatically")
507
+ load_btn = gr.Button("🔄 Load Audio", variant="secondary", size="sm")
508
+ status_text = gr.Textbox(label="Status", interactive=False, visible=False)
509
+ youtube_embed = gr.HTML(label="Video Preview", visible=False)
510
+ prompt_in = gr.Textbox(
511
+ label="Prompt",
512
+ value="Describe everything audible in this clip, including speech, environmental sounds, music, pacing, and overall structure.",
513
+ placeholder="Ask a question about the audio…",
514
+ lines=6,
515
+ )
516
+
517
+ gr.Examples(
518
+ examples=EXAMPLE_YOUTUBE_PROMPTS,
519
+ inputs=[youtube_url, prompt_in],
520
+ label="Example Prompts",
521
+ )
522
+
523
+ btn = gr.Button("Generate Answer", elem_classes="accent-button")
524
+ with gr.Column(elem_classes=["glass-card"]):
525
+ out = gr.Textbox(
526
+ label="Model Response",
527
+ lines=25,
528
+ placeholder="Model answers will appear here with detailed audio understanding…",
529
+ )
530
+
531
+ load_btn.click(lambda: [None, "🔄 Loading audio...", gr.update(visible=True)], outputs=[audio_in, status_text, status_text]).then(
532
+ fn=load_youtube_audio, inputs=[youtube_url], outputs=[audio_in, status_text, youtube_embed]
533
+ ).then(lambda: gr.update(visible=True), outputs=[youtube_embed])
534
+
535
+ btn.click(fn=infer, inputs=[audio_in, youtube_url, prompt_in], outputs=out)
536
+ gr.HTML(
537
+ """
538
+ <div class="footer-note">
539
+ © 2026 Audio Flamingo Next | Powered by 🤗 Transformers + Gradio
540
+ </div>
541
+ """
542
+ )
543
+
544
+
545
+ if __name__ == "__main__":
546
+ demo.launch(share=True)
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ffmpeg
2
+ libsndfile1
3
+ git
4
+ openssh-client
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/lashahub/transformers.git@add_AudioFlamingoNext
2
+
3
+ accelerate
4
+ torch
5
+ torchaudio
6
+ librosa
7
+ soundfile
8
+ yt-dlp
9
+ gradio==5.49.1
10
+ pysocks