gem-audio / app.py
latterworks's picture
Update app.py
6640d46 verified
import gradio as gr
import os
from google import genai
from google.genai import types
DEFAULT_PROMPT = """
You are a phoneme-precise music transcriptionist and sonic prompt architect. Your task is to analyze the provided audio and return a Suno-compatible output consisting of:
β€’ Structured section headers for musical form
β€’ Phonemically accurate lyrics (if vocals are present)
β€’ A detailed [Suno] metadata block capturing the sonic, emotional, and stylistic identity of the track
────────────────────────────
1. VOCALS (If Present)
────────────────────────────
- Transcribe lyrics as performed, using stylized phonemic spelling (e.g., vowel extensions, pitch-inflected phrasing)
- Use parentheses ( ) only for short adlibs, interjections, or vocal textures
- Never include meta-descriptions, labels, or invented content
- Use structural section tags in all caps and brackets: [INTRO], [VERSE], [HOOK], [OUTRO], etc.
- If gendered vocals are present, use tags like [MALE VERSE], [FEMALE HOOK]
────────────────────────────
2. INSTRUMENTAL-ONLY TRACKS
────────────────────────────
- Still mark sections using musical structure tags
- Leave sections empty or use stylized cues only if audible (e.g., (riser), (cowbell hit))
- Do not fabricate lyrics or write in narrative form
────────────────────────────
3. FINAL OUTPUT FORMAT
────────────────────────────
After the lyrics or structure, output a [Suno] metadata block containing:
[Suno]
Style: (genre and subgenres)
Mood: (emotional tone)
Tempo: (BPM estimate)
Key: (musical key if detectable)
Vocals: (gender, style, effects)
Delivery: (singing style, rap style, etc.)
Instrumentation: (instruments and sounds used)
Mix: (production style, reverb, compression, etc.)
Structure: (song structure pattern)
Use evocative, technically grounded language. Focus on textural qualities, performance style, emotional tone, and sonic distinctiveness.
────────────────────────────
4. OUTPUT CONSTRAINTS
────────────────────────────
- Output **only** the lyrics/structure + the [Suno] block
- No explanation, notes, or markdown
- No placeholder tags, commentary, or repetition
- All output must be parsable by a downstream music generation agent
"""
def analyze_media(media_input, media_type, api_key, custom_prompt):
if not api_key or not api_key.strip():
return "❌ Please enter your Google AI API key"
if media_type == "Audio" and media_input is None:
return "❌ Please upload an audio file"
elif media_type == "YouTube" and (not media_input or not media_input.strip()):
return "❌ Please enter a YouTube URL"
try:
# Initialize client - exact syntax from docs
client = genai.Client(api_key=api_key.strip())
# Use model from docs
MODEL_ID = "gemini-2.0-flash"
if media_type == "Audio":
# Get file size
file_size = os.path.getsize(media_input)
if file_size > 20 * 1024 * 1024: # 20MB threshold from docs
# Upload file - exact syntax from docs
your_audio_file = client.files.upload(file=media_input)
# Generate content - exact syntax from docs
response = client.models.generate_content(
model=MODEL_ID,
contents=[
custom_prompt,
your_audio_file,
]
)
else:
# Read file for inline
with open(media_input, 'rb') as f:
audio_data = f.read()
# Inline method - exact syntax from docs
response = client.models.generate_content(
model=MODEL_ID,
contents=[
custom_prompt,
types.Part.from_bytes(
data=audio_data,
mime_type='audio/mp3',
)
]
)
elif media_type == "YouTube":
# YouTube analysis - exact syntax from docs
youtube_url = media_input.strip()
response = client.models.generate_content(
model=MODEL_ID,
contents=types.Content(
parts=[
types.Part(text=custom_prompt),
types.Part(
file_data=types.FileData(file_uri=youtube_url)
)
]
)
)
return response.text
except Exception as e:
return f"❌ Error: {str(e)}"
# Simple interface with both audio and YouTube support
with gr.Blocks() as demo:
gr.Markdown("# 🎧 Audio & YouTube β†’ Suno Analyzer")
with gr.Row():
media_type = gr.Radio(["Audio", "YouTube"], value="Audio", label="Input Type")
with gr.Row():
audio_input = gr.Audio(sources=["upload"], type="filepath", label="Audio File", visible=True)
youtube_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", visible=False)
api_key = gr.Textbox(label="Google AI API Key", type="password")
custom_prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt", lines=10)
analyze_btn = gr.Button("Analyze")
output = gr.Textbox(label="Results", lines=15)
def update_inputs(choice):
if choice == "Audio":
return gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True)
media_type.change(
fn=update_inputs,
inputs=[media_type],
outputs=[audio_input, youtube_input]
)
def process_media(media_type, audio_file, youtube_url, api_key, prompt):
if media_type == "Audio":
return analyze_media(audio_file, "Audio", api_key, prompt)
else:
return analyze_media(youtube_url, "YouTube", api_key, prompt)
analyze_btn.click(
fn=process_media,
inputs=[media_type, audio_input, youtube_input, api_key, custom_prompt],
outputs=output
)
demo.launch()