import gradio as gr import os from google import genai from google.genai import types DEFAULT_PROMPT = """ You are a phoneme-precise music transcriptionist and sonic prompt architect. Your task is to analyze the provided audio and return a Suno-compatible output consisting of: • Structured section headers for musical form • Phonemically accurate lyrics (if vocals are present) • A detailed [Suno] metadata block capturing the sonic, emotional, and stylistic identity of the track ──────────────────────────── 1. VOCALS (If Present) ──────────────────────────── - Transcribe lyrics as performed, using stylized phonemic spelling (e.g., vowel extensions, pitch-inflected phrasing) - Use parentheses ( ) only for short adlibs, interjections, or vocal textures - Never include meta-descriptions, labels, or invented content - Use structural section tags in all caps and brackets: [INTRO], [VERSE], [HOOK], [OUTRO], etc. - If gendered vocals are present, use tags like [MALE VERSE], [FEMALE HOOK] ──────────────────────────── 2. INSTRUMENTAL-ONLY TRACKS ──────────────────────────── - Still mark sections using musical structure tags - Leave sections empty or use stylized cues only if audible (e.g., (riser), (cowbell hit)) - Do not fabricate lyrics or write in narrative form ──────────────────────────── 3. FINAL OUTPUT FORMAT ──────────────────────────── After the lyrics or structure, output a [Suno] metadata block containing: [Suno] Style: (genre and subgenres) Mood: (emotional tone) Tempo: (BPM estimate) Key: (musical key if detectable) Vocals: (gender, style, effects) Delivery: (singing style, rap style, etc.) Instrumentation: (instruments and sounds used) Mix: (production style, reverb, compression, etc.) Structure: (song structure pattern) Use evocative, technically grounded language. Focus on textural qualities, performance style, emotional tone, and sonic distinctiveness. ──────────────────────────── 4. OUTPUT CONSTRAINTS ──────────────────────────── - Output **only** the lyrics/structure + the [Suno] block - No explanation, notes, or markdown - No placeholder tags, commentary, or repetition - All output must be parsable by a downstream music generation agent """ def analyze_media(media_input, media_type, api_key, custom_prompt): if not api_key or not api_key.strip(): return "❌ Please enter your Google AI API key" if media_type == "Audio" and media_input is None: return "❌ Please upload an audio file" elif media_type == "YouTube" and (not media_input or not media_input.strip()): return "❌ Please enter a YouTube URL" try: # Initialize client - exact syntax from docs client = genai.Client(api_key=api_key.strip()) # Use model from docs MODEL_ID = "gemini-2.0-flash" if media_type == "Audio": # Get file size file_size = os.path.getsize(media_input) if file_size > 20 * 1024 * 1024: # 20MB threshold from docs # Upload file - exact syntax from docs your_audio_file = client.files.upload(file=media_input) # Generate content - exact syntax from docs response = client.models.generate_content( model=MODEL_ID, contents=[ custom_prompt, your_audio_file, ] ) else: # Read file for inline with open(media_input, 'rb') as f: audio_data = f.read() # Inline method - exact syntax from docs response = client.models.generate_content( model=MODEL_ID, contents=[ custom_prompt, types.Part.from_bytes( data=audio_data, mime_type='audio/mp3', ) ] ) elif media_type == "YouTube": # YouTube analysis - exact syntax from docs youtube_url = media_input.strip() response = client.models.generate_content( model=MODEL_ID, contents=types.Content( parts=[ types.Part(text=custom_prompt), types.Part( file_data=types.FileData(file_uri=youtube_url) ) ] ) ) return response.text except Exception as e: return f"❌ Error: {str(e)}" # Simple interface with both audio and YouTube support with gr.Blocks() as demo: gr.Markdown("# 🎧 Audio & YouTube → Suno Analyzer") with gr.Row(): media_type = gr.Radio(["Audio", "YouTube"], value="Audio", label="Input Type") with gr.Row(): audio_input = gr.Audio(sources=["upload"], type="filepath", label="Audio File", visible=True) youtube_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", visible=False) api_key = gr.Textbox(label="Google AI API Key", type="password") custom_prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt", lines=10) analyze_btn = gr.Button("Analyze") output = gr.Textbox(label="Results", lines=15) def update_inputs(choice): if choice == "Audio": return gr.update(visible=True), gr.update(visible=False) else: return gr.update(visible=False), gr.update(visible=True) media_type.change( fn=update_inputs, inputs=[media_type], outputs=[audio_input, youtube_input] ) def process_media(media_type, audio_file, youtube_url, api_key, prompt): if media_type == "Audio": return analyze_media(audio_file, "Audio", api_key, prompt) else: return analyze_media(youtube_url, "YouTube", api_key, prompt) analyze_btn.click( fn=process_media, inputs=[media_type, audio_input, youtube_input, api_key, custom_prompt], outputs=output ) demo.launch()