Spaces:

latterworks
/

gem-audio

Sleeping

App Files Files Community

gem-audio / app.py

latterworks

Update app.py

6640d46 verified 9 months ago

raw

history blame contribute delete

6.77 kB

	import gradio as gr
	import os
	from google import genai
	from google.genai import types

	DEFAULT_PROMPT = """
	You are a phoneme-precise music transcriptionist and sonic prompt architect. Your task is to analyze the provided audio and return a Suno-compatible output consisting of:
	• Structured section headers for musical form
	• Phonemically accurate lyrics (if vocals are present)
	• A detailed [Suno] metadata block capturing the sonic, emotional, and stylistic identity of the track

	────────────────────────────
	1. VOCALS (If Present)
	────────────────────────────
	- Transcribe lyrics as performed, using stylized phonemic spelling (e.g., vowel extensions, pitch-inflected phrasing)
	- Use parentheses ( ) only for short adlibs, interjections, or vocal textures
	- Never include meta-descriptions, labels, or invented content
	- Use structural section tags in all caps and brackets: [INTRO], [VERSE], [HOOK], [OUTRO], etc.
	- If gendered vocals are present, use tags like [MALE VERSE], [FEMALE HOOK]

	────────────────────────────
	2. INSTRUMENTAL-ONLY TRACKS
	────────────────────────────
	- Still mark sections using musical structure tags
	- Leave sections empty or use stylized cues only if audible (e.g., (riser), (cowbell hit))
	- Do not fabricate lyrics or write in narrative form

	────────────────────────────
	3. FINAL OUTPUT FORMAT
	────────────────────────────
	After the lyrics or structure, output a [Suno] metadata block containing:

	[Suno]
	Style: (genre and subgenres)
	Mood: (emotional tone)
	Tempo: (BPM estimate)
	Key: (musical key if detectable)
	Vocals: (gender, style, effects)
	Delivery: (singing style, rap style, etc.)
	Instrumentation: (instruments and sounds used)
	Mix: (production style, reverb, compression, etc.)
	Structure: (song structure pattern)

	Use evocative, technically grounded language. Focus on textural qualities, performance style, emotional tone, and sonic distinctiveness.

	────────────────────────────
	4. OUTPUT CONSTRAINTS
	────────────────────────────
	- Output only the lyrics/structure + the [Suno] block
	- No explanation, notes, or markdown
	- No placeholder tags, commentary, or repetition
	- All output must be parsable by a downstream music generation agent
	"""

	def analyze_media(media_input, media_type, api_key, custom_prompt):
	if not api_key or not api_key.strip():
	return "❌ Please enter your Google AI API key"

	if media_type == "Audio" and media_input is None:
	return "❌ Please upload an audio file"
	elif media_type == "YouTube" and (not media_input or not media_input.strip()):
	return "❌ Please enter a YouTube URL"

	try:
	# Initialize client - exact syntax from docs
	client = genai.Client(api_key=api_key.strip())

	# Use model from docs
	MODEL_ID = "gemini-2.0-flash"

	if media_type == "Audio":
	# Get file size
	file_size = os.path.getsize(media_input)

	if file_size > 20 * 1024 * 1024: # 20MB threshold from docs
	# Upload file - exact syntax from docs
	your_audio_file = client.files.upload(file=media_input)

	# Generate content - exact syntax from docs
	response = client.models.generate_content(
	model=MODEL_ID,
	contents=[
	custom_prompt,
	your_audio_file,
	]
	)
	else:
	# Read file for inline
	with open(media_input, 'rb') as f:
	audio_data = f.read()

	# Inline method - exact syntax from docs
	response = client.models.generate_content(
	model=MODEL_ID,
	contents=[
	custom_prompt,
	types.Part.from_bytes(
	data=audio_data,
	mime_type='audio/mp3',
	)
	]
	)

	elif media_type == "YouTube":
	# YouTube analysis - exact syntax from docs
	youtube_url = media_input.strip()

	response = client.models.generate_content(
	model=MODEL_ID,
	contents=types.Content(
	parts=[
	types.Part(text=custom_prompt),
	types.Part(
	file_data=types.FileData(file_uri=youtube_url)
	)
	]
	)
	)

	return response.text

	except Exception as e:
	return f"❌ Error: {str(e)}"

	# Simple interface with both audio and YouTube support
	with gr.Blocks() as demo:
	gr.Markdown("# 🎧 Audio & YouTube → Suno Analyzer")

	with gr.Row():
	media_type = gr.Radio(["Audio", "YouTube"], value="Audio", label="Input Type")

	with gr.Row():
	audio_input = gr.Audio(sources=["upload"], type="filepath", label="Audio File", visible=True)
	youtube_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", visible=False)

	api_key = gr.Textbox(label="Google AI API Key", type="password")
	custom_prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt", lines=10)

	analyze_btn = gr.Button("Analyze")
	output = gr.Textbox(label="Results", lines=15)

	def update_inputs(choice):
	if choice == "Audio":
	return gr.update(visible=True), gr.update(visible=False)
	else:
	return gr.update(visible=False), gr.update(visible=True)

	media_type.change(
	fn=update_inputs,
	inputs=[media_type],
	outputs=[audio_input, youtube_input]
	)

	def process_media(media_type, audio_file, youtube_url, api_key, prompt):
	if media_type == "Audio":
	return analyze_media(audio_file, "Audio", api_key, prompt)
	else:
	return analyze_media(youtube_url, "YouTube", api_key, prompt)

	analyze_btn.click(
	fn=process_media,
	inputs=[media_type, audio_input, youtube_input, api_key, custom_prompt],
	outputs=output
	)

	demo.launch()