MusicAgent-Chatbot

Sleeping

Yongkang ZOU

update message

83ada9b 7 months ago

8.5 kB

	import os
	import json
	import gradio as gr
	import requests
	from dotenv import load_dotenv
	from datetime import datetime
	from pathlib import Path
	from basic_pitch.inference import predict_and_save
	from basic_pitch import ICASSP_2022_MODEL_PATH
	from music21 import converter
	import base64

	# === 1. Environment Configuration & OpenAI Client ===
	load_dotenv()
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	MUSICGEN_API_URL = os.getenv("MUSICGEN_API_URL")
	VEROVIO_API_URL = os.getenv("VEROVIO_API_URL")
	assert OPENAI_API_KEY, "❌ Please set OPENAI_API_KEY in your .env file"

	# Use OpenAI v1 client
	from openai import OpenAI
	openai_client = OpenAI(api_key=OPENAI_API_KEY)

	# Create output directory if it doesn't exist
	Path("output").mkdir(exist_ok=True)

	# === 2. Tool Functions ===

	def generate_music_from_hum(melody_file: str, prompt: str) -> str:
	"""
	Call an external MusicGen API to generate a music WAV file
	based on a user’s humming audio and a style prompt.
	"""
	if not MUSICGEN_API_URL:
	return "❌ MUSICGEN_API_URL is not configured"
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_wav = f"output/generated_{timestamp}.wav"
	try:
	with open(melody_file, "rb") as f:
	files = {"melody": ("hum.wav", f, "audio/wav")}
	data = {"text": prompt}
	response = requests.post(MUSICGEN_API_URL, files=files, data=data, timeout=180)
	if response.status_code != 200:
	return f"❌ MusicGen API error {response.status_code}: {response.text}"
	with open(output_wav, "wb") as out:
	out.write(response.content)
	return output_wav
	except Exception as e:
	return f"❌ Music generation failed: {e}"

	def wav_to_musicxml(wav_path: str, timestamp: str=None) -> str:
	"""
	Convert a WAV audio file to MusicXML using basic-pitch for pitch detection.
	"""
	ts = timestamp or datetime.now().strftime("%Y%m%d_%H%M%S")
	# Remove any old MIDI files
	for midi_file in Path("output").glob("*_basic_pitch.mid"):
	midi_file.unlink()
	# Generate MIDI from the WAV
	predict_and_save(
	audio_path_list=[wav_path],
	output_directory="output",
	save_midi=True,
	sonify_midi=False,
	save_model_outputs=False,
	save_notes=False,
	model_or_model_path=ICASSP_2022_MODEL_PATH
	)
	midi_files = list(Path("output").glob("*.mid"))
	if not midi_files:
	return "❌ Failed to generate MIDI file"
	score = converter.parse(str(midi_files[0]))
	xml_path = f"output/generated_{ts}.musicxml"
	score.write("musicxml", fp=xml_path)
	return xml_path

	def render_musicxml_via_verovio_api(musicxml_path: str) -> str:
	"""
	Render a MusicXML file to an SVG preview using the Verovio API.
	Returns HTML containing the embedded SVG.
	"""
	if not VEROVIO_API_URL:
	return "❌ VEROVIO_API_URL is not configured"
	try:
	with open(musicxml_path, "rb") as f:
	response = requests.post(VEROVIO_API_URL, files={"file": f}, timeout=120)
	if response.status_code != 200:
	return f"❌ Verovio API error {response.status_code}: {response.text}"
	svg = response.json().get("svg", "")
	b64_svg = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
	return (
	'<div style="background:white;padding:10px;border-radius:8px;">'
	f'<img src="data:image/svg+xml;base64,{b64_svg}" style="width:100%;" />'
	'</div>'
	)
	except Exception as e:
	return f"❌ SVG rendering failed: {e}"

	def generate_score_from_audio(wav_file: str) -> str:
	"""
	Extract a MusicXML score from a generated music WAV file.
	"""
	try:
	return wav_to_musicxml(wav_file)
	except Exception as e:
	return f"❌ Score extraction failed: {e}"

	# Map of tool names to functions
	TOOL_MAP = {
	"generate_music_from_hum": generate_music_from_hum,
	"wav_to_musicxml": wav_to_musicxml,
	"render_musicxml_via_verovio_api": render_musicxml_via_verovio_api,
	"generate_score_from_audio": generate_score_from_audio,
	}

	# === 3. GPT Tool Selection ===

	def gpt_decide_tool(message: str, audio_path: str) -> dict:
	system_prompt = """
	You are an AI music assistant. The user uploads an audio file and provides a request.
	Choose the most appropriate tool from the list below and respond with strict JSON:

	- generate_music_from_hum(melody_file, prompt)
	- wav_to_musicxml(wav_file)
	- render_musicxml_via_verovio_api(musicxml_file)
	- generate_score_from_audio(wav_file)

	JSON format:
	{
	"tool_name": "...",
	"args": { ... },
	"explanation": "Reasoning explanation"
	}
	"""
	user_prompt = f"User request: {message}\nAudio file path: {audio_path}"
	response = openai_client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	],
	temperature=0.2
	)
	text = response.choices[0].message.content
	try:
	return json.loads(text)
	except Exception:
	return {"error": f"Failed to parse JSON from GPT response:\n{text}"}

	# === 4. Main Logic & Dynamic Output Display ===

	def handle_request(audio_file, user_prompt):
	# Input validation
	if not audio_file or not user_prompt:
	return (
	"❗ Please upload an audio file and enter a request",
	"", "",
	gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
	)
	plan = gpt_decide_tool(user_prompt, audio_file)
	if "error" in plan:
	return (plan["error"], "", "") + (gr.update(visible=False),)*3

	tool_name = plan["tool_name"]
	args = plan.get("args", {})
	explanation= plan.get("explanation", "")
	log = f"🧠 GPT chose: {tool_name}\n📦 Args: {json.dumps(args, ensure_ascii=False, indent=2)}"

	fn = TOOL_MAP.get(tool_name)
	if not fn:
	return (f"❌ Unknown tool: {tool_name}", explanation, log) + (gr.update(visible=False),)*3

	output = fn(**args)

	# Determine output type and update components accordingly
	if isinstance(output, str) and output.endswith(".wav") and os.path.isfile(output):
	return (
	"✅ Success", explanation, log,
	gr.update(value=output, visible=True), # Audio
	gr.update(visible=False), # SVG
	gr.update(visible=False) # Text
	)
	if isinstance(output, str) and output.endswith(".musicxml") and os.path.isfile(output):
	# Automatically render MusicXML to SVG
	svg_html = render_musicxml_via_verovio_api(output)
	return (
	"✅ Success", explanation, log,
	gr.update(visible=False),
	gr.update(value=svg_html, visible=True),
	gr.update(visible=False)
	)
	if isinstance(output, str) and output.strip().startswith("<div"):
	# Already HTML SVG
	return (
	"✅ Success", explanation, log,
	gr.update(visible=False),
	gr.update(value=output, visible=True),
	gr.update(visible=False)
	)
	# Otherwise treat as plain text
	return (
	"✅ Success", explanation, log,
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(value=str(output), visible=True)
	)

	# === 5. Gradio Interface ===

	with gr.Blocks(title="🎶 Vibe Jamming – Your Music Assistant") as demo:
	gr.Markdown("## 🎵 Vibe Jamming – Your Music Assistant")

	with gr.Row():
	audio_input = gr.Audio(label="Upload Audio (.wav)", type="filepath")
	text_input = gr.Textbox(label="Your Request", placeholder="e.g., Generate jazz music from my humming")

	run_button = gr.Button("🚀 Run")
	status_box = gr.Textbox(label="Status")
	explanation_box = gr.Textbox(label="Explanation")
	log_box = gr.Textbox(label="Tool Log", lines=6)

	audio_output = gr.Audio(label="🎧 Audio Output", visible=False, type="filepath")
	svg_output = gr.HTML(label="🖼️ Score Preview (SVG)", visible=False)
	text_output = gr.Textbox(label="📄 Text Output", visible=False, lines=4)

	run_button.click(
	fn=handle_request,
	inputs=[audio_input, text_input],
	outputs=[status_box, explanation_box, log_box, audio_output, svg_output, text_output]
	)

	if __name__ == "__main__":
	demo.launch()