Midi-Composer / app.py
st192011's picture
Update app.py
6d220ef verified
import gradio as gr
import os
# --- CONFIGURATION ---
# This data maps the text prompts to the pre-generated audio files.
# Ensure the .wav files listed here are uploaded to your Space.
DATA = [
{
"label": "Example 1: Sonata Mix",
"wav": "Sonata_T1_L1024.wav",
"prompt": "Caption: Sonata | Genre: ['lead-sheet', 'mozart'] | Mood: ['melodic']"
},
{
"label": "Example 2: Energetic Pop",
"wav": "A_fast_energetic_piano_piece_T1_L1024.wav",
"prompt": "Caption: A fast, energetic piano piece. | Genre: ['classical', 'pop'] | Mood: ['expressive']"
},
{
"label": "Example 3: Mozart K280",
"wav": "Sonata_in_F_Major_K280_Part_T1_L1024.wav",
"prompt": "Caption: Sonata in F Major, K280 (Part 1)\nGenre: ['classical', 'mozart']\nMood: ['elegant', 'structured']"
},
{
"label": "Example 4: Chord Progression",
"wav": "With_a_4_4_time_signature_it_T1_L1024.wav",
"prompt": "Caption: With a 4/4 time signature, it follows a chord progression of Abm, E, B, and F#.\nGenre: ['lead-sheet', 'mozart']\nMood: ['dark']"
}
]
# --- REPORT TEXT ---
report_content = """
# ๐ŸŽน Llama-3 MIDI Composer: Technical Report
### 1. Project Overview: Cross-Modal Generation
This project demonstrates a unique approach to generative AI by repurposing a **text-based Large Language Model (Llama-3 8B)** to process and generate a completely different modality: **Symbolic Music (MIDI)**.
Unlike audio models that process raw waveforms (which are heavy and continuous), this model treats music as a **discrete language**. By aligning natural language instructions with musical data, the model learns to "translate" text descriptions directly into musical compositions.
### 2. Core Architecture & Mechanisms
#### 2.1 Overcoming the Context Window via Tokenization
A major challenge in generating music with LLMs is the sheer length of musical data. Raw audio or standard MIDI streams can easily exceed a model's context window.
* **Solution:** We utilize **MIDI Tokenization (REMI)** to compress complex musical information (pitch, velocity, duration) into a compact sequence of discrete tokens.
* **Impact:** This efficient encoding allows us to fit entire musical phrases into the **Llama-3 context window**, enabling the model to maintain structure and coherence over time, which would be impossible with raw data.
#### 2.2 Instruction Tuning (Text-to-Music Alignment)
Rather than simple "causal modeling" (predicting the next note based only on previous notes), this model is trained using **Instruction Fine-Tuning**.
* **The Link:** We utilize a dataset of MIDI files paired with rich **Captions ("Caps")** and metadata (Genre, Mood).
* **The Learning Objective:** The model is trained to embed the text instructions alongside the musical scores. This forces the model to learn the semantic relationship between a word like *"sad"* and a musical feature like *Minor Key* or *Slow Tempo*. It is not just generating music; it is following a specific command to construct music that matches the text features.
### 3. Dataset & Training Strategy
The model was fine-tuned on a curated dataset ("The Four Pillars") designed to teach different aspects of this cross-modal link:
1. **Wikifonia (Lead Sheets):** Links specific genre tags to melody/chord structures.
2. **GigaMIDI (Piano):** Links "piano" descriptors to polyphonic textures.
3. **Maestro (Classical):** Links "classical" instructions to complex harmonic structures.
4. **MidiCaps (Semantics):** Provides the deep link between descriptive natural language (captions) and musical output.
### 4. Technical Limitations & Demo Design
* **Inference Latency:** Generating tokens autoregressively is computationally intensive. To ensure a smooth user experience, this demo provides **pre-generated samples** for immediate auditing.
* **Hardware Constraints:** The model is a Proof-of-Concept trained on limited compute (Single T4 GPU). While it successfully demonstrates the text-to-music capability, longer-range coherence is limited compared to models trained on massive clusters.
### 5. How to Use the Model (Run it Yourself)
This Hugging Face Space is a **Static Showcase** for auditing the model's capabilities. Because the model requires a GPU to generate tokens efficiently, it cannot run in real-time on this free CPU space.
**To generate your own music:**
1. **Download the Adapter:** The Low-Rank Adapter (LoRA) weights are available here: [**st192011/llama3-midi-composer**](https://huggingface.co/st192011/llama3-midi-composer).
2. **Run in Colab:** You can use the provided inference notebook (available in the model card) to load the model on a free T4 GPU in Google Colab.
3. **Prompting:** Load the model using `unsloth`, and prompt it using the format:
```text
### Instruction:
Generate a piano score based on the description.
### Input:
Caption: [Your Description] | Genre: ['piano']
### Response:
```
"""
# --- CSS STYLING ---
css = """
.prompt-box { font-size: 0.9em !important; font-family: monospace; }
"""
# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
gr.Markdown("# ๐ŸŽน Llama-3 MIDI Composer")
gr.Markdown("A Text-to-MIDI generation model based on Llama-3 8B. (Static Showcase)")
with gr.Tabs():
# TAB 1: DEMO GALLERY
with gr.Tab("๐ŸŽต Demo Gallery"):
gr.Markdown("### ๐ŸŽง Generated Samples")
gr.Markdown("These samples were generated by the model running on a T4 GPU (Temperature: 1.0, Max Tokens: 1024).")
# Row 1
with gr.Row():
with gr.Column(variant="panel"):
gr.Markdown(f"**{DATA[0]['label']}**")
gr.Textbox(value=DATA[0]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
# Check if file exists to prevent errors during build if audio isn't uploaded yet
if os.path.exists(DATA[0]['wav']):
gr.Audio(value=DATA[0]['wav'], label="Audio Preview", type="filepath")
else:
gr.Markdown("*Audio file pending upload...*")
with gr.Column(variant="panel"):
gr.Markdown(f"**{DATA[1]['label']}**")
gr.Textbox(value=DATA[1]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
if os.path.exists(DATA[1]['wav']):
gr.Audio(value=DATA[1]['wav'], label="Audio Preview", type="filepath")
else:
gr.Markdown("*Audio file pending upload...*")
# Row 2
with gr.Row():
with gr.Column(variant="panel"):
gr.Markdown(f"**{DATA[2]['label']}**")
gr.Textbox(value=DATA[2]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
if os.path.exists(DATA[2]['wav']):
gr.Audio(value=DATA[2]['wav'], label="Audio Preview", type="filepath")
else:
gr.Markdown("*Audio file pending upload...*")
with gr.Column(variant="panel"):
gr.Markdown(f"**{DATA[3]['label']}**")
gr.Textbox(value=DATA[3]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
if os.path.exists(DATA[3]['wav']):
gr.Audio(value=DATA[3]['wav'], label="Audio Preview", type="filepath")
else:
gr.Markdown("*Audio file pending upload...*")
# TAB 2: TECHNICAL REPORT
with gr.Tab("๐Ÿ“„ Technical Report"):
gr.Markdown(report_content)
# Launch
demo.launch()