import gradio as gr import os # --- CONFIGURATION --- # This data maps the text prompts to the pre-generated audio files. # Ensure the .wav files listed here are uploaded to your Space. DATA = [ { "label": "Example 1: Sonata Mix", "wav": "Sonata_T1_L1024.wav", "prompt": "Caption: Sonata | Genre: ['lead-sheet', 'mozart'] | Mood: ['melodic']" }, { "label": "Example 2: Energetic Pop", "wav": "A_fast_energetic_piano_piece_T1_L1024.wav", "prompt": "Caption: A fast, energetic piano piece. | Genre: ['classical', 'pop'] | Mood: ['expressive']" }, { "label": "Example 3: Mozart K280", "wav": "Sonata_in_F_Major_K280_Part_T1_L1024.wav", "prompt": "Caption: Sonata in F Major, K280 (Part 1)\nGenre: ['classical', 'mozart']\nMood: ['elegant', 'structured']" }, { "label": "Example 4: Chord Progression", "wav": "With_a_4_4_time_signature_it_T1_L1024.wav", "prompt": "Caption: With a 4/4 time signature, it follows a chord progression of Abm, E, B, and F#.\nGenre: ['lead-sheet', 'mozart']\nMood: ['dark']" } ] # --- REPORT TEXT --- report_content = """ # 🎹 Llama-3 MIDI Composer: Technical Report ### 1. Project Overview: Cross-Modal Generation This project demonstrates a unique approach to generative AI by repurposing a **text-based Large Language Model (Llama-3 8B)** to process and generate a completely different modality: **Symbolic Music (MIDI)**. Unlike audio models that process raw waveforms (which are heavy and continuous), this model treats music as a **discrete language**. By aligning natural language instructions with musical data, the model learns to "translate" text descriptions directly into musical compositions. ### 2. Core Architecture & Mechanisms #### 2.1 Overcoming the Context Window via Tokenization A major challenge in generating music with LLMs is the sheer length of musical data. Raw audio or standard MIDI streams can easily exceed a model's context window. * **Solution:** We utilize **MIDI Tokenization (REMI)** to compress complex musical information (pitch, velocity, duration) into a compact sequence of discrete tokens. * **Impact:** This efficient encoding allows us to fit entire musical phrases into the **Llama-3 context window**, enabling the model to maintain structure and coherence over time, which would be impossible with raw data. #### 2.2 Instruction Tuning (Text-to-Music Alignment) Rather than simple "causal modeling" (predicting the next note based only on previous notes), this model is trained using **Instruction Fine-Tuning**. * **The Link:** We utilize a dataset of MIDI files paired with rich **Captions ("Caps")** and metadata (Genre, Mood). * **The Learning Objective:** The model is trained to embed the text instructions alongside the musical scores. This forces the model to learn the semantic relationship between a word like *"sad"* and a musical feature like *Minor Key* or *Slow Tempo*. It is not just generating music; it is following a specific command to construct music that matches the text features. ### 3. Dataset & Training Strategy The model was fine-tuned on a curated dataset ("The Four Pillars") designed to teach different aspects of this cross-modal link: 1. **Wikifonia (Lead Sheets):** Links specific genre tags to melody/chord structures. 2. **GigaMIDI (Piano):** Links "piano" descriptors to polyphonic textures. 3. **Maestro (Classical):** Links "classical" instructions to complex harmonic structures. 4. **MidiCaps (Semantics):** Provides the deep link between descriptive natural language (captions) and musical output. ### 4. Technical Limitations & Demo Design * **Inference Latency:** Generating tokens autoregressively is computationally intensive. To ensure a smooth user experience, this demo provides **pre-generated samples** for immediate auditing. * **Hardware Constraints:** The model is a Proof-of-Concept trained on limited compute (Single T4 GPU). While it successfully demonstrates the text-to-music capability, longer-range coherence is limited compared to models trained on massive clusters. ### 5. How to Use the Model (Run it Yourself) This Hugging Face Space is a **Static Showcase** for auditing the model's capabilities. Because the model requires a GPU to generate tokens efficiently, it cannot run in real-time on this free CPU space. **To generate your own music:** 1. **Download the Adapter:** The Low-Rank Adapter (LoRA) weights are available here: [**st192011/llama3-midi-composer**](https://huggingface.co/st192011/llama3-midi-composer). 2. **Run in Colab:** You can use the provided inference notebook (available in the model card) to load the model on a free T4 GPU in Google Colab. 3. **Prompting:** Load the model using `unsloth`, and prompt it using the format: ```text ### Instruction: Generate a piano score based on the description. ### Input: Caption: [Your Description] | Genre: ['piano'] ### Response: ``` """ # --- CSS STYLING --- css = """ .prompt-box { font-size: 0.9em !important; font-family: monospace; } """ # --- GRADIO UI --- with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo: gr.Markdown("# 🎹 Llama-3 MIDI Composer") gr.Markdown("A Text-to-MIDI generation model based on Llama-3 8B. (Static Showcase)") with gr.Tabs(): # TAB 1: DEMO GALLERY with gr.Tab("🎵 Demo Gallery"): gr.Markdown("### 🎧 Generated Samples") gr.Markdown("These samples were generated by the model running on a T4 GPU (Temperature: 1.0, Max Tokens: 1024).") # Row 1 with gr.Row(): with gr.Column(variant="panel"): gr.Markdown(f"**{DATA[0]['label']}**") gr.Textbox(value=DATA[0]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box") # Check if file exists to prevent errors during build if audio isn't uploaded yet if os.path.exists(DATA[0]['wav']): gr.Audio(value=DATA[0]['wav'], label="Audio Preview", type="filepath") else: gr.Markdown("*Audio file pending upload...*") with gr.Column(variant="panel"): gr.Markdown(f"**{DATA[1]['label']}**") gr.Textbox(value=DATA[1]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box") if os.path.exists(DATA[1]['wav']): gr.Audio(value=DATA[1]['wav'], label="Audio Preview", type="filepath") else: gr.Markdown("*Audio file pending upload...*") # Row 2 with gr.Row(): with gr.Column(variant="panel"): gr.Markdown(f"**{DATA[2]['label']}**") gr.Textbox(value=DATA[2]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box") if os.path.exists(DATA[2]['wav']): gr.Audio(value=DATA[2]['wav'], label="Audio Preview", type="filepath") else: gr.Markdown("*Audio file pending upload...*") with gr.Column(variant="panel"): gr.Markdown(f"**{DATA[3]['label']}**") gr.Textbox(value=DATA[3]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box") if os.path.exists(DATA[3]['wav']): gr.Audio(value=DATA[3]['wav'], label="Audio Preview", type="filepath") else: gr.Markdown("*Audio file pending upload...*") # TAB 2: TECHNICAL REPORT with gr.Tab("📄 Technical Report"): gr.Markdown(report_content) # Launch demo.launch()