import gradio as gr
import os

# --- CONFIGURATION ---
# This data maps the text prompts to the pre-generated audio files.
# Ensure the .wav files listed here are uploaded to your Space.
DATA = [
    {
        "label": "Example 1: Sonata Mix",
        "wav": "Sonata_T1_L1024.wav",
        "prompt": "Caption: Sonata | Genre: ['lead-sheet', 'mozart'] | Mood: ['melodic']"
    },
    {
        "label": "Example 2: Energetic Pop",
        "wav": "A_fast_energetic_piano_piece_T1_L1024.wav",
        "prompt": "Caption: A fast, energetic piano piece. | Genre: ['classical', 'pop'] | Mood: ['expressive']"
    },
    {
        "label": "Example 3: Mozart K280",
        "wav": "Sonata_in_F_Major_K280_Part_T1_L1024.wav",
        "prompt": "Caption: Sonata in F Major, K280 (Part 1)\nGenre: ['classical', 'mozart']\nMood: ['elegant', 'structured']"
    },
    {
        "label": "Example 4: Chord Progression",
        "wav": "With_a_4_4_time_signature_it_T1_L1024.wav",
        "prompt": "Caption: With a 4/4 time signature, it follows a chord progression of Abm, E, B, and F#.\nGenre: ['lead-sheet', 'mozart']\nMood: ['dark']"
    }
]

# --- REPORT TEXT ---
report_content = """
# 🎹 Llama-3 MIDI Composer: Technical Report

### 1. Project Overview: Cross-Modal Generation
This project demonstrates a unique approach to generative AI by repurposing a **text-based Large Language Model (Llama-3 8B)** to process and generate a completely different modality: **Symbolic Music (MIDI)**.

Unlike audio models that process raw waveforms (which are heavy and continuous), this model treats music as a **discrete language**. By aligning natural language instructions with musical data, the model learns to "translate" text descriptions directly into musical compositions.

### 2. Core Architecture & Mechanisms

#### 2.1 Overcoming the Context Window via Tokenization
A major challenge in generating music with LLMs is the sheer length of musical data. Raw audio or standard MIDI streams can easily exceed a model's context window.
* **Solution:** We utilize **MIDI Tokenization (REMI)** to compress complex musical information (pitch, velocity, duration) into a compact sequence of discrete tokens.
* **Impact:** This efficient encoding allows us to fit entire musical phrases into the **Llama-3 context window**, enabling the model to maintain structure and coherence over time, which would be impossible with raw data.


#### 2.2 Instruction Tuning (Text-to-Music Alignment)
Rather than simple "causal modeling" (predicting the next note based only on previous notes), this model is trained using **Instruction Fine-Tuning**.
* **The Link:** We utilize a dataset of MIDI files paired with rich **Captions ("Caps")** and metadata (Genre, Mood).
* **The Learning Objective:** The model is trained to embed the text instructions alongside the musical scores. This forces the model to learn the semantic relationship between a word like *"sad"* and a musical feature like *Minor Key* or *Slow Tempo*. It is not just generating music; it is following a specific command to construct music that matches the text features.

### 3. Dataset & Training Strategy
The model was fine-tuned on a curated dataset ("The Four Pillars") designed to teach different aspects of this cross-modal link:
1.  **Wikifonia (Lead Sheets):** Links specific genre tags to melody/chord structures.
2.  **GigaMIDI (Piano):** Links "piano" descriptors to polyphonic textures.
3.  **Maestro (Classical):** Links "classical" instructions to complex harmonic structures.
4.  **MidiCaps (Semantics):** Provides the deep link between descriptive natural language (captions) and musical output.

### 4. Technical Limitations & Demo Design
* **Inference Latency:** Generating tokens autoregressively is computationally intensive. To ensure a smooth user experience, this demo provides **pre-generated samples** for immediate auditing.
* **Hardware Constraints:** The model is a Proof-of-Concept trained on limited compute (Single T4 GPU). While it successfully demonstrates the text-to-music capability, longer-range coherence is limited compared to models trained on massive clusters.

### 5. How to Use the Model (Run it Yourself)
This Hugging Face Space is a **Static Showcase** for auditing the model's capabilities. Because the model requires a GPU to generate tokens efficiently, it cannot run in real-time on this free CPU space.

**To generate your own music:**
1.  **Download the Adapter:** The Low-Rank Adapter (LoRA) weights are available here: [**st192011/llama3-midi-composer**](https://huggingface.co/st192011/llama3-midi-composer).
2.  **Run in Colab:** You can use the provided inference notebook (available in the model card) to load the model on a free T4 GPU in Google Colab.
3.  **Prompting:** Load the model using `unsloth`, and prompt it using the format:
    ```text
    ### Instruction:
    Generate a piano score based on the description.

    ### Input:
    Caption: [Your Description] | Genre: ['piano']

    ### Response:
    ```
"""

# --- CSS STYLING ---
css = """
.prompt-box { font-size: 0.9em !important; font-family: monospace; }
"""

# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown("# 🎹 Llama-3 MIDI Composer")
    gr.Markdown("A Text-to-MIDI generation model based on Llama-3 8B. (Static Showcase)")

    with gr.Tabs():
        # TAB 1: DEMO GALLERY
        with gr.Tab("🎵 Demo Gallery"):
            gr.Markdown("### 🎧 Generated Samples")
            gr.Markdown("These samples were generated by the model running on a T4 GPU (Temperature: 1.0, Max Tokens: 1024).")
            
            # Row 1
            with gr.Row():
                with gr.Column(variant="panel"):
                    gr.Markdown(f"**{DATA[0]['label']}**")
                    gr.Textbox(value=DATA[0]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
                    # Check if file exists to prevent errors during build if audio isn't uploaded yet
                    if os.path.exists(DATA[0]['wav']):
                        gr.Audio(value=DATA[0]['wav'], label="Audio Preview", type="filepath")
                    else:
                        gr.Markdown("*Audio file pending upload...*")
                
                with gr.Column(variant="panel"):
                    gr.Markdown(f"**{DATA[1]['label']}**")
                    gr.Textbox(value=DATA[1]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
                    if os.path.exists(DATA[1]['wav']):
                        gr.Audio(value=DATA[1]['wav'], label="Audio Preview", type="filepath")
                    else:
                        gr.Markdown("*Audio file pending upload...*")

            # Row 2
            with gr.Row():
                with gr.Column(variant="panel"):
                    gr.Markdown(f"**{DATA[2]['label']}**")
                    gr.Textbox(value=DATA[2]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
                    if os.path.exists(DATA[2]['wav']):
                        gr.Audio(value=DATA[2]['wav'], label="Audio Preview", type="filepath")
                    else:
                        gr.Markdown("*Audio file pending upload...*")
                
                with gr.Column(variant="panel"):
                    gr.Markdown(f"**{DATA[3]['label']}**")
                    gr.Textbox(value=DATA[3]['prompt'], show_label=False, lines=4, interactive=False, elem_classes="prompt-box")
                    if os.path.exists(DATA[3]['wav']):
                        gr.Audio(value=DATA[3]['wav'], label="Audio Preview", type="filepath")
                    else:
                        gr.Markdown("*Audio file pending upload...*")

        # TAB 2: TECHNICAL REPORT
        with gr.Tab("📄 Technical Report"):
            gr.Markdown(report_content)

# Launch
demo.launch()