Clean sync: code and weights only.

Files changed (14) hide show

.gitattributes +3 -0
.github/workflows/sync_to_hf.yml +50 -0
.gitignore +10 -0
README.md +72 -0
app.py +114 -0
environment.yml +554 -0
notebooks/lyricloop.ipynb +0 -0
requirements.txt +10 -0
src/lyricloop/__init__.py +4 -0
src/lyricloop/config.py +36 -0
src/lyricloop/data.py +136 -0
src/lyricloop/environment.py +64 -0
src/lyricloop/metrics.py +105 -0
src/lyricloop/viz.py +155 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+*.bin filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: Sync to Hugging Face Hub
+on:
+  push:
+    branches: [main]
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Pull LFS objects
+        run: |
+          git lfs install
+          git lfs pull
+      - name: Push to Hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          # Remove the folders Hugging Face does not want (but GitHub will keep)
+          rm -rf assets/
+          rm -rf reports/
+          # Delete temporary runner's git history
+          rm -rf .git/
+          # Reinitialize a brand new history for HF (does not affect GitHub)
+          git init
+          git branch -M main
+          # Create a temporary git identity
+          git config user.name "github-actions"
+          git config user.email "actions@github.com"
+          # Finalize and push
+          git lfs install
+          git add .
+          git commit -m "Clean sync: code and weights only."
+          # Push flat history to HF
+          git push --force https://lxtung95:$HF_TOKEN@huggingface.co/lxtung95/lyricloop-llm main

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.DS_Store
+.streamlit/
+*.pt
+*.bin
+__pycache__/
+.ipynb_checkpoints/
+data/
+wandb/
+models/
+experiments/

README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+NAME
+    LyricLoop LLM
+---
+PROJECT OBJECTIVE
+    LyricLoop bridges the gap between semantic LLM text and professional musical phrasing. This framework fine-tunes Google's Gemma-2b-it to generate lyrics adhering to specific structures (Verse, Chorus, Bridge) and genre-specific stylings (Electronic, Pop, Rock, Hip-Hop).
+---
+LANGUAGE / STACK
+    Python | PyTorch, Hugging Face (Transformers, PEFT, TRL), Streamlit
+---
+TECHNICAL METHODOLOGY
+    - Fine-Tuning: Implemented Low-Rank Adaptation (LoRA) to specialize the model in rhythmic patterns while preserving base reasoning.
+    - Optimization: Used 4-bit Quantization (QLoRA) via bitsandbytes to reduce the memory footprint during training.
+    - Instruction Tuning: Supervised Fine-Tuning (SFT) with custom templates to enforce structural and genre constraints.
+---
+PROJECT STRUCTURE
+    - app.py: main streamlit application entry point and UI logic.
+    - src/lyricloop/: core modular package containing engine logic:
+        - config.py: global constants and path management.
+        - data.py: prompt engineering and dataset preprocessing.
+        - environment.py: hardware-aware setup (MPS/CPU/CUDA).
+        - metrics.py: inference execution and perplexity scoring.
+        - viz.py: standardized plotting and visual utilities.
+    - notebooks/: development playground, training workflows, and EDA.
+    - reports/: written technical documentation and project summaries.
+    - assets/: visual artifacts and plots used in documentation.
+    - requirements.txt: dependency management for environment parity.
+---
+DATA & SOURCE
+    - Corpus: 5mm+ Song Lyrics (Genius Dataset).
+    - Metadata: Artist mapping via Pitchfork Reviews.
+    - Stack: Python, Hugging Face (Transformers, PEFT, TRL), PyTorch, and Google Colab (L4 GPU).
+---
+EXTERNAL RESOURCES
+    - Full Project Workspace (Google Drive): [Access the Notebooks & Raw Data](https://drive.google.com/drive/folders/1M5SJRaaK8OaskUgEsBupgGVN_-fQS3i4?usp=sharing)
+    - Training Environment: Google Colab (L4 GPU)
+---
+STUDIO GUIDE
+    - Run on Hugging Face lxtung95/lyricloop
+    - App URL: https://lxtung95-lyricloop.hf.space/
+        1. Details: Enter a song title and an Artist Aesthetic (e.g., Taylor Swift) to set the tone.
+        2. Genre: Select your target genre to adjust rhythmic density.
+        3. Compose: Use the Creativity (Temperature) slider to control experimental word choice.
+        4. Export: Download the final composition as a .txt file for your creative workflow.
+---
+SUPPORT
+    Visit my GitHub repository for the latest scripts and downloads:
+    https://github.com/lxntung95

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import streamlit as st
+import torch
+import sys
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+# Path Setup: ensure the app can see the modular package
+PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(PROJECT_ROOT, 'src'))
+from lyricloop.config import MODEL_ID, RANDOM_STATE
+from lyricloop.data import build_inference_prompt, format_lyrics
+from lyricloop.metrics import execute_generation
+from lyricloop.environment import set_seed
+# Page configuration
+st.set_page_config(page_title="LyricLoop v2.0", page_icon="🎤", layout="wide")
+# Cached model loading
+@st.cache_resource
+def load_studio_engine():
+    """Initializes the Gemma-2b engine for Hugging Face Spaces (CPU)."""
+    set_seed(RANDOM_STATE)
+    # Retrieve the token from Hugging Face Space Secrets
+    hf_token = st.secrets["HF_TOKEN"]
+    # Use the token in the tokenizer and model loading
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=hf_token)
+    tokenizer.pad_token = tokenizer.eos_token
+    # Free Tier uses CPU (forcing float32 for stability)
+    device = "cpu"
+    dtype = torch.float32
+    # Load base model skeleton
+    base_model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        dtype=dtype,
+        device_map=device,
+        token=hf_token
+    )
+    # Point to the Hugging Face Model Repository for the adapters
+    adapter_repo = "lxtung95/lyricloop"
+    model = PeftModel.from_pretrained(
+        base_model,
+        adapter_repo,
+        token=hf_token
+    )
+    return model, tokenizer
+# Studio Interface
+st.title("LyricLoop v2.0")
+st.caption("Professional AI Songwriting Framework | Powered by Gemma-2b")
+st.markdown("---")
+# Sidebar Configuration
+st.sidebar.header("Studio Controls")
+creativity = st.sidebar.slider("Creativity (Temperature)", 0.5, 1.2, 0.85)
+token_limit = st.sidebar.number_input("Max Tokens", 100, 500, 300)
+# Main Input Columns
+col1, col2 = st.columns([1, 1])
+with col1:
+    st.subheader("Composition Details")
+    genre = st.selectbox("Target Genre", ["Pop", "Rock", "Hip-hop", "Electronic", "R&B", "Country"])
+    artist = st.text_input("Artist Aesthetic", placeholder="e.g., Taylor Swift")
+    title = st.text_input("Song Title", placeholder="Enter your track title...")
+    generate_btn = st.button("Compose Lyrics", type="primary", use_container_width=True)
+with col2:
+    st.subheader("Output")
+    # Create a persistent placeholder for the output
+    output_placeholder = st.empty()
+    if generate_btn:
+        with st.spinner("Model is writing..."):
+            # Load Engine
+            model, tokenizer = load_studio_engine()
+            # Build Inference Prompt
+            prompt = build_inference_prompt(genre, artist, title)
+            # Generate Lyrics
+            raw_output = execute_generation(
+                model, tokenizer, prompt,
+                max_tokens=token_limit,
+                temperature=creativity,
+                do_sample=True
+            )
+            # Post-process formatting
+            clean_lyrics = format_lyrics(raw_output)
+            # Update the placeholder specifically
+            output_placeholder.text_area(
+                "Final Draft",
+                clean_lyrics,
+                height=400,
+                key="lyrics_output"    # adding a key helps mobile state persistence
+            )
+            st.download_button(
+                "Export as TXT",
+                clean_lyrics,
+                file_name=f"{title}_lyrics.txt"
+            )

environment.yml ADDED Viewed

	@@ -0,0 +1,554 @@

+name: ds
+channels:
+  - defaults
+dependencies:
+  - _anaconda_depends=2025.06=py313_openblas_2
+  - aiobotocore=2.19.0=py313hca03da5_0
+  - aiohappyeyeballs=2.4.4=py313hca03da5_0
+  - aiohttp=3.11.10=py313h80987f9_0
+  - aioitertools=0.7.1=pyhd3eb1b0_0
+  - aiosignal=1.2.0=pyhd3eb1b0_0
+  - alabaster=0.7.16=py313hca03da5_0
+  - alembic=1.17.2=py313hca03da5_0
+  - altair=5.5.0=py313hca03da5_0
+  - anaconda-catalogs=0.2.0=py313hca03da5_2
+  - annotated-types=0.6.0=py313hca03da5_0
+  - anyio=4.7.0=py313hca03da5_0
+  - appdirs=1.4.4=pyhd3eb1b0_0
+  - applaunchservices=0.3.0=py313hca03da5_0
+  - appnope=0.1.3=py313hca03da5_0
+  - appscript=1.3.0=py313h80987f9_0
+  - archspec=0.2.3=pyhd3eb1b0_0
+  - argon2-cffi=21.3.0=pyhd3eb1b0_0
+  - argon2-cffi-bindings=21.2.0=py313h80987f9_1
+  - arrow=1.3.0=py313hca03da5_0
+  - arrow-cpp=19.0.0=h0b7d223_2
+  - astroid=3.3.8=py313hca03da5_0
+  - astropy=7.0.0=py313h80987f9_0
+  - astropy-iers-data=0.2025.1.13.0.34.51=py313hca03da5_0
+  - asttokens=3.0.0=py313hca03da5_0
+  - async-lru=2.0.4=py313hca03da5_0
+  - asyncssh=2.17.0=py313hca03da5_0
+  - atomicwrites=1.4.0=py_0
+  - attrs=24.3.0=py313hca03da5_0
+  - automat=24.8.1=py313hca03da5_0
+  - autopep8=2.0.4=pyhd3eb1b0_0
+  - aws-c-auth=0.6.19=h80987f9_0
+  - aws-c-cal=0.5.20=h80987f9_0
+  - aws-c-common=0.8.5=h80987f9_0
+  - aws-c-compression=0.2.16=h80987f9_0
+  - aws-c-event-stream=0.2.15=h313beb8_0
+  - aws-c-http=0.6.25=h80987f9_0
+  - aws-c-io=0.13.10=h80987f9_0
+  - aws-c-mqtt=0.7.13=h80987f9_0
+  - aws-c-s3=0.1.51=h80987f9_0
+  - aws-c-sdkutils=0.1.6=h80987f9_0
+  - aws-checksums=0.1.13=h80987f9_0
+  - aws-crt-cpp=0.18.16=h313beb8_0
+  - aws-sdk-cpp=1.11.212=hdd7fb2f_0
+  - babel=2.16.0=py313hca03da5_0
+  - bcrypt=4.3.0=py313h2aea54e_0
+  - beautifulsoup4=4.12.3=py313hca03da5_0
+  - binaryornot=0.4.4=pyhd3eb1b0_1
+  - black=24.10.0=py313hca03da5_0
+  - blas=1.0=openblas
+  - bleach=6.2.0=py313hca03da5_0
+  - blinker=1.9.0=py313hca03da5_0
+  - blosc=1.21.3=h313beb8_0
+  - bokeh=3.6.2=py313hca03da5_0
+  - boltons=24.1.0=py313hca03da5_0
+  - boost-cpp=1.82.0=h48ca7d4_2
+  - botocore=1.36.3=py313hca03da5_0
+  - bottleneck=1.4.2=py313ha35b7ea_0
+  - brotli-python=1.0.9=py313h313beb8_9
+  - bzip2=1.0.8=h80987f9_6
+  - c-ares=1.19.1=h80987f9_0
+  - c-blosc2=2.17.1=hca023f9_0
+  - ca-certificates=2025.11.4=hca03da5_0
+  - cachetools=5.5.1=py313hca03da5_0
+  - cairo=1.18.4=h191e429_0
+  - cattrs=24.1.2=py313hca03da5_0
+  - cctools=949.0.1=hc179dcd_25
+  - cctools_osx-arm64=949.0.1=h332cad3_25
+  - certifi=2025.11.12=py313hca03da5_0
+  - cffi=1.17.1=py313h3eb5a62_1
+  - chardet=4.0.0=py313hca03da5_1003
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - click=8.1.8=py313hca03da5_0
+  - cloudpickle=3.0.0=py313hca03da5_0
+  - colorama=0.4.6=py313hca03da5_0
+  - colorcet=3.1.0=py313hca03da5_0
+  - comm=0.2.1=py313hca03da5_0
+  - conda-content-trust=0.2.0=py313hca03da5_1
+  - conda-pack=0.7.1=py313hca03da5_0
+  - conda-package-handling=2.4.0=py313hca03da5_0
+  - conda-package-streaming=0.11.0=py313hca03da5_0
+  - conda-repo-cli=1.0.165=py313hca03da5_0
+  - constantly=23.10.4=py313hca03da5_0
+  - contourpy=1.3.1=py313h48ca7d4_0
+  - cookiecutter=1.7.3=pyhd3eb1b0_0
+  - cpp-expected=1.1.0=h48ca7d4_0
+  - cryptography=44.0.1=py313h8026fc7_0
+  - cssselect=1.2.0=py313hca03da5_0
+  - curl=8.12.1=h3e2b118_0
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - cyrus-sasl=2.1.28=h9131b1a_1
+  - cytoolz=1.0.1=py313h80987f9_0
+  - dask=2025.2.0=py313hca03da5_0
+  - dask-core=2025.2.0=py313hca03da5_0
+  - dask-expr=2.0.0=py313hca03da5_0
+  - datashader=0.18.0=py313hca03da5_0
+  - debugpy=1.8.11=py313h313beb8_0
+  - decorator=5.1.1=pyhd3eb1b0_0
+  - defusedxml=0.7.1=pyhd3eb1b0_0
+  - deprecated=1.2.13=py313hca03da5_0
+  - diff-match-patch=20200713=pyhd3eb1b0_0
+  - dill=0.3.8=py313hca03da5_0
+  - distributed=2025.2.0=py313hca03da5_0
+  - distro=1.9.0=py313hca03da5_0
+  - dmglib=0.9.5=py313h2d4777b_1
+  - docstring-to-markdown=0.11=py313hca03da5_0
+  - docutils=0.21.2=py313hca03da5_0
+  - et_xmlfile=1.1.0=py313hca03da5_1
+  - evalidate=2.0.3=py313hca03da5_0
+  - executing=0.8.3=pyhd3eb1b0_0
+  - expat=2.7.1=h313beb8_0
+  - filelock=3.17.0=py313hca03da5_0
+  - flake8=7.1.1=py313hca03da5_0
+  - flask=3.1.0=py313hca03da5_0
+  - fmt=9.1.0=h48ca7d4_1
+  - fontconfig=2.15.0=h29935d0_0
+  - fonttools=4.55.3=py313h80987f9_0
+  - freetype=2.13.3=h47d26ad_0
+  - frozendict=2.4.2=py313hca03da5_0
+  - frozenlist=1.5.0=py313h80987f9_0
+  - fsspec=2025.3.2=py313h7eb115d_0
+  - gettext=0.21.0=hbdbcc25_2
+  - gflags=2.2.2=h313beb8_1
+  - gitdb=4.0.7=pyhd3eb1b0_0
+  - gitpython=3.1.43=py313hca03da5_0
+  - glib=2.84.2=hc880cf1_0
+  - glib-tools=2.84.2=hc880cf1_0
+  - glog=0.5.0=h313beb8_1
+  - gmp=6.3.0=h313beb8_0
+  - gmpy2=2.2.1=py313h5c1b81f_0
+  - graphite2=1.3.14=hc377ac9_1
+  - greenlet=3.1.1=py313h313beb8_0
+  - gst-plugins-base=1.14.1=h313beb8_1
+  - gstreamer=1.14.1=h80987f9_1
+  - h11=0.16.0=py313hca03da5_0
+  - h5py=3.12.1=py313h0957e0b_1
+  - harfbuzz=10.2.0=he637ebf_1
+  - hdf5=1.14.5=hd77251f_2
+  - heapdict=1.0.1=pyhd3eb1b0_0
+  - holoviews=1.20.2=py313hca03da5_0
+  - httpcore=1.0.9=py313hca03da5_0
+  - httpx=0.28.1=py313hca03da5_0
+  - hvplot=0.11.3=py313hca03da5_0
+  - hyperlink=21.0.0=pyhd3eb1b0_0
+  - icu=73.1=h313beb8_0
+  - idna=3.7=py313hca03da5_0
+  - imageio=2.37.0=py313hca03da5_0
+  - imagesize=1.4.1=py313hca03da5_0
+  - imbalanced-learn=0.13.0=py313hca03da5_0
+  - importlib-metadata=8.5.0=py313hca03da5_0
+  - incremental=24.7.2=pyhd3eb1b0_0
+  - inflection=0.5.1=py313hca03da5_1
+  - iniconfig=1.1.1=pyhd3eb1b0_0
+  - intake=2.0.7=py313hca03da5_0
+  - intervaltree=3.1.0=pyhd3eb1b0_0
+  - ipykernel=6.29.5=py313hca03da5_1
+  - ipython=8.30.0=py313hca03da5_0
+  - ipython_pygments_lexers=1.1.1=py313hca03da5_0
+  - ipywidgets=8.1.5=py313hca03da5_0
+  - isort=6.0.1=py313hca03da5_0
+  - itemadapter=0.3.0=pyhd3eb1b0_0
+  - itemloaders=1.3.2=py313hca03da5_0
+  - itsdangerous=2.2.0=py313hca03da5_0
+  - jaraco.classes=3.2.1=pyhd3eb1b0_0
+  - jaraco.context=6.0.0=py313hca03da5_0
+  - jaraco.functools=4.1.0=py313hca03da5_0
+  - jedi=0.19.2=py313hca03da5_0
+  - jellyfish=1.1.3=py313h1bd1ac0_0
+  - jinja2=3.1.6=py313hca03da5_0
+  - jinja2-time=0.2.0=pyhd3eb1b0_3
+  - jmespath=1.0.1=py313hca03da5_0
+  - joblib=1.4.2=py313hca03da5_0
+  - jpeg=9e=h80987f9_3
+  - jq=1.7.1=h80987f9_0
+  - json5=0.9.25=py313hca03da5_0
+  - jsonpatch=1.33=py313hca03da5_1
+  - jsonpointer=2.1=pyhd3eb1b0_0
+  - jsonschema=4.23.0=py313hca03da5_0
+  - jsonschema-specifications=2023.7.1=py313hca03da5_0
+  - jupyter=1.1.1=py313hca03da5_0
+  - jupyter-lsp=2.2.5=py313hca03da5_0
+  - jupyter_client=8.6.3=py313hca03da5_0
+  - jupyter_console=6.6.3=py313hca03da5_1
+  - jupyter_core=5.7.2=py313hca03da5_0
+  - jupyter_events=0.12.0=py313hca03da5_0
+  - jupyter_server=2.15.0=py313hca03da5_0
+  - jupyter_server_terminals=0.5.3=py313hca03da5_0
+  - jupyterlab=4.3.4=py313hca03da5_0
+  - jupyterlab-variableinspector=3.2.4=py313hca03da5_0
+  - jupyterlab_pygments=0.3.0=py313hca03da5_0
+  - jupyterlab_server=2.27.3=py313hca03da5_0
+  - jupyterlab_widgets=3.0.13=py313hca03da5_0
+  - keyring=25.6.0=py313hca03da5_0
+  - kiwisolver=1.4.8=py313h313beb8_0
+  - krb5=1.20.1=hf3e1bf2_1
+  - lazy_loader=0.4=py313hca03da5_0
+  - lcms2=2.16=he26ebf3_1
+  - ld64=530=hb29bf3f_25
+  - ld64_osx-arm64=530=h001ce53_25
+  - ldid=2.1.5=h20b2a84_3
+  - lerc=4.0.0=h313beb8_0
+  - libabseil=20250127.0=cxx17_h313beb8_0
+  - libarchive=3.7.7=h8f13d7a_0
+  - libboost=1.82.0=h0bc93f9_2
+  - libbrotlicommon=1.0.9=h80987f9_9
+  - libbrotlidec=1.0.9=h80987f9_9
+  - libbrotlienc=1.0.9=h80987f9_9
+  - libclang=20.1.8=default_h988c893_0
+  - libclang13=20.1.8=default_h9231c17_0
+  - libcurl=8.12.1=hde089ae_0
+  - libcxx=20.1.8=hd7fd590_1
+  - libdeflate=1.22=h80987f9_0
+  - libedit=3.1.20230828=h80987f9_0
+  - libev=4.33=h1a28f6b_1
+  - libevent=2.1.12=h02f6b3c_1
+  - libffi=3.4.4=hca03da5_1
+  - libgfortran=5.0.0=11_3_0_hca03da5_28
+  - libgfortran5=11.3.0=h009349e_28
+  - libglib=2.84.2=hdc2269c_0
+  - libgrpc=1.71.0=h62f6fdd_0
+  - libiconv=1.16=h80987f9_3
+  - liblief=0.16.4=h313beb8_0
+  - libllvm14=14.0.6=h19fdd8a_4
+  - libllvm20=20.1.8=h1701f07_0
+  - libmamba=2.0.5=h15e39b3_1
+  - libmambapy=2.0.5=py313h48ca7d4_1
+  - libmpdec=4.0.0=h80987f9_0
+  - libnghttp2=1.57.0=h62f6fdd_0
+  - libopenblas=0.3.30=hf2bb037_2
+  - libopus=1.3.1=h80987f9_1
+  - libpng=1.6.39=h80987f9_0
+  - libpq=17.4=h02f6b3c_0
+  - libprotobuf=5.29.3=h9f9f828_0
+  - libre2-11=2024.07.02=h313beb8_0
+  - libsodium=1.0.18=h1a28f6b_0
+  - libsolv=0.7.30=h514c7bf_1
+  - libspatialindex=1.9.3=hc377ac9_0
+  - libssh2=1.11.1=h3e2b118_0
+  - libthrift=0.15.0=h73c2103_2
+  - libtiff=4.7.0=h91aec0a_0
+  - libvpx=1.13.1=h313beb8_0
+  - libwebp-base=1.3.2=h80987f9_1
+  - libxml2=2.13.8=h0b34f26_0
+  - libxslt=1.1.41=hf4d3faa_0
+  - linkify-it-py=2.0.0=py313hca03da5_0
+  - llvm-openmp=14.0.6=hc6e5704_0
+  - llvmlite=0.44.0=py313heb35c27_1
+  - locket=1.0.0=py313hca03da5_0
+  - lsprotocol=2025.0.0=py313hca03da5_0
+  - lxml=5.3.0=py313h1d4350b_1
+  - lz4=4.3.2=py313h80987f9_1
+  - lz4-c=1.9.4=h313beb8_1
+  - lzo=2.10=h1a28f6b_2
+  - mako=1.3.10=py313hca03da5_0
+  - markdown=3.8=py313hca03da5_0
+  - markdown-it-py=2.2.0=py313hca03da5_1
+  - markupsafe=3.0.2=py313h80987f9_0
+  - matplotlib=3.10.0=py313hca03da5_1
+  - matplotlib-base=3.10.0=py313hb68df00_0
+  - matplotlib-inline=0.1.6=py313hca03da5_0
+  - mbedtls=3.5.1=h313beb8_1
+  - mccabe=0.7.0=pyhd3eb1b0_0
+  - mdit-py-plugins=0.3.0=py313hca03da5_0
+  - mdurl=0.1.0=py313hca03da5_0
+  - menuinst=2.2.0=py313hca03da5_1
+  - mistune=3.1.2=py313hca03da5_0
+  - more-itertools=10.3.0=py313hca03da5_0
+  - mpc=1.3.1=h80987f9_0
+  - mpfr=4.2.1=h80987f9_0
+  - mpmath=1.3.0=py313hca03da5_0
+  - msgpack-python=1.0.3=py313h48ca7d4_0
+  - multidict=6.1.0=py313h80987f9_0
+  - multipledispatch=0.6.0=py313hca03da5_0
+  - mypy=1.14.1=py313h80987f9_0
+  - mypy_extensions=1.0.0=py313hca03da5_0
+  - mysql=8.4.0=h065ec36_2
+  - mysql-common=9.3.0=h0968ce5_3
+  - mysql-libs=9.3.0=ha948bd4_3
+  - narwhals=1.31.0=py313hca03da5_1
+  - nb_conda_kernels=2.5.2=py313hca03da5_2
+  - nbclient=0.10.2=py313hca03da5_0
+  - nbconvert=7.16.6=py313hca03da5_0
+  - nbconvert-core=7.16.6=py313hca03da5_0
+  - nbconvert-pandoc=7.16.6=py313hca03da5_0
+  - nbformat=5.10.4=py313hca03da5_0
+  - ncurses=6.4=h313beb8_0
+  - nest-asyncio=1.6.0=py313hca03da5_0
+  - networkx=3.4.2=py313hca03da5_0
+  - nlohmann_json=3.11.2=h313beb8_0
+  - nltk=3.9.1=py313h0ef513f_0
+  - notebook=7.3.2=py313hca03da5_1
+  - notebook-shim=0.2.4=py313hca03da5_0
+  - numba=0.61.0=py313h313beb8_1
+  - numexpr=2.10.1=py313h5d9532f_0
+  - numpy=2.1.3=py313hf2f81dc_3
+  - numpy-base=2.1.3=py313h2a02f3f_3
+  - numpydoc=1.2=pyhd3eb1b0_0
+  - oniguruma=6.9.7.1=h1a28f6b_0
+  - openjpeg=2.5.2=hba36e21_1
+  - openldap=2.6.4=he7ef289_0
+  - openpyxl=3.1.5=py313h80987f9_1
+  - openssl=3.0.18=h9b4081a_0
+  - orc=2.1.1=h55d209b_0
+  - overrides=7.4.0=py313hca03da5_0
+  - packaging=24.2=py313hca03da5_0
+  - pandas=2.2.3=py313hcf29cfe_0
+  - pandoc=2.12=hca03da5_3
+  - pandocfilters=1.5.0=pyhd3eb1b0_0
+  - panel=1.7.0=py313hca03da5_0
+  - param=2.2.0=py313hca03da5_0
+  - parsel=1.8.1=py313hca03da5_0
+  - parso=0.8.4=py313hca03da5_0
+  - partd=1.4.2=py313hca03da5_0
+  - patch=2.7.6=h1a28f6b_1001
+  - pathspec=0.10.3=py313hca03da5_0
+  - patsy=1.0.1=py313hca03da5_0
+  - pcre2=10.42=hb066dcc_1
+  - pexpect=4.8.0=pyhd3eb1b0_3
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=11.1.0=py313h41ba818_1
+  - pip=25.1=pyhc872135_2
+  - pixman=0.46.4=h09dc60e_0
+  - pkce=1.0.3=py313hca03da5_0
+  - pkginfo=1.12.0=py313hca03da5_0
+  - platformdirs=4.3.7=py313hca03da5_0
+  - plotly=5.24.1=py313h7eb115d_1
+  - pluggy=1.5.0=py313hca03da5_0
+  - ply=3.11=py313hca03da5_1
+  - poyo=0.5.0=pyhd3eb1b0_0
+  - prometheus_client=0.21.1=py313hca03da5_0
+  - prompt-toolkit=3.0.43=py313hca03da5_0
+  - prompt_toolkit=3.0.43=hd3eb1b0_0
+  - propcache=0.3.1=py313h80987f9_0
+  - protego=0.4.0=py313hca03da5_0
+  - protobuf=5.29.3=py313h514c7bf_0
+  - psutil=5.9.0=py313h80987f9_1
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
+  - pure_eval=0.2.2=pyhd3eb1b0_0
+  - py-cpuinfo=9.0.0=py313hca03da5_0
+  - py-lief=0.16.4=py313h313beb8_0
+  - pyarrow=19.0.0=py313h313beb8_1
+  - pyasn1=0.4.8=pyhd3eb1b0_0
+  - pyasn1-modules=0.2.8=py_0
+  - pybind11-abi=5=hd3eb1b0_0
+  - pycodestyle=2.12.1=py313hca03da5_0
+  - pycosat=0.6.6=py313h80987f9_2
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyct=0.5.0=py313hca03da5_0
+  - pycurl=7.45.6=py313h10e1ce2_0
+  - pydantic=2.10.3=py313hca03da5_0
+  - pydantic-core=2.27.1=py313h2aea54e_0
+  - pydantic-settings=2.6.1=py313hca03da5_0
+  - pydispatcher=2.0.5=py313hca03da5_3
+  - pydocstyle=6.3.0=py313hca03da5_0
+  - pyerfa=2.0.1.5=py313h80987f9_0
+  - pyflakes=3.2.0=py313hca03da5_0
+  - pygithub=2.4.0=py313hca03da5_0
+  - pygments=2.19.1=py313hca03da5_0
+  - pyjwt=2.10.1=py313hca03da5_0
+  - pylint=3.3.5=py313hca03da5_0
+  - pylint-venv=3.0.3=py313hca03da5_0
+  - pyls-spyder=0.4.0=pyhd3eb1b0_0
+  - pynacl=1.5.0=py313h80987f9_1
+  - pyobjc-core=10.1=py313h80987f9_0
+  - pyobjc-framework-cocoa=10.1=py313hb094c41_0
+  - pyobjc-framework-coreservices=10.1=py313hdd8dd1f_0
+  - pyobjc-framework-fsevents=10.1=py313hca03da5_0
+  - pyodbc=5.2.0=py313h313beb8_0
+  - pyopenssl=25.0.0=py313h9e2d7d8_0
+  - pyparsing=3.2.0=py313hca03da5_0
+  - pyqt=6.9.1=py313h9be6068_0
+  - pyqt5-sip=12.13.0=py313h80987f9_1
+  - pyqt6-sip=13.10.2=py313h45c6bc8_0
+  - pyqtwebengine=6.9.0=py313h8d0667a_0
+  - pyside6=6.9.2=py313h7961fb0_0
+  - pysocks=1.7.1=py313hca03da5_0
+  - pytables=3.10.2=py313h8397fff_2
+  - pytest=8.3.4=py313hca03da5_0
+  - python=3.13.5=h2eb94d5_100_cp313
+  - python-dateutil=2.9.0post0=py313hca03da5_2
+  - python-dotenv=1.1.0=py313hca03da5_0
+  - python-fastjsonschema=2.20.0=py313hca03da5_0
+  - python-json-logger=3.2.1=py313hca03da5_0
+  - python-libarchive-c=5.1=pyhd3eb1b0_0
+  - python-lmdb=1.6.2=py313h313beb8_0
+  - python-lsp-black=2.0.0=py313hca03da5_1
+  - python-lsp-jsonrpc=1.1.2=pyhd3eb1b0_0
+  - python-lsp-ruff=2.3.0=py313hca03da5_0
+  - python-lsp-server=1.13.1=py313h7eb115d_0
+  - python-slugify=5.0.2=pyhd3eb1b0_0
+  - python-tzdata=2025.2=pyhd3eb1b0_0
+  - python.app=3=py313h80987f9_2
+  - python_abi=3.13=0_cp313
+  - pytoolconfig=1.2.6=py313hca03da5_0
+  - pytz=2024.1=py313hca03da5_0
+  - pyuca=1.2=py313hca03da5_1
+  - pyviz_comms=3.0.2=py313hca03da5_0
+  - pywavelets=1.8.0=py313h80987f9_0
+  - pyyaml=6.0.2=py313h80987f9_0
+  - pyzmq=26.2.0=py313h313beb8_0
+  - qdarkstyle=3.2.3=pyhd3eb1b0_0
+  - qstylizer=0.2.2=py313hca03da5_0
+  - qt-main=6.9.2=h10e828f_0
+  - qt5compat=6.9.2=h6ff497d_1
+  - qtawesome=1.4.0=py313hca03da5_0
+  - qtbase=6.9.2=h32c7431_0
+  - qtbase-devel=6.9.2=h2b69d39_0
+  - qtconsole=5.7.0=py313hca03da5_0
+  - qtdeclarative=6.9.2=hfc17e28_1
+  - qtimageformats=6.9.2=h850909d_1
+  - qtpy=2.4.1=py313hca03da5_0
+  - qtshadertools=6.9.2=hfc17e28_1
+  - qtsvg=6.9.2=h310a915_1
+  - qttools=6.9.2=hd987465_0
+  - qttranslations=6.9.2=hfc17e28_1
+  - qtwebchannel=6.9.2=hfc17e28_1
+  - qtwebengine=6.9.2=h79e3840_0
+  - qtwebsockets=6.9.2=hfc17e28_1
+  - queuelib=1.6.2=py313hca03da5_0
+  - re2=2024.07.02=h48ca7d4_0
+  - readchar=4.0.5=py313hca03da5_0
+  - readline=8.2=h1a28f6b_0
+  - referencing=0.30.2=py313hca03da5_0
+  - regex=2024.11.6=py313h80987f9_0
+  - reproc=14.2.4=h313beb8_2
+  - reproc-cpp=14.2.4=h313beb8_2
+  - requests=2.32.3=py313hca03da5_1
+  - requests-file=2.1.0=py313hca03da5_0
+  - requests-toolbelt=1.0.0=py313hca03da5_0
+  - rfc3339-validator=0.1.4=py313hca03da5_0
+  - rfc3986-validator=0.1.1=py313hca03da5_0
+  - rich=13.9.4=py313hca03da5_0
+  - roman-numerals-py=3.1.0=py313hca03da5_0
+  - rope=1.13.0=py313hca03da5_0
+  - rpds-py=0.22.3=py313h2aea54e_0
+  - rtree=1.0.1=py313hca03da5_0
+  - ruamel.yaml=0.18.10=py313h80987f9_0
+  - ruamel.yaml.clib=0.2.12=py313h80987f9_0
+  - ruamel_yaml=0.17.21=py313h80987f9_0
+  - ruff=0.12.0=py313h59dbcda_0
+  - s3fs=2025.3.2=py313hca03da5_0
+  - scikit-image=0.25.0=py313h313beb8_0
+  - scikit-learn=1.6.1=py313h313beb8_0
+  - scipy=1.15.3=py313hd7edaaf_0
+  - scrapy=2.12.0=py313hca03da5_1
+  - seaborn=0.13.2=py313hca03da5_3
+  - semver=3.0.2=py313hca03da5_1
+  - send2trash=1.8.2=py313hca03da5_1
+  - sentry-sdk=2.45.0=py313hca03da5_0
+  - service_identity=24.2.0=py313hca03da5_0
+  - setuptools=80.9.0=py313hca03da5_0
+  - shellingham=1.5.0=py313hca03da5_0
+  - simdjson=3.10.1=h48ca7d4_0
+  - sip=6.12.0=py313h8740e61_0
+  - six=1.17.0=py313hca03da5_0
+  - sklearn-compat=0.1.3=py313hca03da5_0
+  - smmap=4.0.0=pyhd3eb1b0_0
+  - snappy=1.2.1=h313beb8_0
+  - sniffio=1.3.0=py313hca03da5_0
+  - snowballstemmer=2.2.0=pyhd3eb1b0_0
+  - sortedcontainers=2.4.0=pyhd3eb1b0_0
+  - soupsieve=2.5=py313hca03da5_0
+  - spdlog=1.11.0=h48ca7d4_0
+  - sphinx=8.2.3=py313h80987f9_0
+  - sphinxcontrib-applehelp=2.0.0=pyhd3eb1b0_1
+  - sphinxcontrib-devhelp=2.0.0=pyhd3eb1b0_0
+  - sphinxcontrib-htmlhelp=2.1.0=pyhd3eb1b0_0
+  - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0
+  - sphinxcontrib-qthelp=2.0.0=pyhd3eb1b0_1
+  - sphinxcontrib-serializinghtml=2.0.0=pyhd3eb1b0_0
+  - spyder=6.1.0=py313h70421b1_1
+  - spyder-kernels=3.1.1=py313h7eb115d_0
+  - sqlalchemy=2.0.39=py313hbe2cdee_0
+  - sqlite=3.50.2=h79febb2_1
+  - stack_data=0.2.0=pyhd3eb1b0_0
+  - statsmodels=0.14.4=py313h80987f9_0
+  - streamlit=1.45.1=py313hca03da5_1
+  - superqt=0.7.6=py313hffb95fb_0
+  - sympy=1.13.3=py313hca03da5_1
+  - tabulate=0.9.0=py313hca03da5_0
+  - tapi=1100.0.11=h8754e6a_1
+  - tbb=2021.8.0=h48ca7d4_0
+  - tblib=3.1.0=py313hca03da5_0
+  - tenacity=9.0.0=py313hca03da5_0
+  - terminado=0.17.1=py313hca03da5_0
+  - text-unidecode=1.3=pyhd3eb1b0_0
+  - textdistance=4.2.1=pyhd3eb1b0_0
+  - threadpoolctl=3.5.0=py313h7eb115d_0
+  - three-merge=0.1.1=pyhd3eb1b0_0
+  - tifffile=2025.2.18=py313hca03da5_0
+  - tinycss2=1.4.0=py313hca03da5_0
+  - tk=8.6.14=h6ba3021_1
+  - tldextract=5.1.2=py313hca03da5_0
+  - toml=0.10.2=pyhd3eb1b0_0
+  - tomli=2.0.1=py313hca03da5_1
+  - tomlkit=0.13.2=py313hca03da5_0
+  - toolz=1.0.0=py313hca03da5_0
+  - tornado=6.5.1=py313h80987f9_0
+  - tqdm=4.67.1=py313h7eb115d_0
+  - traitlets=5.14.3=py313hca03da5_0
+  - truststore=0.10.0=py313hca03da5_0
+  - twisted=24.11.0=py313hca03da5_0
+  - typer=0.9.0=py313hca03da5_0
+  - typing-extensions=4.12.2=py313hca03da5_0
+  - typing_extensions=4.12.2=py313hca03da5_0
+  - tzdata=2025b=h04d1e81_0
+  - uc-micro-py=1.0.1=py313hca03da5_0
+  - ujson=5.10.0=py313h313beb8_1
+  - unidecode=1.3.8=py313hca03da5_0
+  - unixodbc=2.3.11=h1a28f6b_0
+  - urllib3=2.3.0=py313hca03da5_0
+  - utf8proc=2.6.1=h80987f9_1
+  - w3lib=2.1.2=py313hca03da5_0
+  - watchdog=4.0.2=py313h80987f9_0
+  - wcwidth=0.2.5=pyhd3eb1b0_0
+  - webencodings=0.5.1=py313hca03da5_2
+  - websocket-client=1.8.0=py313hca03da5_0
+  - werkzeug=3.1.3=py313hca03da5_0
+  - whatthepatch=1.0.2=py313hca03da5_0
+  - wheel=0.45.1=py313hca03da5_0
+  - widgetsnbextension=4.0.13=py313hca03da5_0
+  - wrapt=1.17.0=py313h80987f9_0
+  - wurlitzer=3.0.2=py313hca03da5_0
+  - xarray=2025.4.0=py313hca03da5_0
+  - xlwings=0.32.1=py313hca03da5_1
+  - xyzservices=2022.9.0=py313hca03da5_1
+  - xz=5.6.4=h80987f9_1
+  - yaml=0.2.5=h1a28f6b_0
+  - yaml-cpp=0.8.0=h313beb8_1
+  - yapf=0.40.2=py313hca03da5_0
+  - yarl=1.18.0=py313h80987f9_0
+  - zeromq=4.3.5=h313beb8_0
+  - zict=3.0.0=py313hca03da5_0
+  - zipp=3.21.0=py313hca03da5_0
+  - zlib=1.2.13=h18a0788_1
+  - zlib-ng=2.0.7=h80987f9_0
+  - zope=1.0=py313hca03da5_1
+  - zope.interface=7.1.1=py313h80987f9_0
+  - zstandard=0.23.0=py313h1a4646a_1
+  - zstd=1.5.6=hfb09047_0
+  - pip:
+      - accelerate==1.12.0
+      - hf-xet==1.2.0
+      - huggingface-hub==0.36.0
+      - peft==0.18.1
+      - safetensors==0.7.0
+      - tokenizers==0.22.2
+      - torch==2.9.1
+      - transformers==4.57.6
+      - typer-slim==0.21.1

notebooks/lyricloop.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit
+torch
+transformers
+peft
+bitsandbytes
+accelerate
+pandas
+matplotlib
+seaborn
+scikit-learn

src/lyricloop/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+LyricLoop: A modular framework for fine-tuning and evaluating
+LLMs on musical lyric generation and critique.
+"""

src/lyricloop/config.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+# -------------------------
+# Model Configuration
+# -------------------------
+MODEL_ID = "google/gemma-2b-it"
+RANDOM_STATE = 42
+# -------------------------
+# Path Management
+# -------------------------
+# Assumes the script is in lyricloop-llm/src/lyricloop/
+# Go up 2 levels to reach the lyricloop-llm root
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+# Define standard subfolders
+ASSETS_DIR = os.path.join(PROJECT_ROOT, "assets")
+DATA_DIR = os.path.join(PROJECT_ROOT, "data")
+MODELS_DIR = os.path.join(PROJECT_ROOT, "models")
+def ensure_dirs():
+    """Initializes the project folder structure if it does not exist."""
+    os.makedirs(ASSETS_DIR, exist_ok=True)
+    os.makedirs(DATA_DIR, exist_ok=True)
+    os.makedirs(MODELS_DIR, exist_ok=True)
+# -------------------------
+# Global History Template
+# -------------------------
+def initialize_history():
+    """Returns a fresh instance of the experiment history log."""
+    return {
+        "baseline": {"scores": [], "avg_confidence": [], "samples": {}, "metrics": {}},
+        "1.0": {"scores": [], "avg_confidence": [], "samples": {}, "metrics": {}},
+        "2.0": {"scores": [], "avg_confidence": [], "samples": {}, "metrics": {}}
+    }

src/lyricloop/data.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import re
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from .config import RANDOM_STATE, ASSETS_DIR
+# -------------------------
+# Prompt Construction
+# -------------------------
+def build_critic_prompt(genre, artist, title, lyrics, max_lyric_length=300):
+    """Constructs the instruction-tuning prompt for the Critic persona."""
+    lyrics_snippet = lyrics[:max_lyric_length]
+    instruction = (
+        "You are a professional music critic. Provide specific feedback on how to improve "
+        "the lyrics based on the genre and artist style. \n"
+        "Formatting Rules: \n"
+        "1. Use plain text with clear line breaks.\n"
+        "2. Ensure all song titles and words have proper spacing."
+    )
+    context = (
+        f"Target Genre: {genre}\n"
+        f"Target Artist: {artist}\n"
+        f"Target Title: {title}\n\n"
+        f"Lyrics to Evaluate:\n{lyrics_snippet}"
+    )
+    return f"<start_of_turn>user\n{instruction}\n\n{context}<end_of_turn>\n<start_of_turn>model\n"
+def build_revision_prompt(genre, artist, title, draft, critiques):
+    """Constructs the prompt for the 'Revise' step of the refinement loop."""
+    instruction = (
+        "You are an expert songwriter. Revise the provided lyrics by incorporating "
+        "the specific feedback from the critic while maintaining the genre and artist style."
+    )
+    context = (
+        f"Genre: {genre}\n"
+        f"Artist Style: {artist}\n"
+        f"Title: {title}\n\n"
+        f"Current Draft:\n{draft}\n\n"
+        f"Critic Feedback:\n{critiques}"
+    )
+    return f"<start_of_turn>user\n{instruction}\n\n{context}<end_of_turn>\n<start_of_turn>model\n"
+def build_inference_prompt(genre, artist, title):
+    """Reconstructs the prompt format used during v1.0 training."""
+    instruction = "Generate lyrics for a song based on these details."
+    input_context = f"Genre: {genre}\nArtist: {artist}\nTitle: {title}"
+    return (
+        f"<start_of_turn>user\n{instruction}\n\n{input_context}<end_of_turn>\n"
+        f"<start_of_turn>model\n"
+    )
+def format_prompt(row):
+    """Converts a dataframe row into a structured Gemma control-token prompt."""
+    instruction = "Generate lyrics for a song based on these details."
+    input_context = f"Genre: {row['tag']}\nArtist: {row['artist']}\nTitle: {row['title']}"
+    response = row['lyrics']
+    return (
+        f"<start_of_turn>user\n{instruction}\n\n{input_context}<end_of_turn>\n"
+        f"<start_of_turn>model\n{response}<end_of_turn>"
+    )
+# -------------------------
+# Text Processing
+# -------------------------
+def format_lyrics(text):
+    """Cleans up raw model output by enforcing structural newlines and spacing."""
+    # Add double newlines before section headers like [Verse], [Chorus]
+    text = re.sub(r'(\[.*?\])', r'\n\n\1\n', text)
+    # Add a newline when a capital letter follows a lowercase letter immediately
+    text = re.sub(r'([a-z])([A-Z])', r'\1\n\2', text)
+    return text.strip()
+# -------------------------
+# Dataset Management
+# -------------------------
+def format_critic_training_row(row):
+    """Standardizes raw rows into the Critic instruction-tuning format."""
+    prompt = build_critic_prompt(row.tag, row.artist, row.title, row.lyrics)
+    target_output = (
+        f"Genre Fit: The {row.tag} style is well-maintained.\n"
+        f"Artist Style: Matches the {row.artist} aesthetic.\n"
+        f"Improvements: Consider refining the rhythmic flow in the second verse."
+    )
+    return f"{prompt}{target_output}<eos>"
+def prepare_lyric_dataset(lyrics_filename, reviews_filename, songs_per_genre=200):
+    """Loads, cleans, and balances the dataset while exporting EDA plots."""
+    from .viz import save_figure
+    lyrics_path = os.path.join("data", lyrics_filename)
+    reviews_path = os.path.join("data", reviews_filename)
+    print(f"Loading & Cleaning Raw Data...")
+    lyrics_df = pd.read_csv(lyrics_path, on_bad_lines='skip')
+    reviews_df = pd.read_csv(reviews_path)
+    lyrics_df = lyrics_df.dropna(subset=['lyrics', 'artist', 'tag'])
+    reviews_df = reviews_df.dropna(subset=['genre', 'artist'])
+    lyrics_clean = lyrics_df.drop_duplicates(subset="artist")[["artist", "lyrics", "title", "tag"]]
+    merged_df = reviews_df.merge(lyrics_clean, on="artist", how="left").dropna(subset=["lyrics", "tag"])
+    # --- Plot 1: Raw Distribution ("Before") ---
+    plt.figure(figsize=(10, 5))
+    top_raw = merged_df['tag'].value_counts().nlargest(10)
+    sns.barplot(x=top_raw.values, y=top_raw.index, hue=top_raw.index, palette='viridis', legend=False)
+    plt.title(f"Raw Genre Distribution (n={len(merged_df):,})")
+    save_figure("eda_1_raw_distribution.png")
+    # Class balancing logic
+    balanced_df = merged_df.groupby("tag", group_keys=False).apply(
+        lambda x: x.sample(min(len(x), songs_per_genre), random_state=RANDOM_STATE)
+    )
+    # --- Plot 2: Balanced Distribution ("After") ---
+    plt.figure(figsize=(10, 5))
+    sns.countplot(data=balanced_df, y='tag', hue='tag', palette='magma', legend=False)
+    plt.title(f"Balanced Genre Distribution (n={len(balanced_df):,})")
+    save_figure("eda_2_balanced_distribution.png")
+    return balanced_df

src/lyricloop/environment.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import random
+import numpy as np
+import torch
+# -------------------------
+# Replicability Logic
+# -------------------------
+def set_seed(seed=42):
+    """
+    Sets universal random seeds to ensure deterministic results
+    across Python, NumPy, and PyTorch.
+    """
+    # Python
+    random.seed(seed)
+    # NumPy
+    np.random.seed(seed)
+    # PyTorch
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    # Force deterministic algorithms to ensure GPU calculates the exact same gradients every time
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    print(f"Random Seed Set to: {seed}")
+# -------------------------
+# Hardware Diagnostics
+# -------------------------
+def get_device_capability():
+    """
+    Diagnostics to ensure the GPU is ready for LLM Fine-Tuning.
+    Enables TF32 for newer NVIDIA architectures (L4).
+    """
+    # Enable TF32 for modern Tensor Cores
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    if not torch.cuda.is_available():
+        raise RuntimeError("No GPU found! Go to Runtime > Change runtime type > Select NVIDIA L4.")
+    device = torch.device('cuda')
+    # Extract GPU Metadata
+    gpu_name = torch.cuda.get_device_name(0)
+    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
+    capability = torch.cuda.get_device_capability(0)
+    bf16_support = torch.cuda.is_bf16_supported()
+    # Print Status Report
+    print(f"GPU Detected: {gpu_name}")
+    print(f"    |-- Memory: {gpu_mem:.2f} GB")
+    print(f"    |-- Compute Capability: {capability}")
+    print(f"    |-- BFloat16 Support: {'Yes' if bf16_support else 'No'}")
+    # Professional Warning System
+    if "L4" not in gpu_name and "A100" not in gpu_name:
+        print(f"\nWarning: Using {gpu_name}. Performance may be suboptimal for Gemma fine-tuning.")
+    return device

src/lyricloop/metrics.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import numpy as np
+# -------------------------
+# Generation Engines
+# -------------------------
+def execute_generation(model, tokenizer, prompt, max_tokens=300, temperature=0.85, do_sample=False):
+    """
+    A universal engine that handles GPU movement, sampling, and decoding.
+    The do_sample=False default is ideal for objective Critic tasks.
+    """
+    model.eval()
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            no_repeat_ngram_size=3,
+            do_sample=do_sample,
+            temperature=temperature if do_sample else None,
+            repetition_penalty=1.2,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    input_length = inputs.input_ids.shape[1]
+    generated_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+    return generated_text.strip()
+def get_token_confidences(model, tokenizer, prompt, max_tokens=50):
+    """
+    Generates text and returns a list of (token, confidence_score) tuples.
+    Used for creating confidence heatmaps in the UI.
+    """
+    model.eval()
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            return_dict_in_generate=True,
+            output_scores=True,
+            do_sample=True,
+            temperature=0.8,
+            pad_token_id=tokenizer.pad_token_id
+        )
+    input_len = inputs.input_ids.shape[1]
+    gen_ids = outputs.sequences[0][input_len:]
+    # Calculate softmax probabilities for each generated token
+    probs = [torch.softmax(score, dim=-1)[0, tid].item() for tid, score in zip(gen_ids, outputs.scores)]
+    tokens = [tokenizer.decode(tid) for tid in gen_ids]
+    return list(zip(tokens, probs))
+# -------------------------
+# Evaluation Metrics
+# -------------------------
+def calculate_perplexity(model, tokenizer, text):
+    """
+    Computes the perplexity (uncertainty) of the model for a specific text sequence.
+    Lower score = the model finds the text natural/predictable.
+    Higher score = the model finds the text confusing/alien.
+    """
+    model.eval()
+    inputs = tokenizer(text, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model(inputs.input_ids, labels=inputs.input_ids)
+        loss = outputs.loss
+    # Perplexity is mathematically the exponential of the cross-entropy loss
+    return torch.exp(loss).item()
+# -------------------------
+# Trainer Log Parsers
+# -------------------------
+def extract_trainer_metrics(model_trainer):
+    """
+    Universal log parser for Hugging Face Trainer.
+    Extracts step-by-step history for plotting and final validation.
+    """
+    logs = model_trainer.state.log_history
+    # Extract coordinates for plotting (Training vs Evaluation)
+    train_metrics = [{"step": x["step"], "loss": x["loss"]} for x in logs if "loss" in x]
+    eval_metrics = [{"step": x["step"], "loss": x["eval_loss"]} for x in logs if "eval_loss" in x]
+    final_loss = eval_metrics[-1]["loss"] if eval_metrics else None
+    return {
+        "train_steps": [x["step"] for x in train_metrics],
+        "train_loss": [x["loss"] for x in train_metrics],
+        "eval_steps": [x["step"] for x in eval_metrics],
+        "eval_loss": [x["loss"] for x in eval_metrics],
+        "val_loss": final_loss,
+        "perplexity": np.exp(final_loss) if final_loss else None
+    }

src/lyricloop/viz.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from .config import ASSETS_DIR
+# -------------------------
+# Visualization Utilities
+# -------------------------
+def save_figure(filename):
+    """
+    Saves the current matplotlib figure with consistent professional settings.
+    Saves to the global assets directory with 300 DPI resolution.
+    """
+    path = os.path.join(ASSETS_DIR, filename)
+    # Ensure layout does not clip labels
+    plt.tight_layout()
+    # High resolution for documentation and reports
+    plt.savefig(path, dpi=300, bbox_inches='tight')
+    print(f"    Artifact Saved: {path}")
+    plt.show()
+    plt.close()
+# -------------------------
+# Training Diagnostics
+# -------------------------
+def plot_learning_curves(metrics, version="v1"):
+    """
+    Standardized learning curve plotter for loss and validation metrics.
+    """
+    sns.set_style("whitegrid")
+    plt.figure(figsize=(12, 6))
+    # Training Loss
+    sns.lineplot(x=metrics["train_steps"], y=metrics["train_loss"],
+                 label='Training Loss', color='#4E79A7', linewidth=2.5)
+    # Validation Loss (if available)
+    if metrics["eval_loss"]:
+        sns.lineplot(x=metrics["eval_steps"], y=metrics["eval_loss"],
+                     label='Validation Loss', color='#E15759', linewidth=2.5, marker='o')
+    plt.title(f'Learning Curve: LyricLoop {version.upper()}', fontsize=16, fontweight='bold', pad=15)
+    plt.xlabel('Training Steps')
+    plt.ylabel('Loss')
+    plt.legend(frameon=True, fancybox=True, framealpha=0.9)
+    save_figure(f"eval_loss_curve_{version}.png")
+# -------------------------
+# Confidence & Interpretability
+# -------------------------
+def plot_token_heatmap(token_conf_pairs, title="Confidence Heatmap", filename="heatmap.png"):
+    """Draws a text heatmap where background color represents model confidence."""
+    fig = plt.figure(figsize=(10, 4))
+    ax = fig.add_axes([0, 0, 1, 1])
+    ax.axis('off')
+    x, y = 0.02, 0.85
+    line_height = 0.12
+    confidences = [p[1] for p in token_conf_pairs]
+    avg_conf = np.mean(confidences) if confidences else 0
+    ax.text(0.02, 0.95, f"{title} (Avg: {avg_conf:.2%})",
+            fontsize=12, fontweight='bold', transform=ax.transAxes)
+    for t, score in token_conf_pairs:
+        # Professional Color Scale: Green (High), Orange (Medium), Red (Low)
+        if score > 0.7: bg = '#aaffaa'
+        elif score > 0.3: bg = '#ffeeba'
+        else: bg = '#ffcccc'
+        clean_text = t.replace('\n', '↵ ')
+        text_w = len(clean_text) * 0.015
+        if x + text_w > 0.95:
+            x = 0.02
+            y -= line_height
+        ax.text(x, y, clean_text, bbox=dict(facecolor=bg, edgecolor='none', pad=2, alpha=0.8),
+                fontfamily='monospace', fontsize=10, transform=ax.transAxes)
+        x += text_w + 0.005
+    save_figure(filename)
+    return avg_conf
+def plot_confidence_summary(genres, scores, title="Confidence Summary", filename="conf_summary.png"):
+    """Standardized bar chart for comparing confidence across genres."""
+    plt.figure(figsize=(11, 6))
+    x = np.arange(len(genres))
+    width = 0.35
+    palette = ['#A0A0A0', '#4E79A7', '#E15759']    # grey, blue, red
+    if isinstance(scores, list):
+        scores_dict = {"Model Output": scores}
+        width = 0.5
+    else:
+        scores_dict = scores
+    active_scores = {k: v for k, v in scores_dict.items() if len(v) == len(genres)}
+    for i, (label, values) in enumerate(active_scores.items()):
+        offset = (i - (len(active_scores)-1)/2) * width if len(active_scores) > 1 else 0
+        bars = plt.bar(x + offset, values, width, label=label,
+                       color=palette[i % 3], edgecolor='black', alpha=0.8)
+        for bar in bars:
+            h = bar.get_height()
+            plt.text(bar.get_x() + bar.get_width()/2., h + 0.02, f'{h:.2f}',
+                     ha='center', va='bottom', fontweight='bold', fontsize=9)
+    plt.title(title, fontsize=16, fontweight='bold')
+    plt.ylabel('Average Confidence Score')
+    plt.xticks(x, genres)
+    plt.ylim(0, 1.1)
+    if len(active_scores) > 1:
+        plt.legend(loc='lower right')
+    plt.grid(axis='y', linestyle='--', alpha=0.3)
+    save_figure(filename)
+# -------------------------
+# Performance Comparison
+# -------------------------
+def plot_perplexity(genres, scores_dict, title="Model Perplexity", filename="perplexity.png", use_log=False):
+    """Global plotter for perplexity scores with support for log-scaling."""
+    plt.figure(figsize=(10, 6))
+    if use_log: plt.yscale('log')
+    x = np.arange(len(genres))
+    comp_colors = ['#A0A0A0', '#4E79A7']    # grey for Baseline, blue for Fine-Tuned
+    if len(scores_dict) == 1:
+        label = list(scores_dict.keys())[0]
+        values = list(scores_dict.values())[0]
+        bars = plt.bar(genres, values, color='#A0A0A0', edgecolor='black', alpha=0.8)
+    else:
+        width = 0.35
+        for i, (label, values) in enumerate(scores_dict.items()):
+            offset = (i - (len(scores_dict)-1)/2) * width
+            bars = plt.bar(x + offset, values, width, label=label, color=comp_colors[i % 2], edgecolor='black')
+    plt.title(title, fontsize=14, fontweight='bold')
+    plt.ylabel('Perplexity (Lower is Better)', fontsize=12)
+    plt.xticks(x, genres)
+    plt.grid(axis='y', linestyle='--', alpha=0.5)
+    if len(scores_dict) > 1: plt.legend()
+    save_figure(filename)