James Edmunds commited on
Commit ·
2fb2290
1
Parent(s): 5006ce5
Updated to load HF dataset into HF Space on run
Browse files- app.py +31 -1
- config/settings.py +5 -2
- src/generator/generator.py +89 -2
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
| 3 |
import streamlit as st
|
| 4 |
from src.generator.generator import LyricGenerator
|
| 5 |
from config.settings import Settings
|
|
|
|
| 6 |
|
| 7 |
# Set SQLite path for local development
|
| 8 |
if not Settings.is_huggingface():
|
|
@@ -11,14 +12,43 @@ if not Settings.is_huggingface():
|
|
| 11 |
def initialize_generator():
|
| 12 |
"""Initialize the generator with proper error handling"""
|
| 13 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# Initialize generator
|
|
|
|
| 15 |
st.info("Loading embeddings...")
|
| 16 |
generator = LyricGenerator()
|
| 17 |
st.success("Embeddings loaded successfully!")
|
| 18 |
return generator
|
| 19 |
|
| 20 |
except Exception as e:
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
| 22 |
return None
|
| 23 |
|
| 24 |
def main():
|
|
|
|
| 3 |
import streamlit as st
|
| 4 |
from src.generator.generator import LyricGenerator
|
| 5 |
from config.settings import Settings
|
| 6 |
+
from pathlib import Path
|
| 7 |
|
| 8 |
# Set SQLite path for local development
|
| 9 |
if not Settings.is_huggingface():
|
|
|
|
| 12 |
def initialize_generator():
|
| 13 |
"""Initialize the generator with proper error handling"""
|
| 14 |
try:
|
| 15 |
+
print("\n=== Initializing Generator ===")
|
| 16 |
+
# Check for HuggingFace environment requirements
|
| 17 |
+
if Settings.is_huggingface():
|
| 18 |
+
print("Running in HuggingFace environment")
|
| 19 |
+
print("Checking environment requirements...")
|
| 20 |
+
|
| 21 |
+
if not Settings.HF_TOKEN:
|
| 22 |
+
error_msg = "HuggingFace token not found. Please set HF_TOKEN in Space secrets."
|
| 23 |
+
print(f"Error: {error_msg}")
|
| 24 |
+
st.error(error_msg)
|
| 25 |
+
return None
|
| 26 |
+
else:
|
| 27 |
+
print("HF_TOKEN found in environment")
|
| 28 |
+
|
| 29 |
+
# Ensure persistent storage directory exists
|
| 30 |
+
storage_path = Path("/data/processed/embeddings")
|
| 31 |
+
print(f"Setting up persistent storage at: {storage_path}")
|
| 32 |
+
storage_path.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
print(f"Storage directory created/verified")
|
| 34 |
+
|
| 35 |
+
if storage_path.exists():
|
| 36 |
+
print(f"Storage directory contents: {list(storage_path.glob('**/*'))}")
|
| 37 |
+
else:
|
| 38 |
+
print("Running in local environment")
|
| 39 |
+
|
| 40 |
# Initialize generator
|
| 41 |
+
print("\nInitializing LyricGenerator...")
|
| 42 |
st.info("Loading embeddings...")
|
| 43 |
generator = LyricGenerator()
|
| 44 |
st.success("Embeddings loaded successfully!")
|
| 45 |
return generator
|
| 46 |
|
| 47 |
except Exception as e:
|
| 48 |
+
error_msg = f"Failed to initialize generator: {str(e)}"
|
| 49 |
+
print(f"\nError during initialization: {error_msg}")
|
| 50 |
+
print(f"Error type: {type(e).__name__}")
|
| 51 |
+
st.error(error_msg)
|
| 52 |
return None
|
| 53 |
|
| 54 |
def main():
|
config/settings.py
CHANGED
|
@@ -34,6 +34,7 @@ class Settings:
|
|
| 34 |
|
| 35 |
# HuggingFace Settings
|
| 36 |
HF_SPACE = "SongLift/LyrGen2"
|
|
|
|
| 37 |
|
| 38 |
@classmethod
|
| 39 |
def is_huggingface(cls) -> bool:
|
|
@@ -43,5 +44,7 @@ class Settings:
|
|
| 43 |
@classmethod
|
| 44 |
def get_embeddings_path(cls) -> Path:
|
| 45 |
"""Get appropriate embeddings path based on deployment mode"""
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# HuggingFace Settings
|
| 36 |
HF_SPACE = "SongLift/LyrGen2"
|
| 37 |
+
HF_DATASET = "SongLift/LyrGen2_DB" # Updated dataset repo name
|
| 38 |
|
| 39 |
@classmethod
|
| 40 |
def is_huggingface(cls) -> bool:
|
|
|
|
| 44 |
@classmethod
|
| 45 |
def get_embeddings_path(cls) -> Path:
|
| 46 |
"""Get appropriate embeddings path based on deployment mode"""
|
| 47 |
+
if cls.is_huggingface():
|
| 48 |
+
# Use persistent storage in HF Spaces
|
| 49 |
+
return Path("/data/processed/embeddings")
|
| 50 |
+
return cls.EMBEDDINGS_DIR
|
src/generator/generator.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
from typing import Dict, List, Optional
|
| 2 |
from pathlib import Path
|
|
|
|
| 3 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 4 |
from langchain_chroma import Chroma
|
| 5 |
from langchain.chains import ConversationalRetrievalChain
|
| 6 |
from langchain.prompts import PromptTemplate
|
| 7 |
-
from huggingface_hub import snapshot_download
|
| 8 |
from config.settings import Settings
|
| 9 |
|
| 10 |
|
|
@@ -27,19 +28,105 @@ class LyricGenerator:
|
|
| 27 |
# Load embeddings
|
| 28 |
self._load_embeddings()
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def _load_embeddings(self) -> None:
|
| 31 |
"""Load existing embeddings based on environment"""
|
| 32 |
try:
|
| 33 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# Check Chroma directory structure
|
| 35 |
chroma_dir = self.embeddings_dir / "chroma"
|
| 36 |
print(f"Checking Chroma directory: {chroma_dir}")
|
|
|
|
| 37 |
if not chroma_dir.exists():
|
|
|
|
|
|
|
|
|
|
| 38 |
raise RuntimeError(f"Chroma directory not found at {chroma_dir}")
|
| 39 |
|
| 40 |
sqlite_file = chroma_dir / "chroma.sqlite3"
|
| 41 |
print(f"Checking SQLite file: {sqlite_file}")
|
| 42 |
if not sqlite_file.exists():
|
|
|
|
| 43 |
raise RuntimeError(f"Chroma database not found at {sqlite_file}")
|
| 44 |
print(f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
|
| 45 |
|
|
|
|
| 1 |
from typing import Dict, List, Optional
|
| 2 |
from pathlib import Path
|
| 3 |
+
import shutil
|
| 4 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 5 |
from langchain_chroma import Chroma
|
| 6 |
from langchain.chains import ConversationalRetrievalChain
|
| 7 |
from langchain.prompts import PromptTemplate
|
| 8 |
+
from huggingface_hub import snapshot_download, hf_hub_download
|
| 9 |
from config.settings import Settings
|
| 10 |
|
| 11 |
|
|
|
|
| 28 |
# Load embeddings
|
| 29 |
self._load_embeddings()
|
| 30 |
|
| 31 |
+
def _setup_embeddings_from_hf(self) -> None:
|
| 32 |
+
"""Download and setup embeddings from HuggingFace dataset"""
|
| 33 |
+
print("\n=== Setting up embeddings from HuggingFace dataset ===")
|
| 34 |
+
try:
|
| 35 |
+
# Create necessary directories
|
| 36 |
+
chroma_dir = self.embeddings_dir / "chroma"
|
| 37 |
+
print(f"Target Chroma directory: {chroma_dir}")
|
| 38 |
+
print(f"Creating parent directory: {chroma_dir.parent}")
|
| 39 |
+
chroma_dir.parent.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
# Check if embeddings already exist in persistent storage
|
| 42 |
+
if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
|
| 43 |
+
print("\nDownloading embeddings from HuggingFace dataset...")
|
| 44 |
+
print(f"Dataset repo: {Settings.HF_DATASET}")
|
| 45 |
+
print(f"Using temp directory: /tmp/embeddings")
|
| 46 |
+
|
| 47 |
+
# Download the entire chroma directory from the dataset
|
| 48 |
+
try:
|
| 49 |
+
temp_dir = snapshot_download(
|
| 50 |
+
repo_id=Settings.HF_DATASET,
|
| 51 |
+
repo_type="dataset",
|
| 52 |
+
token=Settings.HF_TOKEN,
|
| 53 |
+
local_dir="/tmp/embeddings"
|
| 54 |
+
)
|
| 55 |
+
print(f"Download completed to: {temp_dir}")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Error during snapshot_download: {str(e)}")
|
| 58 |
+
raise
|
| 59 |
+
|
| 60 |
+
temp_chroma = Path(temp_dir) / "chroma"
|
| 61 |
+
print(f"Looking for Chroma in temp dir at: {temp_chroma}")
|
| 62 |
+
|
| 63 |
+
if not temp_chroma.exists():
|
| 64 |
+
print(f"Contents of temp_dir: {list(Path(temp_dir).glob('**/*'))}")
|
| 65 |
+
raise RuntimeError(
|
| 66 |
+
f"Chroma directory not found in dataset at {temp_chroma}"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
print(f"Found Chroma directory in download. Contents: {list(temp_chroma.glob('**/*'))}")
|
| 70 |
+
|
| 71 |
+
# Copy the downloaded chroma directory to persistent storage
|
| 72 |
+
print(f"\nCopying embeddings to persistent storage...")
|
| 73 |
+
if chroma_dir.exists():
|
| 74 |
+
print(f"Removing existing directory: {chroma_dir}")
|
| 75 |
+
shutil.rmtree(chroma_dir)
|
| 76 |
+
print(f"Copying from {temp_chroma} to {chroma_dir}")
|
| 77 |
+
shutil.copytree(temp_chroma, chroma_dir)
|
| 78 |
+
print(f"Embeddings copied to persistent storage at {chroma_dir}")
|
| 79 |
+
print(f"Persistent storage contents: {list(chroma_dir.glob('**/*'))}")
|
| 80 |
+
|
| 81 |
+
# Clean up temporary directory
|
| 82 |
+
print("\nCleaning up temporary directory...")
|
| 83 |
+
shutil.rmtree("/tmp/embeddings")
|
| 84 |
+
print("Cleanup complete")
|
| 85 |
+
else:
|
| 86 |
+
print("Embeddings already exist in persistent storage")
|
| 87 |
+
print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"\n=== Error in _setup_embeddings_from_hf ===")
|
| 91 |
+
print(f"Error type: {type(e).__name__}")
|
| 92 |
+
print(f"Error message: {str(e)}")
|
| 93 |
+
print(f"Current directory structure:")
|
| 94 |
+
try:
|
| 95 |
+
print(f"Parent dir exists: {chroma_dir.parent.exists()}")
|
| 96 |
+
print(f"Parent dir contents: {list(chroma_dir.parent.glob('**/*'))}")
|
| 97 |
+
except Exception as dir_error:
|
| 98 |
+
print(f"Error checking directories: {str(dir_error)}")
|
| 99 |
+
raise RuntimeError(
|
| 100 |
+
f"Failed to setup embeddings from HuggingFace: {str(e)}"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
def _load_embeddings(self) -> None:
|
| 104 |
"""Load existing embeddings based on environment"""
|
| 105 |
try:
|
| 106 |
+
print("\n=== Loading Embeddings ===")
|
| 107 |
+
|
| 108 |
+
# If in HuggingFace environment, ensure embeddings are set up
|
| 109 |
+
if Settings.is_huggingface():
|
| 110 |
+
print("HuggingFace environment detected, setting up embeddings...")
|
| 111 |
+
self._setup_embeddings_from_hf()
|
| 112 |
+
else:
|
| 113 |
+
print("Local environment detected")
|
| 114 |
+
|
| 115 |
+
print(f"\nLoading vector store from: {self.embeddings_dir}")
|
| 116 |
# Check Chroma directory structure
|
| 117 |
chroma_dir = self.embeddings_dir / "chroma"
|
| 118 |
print(f"Checking Chroma directory: {chroma_dir}")
|
| 119 |
+
|
| 120 |
if not chroma_dir.exists():
|
| 121 |
+
print(f"Parent directory exists: {chroma_dir.parent.exists()}")
|
| 122 |
+
if chroma_dir.parent.exists():
|
| 123 |
+
print(f"Parent directory contents: {list(chroma_dir.parent.glob('**/*'))}")
|
| 124 |
raise RuntimeError(f"Chroma directory not found at {chroma_dir}")
|
| 125 |
|
| 126 |
sqlite_file = chroma_dir / "chroma.sqlite3"
|
| 127 |
print(f"Checking SQLite file: {sqlite_file}")
|
| 128 |
if not sqlite_file.exists():
|
| 129 |
+
print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
|
| 130 |
raise RuntimeError(f"Chroma database not found at {sqlite_file}")
|
| 131 |
print(f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
|
| 132 |
|