Spaces:

SongLift
/

LyrGen2

Sleeping

App Files Files Community

James Edmunds commited on Dec 15, 2024

Commit

686446c

1 Parent(s): 442b10a

Working local. Added new fixes for HF emebeddings

Browse files

Files changed (8) hide show

app.py +20 -16
config/settings.py +32 -4
scripts/browse_hf_data.py +22 -0
scripts/check_chroma_settings.py +26 -0
scripts/check_hf_token.py +27 -0
scripts/display_version.py +9 -0
scripts/test_download_hf_dataset.py +73 -0
src/generator/generator.py +162 -38

app.py CHANGED Viewed

@@ -16,43 +16,47 @@ def initialize_generator():
     """Initialize the generator with proper error handling"""
     try:
         print("\n=== Initializing Generator ===")
-        # Check for HuggingFace environment requirements
         if Settings.is_huggingface():
             print("Running in HuggingFace environment")
             print("Checking environment requirements...")
             if not Settings.HF_TOKEN:
-                error_msg = "HuggingFace token not found. Please set HF_TOKEN in Space secrets."
                 print(f"Error: {error_msg}")
                 st.error(error_msg)
                 return None
-            else:
-                print("HF_TOKEN found in environment")
             # Ensure persistent storage directory exists
             storage_path = Path("/data/processed/embeddings")
-            print(f"Setting up persistent storage at: {storage_path}")
             storage_path.mkdir(parents=True, exist_ok=True)
-            print(f"Storage directory created/verified")
             if storage_path.exists():
-                print(f"Storage directory contents: {list(storage_path.glob('**/*'))}")
-        else:
-            print("Running in local environment")
-        # Debugging: Check if OpenAI API Key is set
-        Settings.debug_openai_key()
         # Initialize generator
         print("\nInitializing LyricGenerator...")
-        st.info("Loading embeddings...")
         generator = LyricGenerator()
-        st.success("Embeddings loaded successfully!")
         return generator
     except Exception as e:
-        error_msg = f"Failed to initialize generator: {str(e)}"
-        print(f"\nError during initialization: {error_msg}")
         print(f"Error type: {type(e).__name__}")
         st.error(error_msg)
         return None

     """Initialize the generator with proper error handling"""
     try:
         print("\n=== Initializing Generator ===")
         if Settings.is_huggingface():
             print("Running in HuggingFace environment")
             print("Checking environment requirements...")
+            # Debug: List contents of /data directory
+            data_dir = Path("/data")
+            if data_dir.exists():
+                print("\nContents of /data directory:")
+                for item in data_dir.rglob("*"):
+                    if item.is_file():
+                        print(f"- {item.relative_to(data_dir)} "
+                              f"({item.stat().st_size / 1024:.1f} KB)")
             if not Settings.HF_TOKEN:
+                error_msg = "HuggingFace token not found."
                 print(f"Error: {error_msg}")
                 st.error(error_msg)
                 return None
             # Ensure persistent storage directory exists
             storage_path = Path("/data/processed/embeddings")
+            print(f"\nSetting up storage at: {storage_path}")
             storage_path.mkdir(parents=True, exist_ok=True)
             if storage_path.exists():
+                print("Storage directory contents:")
+                for item in storage_path.rglob("*"):
+                    if item.is_file():
+                        print(f"- {item.relative_to(storage_path)} "
+                              f"({item.stat().st_size / 1024:.1f} KB)")
         # Initialize generator
         print("\nInitializing LyricGenerator...")
         generator = LyricGenerator()
+        st.success("Generator initialized successfully!")
         return generator
     except Exception as e:
+        error_msg = f"Initialization failed: {str(e)}"
+        print(f"\nError: {error_msg}")
         print(f"Error type: {type(e).__name__}")
         st.error(error_msg)
         return None

config/settings.py CHANGED Viewed

@@ -40,14 +40,41 @@ class Settings:
     def get_embeddings_path(cls) -> Path:
         """Get the base embeddings path"""
         if cls.is_huggingface():
-            # HuggingFace: Use absolute path in persistent storage
-            return Path("/data/processed/embeddings")
         # Local: Use project-relative path
-        return cls.BASE_DIR / "data" / "processed" / "embeddings"
     @classmethod
     def get_chroma_path(cls) -> Path:
         """Get the Chroma DB path"""
         return cls.get_embeddings_path() / "chroma"
     @classmethod
@@ -60,9 +87,10 @@ class Settings:
     @classmethod
     def get_chroma_settings(cls) -> dict:
         """Get ChromaDB settings"""
         return {
             "anonymized_telemetry": False,
-            "persist_directory": str(cls.get_chroma_path()),
             "collection_name": cls.CHROMA_COLLECTION_NAME
         }

     def get_embeddings_path(cls) -> Path:
         """Get the base embeddings path"""
         if cls.is_huggingface():
+            # In HuggingFace, first check the dataset cache
+            data_dir = Path("/data")
+            print(f"\nSearching for embeddings in: {data_dir}")
+            # Look for the most recent snapshot directory containing chroma
+            snapshot_pattern = "**/datasets--*--*/snapshots/*/chroma"
+            print(f"Using search pattern: {snapshot_pattern}")
+            snapshots = list(data_dir.glob(snapshot_pattern))
+            print(f"Found {len(snapshots)} potential snapshot directories:")
+            for snap in snapshots:
+                print(f"- {snap} (Modified: {snap.stat().st_mtime})")
+            if snapshots:
+                chosen_path = max(snapshots, key=lambda p: p.stat().st_mtime)
+                print(f"Selected most recent: {chosen_path}")
+                return chosen_path
+            print("No snapshots found, using fallback location")
+            fallback_path = data_dir / "processed/embeddings"
+            print(f"Fallback path: {fallback_path}")
+            return fallback_path
         # Local: Use project-relative path
+        embeddings_path = cls.BASE_DIR / "data" / "processed" / "embeddings"
+        print(f"Local embeddings path: {embeddings_path}")
+        return embeddings_path
     @classmethod
     def get_chroma_path(cls) -> Path:
         """Get the Chroma DB path"""
+        if cls.is_huggingface():
+            # In HuggingFace, the chroma path is the embeddings path itself
+            return cls.get_embeddings_path()
+        # Local: Use subdirectory
         return cls.get_embeddings_path() / "chroma"
     @classmethod
     @classmethod
     def get_chroma_settings(cls) -> dict:
         """Get ChromaDB settings"""
+        chroma_path = cls.get_chroma_path()
         return {
             "anonymized_telemetry": False,
+            "persist_directory": str(chroma_path),
             "collection_name": cls.CHROMA_COLLECTION_NAME
         }

scripts/browse_hf_data.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 import streamlit as st
 from pathlib import Path
 def list_files_in_directory(directory):
     """List all files in the given directory and its subdirectories."""
@@ -10,9 +13,27 @@ def list_files_in_directory(directory):
             files.append(os.path.join(root, filename))
     return files
 def main():
     st.title("Embeddings File Browser")
     # Directory to browse
     directory = "/data"  # Persistent storage directory
     st.write(f"Browsing directory: {directory}")
@@ -27,5 +48,6 @@ def main():
     else:
         st.write("No files found in the directory.")
 if __name__ == "__main__":
     main()

 import os
 import streamlit as st
 from pathlib import Path
+from datasets import load_dataset
+from dotenv import load_dotenv
 def list_files_in_directory(directory):
     """List all files in the given directory and its subdirectories."""
             files.append(os.path.join(root, filename))
     return files
 def main():
     st.title("Embeddings File Browser")
+    # Load environment variables
+    load_dotenv()
+    # Retrieve the Hugging Face token
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        st.error("HF_TOKEN not found in environment variables.")
+        return
+    # Load the dataset using the token
+    try:
+        dataset = load_dataset("SongLift/LyrGen2_DB", use_auth_token=hf_token)
+        st.write("Dataset loaded successfully.")
+    except Exception as e:
+        st.error(f"Error loading dataset: {str(e)}")
+        return
     # Directory to browse
     directory = "/data"  # Persistent storage directory
     st.write(f"Browsing directory: {directory}")
     else:
         st.write("No files found in the directory.")
 if __name__ == "__main__":
     main()

scripts/check_chroma_settings.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import sys
+from pathlib import Path
+# Add the project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.append(str(project_root))
+from config.settings import Settings
+def main():
+    """Print Chroma settings and related paths"""
+    print("\nChroma Settings:")
+    print("-" * 50)
+    settings = Settings.get_chroma_settings()
+    for key, value in settings.items():
+        print(f"{key}: {value}")
+    print("\nRelated Paths:")
+    print("-" * 50)
+    print(f"Base Dir: {Settings.BASE_DIR}")
+    print(f"Embeddings Path: {Settings.get_embeddings_path()}")
+    print(f"Chroma Path: {Settings.get_chroma_path()}")
+    print(f"Is HuggingFace: {Settings.is_huggingface()}")
+if __name__ == "__main__":
+    main()

scripts/check_hf_token.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import sys
+from pathlib import Path
+# Add project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.append(str(project_root))
+from config.settings import Settings
+from huggingface_hub import HfApi
+def check_dataset_files():
+    """Check files available in the HuggingFace dataset"""
+    print(f"\nChecking dataset: {Settings.HF_DATASET}")
+    print(f"Using token: {'Present' if Settings.HF_TOKEN else 'Missing'}")
+    api = HfApi(token=Settings.HF_TOKEN)
+    try:
+        files = api.list_repo_files(Settings.HF_DATASET, repo_type="dataset")
+        print("\nFiles in dataset:")
+        for f in files:
+            print(f"- {f}")
+    except Exception as e:
+        print(f"\nError accessing dataset: {type(e).__name__}")
+        print(f"Error details: {str(e)}")
+if __name__ == "__main__":
+    check_dataset_files()

scripts/display_version.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from huggingface_hub import HfApi
+api = HfApi(token=Settings.HF_TOKEN)
+try:
+    files = api.list_repo_files(Settings.HF_DATASET)
+    print("Files in dataset:")
+    for f in files:
+        print(f"- {f}")
+except Exception as e:
+    print(f"Error: {e}")

scripts/test_download_hf_dataset.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import sys
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+from dotenv import load_dotenv
+import tempfile
+import shutil
+# Add project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.append(str(project_root))
+from config.settings import Settings
+def download_chroma_files():
+    """Download Chroma files directly using hf_hub_download"""
+    try:
+        load_dotenv()
+        print(f"\nAttempting to download files from: {Settings.HF_DATASET}")
+        # Create a temporary directory for downloads
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            print(f"\nUsing temporary directory: {temp_path}")
+            # Files to download
+            files_to_download = [
+                "chroma/chroma.sqlite3",
+                "chroma/fade0013-ed4b-4928-b81b-7435145156dc/data_level0.bin",
+                "chroma/fade0013-ed4b-4928-b81b-7435145156dc/header.bin",
+                "chroma/fade0013-ed4b-4928-b81b-7435145156dc/index_metadata.pickle",
+                "chroma/fade0013-ed4b-4928-b81b-7435145156dc/length.bin",
+                "chroma/fade0013-ed4b-4928-b81b-7435145156dc/link_lists.bin"
+            ]
+            for file_path in files_to_download:
+                try:
+                    print(f"\nDownloading: {file_path}")
+                    local_path = hf_hub_download(
+                        repo_id=Settings.HF_DATASET,
+                        filename=file_path,
+                        repo_type="dataset",
+                        token=Settings.HF_TOKEN,
+                        cache_dir=temp_path
+                    )
+                    print(f"Downloaded to: {local_path}")
+                    # Get file size
+                    size_mb = Path(local_path).stat().st_size / (1024 * 1024)
+                    print(f"File size: {size_mb:.2f} MB")
+                except Exception as e:
+                    print(f"Error downloading {file_path}: {str(e)}")
+            print("\nFinal directory structure:")
+            def print_dir_tree(path: Path, level: int = 0):
+                indent = "  " * level
+                print(f"{indent}{path.name}/")
+                for item in path.iterdir():
+                    if item.is_file():
+                        size_mb = item.stat().st_size / (1024 * 1024)
+                        print(f"{indent}  {item.name} ({size_mb:.2f} MB)")
+                    else:
+                        print_dir_tree(item, level + 1)
+            print_dir_tree(temp_path)
+    except Exception as e:
+        print(f"\nTop-level error: {type(e).__name__}")
+        print(f"Error details: {str(e)}")
+if __name__ == "__main__":
+    download_chroma_files()

src/generator/generator.py CHANGED Viewed

@@ -5,10 +5,11 @@ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_chroma import Chroma
 from langchain.chains import ConversationalRetrievalChain
 from langchain.prompts import PromptTemplate
-from huggingface_hub import snapshot_download
 from config.settings import Settings
 from tenacity import retry, stop_after_attempt, wait_exponential
 from datasets import load_dataset
 class LyricGenerator:
@@ -48,20 +49,119 @@ class LyricGenerator:
         """Download and setup embeddings from HuggingFace dataset"""
         print("\n=== Setting up embeddings from HuggingFace dataset ===")
         try:
-            # Load the latest version of the dataset into the desired directory
-            dataset = load_dataset("SongLift/LyrGen2_DB",
-                                   split='train', cache_dir="/data")
-            print("Dataset loaded successfully into cache directory.")
-            # Verify the contents of the cache directory
-            self._list_cache_directory("/data")
         except Exception as e:
             print(f"\n=== Error in _setup_embeddings_from_hf ===")
             print(f"Error type: {type(e).__name__}")
             print(f"Error message: {str(e)}")
-            raise RuntimeError(
-                f"Failed to setup embeddings from HuggingFace: {str(e)}")
     def _list_cache_directory(self, cache_dir_path: str) -> None:
         """List the contents of the cache directory"""
@@ -82,41 +182,29 @@ class LyricGenerator:
             if Settings.is_huggingface():
                 print("HuggingFace environment detected, setting up embeddings...")
                 self._setup_embeddings_from_hf()
-                # Dynamically determine the correct chroma directory
-                chroma_dir = self._find_chroma_directory("/data")
-                if chroma_dir is None:
-                    raise RuntimeError("Chroma directory not found in any expected location.")
             else:
                 print("Local environment detected")
                 print(f"Base directory: {Settings.BASE_DIR}")
-                chroma_dir = Path("/data/processed/embeddings/chroma")  # Local environment path
-            print(f"Checking Chroma directory: {chroma_dir}")
-            print(f"Absolute path: {chroma_dir.absolute()}")
-            if not chroma_dir.exists():
                 print(
-                    f"Parent directory exists: {self.embeddings_dir.exists()}")
-                if self.embeddings_dir.exists():
-                    print(
-                        f"Parent directory contents: {list(self.embeddings_dir.glob('**/*'))}")
-                raise RuntimeError(
-                    f"Chroma directory not found at {chroma_dir}")
-            sqlite_file = chroma_dir / "chroma.sqlite3"
-            print(f"Checking SQLite file: {sqlite_file}")
-            if not sqlite_file.exists():
-                print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
-                raise RuntimeError(
-                    f"Chroma database not found at {sqlite_file}")
-            print(
-                f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
             # Load vector store using environment-aware settings
             print("Initializing Chroma with settings:")
             chroma_settings = Settings.get_chroma_settings()
             self.vector_store = Chroma(
                 persist_directory=chroma_settings["persist_directory"],
                 embedding_function=self.embeddings,
@@ -137,7 +225,7 @@ class LyricGenerator:
                 # Additional debugging for empty collection
                 print("\nDebug Information:")
                 print(f"Chroma directory structure:")
-                for item in chroma_dir.glob('**/*'):
                     print(f"  {item}")
                     if item.is_file():
                         print(
@@ -406,3 +494,39 @@ class LyricGenerator:
         except Exception as e:
             print(f"Error in generate_lyrics: {str(e)}")
             raise RuntimeError(f"Failed to generate lyrics: {str(e)}")

 from langchain_chroma import Chroma
 from langchain.chains import ConversationalRetrievalChain
 from langchain.prompts import PromptTemplate
+from huggingface_hub import snapshot_download, hf_hub_download
 from config.settings import Settings
 from tenacity import retry, stop_after_attempt, wait_exponential
 from datasets import load_dataset
+import sqlite3
 class LyricGenerator:
         """Download and setup embeddings from HuggingFace dataset"""
         print("\n=== Setting up embeddings from HuggingFace dataset ===")
         try:
+            # Create data directory if it doesn't exist
+            data_dir = Path("/data")
+            data_dir.mkdir(exist_ok=True)
+            # First download just the chroma.sqlite3 file
+            print(f"Downloading main database file...")
+            main_db = hf_hub_download(
+                repo_id=Settings.HF_DATASET,
+                filename="chroma/chroma.sqlite3",
+                repo_type="dataset",
+                token=Settings.HF_TOKEN,
+                cache_dir=data_dir
+            )
+            print(f"Main database downloaded to: {main_db}")
+            # Find the collection directory by looking at the parent directory
+            chroma_dir = Path(main_db).parent
+            # Debug: Print SQLite database contents
+            print("\nExamining Chroma database...")
+            try:
+                conn = sqlite3.connect(main_db)
+                cursor = conn.cursor()
+                # List tables
+                cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+                tables = cursor.fetchall()
+                print("Tables in database:", [t[0] for t in tables])
+                # Get collection info
+                cursor.execute("SELECT name, directory FROM collections;")
+                collections = cursor.fetchall()
+                print("\nCollections found:")
+                for name, directory in collections:
+                    print(f"- Name: {name}, Directory: {directory}")
+                conn.close()
+            except Exception as e:
+                print(f"Warning: Could not read SQLite database: {e}")
+            # Find all UUID-style directories
+            collection_dirs = list(chroma_dir.glob("*-*-*-*-*"))
+            if not collection_dirs:
+                raise RuntimeError("No collection directory found in Chroma folder")
+            if len(collection_dirs) > 1:
+                print(f"\nWarning: Multiple collection directories found: {[d.name for d in collection_dirs]}")
+                print("Using the first one found.")
+            collection_dir = collection_dirs[0]
+            collection_name = collection_dir.name
+            print(f"\nUsing collection directory: {collection_name}")
+            # Now define the files we need using the found collection name
+            files_to_download = [
+                "chroma/chroma.sqlite3",
+                f"chroma/{collection_name}/data_level0.bin",
+                f"chroma/{collection_name}/header.bin",
+                f"chroma/{collection_name}/index_metadata.pickle",
+                f"chroma/{collection_name}/length.bin",
+                f"chroma/{collection_name}/link_lists.bin"
+            ]
+            print(f"Downloading files to: {data_dir}")
+            for file_path in files_to_download[1:]:  # Skip the first file as we already have it
+                try:
+                    print(f"\nDownloading: {file_path}")
+                    local_path = hf_hub_download(
+                        repo_id=Settings.HF_DATASET,
+                        filename=file_path,
+                        repo_type="dataset",
+                        token=Settings.HF_TOKEN,
+                        cache_dir=data_dir
+                    )
+                    print(f"Downloaded to: {local_path}")
+                    # Get file size
+                    size_mb = Path(local_path).stat().st_size / (1024 * 1024)
+                    print(f"File size: {size_mb:.2f} MB")
+                except Exception as e:
+                    print(f"Error downloading {file_path}: {str(e)}")
+                    raise RuntimeError(f"Failed to download {file_path}: {str(e)}")
+            # Set the chroma directory
+            self.chroma_dir = chroma_dir
+            print(f"Using Chroma directory: {self.chroma_dir}")
+            # Verify all required files are present
+            required_files = {
+                "chroma.sqlite3",
+                f"{collection_name}/data_level0.bin",
+                f"{collection_name}/header.bin",
+                f"{collection_name}/index_metadata.pickle",
+                f"{collection_name}/length.bin",
+                f"{collection_name}/link_lists.bin"
+            }
+            found_files = {p.relative_to(self.chroma_dir).as_posix()
+                          for p in self.chroma_dir.glob("**/*") if p.is_file()}
+            missing_files = required_files - found_files
+            if missing_files:
+                raise RuntimeError(f"Missing required files: {missing_files}")
+            print("All required files downloaded and verified successfully")
         except Exception as e:
             print(f"\n=== Error in _setup_embeddings_from_hf ===")
             print(f"Error type: {type(e).__name__}")
             print(f"Error message: {str(e)}")
+            raise RuntimeError(f"Failed to setup embeddings from HuggingFace: {str(e)}")
     def _list_cache_directory(self, cache_dir_path: str) -> None:
         """List the contents of the cache directory"""
             if Settings.is_huggingface():
                 print("HuggingFace environment detected, setting up embeddings...")
                 self._setup_embeddings_from_hf()
             else:
                 print("Local environment detected")
                 print(f"Base directory: {Settings.BASE_DIR}")
+                # Verify local paths
+                if not self.chroma_dir.exists():
+                    raise RuntimeError(
+                        f"Chroma directory not found at {self.chroma_dir}")
+                sqlite_file = self.chroma_dir / "chroma.sqlite3"
+                print(f"Checking SQLite file: {sqlite_file}")
+                if not sqlite_file.exists():
+                    print(f"Directory contents: {list(self.chroma_dir.glob('**/*'))}")
+                    raise RuntimeError(
+                        f"Chroma database not found at {sqlite_file}")
                 print(
+                    f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
             # Load vector store using environment-aware settings
             print("Initializing Chroma with settings:")
             chroma_settings = Settings.get_chroma_settings()
+            print(f"Using persist directory: {chroma_settings['persist_directory']}")
             self.vector_store = Chroma(
                 persist_directory=chroma_settings["persist_directory"],
                 embedding_function=self.embeddings,
                 # Additional debugging for empty collection
                 print("\nDebug Information:")
                 print(f"Chroma directory structure:")
+                for item in self.chroma_dir.glob('**/*'):
                     print(f"  {item}")
                     if item.is_file():
                         print(
         except Exception as e:
             print(f"Error in generate_lyrics: {str(e)}")
             raise RuntimeError(f"Failed to generate lyrics: {str(e)}")
+    def _examine_sqlite_db(self, db_path: Path) -> None:
+        """Examine the contents of the SQLite database"""
+        try:
+            print(f"\nExamining SQLite database at: {db_path}")
+            conn = sqlite3.connect(db_path)
+            cursor = conn.cursor()
+            # List all tables
+            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+            tables = cursor.fetchall()
+            print("\nTables in database:")
+            for table in tables:
+                print(f"- {table[0]}")
+                # Get column info for each table
+                cursor.execute(f"PRAGMA table_info({table[0]})")
+                columns = cursor.fetchall()
+                for col in columns:
+                    print(f"  - {col[1]} ({col[2]})")
+            # Get collection info
+            print("\nCollections:")
+            cursor.execute("SELECT name, directory FROM collections;")
+            collections = cursor.fetchall()
+            for name, directory in collections:
+                print(f"- Name: {name}")
+                print(f"  Directory: {directory}")
+                # Get count of embeddings
+                cursor.execute("SELECT COUNT(*) FROM embeddings WHERE collection_id = (SELECT id FROM collections WHERE name = ?)", (name,))
+                count = cursor.fetchone()[0]
+                print(f"  Embeddings count: {count}")
+            conn.close()
+        except Exception as e:
+            print(f"Warning: Could not fully examine SQLite database: {e}")