Spaces:

SongLift
/

LyrGen2

Running

App Files Files Community

James Edmunds commited on Dec 14, 2024

Commit

0152ed4

1 Parent(s): b6d877d

Working locally, trying to fix embeddings on HF

Browse files

Files changed (5) hide show

config/settings.py +5 -1
docs/TROUBLESHOOTING.md +44 -0
scripts/process_lyrics.py +3 -1
scripts/test_embeddings.py +63 -0
src/generator/generator.py +60 -48

config/settings.py CHANGED Viewed

@@ -28,6 +28,9 @@ class Settings:
     EMBEDDING_MODEL = "text-embedding-ada-002"
     LLM_MODEL = "gpt-4"
     @classmethod
     def is_huggingface(cls) -> bool:
         """Check if running in HuggingFace environment"""
@@ -59,5 +62,6 @@ class Settings:
         """Get ChromaDB settings"""
         return {
             "anonymized_telemetry": False,
-            "persist_directory": str(cls.get_chroma_path())
         }

     EMBEDDING_MODEL = "text-embedding-ada-002"
     LLM_MODEL = "gpt-4"
+    # ChromaDB Settings
+    CHROMA_COLLECTION_NAME = "langchain"
     @classmethod
     def is_huggingface(cls) -> bool:
         """Check if running in HuggingFace environment"""
         """Get ChromaDB settings"""
         return {
             "anonymized_telemetry": False,
+            "persist_directory": str(cls.get_chroma_path()),
+            "collection_name": cls.CHROMA_COLLECTION_NAME
         }

docs/TROUBLESHOOTING.md ADDED Viewed

	@@ -0,0 +1,44 @@

+# Troubleshooting Guide
+## Embeddings Issues
+### Empty Chroma Collection (0 Documents)
+**Symptom:**
+- ChromaDB shows 0 documents despite large files being present
+- SQLite database shows records (e.g., embeddings: 233998 records)
+- Files exist and have expected sizes:
+  - chroma.sqlite3 (~576 MB)
+  - data_level0.bin (~1.3 GB)
+**Cause:**
+Collection name mismatch between processing and loading. The system uses two collections:
+- "langchain" (contains the data)
+- "lyrics" (empty)
+**Solution:**
+Always use "langchain" as the collection name in all operations:
+```python
+vector_store = Chroma(
+    persist_directory=str(chroma_dir),
+    embedding_function=embeddings,
+    collection_name="langchain"  # Must be "langchain"
+)
+```
+**Verification:**
+Run the test script to check collections:
+```bash
+python scripts/test_embeddings.py
+```
+Expected output:
+```
+Collection names: [Collection(name=langchain), Collection(name=lyrics)]
+Collection count: 233998  # For langchain collection
+```
+**Files to Check:**
+1. config/settings.py: CHROMA_COLLECTION_NAME
+2. src/generator/generator.py: vector_store initialization
+3. scripts/process_lyrics.py: Chroma.from_documents() call

scripts/process_lyrics.py CHANGED Viewed

@@ -35,6 +35,7 @@ class LyricsProcessor:
         self.output_dir = Path(output_dir)
         self.batch_size = batch_size
         self.embeddings = OpenAIEmbeddings()
         # Configure text splitter for lyrics
         self.text_splitter = RecursiveCharacterTextSplitter(
@@ -116,7 +117,8 @@ class LyricsProcessor:
                     vector_store = Chroma.from_documents(
                         documents=batch,
                         embedding=self.embeddings,
-                        persist_directory=str(self.output_dir / "chroma")
                     )
                 else:
                     # Add subsequent batches

         self.output_dir = Path(output_dir)
         self.batch_size = batch_size
         self.embeddings = OpenAIEmbeddings()
+        self.collection_name = "langchain"
         # Configure text splitter for lyrics
         self.text_splitter = RecursiveCharacterTextSplitter(
                     vector_store = Chroma.from_documents(
                         documents=batch,
                         embedding=self.embeddings,
+                        persist_directory=str(self.output_dir / "chroma"),
+                        collection_name=self.collection_name
                     )
                 else:
                     # Add subsequent batches

scripts/test_embeddings.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from pathlib import Path
+from langchain_openai import OpenAIEmbeddings
+from langchain_chroma import Chroma
+from dotenv import load_dotenv
+import os
+import sqlite3
+load_dotenv()
+def test_load_embeddings():
+    print("=== Testing Embeddings Load ===")
+    base_dir = Path.cwd()
+    chroma_dir = base_dir / "data" / "processed" / "embeddings" / "chroma"
+    # Test SQLite connection directly
+    print("\nTesting SQLite database:")
+    try:
+        conn = sqlite3.connect(str(chroma_dir / "chroma.sqlite3"))
+        cursor = conn.cursor()
+        # Check tables
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+        tables = cursor.fetchall()
+        print(f"Found tables: {tables}")
+        # Try to count records
+        for table in tables:
+            cursor.execute(f"SELECT COUNT(*) FROM {table[0]};")
+            count = cursor.fetchone()[0]
+            print(f"Table {table[0]}: {count} records")
+    except Exception as e:
+        print(f"SQLite Error: {str(e)}")
+    finally:
+        if 'conn' in locals():
+            conn.close()
+    # Now try ChromaDB with langchain collection
+    print("\nTesting ChromaDB load:")
+    try:
+        embeddings = OpenAIEmbeddings(
+            openai_api_key=os.getenv("OPENAI_API_KEY")
+        )
+        db = Chroma(
+            persist_directory=str(chroma_dir),
+            embedding_function=embeddings,
+            collection_name="langchain"
+        )
+        print("\nChroma instance created")
+        print(f"Collection names: {db._client.list_collections()}")
+        # Try to get collection details
+        collection = db._client.get_collection("langchain")
+        print(f"\nCollection count: {collection.count()}")
+        print(f"Collection peek: {collection.peek()}")
+    except Exception as e:
+        print(f"\nChroma Error: {str(e)}")
+if __name__ == "__main__":
+    test_load_embeddings()

src/generator/generator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_chroma import Chroma
 from langchain.chains import ConversationalRetrievalChain
 from langchain.prompts import PromptTemplate
-from huggingface_hub import snapshot_download, hf_hub_download
 from config.settings import Settings
@@ -14,25 +14,25 @@ class LyricGenerator:
         """Initialize the generator with embeddings"""
         print("Initializing LyricGenerator...")
         print(f"Deployment mode: {Settings.DEPLOYMENT_MODE}")
         # Ensure paths exist (if local)
         Settings.ensure_embedding_paths()
         # Get and log paths
         self.embeddings_dir = Settings.get_embeddings_path()
         self.chroma_dir = Settings.get_chroma_path()
         print(f"Embeddings directory: {self.embeddings_dir}")
         print(f"Chroma directory: {self.chroma_dir}")
         # Initialize OpenAI embeddings
         print("Setting up OpenAI embeddings...")
         self.embeddings = OpenAIEmbeddings(
             openai_api_key=Settings.OPENAI_API_KEY
         )
         self.vector_store = None
         self.qa_chain = None
         # Load embeddings
         self._load_embeddings()
@@ -45,13 +45,13 @@ class LyricGenerator:
             print(f"Target Chroma directory: {chroma_dir}")
             print(f"Creating parent directory: {chroma_dir.parent}")
             chroma_dir.parent.mkdir(parents=True, exist_ok=True)
             # Check if embeddings already exist in persistent storage
             if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
                 print("\nDownloading embeddings from HuggingFace dataset...")
                 print(f"Dataset repo: {Settings.HF_DATASET}")
                 print(f"Using temp directory: /tmp/embeddings")
                 # Download the entire chroma directory from the dataset
                 try:
                     temp_dir = snapshot_download(
@@ -64,18 +64,20 @@ class LyricGenerator:
                 except Exception as e:
                     print(f"Error during snapshot_download: {str(e)}")
                     raise
                 temp_chroma = Path(temp_dir) / "chroma"
                 print(f"Looking for Chroma in temp dir at: {temp_chroma}")
                 if not temp_chroma.exists():
-                    print(f"Contents of temp_dir: {list(Path(temp_dir).glob('**/*'))}")
                     raise RuntimeError(
                         f"Chroma directory not found in dataset at {temp_chroma}"
                     )
-                print(f"Found Chroma directory in download. Contents: {list(temp_chroma.glob('**/*'))}")
                 # Copy the downloaded chroma directory to persistent storage
                 print(f"\nCopying embeddings to persistent storage...")
                 if chroma_dir.exists():
@@ -83,9 +85,11 @@ class LyricGenerator:
                     shutil.rmtree(chroma_dir)
                 print(f"Copying from {temp_chroma} to {chroma_dir}")
                 shutil.copytree(temp_chroma, chroma_dir)
-                print(f"Embeddings copied to persistent storage at {chroma_dir}")
-                print(f"Persistent storage contents: {list(chroma_dir.glob('**/*'))}")
                 # Clean up temporary directory
                 print("\nCleaning up temporary directory...")
                 shutil.rmtree("/tmp/embeddings")
@@ -93,7 +97,7 @@ class LyricGenerator:
             else:
                 print("Embeddings already exist in persistent storage")
                 print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
         except Exception as e:
             print(f"\n=== Error in _setup_embeddings_from_hf ===")
             print(f"Error type: {type(e).__name__}")
@@ -101,7 +105,8 @@ class LyricGenerator:
             print(f"Current directory structure:")
             try:
                 print(f"Parent dir exists: {chroma_dir.parent.exists()}")
-                print(f"Parent dir contents: {list(chroma_dir.parent.glob('**/*'))}")
             except Exception as dir_error:
                 print(f"Error checking directories: {str(dir_error)}")
             raise RuntimeError(
@@ -112,7 +117,7 @@ class LyricGenerator:
         """Load existing embeddings based on environment"""
         try:
             print("\n=== Loading Embeddings ===")
             # If in HuggingFace environment, ensure embeddings are set up
             if Settings.is_huggingface():
                 print("HuggingFace environment detected, setting up embeddings...")
@@ -120,67 +125,71 @@ class LyricGenerator:
             else:
                 print("Local environment detected")
                 print(f"Base directory: {Settings.BASE_DIR}")
             print(f"\nLoading vector store from: {self.embeddings_dir}")
             # Check Chroma directory structure
             chroma_dir = self.embeddings_dir / "chroma"
             print(f"Checking Chroma directory: {chroma_dir}")
             print(f"Absolute path: {chroma_dir.absolute()}")
             if not chroma_dir.exists():
-                print(f"Parent directory exists: {self.embeddings_dir.exists()}")
                 if self.embeddings_dir.exists():
-                    print(f"Parent directory contents: {list(self.embeddings_dir.glob('**/*'))}")
-                raise RuntimeError(f"Chroma directory not found at {chroma_dir}")
             sqlite_file = chroma_dir / "chroma.sqlite3"
             print(f"Checking SQLite file: {sqlite_file}")
             if not sqlite_file.exists():
                 print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
-                raise RuntimeError(f"Chroma database not found at {sqlite_file}")
-            print(f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
             # Load vector store using environment-aware settings
             print("Initializing Chroma with settings:")
-            print(f"  persist_directory: {str(chroma_dir)}")
-            print(f"  collection_name: lyrics")
             self.vector_store = Chroma(
-                persist_directory=str(chroma_dir),
                 embedding_function=self.embeddings,
-                collection_name="lyrics"
             )
             # Verify collection has documents
             collection = self.vector_store._collection
             count = collection.count()
             print(f"Collection contains {count} documents")
             if count == 0:
                 print("Collection is empty, checking details...")
                 # Try to peek at the collection data
                 peek = collection.peek()
                 print(f"Collection peek: {peek}")
                 # Additional debugging for empty collection
                 print("\nDebug Information:")
                 print(f"Chroma directory structure:")
                 for item in chroma_dir.glob('**/*'):
                     print(f"  {item}")
                     if item.is_file():
-                        print(f"    Size: {item.stat().st_size / (1024*1024):.2f} MB")
                 raise RuntimeError(
                     "Chroma DB is empty. Please ensure embeddings "
                     "were properly generated and uploaded."
                 )
             else:
                 print("Successfully loaded embeddings")
         except Exception as e:
             print(f"Error loading embeddings: {str(e)}")
             raise RuntimeError(f"Failed to load embeddings: {str(e)}")
         # Setup QA chain
         self._setup_qa_chain()
@@ -317,25 +326,27 @@ class LyricGenerator:
         try:
             print("Starting lyrics generation process...")
             print(f"Using OpenAI model: {Settings.LLM_MODEL}")
             # Get source documents with scores first
             print("Searching for similar documents...")
             try:
                 # Test embeddings function first
                 print("Testing embeddings function...")
                 test_embedding = self.embeddings.embed_query("test")
-                print(f"Embeddings function working (vector size: {len(test_embedding)})")
                 # Now try similarity search
                 docs_and_scores = self.vector_store.similarity_search_with_score(
                     prompt,
                     k=20
                 )
                 print(f"Found {len(docs_and_scores)} similar documents")
                 if not docs_and_scores:
-                    print("Warning: No similar documents found. This may affect generation quality.")
             except Exception as e:
                 print(f"Error during similarity search: {str(e)}")
                 raise RuntimeError(
@@ -353,7 +364,8 @@ class LyricGenerator:
                     'artist': doc.metadata['artist'],
                     'song': doc.metadata['song_title'],
                     'similarity': similarity,
-                    'content': doc.page_content[:200] + "..."  # First 200 chars
                 })
             try:

 from langchain_chroma import Chroma
 from langchain.chains import ConversationalRetrievalChain
 from langchain.prompts import PromptTemplate
+from huggingface_hub import snapshot_download
 from config.settings import Settings
         """Initialize the generator with embeddings"""
         print("Initializing LyricGenerator...")
         print(f"Deployment mode: {Settings.DEPLOYMENT_MODE}")
         # Ensure paths exist (if local)
         Settings.ensure_embedding_paths()
         # Get and log paths
         self.embeddings_dir = Settings.get_embeddings_path()
         self.chroma_dir = Settings.get_chroma_path()
         print(f"Embeddings directory: {self.embeddings_dir}")
         print(f"Chroma directory: {self.chroma_dir}")
         # Initialize OpenAI embeddings
         print("Setting up OpenAI embeddings...")
         self.embeddings = OpenAIEmbeddings(
             openai_api_key=Settings.OPENAI_API_KEY
         )
         self.vector_store = None
         self.qa_chain = None
         # Load embeddings
         self._load_embeddings()
             print(f"Target Chroma directory: {chroma_dir}")
             print(f"Creating parent directory: {chroma_dir.parent}")
             chroma_dir.parent.mkdir(parents=True, exist_ok=True)
             # Check if embeddings already exist in persistent storage
             if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
                 print("\nDownloading embeddings from HuggingFace dataset...")
                 print(f"Dataset repo: {Settings.HF_DATASET}")
                 print(f"Using temp directory: /tmp/embeddings")
                 # Download the entire chroma directory from the dataset
                 try:
                     temp_dir = snapshot_download(
                 except Exception as e:
                     print(f"Error during snapshot_download: {str(e)}")
                     raise
                 temp_chroma = Path(temp_dir) / "chroma"
                 print(f"Looking for Chroma in temp dir at: {temp_chroma}")
                 if not temp_chroma.exists():
+                    print(
+                        f"Contents of temp_dir: {list(Path(temp_dir).glob('**/*'))}")
                     raise RuntimeError(
                         f"Chroma directory not found in dataset at {temp_chroma}"
                     )
+                print(
+                    f"Found Chroma directory in download. Contents: {list(temp_chroma.glob('**/*'))}")
                 # Copy the downloaded chroma directory to persistent storage
                 print(f"\nCopying embeddings to persistent storage...")
                 if chroma_dir.exists():
                     shutil.rmtree(chroma_dir)
                 print(f"Copying from {temp_chroma} to {chroma_dir}")
                 shutil.copytree(temp_chroma, chroma_dir)
+                print(
+                    f"Embeddings copied to persistent storage at {chroma_dir}")
+                print(
+                    f"Persistent storage contents: {list(chroma_dir.glob('**/*'))}")
                 # Clean up temporary directory
                 print("\nCleaning up temporary directory...")
                 shutil.rmtree("/tmp/embeddings")
             else:
                 print("Embeddings already exist in persistent storage")
                 print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
         except Exception as e:
             print(f"\n=== Error in _setup_embeddings_from_hf ===")
             print(f"Error type: {type(e).__name__}")
             print(f"Current directory structure:")
             try:
                 print(f"Parent dir exists: {chroma_dir.parent.exists()}")
+                print(
+                    f"Parent dir contents: {list(chroma_dir.parent.glob('**/*'))}")
             except Exception as dir_error:
                 print(f"Error checking directories: {str(dir_error)}")
             raise RuntimeError(
         """Load existing embeddings based on environment"""
         try:
             print("\n=== Loading Embeddings ===")
             # If in HuggingFace environment, ensure embeddings are set up
             if Settings.is_huggingface():
                 print("HuggingFace environment detected, setting up embeddings...")
             else:
                 print("Local environment detected")
                 print(f"Base directory: {Settings.BASE_DIR}")
             print(f"\nLoading vector store from: {self.embeddings_dir}")
             # Check Chroma directory structure
             chroma_dir = self.embeddings_dir / "chroma"
             print(f"Checking Chroma directory: {chroma_dir}")
             print(f"Absolute path: {chroma_dir.absolute()}")
             if not chroma_dir.exists():
+                print(
+                    f"Parent directory exists: {self.embeddings_dir.exists()}")
                 if self.embeddings_dir.exists():
+                    print(
+                        f"Parent directory contents: {list(self.embeddings_dir.glob('**/*'))}")
+                raise RuntimeError(
+                    f"Chroma directory not found at {chroma_dir}")
             sqlite_file = chroma_dir / "chroma.sqlite3"
             print(f"Checking SQLite file: {sqlite_file}")
             if not sqlite_file.exists():
                 print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
+                raise RuntimeError(
+                    f"Chroma database not found at {sqlite_file}")
+            print(
+                f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
             # Load vector store using environment-aware settings
             print("Initializing Chroma with settings:")
+            chroma_settings = Settings.get_chroma_settings()
             self.vector_store = Chroma(
+                persist_directory=chroma_settings["persist_directory"],
                 embedding_function=self.embeddings,
+                collection_name=chroma_settings["collection_name"]
             )
             # Verify collection has documents
             collection = self.vector_store._collection
             count = collection.count()
             print(f"Collection contains {count} documents")
             if count == 0:
                 print("Collection is empty, checking details...")
                 # Try to peek at the collection data
                 peek = collection.peek()
                 print(f"Collection peek: {peek}")
                 # Additional debugging for empty collection
                 print("\nDebug Information:")
                 print(f"Chroma directory structure:")
                 for item in chroma_dir.glob('**/*'):
                     print(f"  {item}")
                     if item.is_file():
+                        print(
+                            f"    Size: {item.stat().st_size / (1024*1024):.2f} MB")
                 raise RuntimeError(
                     "Chroma DB is empty. Please ensure embeddings "
                     "were properly generated and uploaded."
                 )
             else:
                 print("Successfully loaded embeddings")
         except Exception as e:
             print(f"Error loading embeddings: {str(e)}")
             raise RuntimeError(f"Failed to load embeddings: {str(e)}")
         # Setup QA chain
         self._setup_qa_chain()
         try:
             print("Starting lyrics generation process...")
             print(f"Using OpenAI model: {Settings.LLM_MODEL}")
             # Get source documents with scores first
             print("Searching for similar documents...")
             try:
                 # Test embeddings function first
                 print("Testing embeddings function...")
                 test_embedding = self.embeddings.embed_query("test")
+                print(
+                    f"Embeddings function working (vector size: {len(test_embedding)})")
                 # Now try similarity search
                 docs_and_scores = self.vector_store.similarity_search_with_score(
                     prompt,
                     k=20
                 )
                 print(f"Found {len(docs_and_scores)} similar documents")
                 if not docs_and_scores:
+                    print(
+                        "Warning: No similar documents found. This may affect generation quality.")
             except Exception as e:
                 print(f"Error during similarity search: {str(e)}")
                 raise RuntimeError(
                     'artist': doc.metadata['artist'],
                     'song': doc.metadata['song_title'],
                     'similarity': similarity,
+                    # First 200 chars
+                    'content': doc.page_content[:200] + "..."
                 })
             try: