James Edmunds commited on
Commit
0152ed4
·
1 Parent(s): b6d877d

Working locally, trying to fix embeddings on HF

Browse files
config/settings.py CHANGED
@@ -28,6 +28,9 @@ class Settings:
28
  EMBEDDING_MODEL = "text-embedding-ada-002"
29
  LLM_MODEL = "gpt-4"
30
 
 
 
 
31
  @classmethod
32
  def is_huggingface(cls) -> bool:
33
  """Check if running in HuggingFace environment"""
@@ -59,5 +62,6 @@ class Settings:
59
  """Get ChromaDB settings"""
60
  return {
61
  "anonymized_telemetry": False,
62
- "persist_directory": str(cls.get_chroma_path())
 
63
  }
 
28
  EMBEDDING_MODEL = "text-embedding-ada-002"
29
  LLM_MODEL = "gpt-4"
30
 
31
+ # ChromaDB Settings
32
+ CHROMA_COLLECTION_NAME = "langchain"
33
+
34
  @classmethod
35
  def is_huggingface(cls) -> bool:
36
  """Check if running in HuggingFace environment"""
 
62
  """Get ChromaDB settings"""
63
  return {
64
  "anonymized_telemetry": False,
65
+ "persist_directory": str(cls.get_chroma_path()),
66
+ "collection_name": cls.CHROMA_COLLECTION_NAME
67
  }
docs/TROUBLESHOOTING.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Troubleshooting Guide
2
+
3
+ ## Embeddings Issues
4
+
5
+ ### Empty Chroma Collection (0 Documents)
6
+ **Symptom:**
7
+ - ChromaDB shows 0 documents despite large files being present
8
+ - SQLite database shows records (e.g., embeddings: 233998 records)
9
+ - Files exist and have expected sizes:
10
+ - chroma.sqlite3 (~576 MB)
11
+ - data_level0.bin (~1.3 GB)
12
+
13
+ **Cause:**
14
+ Collection name mismatch between processing and loading. The system uses two collections:
15
+ - "langchain" (contains the data)
16
+ - "lyrics" (empty)
17
+
18
+ **Solution:**
19
+ Always use "langchain" as the collection name in all operations:
20
+ ```python
21
+ vector_store = Chroma(
22
+ persist_directory=str(chroma_dir),
23
+ embedding_function=embeddings,
24
+ collection_name="langchain" # Must be "langchain"
25
+ )
26
+ ```
27
+
28
+ **Verification:**
29
+ Run the test script to check collections:
30
+
31
+ ```bash
32
+ python scripts/test_embeddings.py
33
+ ```
34
+
35
+ Expected output:
36
+ ```
37
+ Collection names: [Collection(name=langchain), Collection(name=lyrics)]
38
+ Collection count: 233998 # For langchain collection
39
+ ```
40
+
41
+ **Files to Check:**
42
+ 1. config/settings.py: CHROMA_COLLECTION_NAME
43
+ 2. src/generator/generator.py: vector_store initialization
44
+ 3. scripts/process_lyrics.py: Chroma.from_documents() call
scripts/process_lyrics.py CHANGED
@@ -35,6 +35,7 @@ class LyricsProcessor:
35
  self.output_dir = Path(output_dir)
36
  self.batch_size = batch_size
37
  self.embeddings = OpenAIEmbeddings()
 
38
 
39
  # Configure text splitter for lyrics
40
  self.text_splitter = RecursiveCharacterTextSplitter(
@@ -116,7 +117,8 @@ class LyricsProcessor:
116
  vector_store = Chroma.from_documents(
117
  documents=batch,
118
  embedding=self.embeddings,
119
- persist_directory=str(self.output_dir / "chroma")
 
120
  )
121
  else:
122
  # Add subsequent batches
 
35
  self.output_dir = Path(output_dir)
36
  self.batch_size = batch_size
37
  self.embeddings = OpenAIEmbeddings()
38
+ self.collection_name = "langchain"
39
 
40
  # Configure text splitter for lyrics
41
  self.text_splitter = RecursiveCharacterTextSplitter(
 
117
  vector_store = Chroma.from_documents(
118
  documents=batch,
119
  embedding=self.embeddings,
120
+ persist_directory=str(self.output_dir / "chroma"),
121
+ collection_name=self.collection_name
122
  )
123
  else:
124
  # Add subsequent batches
scripts/test_embeddings.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from langchain_openai import OpenAIEmbeddings
3
+ from langchain_chroma import Chroma
4
+ from dotenv import load_dotenv
5
+ import os
6
+ import sqlite3
7
+
8
+ load_dotenv()
9
+
10
+
11
+ def test_load_embeddings():
12
+ print("=== Testing Embeddings Load ===")
13
+
14
+ base_dir = Path.cwd()
15
+ chroma_dir = base_dir / "data" / "processed" / "embeddings" / "chroma"
16
+
17
+ # Test SQLite connection directly
18
+ print("\nTesting SQLite database:")
19
+ try:
20
+ conn = sqlite3.connect(str(chroma_dir / "chroma.sqlite3"))
21
+ cursor = conn.cursor()
22
+ # Check tables
23
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
24
+ tables = cursor.fetchall()
25
+ print(f"Found tables: {tables}")
26
+
27
+ # Try to count records
28
+ for table in tables:
29
+ cursor.execute(f"SELECT COUNT(*) FROM {table[0]};")
30
+ count = cursor.fetchone()[0]
31
+ print(f"Table {table[0]}: {count} records")
32
+ except Exception as e:
33
+ print(f"SQLite Error: {str(e)}")
34
+ finally:
35
+ if 'conn' in locals():
36
+ conn.close()
37
+
38
+ # Now try ChromaDB with langchain collection
39
+ print("\nTesting ChromaDB load:")
40
+ try:
41
+ embeddings = OpenAIEmbeddings(
42
+ openai_api_key=os.getenv("OPENAI_API_KEY")
43
+ )
44
+
45
+ db = Chroma(
46
+ persist_directory=str(chroma_dir),
47
+ embedding_function=embeddings,
48
+ collection_name="langchain"
49
+ )
50
+ print("\nChroma instance created")
51
+ print(f"Collection names: {db._client.list_collections()}")
52
+
53
+ # Try to get collection details
54
+ collection = db._client.get_collection("langchain")
55
+ print(f"\nCollection count: {collection.count()}")
56
+ print(f"Collection peek: {collection.peek()}")
57
+
58
+ except Exception as e:
59
+ print(f"\nChroma Error: {str(e)}")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ test_load_embeddings()
src/generator/generator.py CHANGED
@@ -5,7 +5,7 @@ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
5
  from langchain_chroma import Chroma
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.prompts import PromptTemplate
8
- from huggingface_hub import snapshot_download, hf_hub_download
9
  from config.settings import Settings
10
 
11
 
@@ -14,25 +14,25 @@ class LyricGenerator:
14
  """Initialize the generator with embeddings"""
15
  print("Initializing LyricGenerator...")
16
  print(f"Deployment mode: {Settings.DEPLOYMENT_MODE}")
17
-
18
  # Ensure paths exist (if local)
19
  Settings.ensure_embedding_paths()
20
-
21
  # Get and log paths
22
  self.embeddings_dir = Settings.get_embeddings_path()
23
  self.chroma_dir = Settings.get_chroma_path()
24
  print(f"Embeddings directory: {self.embeddings_dir}")
25
  print(f"Chroma directory: {self.chroma_dir}")
26
-
27
  # Initialize OpenAI embeddings
28
  print("Setting up OpenAI embeddings...")
29
  self.embeddings = OpenAIEmbeddings(
30
  openai_api_key=Settings.OPENAI_API_KEY
31
  )
32
-
33
  self.vector_store = None
34
  self.qa_chain = None
35
-
36
  # Load embeddings
37
  self._load_embeddings()
38
 
@@ -45,13 +45,13 @@ class LyricGenerator:
45
  print(f"Target Chroma directory: {chroma_dir}")
46
  print(f"Creating parent directory: {chroma_dir.parent}")
47
  chroma_dir.parent.mkdir(parents=True, exist_ok=True)
48
-
49
  # Check if embeddings already exist in persistent storage
50
  if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
51
  print("\nDownloading embeddings from HuggingFace dataset...")
52
  print(f"Dataset repo: {Settings.HF_DATASET}")
53
  print(f"Using temp directory: /tmp/embeddings")
54
-
55
  # Download the entire chroma directory from the dataset
56
  try:
57
  temp_dir = snapshot_download(
@@ -64,18 +64,20 @@ class LyricGenerator:
64
  except Exception as e:
65
  print(f"Error during snapshot_download: {str(e)}")
66
  raise
67
-
68
  temp_chroma = Path(temp_dir) / "chroma"
69
  print(f"Looking for Chroma in temp dir at: {temp_chroma}")
70
-
71
  if not temp_chroma.exists():
72
- print(f"Contents of temp_dir: {list(Path(temp_dir).glob('**/*'))}")
 
73
  raise RuntimeError(
74
  f"Chroma directory not found in dataset at {temp_chroma}"
75
  )
76
-
77
- print(f"Found Chroma directory in download. Contents: {list(temp_chroma.glob('**/*'))}")
78
-
 
79
  # Copy the downloaded chroma directory to persistent storage
80
  print(f"\nCopying embeddings to persistent storage...")
81
  if chroma_dir.exists():
@@ -83,9 +85,11 @@ class LyricGenerator:
83
  shutil.rmtree(chroma_dir)
84
  print(f"Copying from {temp_chroma} to {chroma_dir}")
85
  shutil.copytree(temp_chroma, chroma_dir)
86
- print(f"Embeddings copied to persistent storage at {chroma_dir}")
87
- print(f"Persistent storage contents: {list(chroma_dir.glob('**/*'))}")
88
-
 
 
89
  # Clean up temporary directory
90
  print("\nCleaning up temporary directory...")
91
  shutil.rmtree("/tmp/embeddings")
@@ -93,7 +97,7 @@ class LyricGenerator:
93
  else:
94
  print("Embeddings already exist in persistent storage")
95
  print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
96
-
97
  except Exception as e:
98
  print(f"\n=== Error in _setup_embeddings_from_hf ===")
99
  print(f"Error type: {type(e).__name__}")
@@ -101,7 +105,8 @@ class LyricGenerator:
101
  print(f"Current directory structure:")
102
  try:
103
  print(f"Parent dir exists: {chroma_dir.parent.exists()}")
104
- print(f"Parent dir contents: {list(chroma_dir.parent.glob('**/*'))}")
 
105
  except Exception as dir_error:
106
  print(f"Error checking directories: {str(dir_error)}")
107
  raise RuntimeError(
@@ -112,7 +117,7 @@ class LyricGenerator:
112
  """Load existing embeddings based on environment"""
113
  try:
114
  print("\n=== Loading Embeddings ===")
115
-
116
  # If in HuggingFace environment, ensure embeddings are set up
117
  if Settings.is_huggingface():
118
  print("HuggingFace environment detected, setting up embeddings...")
@@ -120,67 +125,71 @@ class LyricGenerator:
120
  else:
121
  print("Local environment detected")
122
  print(f"Base directory: {Settings.BASE_DIR}")
123
-
124
  print(f"\nLoading vector store from: {self.embeddings_dir}")
125
  # Check Chroma directory structure
126
  chroma_dir = self.embeddings_dir / "chroma"
127
  print(f"Checking Chroma directory: {chroma_dir}")
128
  print(f"Absolute path: {chroma_dir.absolute()}")
129
-
130
  if not chroma_dir.exists():
131
- print(f"Parent directory exists: {self.embeddings_dir.exists()}")
 
132
  if self.embeddings_dir.exists():
133
- print(f"Parent directory contents: {list(self.embeddings_dir.glob('**/*'))}")
134
- raise RuntimeError(f"Chroma directory not found at {chroma_dir}")
135
-
 
 
136
  sqlite_file = chroma_dir / "chroma.sqlite3"
137
  print(f"Checking SQLite file: {sqlite_file}")
138
  if not sqlite_file.exists():
139
  print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
140
- raise RuntimeError(f"Chroma database not found at {sqlite_file}")
141
- print(f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
142
-
 
 
143
  # Load vector store using environment-aware settings
144
  print("Initializing Chroma with settings:")
145
- print(f" persist_directory: {str(chroma_dir)}")
146
- print(f" collection_name: lyrics")
147
-
148
  self.vector_store = Chroma(
149
- persist_directory=str(chroma_dir),
150
  embedding_function=self.embeddings,
151
- collection_name="lyrics"
152
  )
153
-
154
  # Verify collection has documents
155
  collection = self.vector_store._collection
156
  count = collection.count()
157
  print(f"Collection contains {count} documents")
158
-
159
  if count == 0:
160
  print("Collection is empty, checking details...")
161
  # Try to peek at the collection data
162
  peek = collection.peek()
163
  print(f"Collection peek: {peek}")
164
-
165
  # Additional debugging for empty collection
166
  print("\nDebug Information:")
167
  print(f"Chroma directory structure:")
168
  for item in chroma_dir.glob('**/*'):
169
  print(f" {item}")
170
  if item.is_file():
171
- print(f" Size: {item.stat().st_size / (1024*1024):.2f} MB")
172
-
 
173
  raise RuntimeError(
174
  "Chroma DB is empty. Please ensure embeddings "
175
  "were properly generated and uploaded."
176
  )
177
  else:
178
  print("Successfully loaded embeddings")
179
-
180
  except Exception as e:
181
  print(f"Error loading embeddings: {str(e)}")
182
  raise RuntimeError(f"Failed to load embeddings: {str(e)}")
183
-
184
  # Setup QA chain
185
  self._setup_qa_chain()
186
 
@@ -317,25 +326,27 @@ class LyricGenerator:
317
  try:
318
  print("Starting lyrics generation process...")
319
  print(f"Using OpenAI model: {Settings.LLM_MODEL}")
320
-
321
  # Get source documents with scores first
322
  print("Searching for similar documents...")
323
  try:
324
  # Test embeddings function first
325
  print("Testing embeddings function...")
326
  test_embedding = self.embeddings.embed_query("test")
327
- print(f"Embeddings function working (vector size: {len(test_embedding)})")
328
-
 
329
  # Now try similarity search
330
  docs_and_scores = self.vector_store.similarity_search_with_score(
331
  prompt,
332
  k=20
333
  )
334
  print(f"Found {len(docs_and_scores)} similar documents")
335
-
336
  if not docs_and_scores:
337
- print("Warning: No similar documents found. This may affect generation quality.")
338
-
 
339
  except Exception as e:
340
  print(f"Error during similarity search: {str(e)}")
341
  raise RuntimeError(
@@ -353,7 +364,8 @@ class LyricGenerator:
353
  'artist': doc.metadata['artist'],
354
  'song': doc.metadata['song_title'],
355
  'similarity': similarity,
356
- 'content': doc.page_content[:200] + "..." # First 200 chars
 
357
  })
358
 
359
  try:
 
5
  from langchain_chroma import Chroma
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.prompts import PromptTemplate
8
+ from huggingface_hub import snapshot_download
9
  from config.settings import Settings
10
 
11
 
 
14
  """Initialize the generator with embeddings"""
15
  print("Initializing LyricGenerator...")
16
  print(f"Deployment mode: {Settings.DEPLOYMENT_MODE}")
17
+
18
  # Ensure paths exist (if local)
19
  Settings.ensure_embedding_paths()
20
+
21
  # Get and log paths
22
  self.embeddings_dir = Settings.get_embeddings_path()
23
  self.chroma_dir = Settings.get_chroma_path()
24
  print(f"Embeddings directory: {self.embeddings_dir}")
25
  print(f"Chroma directory: {self.chroma_dir}")
26
+
27
  # Initialize OpenAI embeddings
28
  print("Setting up OpenAI embeddings...")
29
  self.embeddings = OpenAIEmbeddings(
30
  openai_api_key=Settings.OPENAI_API_KEY
31
  )
32
+
33
  self.vector_store = None
34
  self.qa_chain = None
35
+
36
  # Load embeddings
37
  self._load_embeddings()
38
 
 
45
  print(f"Target Chroma directory: {chroma_dir}")
46
  print(f"Creating parent directory: {chroma_dir.parent}")
47
  chroma_dir.parent.mkdir(parents=True, exist_ok=True)
48
+
49
  # Check if embeddings already exist in persistent storage
50
  if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
51
  print("\nDownloading embeddings from HuggingFace dataset...")
52
  print(f"Dataset repo: {Settings.HF_DATASET}")
53
  print(f"Using temp directory: /tmp/embeddings")
54
+
55
  # Download the entire chroma directory from the dataset
56
  try:
57
  temp_dir = snapshot_download(
 
64
  except Exception as e:
65
  print(f"Error during snapshot_download: {str(e)}")
66
  raise
67
+
68
  temp_chroma = Path(temp_dir) / "chroma"
69
  print(f"Looking for Chroma in temp dir at: {temp_chroma}")
70
+
71
  if not temp_chroma.exists():
72
+ print(
73
+ f"Contents of temp_dir: {list(Path(temp_dir).glob('**/*'))}")
74
  raise RuntimeError(
75
  f"Chroma directory not found in dataset at {temp_chroma}"
76
  )
77
+
78
+ print(
79
+ f"Found Chroma directory in download. Contents: {list(temp_chroma.glob('**/*'))}")
80
+
81
  # Copy the downloaded chroma directory to persistent storage
82
  print(f"\nCopying embeddings to persistent storage...")
83
  if chroma_dir.exists():
 
85
  shutil.rmtree(chroma_dir)
86
  print(f"Copying from {temp_chroma} to {chroma_dir}")
87
  shutil.copytree(temp_chroma, chroma_dir)
88
+ print(
89
+ f"Embeddings copied to persistent storage at {chroma_dir}")
90
+ print(
91
+ f"Persistent storage contents: {list(chroma_dir.glob('**/*'))}")
92
+
93
  # Clean up temporary directory
94
  print("\nCleaning up temporary directory...")
95
  shutil.rmtree("/tmp/embeddings")
 
97
  else:
98
  print("Embeddings already exist in persistent storage")
99
  print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
100
+
101
  except Exception as e:
102
  print(f"\n=== Error in _setup_embeddings_from_hf ===")
103
  print(f"Error type: {type(e).__name__}")
 
105
  print(f"Current directory structure:")
106
  try:
107
  print(f"Parent dir exists: {chroma_dir.parent.exists()}")
108
+ print(
109
+ f"Parent dir contents: {list(chroma_dir.parent.glob('**/*'))}")
110
  except Exception as dir_error:
111
  print(f"Error checking directories: {str(dir_error)}")
112
  raise RuntimeError(
 
117
  """Load existing embeddings based on environment"""
118
  try:
119
  print("\n=== Loading Embeddings ===")
120
+
121
  # If in HuggingFace environment, ensure embeddings are set up
122
  if Settings.is_huggingface():
123
  print("HuggingFace environment detected, setting up embeddings...")
 
125
  else:
126
  print("Local environment detected")
127
  print(f"Base directory: {Settings.BASE_DIR}")
128
+
129
  print(f"\nLoading vector store from: {self.embeddings_dir}")
130
  # Check Chroma directory structure
131
  chroma_dir = self.embeddings_dir / "chroma"
132
  print(f"Checking Chroma directory: {chroma_dir}")
133
  print(f"Absolute path: {chroma_dir.absolute()}")
134
+
135
  if not chroma_dir.exists():
136
+ print(
137
+ f"Parent directory exists: {self.embeddings_dir.exists()}")
138
  if self.embeddings_dir.exists():
139
+ print(
140
+ f"Parent directory contents: {list(self.embeddings_dir.glob('**/*'))}")
141
+ raise RuntimeError(
142
+ f"Chroma directory not found at {chroma_dir}")
143
+
144
  sqlite_file = chroma_dir / "chroma.sqlite3"
145
  print(f"Checking SQLite file: {sqlite_file}")
146
  if not sqlite_file.exists():
147
  print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
148
+ raise RuntimeError(
149
+ f"Chroma database not found at {sqlite_file}")
150
+ print(
151
+ f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
152
+
153
  # Load vector store using environment-aware settings
154
  print("Initializing Chroma with settings:")
155
+ chroma_settings = Settings.get_chroma_settings()
 
 
156
  self.vector_store = Chroma(
157
+ persist_directory=chroma_settings["persist_directory"],
158
  embedding_function=self.embeddings,
159
+ collection_name=chroma_settings["collection_name"]
160
  )
161
+
162
  # Verify collection has documents
163
  collection = self.vector_store._collection
164
  count = collection.count()
165
  print(f"Collection contains {count} documents")
166
+
167
  if count == 0:
168
  print("Collection is empty, checking details...")
169
  # Try to peek at the collection data
170
  peek = collection.peek()
171
  print(f"Collection peek: {peek}")
172
+
173
  # Additional debugging for empty collection
174
  print("\nDebug Information:")
175
  print(f"Chroma directory structure:")
176
  for item in chroma_dir.glob('**/*'):
177
  print(f" {item}")
178
  if item.is_file():
179
+ print(
180
+ f" Size: {item.stat().st_size / (1024*1024):.2f} MB")
181
+
182
  raise RuntimeError(
183
  "Chroma DB is empty. Please ensure embeddings "
184
  "were properly generated and uploaded."
185
  )
186
  else:
187
  print("Successfully loaded embeddings")
188
+
189
  except Exception as e:
190
  print(f"Error loading embeddings: {str(e)}")
191
  raise RuntimeError(f"Failed to load embeddings: {str(e)}")
192
+
193
  # Setup QA chain
194
  self._setup_qa_chain()
195
 
 
326
  try:
327
  print("Starting lyrics generation process...")
328
  print(f"Using OpenAI model: {Settings.LLM_MODEL}")
329
+
330
  # Get source documents with scores first
331
  print("Searching for similar documents...")
332
  try:
333
  # Test embeddings function first
334
  print("Testing embeddings function...")
335
  test_embedding = self.embeddings.embed_query("test")
336
+ print(
337
+ f"Embeddings function working (vector size: {len(test_embedding)})")
338
+
339
  # Now try similarity search
340
  docs_and_scores = self.vector_store.similarity_search_with_score(
341
  prompt,
342
  k=20
343
  )
344
  print(f"Found {len(docs_and_scores)} similar documents")
345
+
346
  if not docs_and_scores:
347
+ print(
348
+ "Warning: No similar documents found. This may affect generation quality.")
349
+
350
  except Exception as e:
351
  print(f"Error during similarity search: {str(e)}")
352
  raise RuntimeError(
 
364
  'artist': doc.metadata['artist'],
365
  'song': doc.metadata['song_title'],
366
  'similarity': similarity,
367
+ # First 200 chars
368
+ 'content': doc.page_content[:200] + "..."
369
  })
370
 
371
  try: