James Edmunds commited on
Commit
576366e
·
1 Parent(s): 5909095

feat: add comprehensive embeddings and vector store debugging

Browse files
Files changed (1) hide show
  1. src/generator/generator.py +50 -12
src/generator/generator.py CHANGED
@@ -11,8 +11,16 @@ from config.settings import Settings
11
  class LyricGenerator:
12
  def __init__(self):
13
  """Initialize the generator with embeddings"""
 
14
  self.embeddings_dir = Settings.get_embeddings_path()
15
- self.embeddings = OpenAIEmbeddings()
 
 
 
 
 
 
 
16
  self.vector_store = None
17
  self.qa_chain = None
18
 
@@ -24,12 +32,11 @@ class LyricGenerator:
24
  if Settings.is_huggingface():
25
  try:
26
  print(f"Loading embeddings from HF dataset: {Settings.HF_DATASET}")
27
- # Download dataset to Space's storage (cached after first download)
28
  local_dir = Path(snapshot_download(
29
  repo_id=Settings.HF_DATASET,
30
  repo_type="dataset",
31
- token=Settings.HF_TOKEN,
32
- local_dir="/tmp/chroma_db" # Specify a fixed location
33
  ))
34
  print(f"Dataset downloaded to: {local_dir}")
35
 
@@ -37,18 +44,25 @@ class LyricGenerator:
37
  chroma_dir = local_dir / "chroma"
38
  if not chroma_dir.exists():
39
  raise RuntimeError(
40
- f"Chroma directory not found at {chroma_dir}. "
41
- "Please ensure the dataset contains a 'chroma' directory."
42
  )
43
 
 
44
  # Initialize vector store from the cached location
45
  self.vector_store = Chroma(
46
  persist_directory=str(chroma_dir),
47
  embedding_function=self.embeddings,
48
  collection_name="lyrics"
49
  )
50
- print("Successfully loaded vector store from Space storage")
 
 
 
 
 
 
51
  except Exception as e:
 
52
  raise RuntimeError(f"Failed to load HF embeddings: {str(e)}")
53
  else:
54
  if not self.embeddings_dir.exists():
@@ -58,13 +72,21 @@ class LyricGenerator:
58
  )
59
 
60
  try:
 
61
  # Load vector store using environment-aware settings
62
  self.vector_store = Chroma(
63
  persist_directory=str(self.embeddings_dir),
64
  embedding_function=self.embeddings,
65
  collection_name="lyrics"
66
  )
 
 
 
 
 
 
67
  except Exception as e:
 
68
  raise RuntimeError(f"Failed to load local embeddings: {str(e)}")
69
 
70
  # Setup QA chain
@@ -206,11 +228,27 @@ class LyricGenerator:
206
 
207
  # Get source documents with scores first
208
  print("Searching for similar documents...")
209
- docs_and_scores = self.vector_store.similarity_search_with_score(
210
- prompt,
211
- k=20
212
- )
213
- print(f"Found {len(docs_and_scores)} similar documents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  # Sort by similarity (convert distance to similarity)
216
  docs_and_scores.sort(key=lambda x: x[1], reverse=False)
 
11
  class LyricGenerator:
12
  def __init__(self):
13
  """Initialize the generator with embeddings"""
14
+ print("Initializing LyricGenerator...")
15
  self.embeddings_dir = Settings.get_embeddings_path()
16
+ print(f"Embeddings directory: {self.embeddings_dir}")
17
+
18
+ # Initialize OpenAI embeddings
19
+ print("Setting up OpenAI embeddings...")
20
+ self.embeddings = OpenAIEmbeddings(
21
+ openai_api_key=Settings.OPENAI_API_KEY
22
+ )
23
+
24
  self.vector_store = None
25
  self.qa_chain = None
26
 
 
32
  if Settings.is_huggingface():
33
  try:
34
  print(f"Loading embeddings from HF dataset: {Settings.HF_DATASET}")
35
+ # Download dataset to Space's storage
36
  local_dir = Path(snapshot_download(
37
  repo_id=Settings.HF_DATASET,
38
  repo_type="dataset",
39
+ local_dir="/tmp/chroma_db"
 
40
  ))
41
  print(f"Dataset downloaded to: {local_dir}")
42
 
 
44
  chroma_dir = local_dir / "chroma"
45
  if not chroma_dir.exists():
46
  raise RuntimeError(
47
+ f"Chroma directory not found at {chroma_dir}"
 
48
  )
49
 
50
+ print(f"Loading Chroma DB from: {chroma_dir}")
51
  # Initialize vector store from the cached location
52
  self.vector_store = Chroma(
53
  persist_directory=str(chroma_dir),
54
  embedding_function=self.embeddings,
55
  collection_name="lyrics"
56
  )
57
+ print("Successfully loaded vector store")
58
+
59
+ # Verify collection has documents
60
+ collection = self.vector_store._collection
61
+ count = collection.count()
62
+ print(f"Collection contains {count} documents")
63
+
64
  except Exception as e:
65
+ print(f"Error loading HF embeddings: {str(e)}")
66
  raise RuntimeError(f"Failed to load HF embeddings: {str(e)}")
67
  else:
68
  if not self.embeddings_dir.exists():
 
72
  )
73
 
74
  try:
75
+ print(f"Loading local vector store from: {self.embeddings_dir}")
76
  # Load vector store using environment-aware settings
77
  self.vector_store = Chroma(
78
  persist_directory=str(self.embeddings_dir),
79
  embedding_function=self.embeddings,
80
  collection_name="lyrics"
81
  )
82
+
83
+ # Verify collection has documents
84
+ collection = self.vector_store._collection
85
+ count = collection.count()
86
+ print(f"Collection contains {count} documents")
87
+
88
  except Exception as e:
89
+ print(f"Error loading local embeddings: {str(e)}")
90
  raise RuntimeError(f"Failed to load local embeddings: {str(e)}")
91
 
92
  # Setup QA chain
 
228
 
229
  # Get source documents with scores first
230
  print("Searching for similar documents...")
231
+ try:
232
+ # Test embeddings function first
233
+ print("Testing embeddings function...")
234
+ test_embedding = self.embeddings.embed_query("test")
235
+ print(f"Embeddings function working (vector size: {len(test_embedding)})")
236
+
237
+ # Now try similarity search
238
+ docs_and_scores = self.vector_store.similarity_search_with_score(
239
+ prompt,
240
+ k=20
241
+ )
242
+ print(f"Found {len(docs_and_scores)} similar documents")
243
+
244
+ if not docs_and_scores:
245
+ print("Warning: No similar documents found. This may affect generation quality.")
246
+
247
+ except Exception as e:
248
+ print(f"Error during similarity search: {str(e)}")
249
+ raise RuntimeError(
250
+ f"Failed to search vector store: {str(e)}"
251
+ )
252
 
253
  # Sort by similarity (convert distance to similarity)
254
  docs_and_scores.sort(key=lambda x: x[1], reverse=False)