James Edmunds commited on
Commit
33731b6
·
1 Parent(s): fc18f00

local running, more fixes

Browse files
Files changed (1) hide show
  1. src/generator/generator.py +37 -49
src/generator/generator.py CHANGED
@@ -7,9 +7,10 @@ from langchain.chains import ConversationalRetrievalChain
7
  from langchain.prompts import PromptTemplate
8
  from huggingface_hub import snapshot_download, hf_hub_download, HfApi
9
  from config.settings import Settings
10
- from tenacity import retry, stop_after_attempt, wait_exponential
11
  from datasets import load_dataset
12
  import sqlite3
 
13
 
14
 
15
  class LyricGenerator:
@@ -30,21 +31,36 @@ class LyricGenerator:
30
  print(f"Embeddings directory: {self.embeddings_dir}")
31
  print(f"Chroma directory: {self.chroma_dir}")
32
 
33
- # Initialize OpenAI embeddings
34
  print("Setting up OpenAI embeddings...")
35
  if not Settings.OPENAI_API_KEY:
36
  raise RuntimeError(
37
  "OpenAI API key is not set. Please configure it in your environment variables or HuggingFace Secrets.")
38
- self.embeddings = OpenAIEmbeddings(
39
- openai_api_key=Settings.OPENAI_API_KEY
40
- )
41
-
42
  self.vector_store = None
43
  self.qa_chain = None
44
 
45
  # Load embeddings
46
  self._load_embeddings()
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def _setup_embeddings_from_hf(self) -> None:
49
  """Download and setup embeddings from HuggingFace dataset"""
50
  print("\n=== Setting up embeddings from HuggingFace dataset ===")
@@ -293,19 +309,19 @@ class LyricGenerator:
293
  )
294
 
295
  @retry(
296
- stop=stop_after_attempt(3),
297
- wait=wait_exponential(multiplier=1, min=4, max=10)
 
298
  )
299
- def _embed_query(self, text: str):
300
- """Embed query with retry logic"""
301
  try:
302
- return self.embeddings.embed_query(text)
 
 
 
303
  except Exception as e:
304
- print(f"Embedding error: {str(e)}")
305
- if "api_key" in str(e).lower():
306
- raise RuntimeError(
307
- "OpenAI API key validation failed. Please check your API key."
308
- )
309
  raise
310
 
311
  def generate_lyrics(
@@ -330,51 +346,23 @@ class LyricGenerator:
330
  print("Starting lyrics generation process...")
331
  print(f"Using OpenAI model: {Settings.LLM_MODEL}")
332
 
333
- # Get source documents with scores first
334
  print("Searching for similar documents...")
335
  try:
336
- # Test embeddings function with better error handling
337
- print("Testing embeddings function...")
338
- try:
339
- test_embedding = self._embed_query("test")
340
- print(
341
- f"Embeddings function working (vector size: {len(test_embedding)})")
342
- except Exception as e:
343
- print(f"OpenAI API Connection Error: {str(e)}")
344
- print("Checking API key...")
345
- if not Settings.OPENAI_API_KEY:
346
- raise RuntimeError(
347
- "OpenAI API key not found in environment")
348
- raise RuntimeError(
349
- f"OpenAI API connection failed: {str(e)}")
350
-
351
- # Now try similarity search
352
- docs_and_scores = self.vector_store.similarity_search_with_score(
353
- prompt,
354
- k=20
355
- )
356
- print(f"Found {len(docs_and_scores)} similar documents")
357
-
358
- if not docs_and_scores:
359
- print(
360
- "Warning: No similar documents found. This may affect generation quality.")
361
-
362
  except Exception as e:
363
  print(f"Error during similarity search: {str(e)}")
364
  raise RuntimeError(f"Failed to search vector store: {str(e)}")
365
 
366
- # Sort by similarity (convert distance to similarity)
367
- docs_and_scores.sort(key=lambda x: x[1], reverse=False)
368
-
369
  # Create detailed context log
370
  context_details = []
371
  for doc, score in docs_and_scores[:5]: # Log top 5 for brevity
372
- similarity = round((1 - score) * 100, 2)
373
  context_details.append({
374
  'artist': doc.metadata['artist'],
375
  'song': doc.metadata['song_title'],
376
- 'similarity': similarity,
377
- # First 200 chars
378
  'content': doc.page_content[:200] + "..."
379
  })
380
 
 
7
  from langchain.prompts import PromptTemplate
8
  from huggingface_hub import snapshot_download, hf_hub_download, HfApi
9
  from config.settings import Settings
10
+ from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
11
  from datasets import load_dataset
12
  import sqlite3
13
+ from openai import APIConnectionError, RateLimitError
14
 
15
 
16
  class LyricGenerator:
 
31
  print(f"Embeddings directory: {self.embeddings_dir}")
32
  print(f"Chroma directory: {self.chroma_dir}")
33
 
34
+ # Initialize OpenAI embeddings with retry
35
  print("Setting up OpenAI embeddings...")
36
  if not Settings.OPENAI_API_KEY:
37
  raise RuntimeError(
38
  "OpenAI API key is not set. Please configure it in your environment variables or HuggingFace Secrets.")
39
+
40
+ self.embeddings = self._create_embeddings_with_retry()
41
+
 
42
  self.vector_store = None
43
  self.qa_chain = None
44
 
45
  # Load embeddings
46
  self._load_embeddings()
47
 
48
+ @retry(
49
+ retry=retry_if_exception_type((APIConnectionError, RateLimitError)),
50
+ wait=wait_exponential(multiplier=1, min=4, max=10),
51
+ stop=stop_after_attempt(5)
52
+ )
53
+ def _create_embeddings_with_retry(self):
54
+ """Create OpenAI embeddings with retry logic"""
55
+ try:
56
+ return OpenAIEmbeddings(
57
+ openai_api_key=Settings.OPENAI_API_KEY,
58
+ timeout=60 # Increase timeout
59
+ )
60
+ except Exception as e:
61
+ print(f"Error creating embeddings: {type(e).__name__}: {str(e)}")
62
+ raise
63
+
64
  def _setup_embeddings_from_hf(self) -> None:
65
  """Download and setup embeddings from HuggingFace dataset"""
66
  print("\n=== Setting up embeddings from HuggingFace dataset ===")
 
309
  )
310
 
311
  @retry(
312
+ retry=retry_if_exception_type((APIConnectionError, RateLimitError)),
313
+ wait=wait_exponential(multiplier=1, min=4, max=10),
314
+ stop=stop_after_attempt(5)
315
  )
316
+ def _similarity_search_with_retry(self, query: str, k: int = 5):
317
+ """Perform similarity search with retry logic"""
318
  try:
319
+ return self.vector_store.similarity_search_with_score(
320
+ query,
321
+ k=k
322
+ )
323
  except Exception as e:
324
+ print(f"Similarity search error: {type(e).__name__}: {str(e)}")
 
 
 
 
325
  raise
326
 
327
  def generate_lyrics(
 
346
  print("Starting lyrics generation process...")
347
  print(f"Using OpenAI model: {Settings.LLM_MODEL}")
348
 
349
+ # Get source documents
350
  print("Searching for similar documents...")
351
  try:
352
+ results = self._similarity_search_with_retry(prompt)
353
+ # results is a list of (Document, score) tuples
354
+ docs_and_scores = [(doc[0], doc[1]) for doc in results] # Unpack tuples correctly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  except Exception as e:
356
  print(f"Error during similarity search: {str(e)}")
357
  raise RuntimeError(f"Failed to search vector store: {str(e)}")
358
 
 
 
 
359
  # Create detailed context log
360
  context_details = []
361
  for doc, score in docs_and_scores[:5]: # Log top 5 for brevity
 
362
  context_details.append({
363
  'artist': doc.metadata['artist'],
364
  'song': doc.metadata['song_title'],
365
+ 'similarity': f"{score:.2f}",
 
366
  'content': doc.page_content[:200] + "..."
367
  })
368