"""Test semantic understanding of the lyrics database.""" import numpy as np import time from typing import Dict, List, Tuple from pathlib import Path from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma from chromadb.config import Settings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document def initialize_vector_store(lyrics_dir: Path) -> Chroma: """Initialize vector store with lyrics.""" print("Initializing vector store...") embeddings = OpenAIEmbeddings() text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", " ", ""] ) # Load lyrics documents = [] for artist_dir in lyrics_dir.iterdir(): if artist_dir.is_dir(): artist_name = artist_dir.name for lyric_file in artist_dir.glob('*.txt'): with open(lyric_file, 'r', encoding='utf-8') as f: text = f.read() metadata = { 'artist': artist_name, 'song_title': lyric_file.stem, 'source': str(lyric_file) } documents.append( Document(page_content=text, metadata=metadata) ) if not documents: raise ValueError("No lyrics found in directory") print(f"Found {len(documents)} lyrics documents") # Create vector store texts = text_splitter.split_documents(documents) vector_store = Chroma.from_documents( documents=texts, embedding=embeddings, persist_directory="./lyrics_db" ) return vector_store def calculate_metrics(similarity_scores: list[float]) -> dict: """Calculate metrics for similarity scores.""" return { "mean": np.mean(similarity_scores), "median": np.median(similarity_scores), "std": np.std(similarity_scores), "min": np.min(similarity_scores), "max": np.max(similarity_scores) } def test_k_parameters( vector_store: Chroma, query: str, k_values: List[int], search_k_values: List[int] ) -> Dict[Tuple[int, int], Dict]: """Test different combinations of k and search_k parameters.""" results = {} for k in k_values: for search_k in search_k_values: if search_k < k: continue start_time = time.time() try: # Get documents with error handling docs = vector_store.similarity_search_with_score( query, k=k ) elapsed_time = time.time() - start_time if not docs: print(f"No results found for k={k}, search_k={search_k}") continue similarities = [1 - score for _, score in docs] avg_sim = ( np.mean(similarities) if similarities else 0 ) max_sim = ( np.max(similarities) if similarities else 0 ) results[(k, search_k)] = { "time": elapsed_time, "avg_similarity": avg_sim, "max_similarity": max_sim, "result_count": len(docs) } except Exception as e: print(f"Error with k={k}, search_k={search_k}: {str(e)}") continue if not results: raise ValueError( "No valid results found for any parameter combination" ) return results def test_semantic_understanding( lyrics_dir: str = "./app/lyrics", lyrics_db_path: str = None, optimize_params: bool = True ): """Test semantic understanding with parameter optimization.""" print("\n=== Testing Semantic Understanding ===\n") # Get absolute path to lyrics_db if lyrics_db_path is None: lyrics_db_path = Path(__file__).parent.parent / "lyrics_db" else: lyrics_db_path = Path(lyrics_db_path) print(f"Looking for vector store at: {lyrics_db_path}") # Detailed vector store check if not lyrics_db_path.exists(): print(f"Error: Vector store not found at {lyrics_db_path}") return # Check vector store contents print("\nChecking vector store contents:") # Calculate directory size total_size = sum( f.stat().st_size for f in lyrics_db_path.rglob('*') if f.is_file() ) dir_size_mb = total_size / 1024 / 1024 print(f"Directory size: {dir_size_mb:.2f} MB") print("Files found:") for file in lyrics_db_path.iterdir(): size_mb = file.stat().st_size / 1024 / 1024 print(f"- {file.name} ({size_mb:.2f} MB)") try: embeddings = OpenAIEmbeddings() vector_store = Chroma( persist_directory=str(lyrics_db_path), embedding_function=embeddings, client_settings=Settings( anonymized_telemetry=False ) ) # Detailed collection check collection = vector_store._collection collection_info = collection.get() print("\nCollection details:") if collection_info is None: print("Error: Collection info is None. Database may be corrupted.") print("Recommendation: Delete the lyrics_db folder and rebuild.") return # Safely get counts with default values ids = collection_info.get('ids', []) or [] embeddings = collection_info.get('embeddings', []) or [] metadatas = collection_info.get('metadatas', []) or [] print(f"IDs: {len(ids)} items") print(f"Embeddings: {len(embeddings)} items") print(f"Metadatas: {len(metadatas)} items") if not ids: print("\nError: Vector store exists but appears to be empty.") print("Collection structure exists but contains no embeddings.") print( "Try removing the lyrics_db folder and recreating embeddings." ) return msg = f"\nFound valid vector store with {len(ids)} documents" print(msg) except Exception as e: print(f"\nError accessing vector store: {str(e)}") print("Detailed error information:") import traceback traceback.print_exc() return # Continue with testing if vector store is found and not empty... if __name__ == "__main__": try: test_semantic_understanding(optimize_params=True) except Exception as e: print(f"Error during semantic testing: {e}")