| | """Test semantic understanding of the lyrics database.""" |
| | import numpy as np |
| | import time |
| | from typing import Dict, List, Tuple |
| | from pathlib import Path |
| | from langchain_openai import OpenAIEmbeddings |
| | from langchain_community.vectorstores import Chroma |
| | from chromadb.config import Settings |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from langchain_core.documents import Document |
| |
|
| |
|
| | def initialize_vector_store(lyrics_dir: Path) -> Chroma: |
| | """Initialize vector store with lyrics.""" |
| | print("Initializing vector store...") |
| | |
| | embeddings = OpenAIEmbeddings() |
| | text_splitter = RecursiveCharacterTextSplitter( |
| | chunk_size=500, |
| | chunk_overlap=50, |
| | separators=["\n\n", "\n", " ", ""] |
| | ) |
| | |
| | |
| | documents = [] |
| | for artist_dir in lyrics_dir.iterdir(): |
| | if artist_dir.is_dir(): |
| | artist_name = artist_dir.name |
| | for lyric_file in artist_dir.glob('*.txt'): |
| | with open(lyric_file, 'r', encoding='utf-8') as f: |
| | text = f.read() |
| | metadata = { |
| | 'artist': artist_name, |
| | 'song_title': lyric_file.stem, |
| | 'source': str(lyric_file) |
| | } |
| | documents.append( |
| | Document(page_content=text, metadata=metadata) |
| | ) |
| | |
| | if not documents: |
| | raise ValueError("No lyrics found in directory") |
| | |
| | print(f"Found {len(documents)} lyrics documents") |
| | |
| | |
| | texts = text_splitter.split_documents(documents) |
| | vector_store = Chroma.from_documents( |
| | documents=texts, |
| | embedding=embeddings, |
| | persist_directory="./lyrics_db" |
| | ) |
| | |
| | return vector_store |
| |
|
| |
|
| | def calculate_metrics(similarity_scores: list[float]) -> dict: |
| | """Calculate metrics for similarity scores.""" |
| | return { |
| | "mean": np.mean(similarity_scores), |
| | "median": np.median(similarity_scores), |
| | "std": np.std(similarity_scores), |
| | "min": np.min(similarity_scores), |
| | "max": np.max(similarity_scores) |
| | } |
| |
|
| |
|
| | def test_k_parameters( |
| | vector_store: Chroma, |
| | query: str, |
| | k_values: List[int], |
| | search_k_values: List[int] |
| | ) -> Dict[Tuple[int, int], Dict]: |
| | """Test different combinations of k and search_k parameters.""" |
| | results = {} |
| | |
| | for k in k_values: |
| | for search_k in search_k_values: |
| | if search_k < k: |
| | continue |
| | |
| | start_time = time.time() |
| | |
| | try: |
| | |
| | docs = vector_store.similarity_search_with_score( |
| | query, |
| | k=k |
| | ) |
| | |
| | elapsed_time = time.time() - start_time |
| | |
| | if not docs: |
| | print(f"No results found for k={k}, search_k={search_k}") |
| | continue |
| | |
| | similarities = [1 - score for _, score in docs] |
| | |
| | avg_sim = ( |
| | np.mean(similarities) if similarities else 0 |
| | ) |
| | max_sim = ( |
| | np.max(similarities) if similarities else 0 |
| | ) |
| | |
| | results[(k, search_k)] = { |
| | "time": elapsed_time, |
| | "avg_similarity": avg_sim, |
| | "max_similarity": max_sim, |
| | "result_count": len(docs) |
| | } |
| | |
| | except Exception as e: |
| | print(f"Error with k={k}, search_k={search_k}: {str(e)}") |
| | continue |
| | |
| | if not results: |
| | raise ValueError( |
| | "No valid results found for any parameter combination" |
| | ) |
| | |
| | return results |
| |
|
| |
|
| | def test_semantic_understanding( |
| | lyrics_dir: str = "./app/lyrics", |
| | lyrics_db_path: str = None, |
| | optimize_params: bool = True |
| | ): |
| | """Test semantic understanding with parameter optimization.""" |
| | print("\n=== Testing Semantic Understanding ===\n") |
| | |
| | |
| | if lyrics_db_path is None: |
| | lyrics_db_path = Path(__file__).parent.parent / "lyrics_db" |
| | else: |
| | lyrics_db_path = Path(lyrics_db_path) |
| | |
| | print(f"Looking for vector store at: {lyrics_db_path}") |
| | |
| | |
| | if not lyrics_db_path.exists(): |
| | print(f"Error: Vector store not found at {lyrics_db_path}") |
| | return |
| | |
| | |
| | print("\nChecking vector store contents:") |
| | |
| | |
| | total_size = sum( |
| | f.stat().st_size |
| | for f in lyrics_db_path.rglob('*') |
| | if f.is_file() |
| | ) |
| | dir_size_mb = total_size / 1024 / 1024 |
| | print(f"Directory size: {dir_size_mb:.2f} MB") |
| | |
| | print("Files found:") |
| | for file in lyrics_db_path.iterdir(): |
| | size_mb = file.stat().st_size / 1024 / 1024 |
| | print(f"- {file.name} ({size_mb:.2f} MB)") |
| | |
| | try: |
| | embeddings = OpenAIEmbeddings() |
| | vector_store = Chroma( |
| | persist_directory=str(lyrics_db_path), |
| | embedding_function=embeddings, |
| | client_settings=Settings( |
| | anonymized_telemetry=False |
| | ) |
| | ) |
| | |
| | |
| | collection = vector_store._collection |
| | collection_info = collection.get() |
| | |
| | print("\nCollection details:") |
| | if collection_info is None: |
| | print("Error: Collection info is None. Database may be corrupted.") |
| | print("Recommendation: Delete the lyrics_db folder and rebuild.") |
| | return |
| | |
| | |
| | ids = collection_info.get('ids', []) or [] |
| | embeddings = collection_info.get('embeddings', []) or [] |
| | metadatas = collection_info.get('metadatas', []) or [] |
| | |
| | print(f"IDs: {len(ids)} items") |
| | print(f"Embeddings: {len(embeddings)} items") |
| | print(f"Metadatas: {len(metadatas)} items") |
| | |
| | if not ids: |
| | print("\nError: Vector store exists but appears to be empty.") |
| | print("Collection structure exists but contains no embeddings.") |
| | print( |
| | "Try removing the lyrics_db folder and recreating embeddings." |
| | ) |
| | return |
| | |
| | msg = f"\nFound valid vector store with {len(ids)} documents" |
| | print(msg) |
| | |
| | except Exception as e: |
| | print(f"\nError accessing vector store: {str(e)}") |
| | print("Detailed error information:") |
| | import traceback |
| | traceback.print_exc() |
| | return |
| | |
| | |
| |
|
| |
|
| | if __name__ == "__main__": |
| | try: |
| | test_semantic_understanding(optimize_params=True) |
| | except Exception as e: |
| | print(f"Error during semantic testing: {e}") |