from dataclasses import dataclass import pandas as pd from src.methodology import SimpleMethodology from src.neighbours import EmbeddingClosestNeighbours from src.scorer import EmbeddingsOriginalityScorer _FALLBACK_INDEX = 99 class EmbeddingsAnalysis: """ Facade for analyzing embeddings, combining neighbor search and originality scoring. :param index: FAISS index for similarity search. :param all_labels: DataFrame containing 'track_id' and 'length' columns for indexed entries. :param lookup: Pandas DataFrame containing metadata for each indexed entry. :param scalers: Dictionary mapping length ranges to quantile transformers for score normalization. :param close_threshold: Similarity threshold for neighbor search. """ def __init__(self, index, all_labels, lookup, scalers, radii, close_threshold=0.95, score_power=1.0): all_labels_np = all_labels['track_id'].to_numpy() all_lengths_np = all_labels['length'].to_numpy() self._ecn = EmbeddingClosestNeighbours(index, all_labels_np, all_lengths_np, lookup, close_threshold=close_threshold) specific_scalers = {i: scaler for (l, r), scaler in scalers.items() for i in range(l, r)} sm = SimpleMethodology(specific_scalers, specific_scalers[_FALLBACK_INDEX], score_power=score_power) self._scorer = EmbeddingsOriginalityScorer(index, all_labels_np, radii, sm) def get_scores(self, embeddings, lengths): score = self._scorer.score(embeddings, pd.Series(lengths)) return score def get_neighbours(self, embeddings, limit=None): neighbours = self._ecn.get(embeddings, limit) return neighbours