Spaces:
Configuration error
Configuration error
File size: 2,337 Bytes
d4398e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | """
Deduplication Module
======================
Exact and semantic (TF-IDF cosine similarity) deduplication.
"""
from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
import numpy as np
@dataclass
class DeduplicationConfig:
"""Configuration for deduplication."""
remove_exact: bool = True
remove_semantic: bool = False
semantic_threshold: float = 0.90 # cosine similarity threshold
def remove_exact_duplicates(
df: pd.DataFrame,
col: str,
) -> pd.DataFrame:
"""Remove rows with exact duplicate values in the given column."""
return df.drop_duplicates(subset=[col]).reset_index(drop=True)
def remove_semantic_duplicates(
df: pd.DataFrame,
col: str,
threshold: float = 0.90,
) -> pd.DataFrame:
"""
Remove semantically similar rows using TF-IDF cosine similarity.
Rows with cosine similarity >= threshold to an earlier row are dropped.
"""
if len(df) < 2:
return df
try:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
except ImportError:
# If scikit-learn not available, just return as-is
return df
texts = df[col].fillna('').astype(str).tolist()
# Build TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
try:
tfidf_matrix = vectorizer.fit_transform(texts)
except ValueError:
return df
# Find duplicates — compare each row to all previous rows
keep_indices = [0]
for i in range(1, len(texts)):
# Compare row i against all kept rows
sim = cosine_similarity(
tfidf_matrix[i:i+1],
tfidf_matrix[keep_indices],
)
if sim.max() < threshold:
keep_indices.append(i)
return df.iloc[keep_indices].reset_index(drop=True)
def apply_deduplication(
df: pd.DataFrame,
col: str,
config: DeduplicationConfig,
) -> pd.DataFrame:
"""Apply all enabled deduplication methods."""
if config.remove_exact:
df = remove_exact_duplicates(df, col)
if config.remove_semantic:
df = remove_semantic_duplicates(df, col, config.semantic_threshold)
return df
|