File size: 2,337 Bytes
d4398e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""

Deduplication Module

======================

Exact and semantic (TF-IDF cosine similarity) deduplication.

"""

from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
import numpy as np


@dataclass
class DeduplicationConfig:
    """Configuration for deduplication."""
    remove_exact: bool = True
    remove_semantic: bool = False
    semantic_threshold: float = 0.90  # cosine similarity threshold


def remove_exact_duplicates(

    df: pd.DataFrame,

    col: str,

) -> pd.DataFrame:
    """Remove rows with exact duplicate values in the given column."""
    return df.drop_duplicates(subset=[col]).reset_index(drop=True)


def remove_semantic_duplicates(

    df: pd.DataFrame,

    col: str,

    threshold: float = 0.90,

) -> pd.DataFrame:
    """

    Remove semantically similar rows using TF-IDF cosine similarity.

    Rows with cosine similarity >= threshold to an earlier row are dropped.

    """
    if len(df) < 2:
        return df

    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity
    except ImportError:
        # If scikit-learn not available, just return as-is
        return df

    texts = df[col].fillna('').astype(str).tolist()

    # Build TF-IDF matrix
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    try:
        tfidf_matrix = vectorizer.fit_transform(texts)
    except ValueError:
        return df

    # Find duplicates — compare each row to all previous rows
    keep_indices = [0]

    for i in range(1, len(texts)):
        # Compare row i against all kept rows
        sim = cosine_similarity(
            tfidf_matrix[i:i+1],
            tfidf_matrix[keep_indices],
        )
        if sim.max() < threshold:
            keep_indices.append(i)

    return df.iloc[keep_indices].reset_index(drop=True)


def apply_deduplication(

    df: pd.DataFrame,

    col: str,

    config: DeduplicationConfig,

) -> pd.DataFrame:
    """Apply all enabled deduplication methods."""
    if config.remove_exact:
        df = remove_exact_duplicates(df, col)

    if config.remove_semantic:
        df = remove_semantic_duplicates(df, col, config.semantic_threshold)

    return df