File size: 1,812 Bytes
01d5a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from difflib import SequenceMatcher
from typing import Dict, List, Tuple, Any
import logging


def string_similarity(str1: str, str2: str) -> float:
    """Calculate the edit distance similarity between two strings.

    Args:
        str1: First string for comparison.
        str2: Second string for comparison.

    Returns:
        float: Similarity ratio between 0.0 and 1.0.
    """
    return SequenceMatcher(None, str1, str2).ratio()


def remove_similar_dicts(dict_list: List[Dict[str, Any]], similarity_threshold: float = 0.6) -> Tuple[List[Dict[str, Any]], int]:
    """Remove dictionaries with content field similarity greater than threshold.

    Args:
        dict_list: List of dictionaries containing 'content' field to check for similarity.
        similarity_threshold: Maximum similarity allowed between items (default: 0.6).

    Returns:
        Tuple containing:
            - List of dictionaries after removing similar items.
            - Count of similar items found.
    """
    unique_dicts = []
    cnt = 0
    for i, current_dict in enumerate(dict_list):
        if not current_dict["content"]:
            continue
        is_similar = False
        for j in range(len(unique_dicts)):
            if not unique_dicts[j]["content"]:
                continue
            if (
                string_similarity(current_dict["content"], unique_dicts[j]["content"])
                > similarity_threshold
            ):
                is_similar = True
                logging.info(
                    f" {current_dict['content'][-100:]}\n is similar to: \n{unique_dicts[j]['content'][-100:]}\n____________________"
                )
                cnt += 1
                break

        if not is_similar:
            unique_dicts.append(current_dict)

    return unique_dicts, cnt