Spaces:
Sleeping
Sleeping
| from difflib import SequenceMatcher | |
| from typing import Dict, List, Tuple, Any | |
| import logging | |
| def string_similarity(str1: str, str2: str) -> float: | |
| """Calculate the edit distance similarity between two strings. | |
| Args: | |
| str1: First string for comparison. | |
| str2: Second string for comparison. | |
| Returns: | |
| float: Similarity ratio between 0.0 and 1.0. | |
| """ | |
| return SequenceMatcher(None, str1, str2).ratio() | |
| def remove_similar_dicts(dict_list: List[Dict[str, Any]], similarity_threshold: float = 0.6) -> Tuple[List[Dict[str, Any]], int]: | |
| """Remove dictionaries with content field similarity greater than threshold. | |
| Args: | |
| dict_list: List of dictionaries containing 'content' field to check for similarity. | |
| similarity_threshold: Maximum similarity allowed between items (default: 0.6). | |
| Returns: | |
| Tuple containing: | |
| - List of dictionaries after removing similar items. | |
| - Count of similar items found. | |
| """ | |
| unique_dicts = [] | |
| cnt = 0 | |
| for i, current_dict in enumerate(dict_list): | |
| if not current_dict["content"]: | |
| continue | |
| is_similar = False | |
| for j in range(len(unique_dicts)): | |
| if not unique_dicts[j]["content"]: | |
| continue | |
| if ( | |
| string_similarity(current_dict["content"], unique_dicts[j]["content"]) | |
| > similarity_threshold | |
| ): | |
| is_similar = True | |
| logging.info( | |
| f" {current_dict['content'][-100:]}\n is similar to: \n{unique_dicts[j]['content'][-100:]}\n____________________" | |
| ) | |
| cnt += 1 | |
| break | |
| if not is_similar: | |
| unique_dicts.append(current_dict) | |
| return unique_dicts, cnt |