File size: 1,560 Bytes
40a04d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from difflib import SequenceMatcher
import re

"""
Given a list of string messages (most reccent messages)
Check that (str) new_message is not an exact match of an 
existing message or very close in sequence.

Ex:
>>> recent_messages = ['this is a test']
>>> new_message = 'this is a test ok?'
>>> duplicate_check(new_message, recent_messages)
True
"""

#remove punctuation and extra whitespace
def normalize(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

#checks for exact matches
def is_exact_duplicate(new_message, recent_messages):
    new_norm = normalize(new_message)
    return any(new_norm == normalize(m) for m in recent_messages)

#calculate sequence similarity
#https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.ratio
def similarity(a, b):
    if len(a) < len(b):
        return SequenceMatcher(None, a, b).ratio()
    else:
        return SequenceMatcher(None, b, a).ratio()

#checks for duplicate messages with minor differences
def is_similar_duplicate(new_message, recent_messages, threshold=0.9):
    new_norm = normalize(new_message)
    for message in recent_messages:
        message_norm = normalize(message)
        if similarity(new_norm, message_norm) >= threshold:
            return True
    return False

#check everything
def duplicate_check(new_message, recent_messages, threshold=0.9):
    return is_exact_duplicate(new_message, recent_messages) or is_similar_duplicate(new_message, recent_messages, threshold)