Spaces:
Running
Running
File size: 1,560 Bytes
40a04d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | from difflib import SequenceMatcher
import re
"""
Given a list of string messages (most reccent messages)
Check that (str) new_message is not an exact match of an
existing message or very close in sequence.
Ex:
>>> recent_messages = ['this is a test']
>>> new_message = 'this is a test ok?'
>>> duplicate_check(new_message, recent_messages)
True
"""
#remove punctuation and extra whitespace
def normalize(text: str) -> str:
text = text.lower().strip()
text = re.sub(r"\s+", " ", text)
text = re.sub(r"[^\w\s]", "", text)
return text
#checks for exact matches
def is_exact_duplicate(new_message, recent_messages):
new_norm = normalize(new_message)
return any(new_norm == normalize(m) for m in recent_messages)
#calculate sequence similarity
#https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.ratio
def similarity(a, b):
if len(a) < len(b):
return SequenceMatcher(None, a, b).ratio()
else:
return SequenceMatcher(None, b, a).ratio()
#checks for duplicate messages with minor differences
def is_similar_duplicate(new_message, recent_messages, threshold=0.9):
new_norm = normalize(new_message)
for message in recent_messages:
message_norm = normalize(message)
if similarity(new_norm, message_norm) >= threshold:
return True
return False
#check everything
def duplicate_check(new_message, recent_messages, threshold=0.9):
return is_exact_duplicate(new_message, recent_messages) or is_similar_duplicate(new_message, recent_messages, threshold)
|