Spaces:

ProjectFrozone
/

Site

Running

Added Frozone Stuff

40a04d4 25 days ago

1.56 kB

	from difflib import SequenceMatcher
	import re

	"""
	Given a list of string messages (most reccent messages)
	Check that (str) new_message is not an exact match of an
	existing message or very close in sequence.

	Ex:
	>>> recent_messages = ['this is a test']
	>>> new_message = 'this is a test ok?'
	>>> duplicate_check(new_message, recent_messages)
	True
	"""

	#remove punctuation and extra whitespace
	def normalize(text: str) -> str:
	text = text.lower().strip()
	text = re.sub(r"\s+", " ", text)
	text = re.sub(r"[^\w\s]", "", text)
	return text

	#checks for exact matches
	def is_exact_duplicate(new_message, recent_messages):
	new_norm = normalize(new_message)
	return any(new_norm == normalize(m) for m in recent_messages)

	#calculate sequence similarity
	#https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.ratio
	def similarity(a, b):
	if len(a) < len(b):
	return SequenceMatcher(None, a, b).ratio()
	else:
	return SequenceMatcher(None, b, a).ratio()

	#checks for duplicate messages with minor differences
	def is_similar_duplicate(new_message, recent_messages, threshold=0.9):
	new_norm = normalize(new_message)
	for message in recent_messages:
	message_norm = normalize(message)
	if similarity(new_norm, message_norm) >= threshold:
	return True
	return False

	#check everything
	def duplicate_check(new_message, recent_messages, threshold=0.9):
	return is_exact_duplicate(new_message, recent_messages) or is_similar_duplicate(new_message, recent_messages, threshold)