| import re |
| from difflib import SequenceMatcher |
| from collections import defaultdict |
|
|
| def extract_special_characters(text): |
| """Extracts all unique special characters from a list of texts.""" |
| characters = re.findall(r'[^\w\s]', text) |
| return ''.join(characters) |
|
|
| def clean_text(text, keep=""): |
| """Removes special characters except those specified in 'keep', and converts to lowercase.""" |
| pattern = rf'[^\w\s{re.escape(keep)}]' |
| return re.sub(pattern, '', text.lower()) |
|
|
| def text_similarity(text, key_text): |
| """Calculates the similarity between two texts using SequenceMatcher.""" |
| return SequenceMatcher(None, text, key_text).ratio() |
|
|
| def detect_fragments(text, key_texts, threshold=0.7): |
| """Checks if a text contains fragments of key texts.""" |
| for key_text in key_texts: |
| characters_to_not_clean = extract_special_characters(key_text) |
| words = clean_text(text, characters_to_not_clean).split() |
|
|
| key_words = key_text.split() |
| |
| |
| if len(words) < len(key_words): |
| similarity = text_similarity(text, key_text) |
| if similarity >= threshold: |
| return True, key_text, similarity |
| continue |
| |
| |
| for i in range(len(words) - len(key_words) + 1): |
| fragment = " ".join(words[i:i+len(key_words)]) |
| similarity = text_similarity(fragment, key_text) |
| if similarity >= threshold: |
| return True, key_text, similarity |
| return False, None, 0 |
|
|
| def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7): |
| """ |
| Analyzes the similarity between a list of texts and key texts. |
| Returns a detailed report on the similarities found. |
| """ |
| results = { |
| "similar_texts": [], |
| "fragments_detected": [], |
| "combined": [], |
| "statistics": defaultdict(int) |
| } |
| |
| processed_texts = set() |
| |
| |
| for i, text in enumerate(text_list): |
| if not text.strip(): |
| continue |
| |
| for key_text in key_texts: |
| if not key_text.strip(): |
| continue |
| |
| similarity = text_similarity(text, key_text) |
| if similarity >= similarity_threshold: |
| results["similar_texts"].append({ |
| "index": i, |
| "text": text, |
| "key_text": key_text, |
| "similarity": similarity |
| }) |
| results["statistics"]["direct_similarity"] += 1 |
| processed_texts.add(i) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| for i in range(len(text_list)): |
| if i in processed_texts or not text_list[i].strip(): |
| continue |
| |
| for j in range(i+1, len(text_list)): |
| if j in processed_texts or not text_list[j].strip(): |
| continue |
| |
| combined_text = text_list[i] + " " + text_list[j] |
| for key_text in key_texts: |
| if not key_text.strip(): |
| continue |
| |
| similarity = text_similarity(combined_text, key_text) |
| if similarity >= similarity_threshold: |
| results["combined"].append({ |
| "indices": [i, j], |
| "texts": [text_list[i], text_list[j]], |
| "combined_text": combined_text, |
| "key_text": key_text, |
| "similarity": similarity |
| }) |
| results["statistics"]["combined"] += 1 |
| processed_texts.add(i) |
| processed_texts.add(j) |
| break |
| |
| |
| valid_texts = sum(1 for text in text_list if text.strip()) |
| results["statistics"]["total_analyzed"] = valid_texts |
| results["statistics"]["total_processed"] = len(processed_texts) |
| |
| return results |
|
|