import difflib import re def get_word_positions(text): positions = [] for m in re.finditer(r'\S+', text): positions.append((m.group(), m.start(), m.end())) return positions class OffsetMapper: def __init__(self, original, modified): self.original = original self.modified = modified self.mapping = [] self._build_mapping() def _build_mapping(self): s = difflib.SequenceMatcher(None, self.original, self.modified) for tag, i1, i2, j1, j2 in s.get_opcodes(): self.mapping.append((j1, j2, i1, i2)) def map_offset(self, mod_offset): for j1, j2, i1, i2 in self.mapping: if j1 <= mod_offset <= j2: if j2 == j1: return i1 ratio = (mod_offset - j1) / (j2 - j1) return int(i1 + ratio * (i2 - i1)) return len(self.original) def get_word_diffs(original, corrected): orig_words = get_word_positions(original) corr_words = get_word_positions(corrected) s = difflib.SequenceMatcher(None, [w[0] for w in orig_words], [w[0] for w in corr_words]) suggestions = [] for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'replace': if i1 < len(orig_words) and i2 - 1 < len(orig_words): start_char = orig_words[i1][1] end_char = orig_words[i2-1][2] suggestions.append({ 'start': start_char, 'end': end_char, 'original': original[start_char:end_char], 'correction': " ".join([w[0] for w in corr_words[j1:j2]]), 'type': 'generic' }) elif tag == 'delete': if i1 < len(orig_words) and i2 - 1 < len(orig_words): start_char = orig_words[i1][1] end_char = orig_words[i2-1][2] suggestions.append({ 'start': start_char, 'end': end_char, 'original': original[start_char:end_char], 'correction': '', 'type': 'generic' }) elif tag == 'insert': pos = orig_words[i1][1] if i1 < len(orig_words) else len(original) suggestions.append({ 'start': pos, 'end': pos, 'original': '', 'correction': " ".join([w[0] for w in corr_words[j1:j2]]), 'type': 'generic' }) return suggestions def test(): original_text = "قال محمد: علي أننا حققنا نجاحا كبيرا في المشروع رغم الصعوباالصعوبات...." spelling_text = "قال محمد علي أننا حققنا نجاحا كبيرا في المشروع رغم الصعوباالصعوبات...." grammar_text = "قال محمد علي أننا حققنا نجاحا كبيرا في المشروع رغم الصعوبات..." punct_text = "قال محمد علي: أننا حققنا نجاحا كبيرا في المشروع رغم الصعوبات...." suggestions = [] mappers = [] # SPELLING suggestions.append({ 'start': 4, 'end': 9, 'original': "محمد:", 'correction': "محمد", 'type': 'spelling' }) mappers.append(OffsetMapper(original_text, spelling_text)) def map_range_to_original(start, end): curr_start, curr_end = start, end for mapper in reversed(mappers): curr_start = mapper.map_offset(curr_start) curr_end = mapper.map_offset(curr_end) return curr_start, curr_end # GRAMMAR diffs = get_word_diffs(spelling_text, grammar_text) for d in diffs: orig_start, orig_end = map_range_to_original(d['start'], d['end']) suggestions.append({ 'start': orig_start, 'end': orig_end, 'original': original_text[orig_start:orig_end], 'correction': d['correction'], 'type': 'grammar' }) mappers.append(OffsetMapper(spelling_text, grammar_text)) # PUNCTUATION diffs = get_word_diffs(grammar_text, punct_text) for d in diffs: orig_start, orig_end = map_range_to_original(d['start'], d['end']) suggestions.append({ 'start': orig_start, 'end': orig_end, 'original': original_text[orig_start:orig_end], 'correction': d['correction'], 'type': 'punctuation' }) print("SUGGESTIONS BEFORE RESOLUTION:") for s in suggestions: print(s) PRIORITY = {'grammar': 3, 'punctuation': 2, 'spelling': 1, 'autocomplete': 0} suggestions.sort(key=lambda s: PRIORITY.get(s['type'], 0), reverse=True) claimed_ranges = [] resolved = [] for s in suggestions: s_start, s_end = s['start'], s['end'] overlaps = False for (c_start, c_end, c_type) in claimed_ranges: if s_start < c_end and s_end > c_start: overlaps = True print(f"Overlap detected! {s['type']} [{s_start}:{s_end}] overlaps with {c_type} [{c_start}:{c_end}]") break if not overlaps: resolved.append(s) claimed_ranges.append((s_start, s_end, s['type'])) else: print(f"[OVERLAP] Dropped {s['type']} [{s_start}:{s_end}] '{s.get('original','')}'") if __name__ == "__main__": test()