File size: 1,484 Bytes
2cb327c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
Text Utilities
Functions for cleaning and processing text
"""

import re

# Patterns for cleaning
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
EMOJI_PATTERN = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F700-\U0001F77F"
    "\U0001F780-\U0001F7FF"
    "\U0001F800-\U0001F8FF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA00-\U0001FAFF"
    "]+",
    flags=re.UNICODE
)


def clean_text(raw_text: str) -> str:
    # Clean and normalize text by removing URLs, emojis, and duplicates
    if not raw_text:
        return ""

    text = raw_text
    text = URL_PATTERN.sub("", text)
    text = EMOJI_PATTERN.sub("", text)
    text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text).strip()
    text = remove_duplicate_sentences(text)
    
    return text


def remove_duplicate_sentences(text: str) -> str:
    # Remove duplicate sentences while preserving order
    if not text:
        return ""
    
    sentences = text.split('. ')
    seen = set()
    cleaned = []
    
    for s in sentences:
        s_clean = s.strip().lower()
        if len(s_clean) < 5:
            continue
        
        if s_clean not in seen:
            seen.add(s_clean)
            cleaned.append(s.strip())
    
    return '. '.join(cleaned)


def should_summarize(text: str) -> bool:
    # Check if text needs summarization based on length
    return len(text) >= 400  # ~60-80 words