File size: 2,386 Bytes
2cb327c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
Text Cleaning Utilities
========================
Functions for cleaning and processing article text before summarization.

Cleaning pipeline:
    1. Remove URLs (http:// and www.)
    2. Strip Unicode emojis
    3. Normalize special characters to spaces
    4. Collapse whitespace
    5. Remove duplicate sentences

Usage:
    from backend.summarization.utils import clean_text, should_summarize

    cleaned = clean_text(raw_article_content)
    if should_summarize(cleaned):
        summary = summarizer.summarize(cleaned)
"""

import re

# Pre-compiled patterns for performance
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
EMOJI_PATTERN = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F700-\U0001F77F"
    "\U0001F780-\U0001F7FF"
    "\U0001F800-\U0001F8FF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA00-\U0001FAFF"
    "]+",
    flags=re.UNICODE
)


def clean_text(raw_text: str) -> str:
    """Clean and normalize article text by removing URLs, emojis, and duplicates.

    Args:
        raw_text: The raw article body text.

    Returns:
        Cleaned string with normalized whitespace and no duplicate sentences.
    """
    if not raw_text:
        return ""

    text = raw_text
    text = URL_PATTERN.sub("", text)
    text = EMOJI_PATTERN.sub("", text)
    text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text).strip()
    text = remove_duplicate_sentences(text)

    return text


def remove_duplicate_sentences(text: str) -> str:
    """Remove duplicate sentences while preserving order.

    Splits on '. ' (period-space), deduplicates by lowered content,
    and reassembles. Sentences shorter than 5 characters are dropped.
    """
    if not text:
        return ""

    sentences = text.split('. ')
    seen = set()
    cleaned = []

    for s in sentences:
        s_clean = s.strip().lower()
        if len(s_clean) < 5:
            continue
        if s_clean not in seen:
            seen.add(s_clean)
            cleaned.append(s.strip())

    return '. '.join(cleaned)


def should_summarize(text: str) -> bool:
    """Check if text is long enough to benefit from AI summarization.

    Returns True for texts >= 400 characters (~60-80 words).
    Shorter texts are kept as-is (no AI processing needed).
    """
    return len(text) >= 400