File size: 3,688 Bytes
3690599
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
Utility Functions for Segmento Pulse
Provides common helpers for text processing, HTML cleaning, and data transformation
"""

import re
from html import unescape


def strip_html_if_needed(text: str) -> str:
    """
    Intelligently strip HTML only if HTML tags are detected.
    
    This optimization avoids unnecessary regex processing when text is already clean.
    RSS feeds can return either plain text or HTML - we handle both efficiently.
    
    Args:
        text: Input text (may or may not contain HTML)
        
    Returns:
        Cleaned text without HTML tags or entities
        
    Examples:
        >>> strip_html_if_needed("Plain text")
        'Plain text'
        
        >>> strip_html_if_needed("<b>Bold</b> text")
        'Bold text'
        
        >>> strip_html_if_needed("AT&amp;T announces...")
        'AT&T announces...'
    """
    if not text:
        return ""
    
    # Quick check: does this text have HTML?
    # This avoids expensive regex on plain text
    if '<' not in text and '>' not in text and '&' not in text:
        return text.strip()  # Already clean!
    
    # HTML detected - perform full cleanup
    
    # Step 1: Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Step 2: Decode HTML entities (&amp; → &, &lt; → <, etc.)
    text = unescape(text)
    
    # Step 3: Clean excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


def detect_html(text: str) -> bool:
    """
    Quickly detect if text contains HTML markup.
    
    Args:
        text: Text to check
        
    Returns:
        True if HTML tags detected, False otherwise
    """
    if not text:
        return False
    
    return '<' in text or '>' in text


def truncate_text(text: str, max_length: int = 200, suffix: str = "...") -> str:
    """
    Safely truncate text to maximum length.
    
    Args:
        text: Text to truncate
        max_length: Maximum length (default: 200)
        suffix: Suffix to add if truncated (default: "...")
        
    Returns:
        Truncated text
    """
    if not text or len(text) <= max_length:
        return text
    
    return text[:max_length - len(suffix)].strip() + suffix


def normalize_url(url: str) -> str:
    """
    Normalize URL for deduplication.
    
    - Converts to lowercase
    - Removes trailing slashes
    - Strips whitespace
    
    Args:
        url: URL to normalize
        
    Returns:
        Normalized URL
    """
    if not url:
        return ""
    
    return url.strip().rstrip('/').lower()


def extract_domain(url: str) -> str:
    """
    Extract domain from URL.
    
    Args:
        url: Full URL
        
    Returns:
        Domain name (e.g., "techcrunch.com")
    """
    import re
    
    # Remove protocol
    domain = re.sub(r'^https?://', '', url)
    
    # Remove path
    domain = domain.split('/')[0]
    
    # Remove www.
    domain = domain.replace('www.', '')
    
    return domain.lower()


def comma_separated_to_list(text: str) -> list:
    """
    Convert comma-separated string to list.
    
    Args:
        text: Comma-separated string (e.g., "AI,Tech,Cloud")
        
    Returns:
        List of strings (e.g., ["AI", "Tech", "Cloud"])
    """
    if not text:
        return []
    
    return [item.strip() for item in text.split(',') if item.strip()]


def list_to_comma_separated(items: list) -> str:
    """
    Convert list to comma-separated string.
    
    Args:
        items: List of strings
        
    Returns:
        Comma-separated string
    """
    if not items:
        return ""
    
    return ",".join(str(item).strip() for item in items if item)