File size: 3,688 Bytes
3690599 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | """
Utility Functions for Segmento Pulse
Provides common helpers for text processing, HTML cleaning, and data transformation
"""
import re
from html import unescape
def strip_html_if_needed(text: str) -> str:
"""
Intelligently strip HTML only if HTML tags are detected.
This optimization avoids unnecessary regex processing when text is already clean.
RSS feeds can return either plain text or HTML - we handle both efficiently.
Args:
text: Input text (may or may not contain HTML)
Returns:
Cleaned text without HTML tags or entities
Examples:
>>> strip_html_if_needed("Plain text")
'Plain text'
>>> strip_html_if_needed("<b>Bold</b> text")
'Bold text'
>>> strip_html_if_needed("AT&T announces...")
'AT&T announces...'
"""
if not text:
return ""
# Quick check: does this text have HTML?
# This avoids expensive regex on plain text
if '<' not in text and '>' not in text and '&' not in text:
return text.strip() # Already clean!
# HTML detected - perform full cleanup
# Step 1: Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Step 2: Decode HTML entities (& → &, < → <, etc.)
text = unescape(text)
# Step 3: Clean excessive whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def detect_html(text: str) -> bool:
"""
Quickly detect if text contains HTML markup.
Args:
text: Text to check
Returns:
True if HTML tags detected, False otherwise
"""
if not text:
return False
return '<' in text or '>' in text
def truncate_text(text: str, max_length: int = 200, suffix: str = "...") -> str:
"""
Safely truncate text to maximum length.
Args:
text: Text to truncate
max_length: Maximum length (default: 200)
suffix: Suffix to add if truncated (default: "...")
Returns:
Truncated text
"""
if not text or len(text) <= max_length:
return text
return text[:max_length - len(suffix)].strip() + suffix
def normalize_url(url: str) -> str:
"""
Normalize URL for deduplication.
- Converts to lowercase
- Removes trailing slashes
- Strips whitespace
Args:
url: URL to normalize
Returns:
Normalized URL
"""
if not url:
return ""
return url.strip().rstrip('/').lower()
def extract_domain(url: str) -> str:
"""
Extract domain from URL.
Args:
url: Full URL
Returns:
Domain name (e.g., "techcrunch.com")
"""
import re
# Remove protocol
domain = re.sub(r'^https?://', '', url)
# Remove path
domain = domain.split('/')[0]
# Remove www.
domain = domain.replace('www.', '')
return domain.lower()
def comma_separated_to_list(text: str) -> list:
"""
Convert comma-separated string to list.
Args:
text: Comma-separated string (e.g., "AI,Tech,Cloud")
Returns:
List of strings (e.g., ["AI", "Tech", "Cloud"])
"""
if not text:
return []
return [item.strip() for item in text.split(',') if item.strip()]
def list_to_comma_separated(items: list) -> str:
"""
Convert list to comma-separated string.
Args:
items: List of strings
Returns:
Comma-separated string
"""
if not items:
return ""
return ",".join(str(item).strip() for item in items if item)
|