Spaces:
Sleeping
Sleeping
File size: 4,934 Bytes
f1b19d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""
Helper utility functions
"""
import re
import logging
from typing import List, Dict, Any
from datetime import datetime
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def clean_text(text: str) -> str:
"""
Clean and normalize text by removing extra whitespace, special characters, etc.
Args:
text: Raw text to clean
Returns:
Cleaned text string
"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,!?;:\-\'\"()]', '', text)
# Strip leading/trailing whitespace
text = text.strip()
return text
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
"""
Split text into overlapping chunks for processing.
Args:
text: Text to chunk
chunk_size: Size of each chunk in characters
overlap: Overlap between chunks
Returns:
List of text chunks
"""
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = start + chunk_size
chunk = text[start:end]
chunks.append(chunk)
start = end - overlap
return chunks
def summarize_text(text: str, max_length: int = 500) -> str:
"""
Create a simple extractive summary by taking the first sentences.
Args:
text: Text to summarize
max_length: Maximum length of summary
Returns:
Summarized text
"""
sentences = re.split(r'[.!?]+', text)
summary = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if len(summary) + len(sentence) + 2 <= max_length: # +2 for ". "
summary += sentence + ". "
else:
break
# If no sentences fit, return truncated text
if not summary and text:
summary = text[:max_length].rsplit(' ', 1)[0] + "..."
return summary.strip()
def extract_keywords(text: str, top_n: int = 10) -> List[str]:
"""
Extract top keywords from text using simple frequency analysis.
Args:
text: Text to analyze
top_n: Number of top keywords to return
Returns:
List of keywords
"""
# Simple word frequency approach
words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())
# Remove common stop words
stop_words = {'that', 'this', 'with', 'from', 'have', 'been', 'were',
'will', 'would', 'could', 'should', 'about', 'their', 'there'}
words = [w for w in words if w not in stop_words]
# Count frequency
word_freq: Dict[str, int] = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
# Sort by frequency and return top N
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
return [word for word, freq in sorted_words[:top_n]]
def validate_url(url: str) -> bool:
"""
Validate if a string is a proper URL.
Args:
url: URL string to validate
Returns:
True if valid URL, False otherwise
"""
url_pattern = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return url_pattern.match(url) is not None
def format_timestamp() -> str:
"""
Get current timestamp in ISO format.
Returns:
ISO formatted timestamp string
"""
return datetime.now().isoformat()
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
"""
Safely divide two numbers, returning default if denominator is zero.
Args:
numerator: Numerator value
denominator: Denominator value
default: Default value if division by zero
Returns:
Division result or default
"""
try:
return numerator / denominator if denominator != 0 else default
except (TypeError, ZeroDivisionError):
return default
def parse_json_safe(json_str: str) -> Dict[str, Any]:
"""
Safely parse JSON string with error handling.
Args:
json_str: JSON string to parse
Returns:
Parsed dictionary or empty dict on error
"""
import json
try:
return json.loads(json_str)
except json.JSONDecodeError as e:
logger.error(f"JSON parse error: {e}")
return {}
|