File size: 4,934 Bytes
f1b19d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""

Helper utility functions

"""
import re
import logging
from typing import List, Dict, Any
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def clean_text(text: str) -> str:
    """

    Clean and normalize text by removing extra whitespace, special characters, etc.

    

    Args:

        text: Raw text to clean

        

    Returns:

        Cleaned text string

    """
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?;:\-\'\"()]', '', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text


def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    """

    Split text into overlapping chunks for processing.

    

    Args:

        text: Text to chunk

        chunk_size: Size of each chunk in characters

        overlap: Overlap between chunks

        

    Returns:

        List of text chunks

    """
    chunks = []
    start = 0
    text_length = len(text)
    
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
        
    return chunks


def summarize_text(text: str, max_length: int = 500) -> str:
    """

    Create a simple extractive summary by taking the first sentences.

    

    Args:

        text: Text to summarize

        max_length: Maximum length of summary

        

    Returns:

        Summarized text

    """
    sentences = re.split(r'[.!?]+', text)
    summary = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        if len(summary) + len(sentence) + 2 <= max_length:  # +2 for ". "
            summary += sentence + ". "
        else:
            break
    
    # If no sentences fit, return truncated text
    if not summary and text:
        summary = text[:max_length].rsplit(' ', 1)[0] + "..."
            
    return summary.strip()


def extract_keywords(text: str, top_n: int = 10) -> List[str]:
    """

    Extract top keywords from text using simple frequency analysis.

    

    Args:

        text: Text to analyze

        top_n: Number of top keywords to return

        

    Returns:

        List of keywords

    """
    # Simple word frequency approach
    words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())
    
    # Remove common stop words
    stop_words = {'that', 'this', 'with', 'from', 'have', 'been', 'were', 
                  'will', 'would', 'could', 'should', 'about', 'their', 'there'}
    words = [w for w in words if w not in stop_words]
    
    # Count frequency
    word_freq: Dict[str, int] = {}
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1
    
    # Sort by frequency and return top N
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    return [word for word, freq in sorted_words[:top_n]]


def validate_url(url: str) -> bool:
    """

    Validate if a string is a proper URL.

    

    Args:

        url: URL string to validate

        

    Returns:

        True if valid URL, False otherwise

    """
    url_pattern = re.compile(
        r'^https?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return url_pattern.match(url) is not None


def format_timestamp() -> str:
    """

    Get current timestamp in ISO format.

    

    Returns:

        ISO formatted timestamp string

    """
    return datetime.now().isoformat()


def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
    """

    Safely divide two numbers, returning default if denominator is zero.

    

    Args:

        numerator: Numerator value

        denominator: Denominator value

        default: Default value if division by zero

        

    Returns:

        Division result or default

    """
    try:
        return numerator / denominator if denominator != 0 else default
    except (TypeError, ZeroDivisionError):
        return default


def parse_json_safe(json_str: str) -> Dict[str, Any]:
    """

    Safely parse JSON string with error handling.

    

    Args:

        json_str: JSON string to parse

        

    Returns:

        Parsed dictionary or empty dict on error

    """
    import json
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        logger.error(f"JSON parse error: {e}")
        return {}