| | """ |
| | String processing pipeline functions for testing function analysis. |
| | """ |
| |
|
| | import re |
| | from typing import List |
| |
|
| |
|
| | def normalize_whitespace(text): |
| | """Normalize whitespace by removing extra spaces and newlines.""" |
| | |
| | text = re.sub(r'\s+', ' ', text) |
| | |
| | return text.strip() |
| |
|
| |
|
| | def remove_special_characters(text, keep_chars=""): |
| | """Remove special characters, optionally keeping specified characters.""" |
| | |
| | pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]" |
| | return re.sub(pattern, '', text) |
| |
|
| |
|
| | def convert_to_lowercase(text): |
| | """Convert text to lowercase.""" |
| | return text.lower() |
| |
|
| |
|
| | def remove_stopwords(text, stopwords=None): |
| | """Remove common stopwords from text.""" |
| | if stopwords is None: |
| | stopwords = { |
| | 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', |
| | 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', |
| | 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', |
| | 'will', 'would', 'could', 'should', 'may', 'might', 'must' |
| | } |
| | |
| | words = text.split() |
| | filtered_words = [word for word in words if word.lower() not in stopwords] |
| | return ' '.join(filtered_words) |
| |
|
| |
|
| | def extract_keywords(text, min_length=3): |
| | """Extract keywords (words longer than min_length).""" |
| | words = text.split() |
| | keywords = [word for word in words if len(word) >= min_length] |
| | return keywords |
| |
|
| |
|
| | def count_word_frequency(text): |
| | """Count frequency of each word in text.""" |
| | words = text.split() |
| | frequency = {} |
| | for word in words: |
| | frequency[word] = frequency.get(word, 0) + 1 |
| | return frequency |
| |
|
| |
|
| | def capitalize_words(text, exceptions=None): |
| | """Capitalize first letter of each word, with exceptions.""" |
| | if exceptions is None: |
| | exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'} |
| | |
| | words = text.split() |
| | capitalized = [] |
| | |
| | for i, word in enumerate(words): |
| | if i == 0 or word.lower() not in exceptions: |
| | capitalized.append(word.capitalize()) |
| | else: |
| | capitalized.append(word.lower()) |
| | |
| | return ' '.join(capitalized) |
| |
|
| |
|
| | def truncate_text(text, max_length=100, suffix="..."): |
| | """Truncate text to specified length with suffix.""" |
| | if len(text) <= max_length: |
| | return text |
| | |
| | truncated = text[:max_length - len(suffix)] |
| | |
| | last_space = truncated.rfind(' ') |
| | if last_space > max_length * 0.8: |
| | truncated = truncated[:last_space] |
| | |
| | return truncated + suffix |
| |
|
| |
|
| | def text_processing_pipeline(text, operations=None): |
| | """Process text through a pipeline of operations.""" |
| | if operations is None: |
| | operations = [ |
| | 'normalize_whitespace', |
| | 'remove_special_characters', |
| | 'convert_to_lowercase', |
| | 'remove_stopwords' |
| | ] |
| | |
| | |
| | operation_map = { |
| | 'normalize_whitespace': normalize_whitespace, |
| | 'remove_special_characters': remove_special_characters, |
| | 'convert_to_lowercase': convert_to_lowercase, |
| | 'remove_stopwords': remove_stopwords, |
| | 'capitalize_words': capitalize_words, |
| | 'truncate_text': truncate_text |
| | } |
| | |
| | result = text |
| | processing_steps = [] |
| | |
| | for operation in operations: |
| | if operation in operation_map: |
| | before = result |
| | result = operation_map[operation](result) |
| | processing_steps.append({ |
| | 'operation': operation, |
| | 'before': before[:50] + "..." if len(before) > 50 else before, |
| | 'after': result[:50] + "..." if len(result) > 50 else result |
| | }) |
| | |
| | return result, processing_steps |
| |
|
| |
|
| | def analyze_text_statistics(text): |
| | """Analyze various statistics about the text.""" |
| | words = text.split() |
| | |
| | stats = { |
| | 'character_count': len(text), |
| | 'word_count': len(words), |
| | 'sentence_count': len(re.findall(r'[.!?]+', text)), |
| | 'average_word_length': sum(len(word) for word in words) / len(words) if words else 0, |
| | 'longest_word': max(words, key=len) if words else "", |
| | 'shortest_word': min(words, key=len) if words else "" |
| | } |
| | |
| | return stats |
| |
|
| |
|
| | if __name__ == "__main__": |
| | sample_text = """ |
| | This is a SAMPLE text with various formatting issues!!! |
| | It has multiple spaces, special @#$% characters, and |
| | needs some serious cleaning & processing... |
| | """ |
| | |
| | print("Original text:") |
| | print(repr(sample_text)) |
| | |
| | processed_text, steps = text_processing_pipeline(sample_text) |
| | |
| | print("\nProcessing steps:") |
| | for step in steps: |
| | print(f"After {step['operation']}:") |
| | print(f" {step['after']}") |
| | |
| | print(f"\nFinal result: {processed_text}") |
| | |
| | stats = analyze_text_statistics(processed_text) |
| | print(f"\nText statistics: {stats}") |
| |
|