|
|
|
|
|
"""Text Processing Plugin""" |
|
|
import re |
|
|
from typing import List |
|
|
|
|
|
class TextProcessor: |
|
|
"""Clean and process text data.""" |
|
|
def clean_text(self, text: str) -> str: |
|
|
"""Remove extra whitespace, special chars.""" |
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
text = text.strip() |
|
|
return text |
|
|
|
|
|
def extract_emails(self, text: str) -> List[str]: |
|
|
"""Extract email addresses from text.""" |
|
|
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' |
|
|
return re.findall(pattern, text) |
|
|
|
|
|
def extract_urls(self, text: str) -> List[str]: |
|
|
"""Extract URLs from text.""" |
|
|
pattern = r'https?://[^\s]+' |
|
|
return re.findall(pattern, text) |
|
|
|
|
|
def tokenize(self, text: str) -> List[str]: |
|
|
"""Simple word tokenization.""" |
|
|
return text.lower().split() |
|
|
|