Spaces:
Sleeping
Sleeping
| import re | |
| from uuid import uuid4 | |
| import spacy | |
| # Load the English model | |
| nlp = spacy.load("en_core_web_md") | |
| REGEX_PATTERNS = { | |
| "email_pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", | |
| "phone_pattern": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", | |
| "link_pattern": r"\b(?:https?://|www\.)\S+\b", | |
| } | |
| def generate_unique_id(): | |
| """ | |
| Generate a unique ID and return it as a string. | |
| Returns: | |
| str: A string with a unique ID. | |
| """ | |
| return str(uuid4()) | |
| class TextCleaner: | |
| """ | |
| A class for cleaning a text by removing specific patterns. | |
| """ | |
| def remove_emails_links(text): | |
| """ | |
| Clean the input text by removing specific patterns. | |
| Args: | |
| text (str): The input text to clean. | |
| Returns: | |
| str: The cleaned text. | |
| """ | |
| for pattern in REGEX_PATTERNS: | |
| text = re.sub(REGEX_PATTERNS[pattern], "", text) | |
| return text | |
| def clean_text(text): | |
| """ | |
| Clean the input text by removing specific patterns. | |
| Args: | |
| text (str): The input text to clean. | |
| Returns: | |
| str: The cleaned text. | |
| """ | |
| text = TextCleaner.remove_emails_links(text) | |
| doc = nlp(text) | |
| for token in doc: | |
| if token.pos_ == "PUNCT": | |
| text = text.replace(token.text, "") | |
| return str(text) | |
| def remove_stopwords(text): | |
| """ | |
| Clean the input text by removing stopwords. | |
| Args: | |
| text (str): The input text to clean. | |
| Returns: | |
| str: The cleaned text. | |
| """ | |
| doc = nlp(text) | |
| for token in doc: | |
| if token.is_stop: | |
| text = text.replace(token.text, "") | |
| return text | |
| class CountFrequency: | |
| def __init__(self, text): | |
| self.text = text | |
| self.doc = nlp(text) | |
| def count_frequency(self): | |
| """ | |
| Count the frequency of words in the input text. | |
| Returns: | |
| dict: A dictionary with the words as keys and the frequency as values. | |
| """ | |
| pos_freq = {} | |
| for token in self.doc: | |
| if token.pos_ in pos_freq: | |
| pos_freq[token.pos_] += 1 | |
| else: | |
| pos_freq[token.pos_] = 1 | |
| return pos_freq | |