Spaces:
Sleeping
Sleeping
File size: 2,312 Bytes
46917c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import re
from uuid import uuid4
import spacy
# Load the English model
nlp = spacy.load("en_core_web_md")
REGEX_PATTERNS = {
"email_pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
"phone_pattern": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
"link_pattern": r"\b(?:https?://|www\.)\S+\b",
}
def generate_unique_id():
"""
Generate a unique ID and return it as a string.
Returns:
str: A string with a unique ID.
"""
return str(uuid4())
class TextCleaner:
"""
A class for cleaning a text by removing specific patterns.
"""
def remove_emails_links(text):
"""
Clean the input text by removing specific patterns.
Args:
text (str): The input text to clean.
Returns:
str: The cleaned text.
"""
for pattern in REGEX_PATTERNS:
text = re.sub(REGEX_PATTERNS[pattern], "", text)
return text
def clean_text(text):
"""
Clean the input text by removing specific patterns.
Args:
text (str): The input text to clean.
Returns:
str: The cleaned text.
"""
text = TextCleaner.remove_emails_links(text)
doc = nlp(text)
for token in doc:
if token.pos_ == "PUNCT":
text = text.replace(token.text, "")
return str(text)
def remove_stopwords(text):
"""
Clean the input text by removing stopwords.
Args:
text (str): The input text to clean.
Returns:
str: The cleaned text.
"""
doc = nlp(text)
for token in doc:
if token.is_stop:
text = text.replace(token.text, "")
return text
class CountFrequency:
def __init__(self, text):
self.text = text
self.doc = nlp(text)
def count_frequency(self):
"""
Count the frequency of words in the input text.
Returns:
dict: A dictionary with the words as keys and the frequency as values.
"""
pos_freq = {}
for token in self.doc:
if token.pos_ in pos_freq:
pos_freq[token.pos_] += 1
else:
pos_freq[token.pos_] = 1
return pos_freq
|