Spaces:
Runtime error
Runtime error
| import re | |
| from nltk.tokenize import RegexpTokenizer | |
| import spacy | |
| def remove_patterns(text): | |
| """ | |
| Remove punctions, emails, hashtags in given text | |
| """ | |
| if isinstance(text, spacy.tokens.span.Span): | |
| text = text.text | |
| # Remove return char | |
| text = re.sub(r'\n', ' ', text) | |
| # Remove emails | |
| text = re.sub(r'\S*@\S*\s?', '', text) | |
| # Remove hashtags | |
| text = re.sub(r'#\w+', '', text) | |
| # Remove punctuation | |
| text = re.sub(r'[^\w\s]', '', text) | |
| return text | |
| def extract_patterns(text): | |
| """ | |
| Extract punctions, emails, hashtags in given text | |
| """ | |
| # extract emails | |
| emails = re.findall(r'\S+@\S+', text) | |
| # extract hashtags | |
| hashtags = re.findall(r'#\w+', text) | |
| # extract punctuation | |
| punctuation = re.findall(r'[^\w\s]', text) | |
| return punctuation, emails, hashtags | |
| def remove_punct_nltk(text): | |
| tokenizer = RegexpTokenizer(r'\w+') | |
| tokenizer.tokenize(text) | |
| return text |