Spaces:
Sleeping
Sleeping
| import nltk, json | |
| from nltk.stem import PorterStemmer | |
| from nltk.corpus import stopwords | |
| # from app.main import get_metadata | |
| from config import config | |
| class Preprocessor: | |
| def _tokenize(self, text): | |
| text = text.lower().split(' ') | |
| return text | |
| def preprocessing_text(self, doc): | |
| tokens = self._tokenize(doc) | |
| nltk.download('stopwords') | |
| stop_words = set(stopwords.words('english')) | |
| filtered_tokens = [token for token in tokens if not token in stop_words] | |
| stemmer = PorterStemmer() | |
| stemmed_tokes = [stemmer.stem(filtered_token) for filtered_token in filtered_tokens] | |
| preprocess_text = " ".join(stemmed_tokes) | |
| return preprocess_text | |
| def _save(self, docs): | |
| with open("../preprocessing_text.json", "w") as f: | |
| json.dump(docs, f, indent=4) | |
| def preprocessing(self, docs): | |
| preprocessed_docs = [self.preprocessing_text(doc) for doc in docs] | |
| self._save(preprocessed_docs) | |
| if __name__ == '__main__': | |
| texts, _ = get_metadata(config.PATH_METADATA) | |
| preprocessor = Preprocessor() | |
| preprocessor.preprocessing(texts) | |