Spaces:

meruem123
/

CCSS_Alignment

Sleeping

CCSS_Alignment / core /preprocessing_pipeline.py

Upload 15 files

34a1c85 verified 5 months ago

1.39 kB

	import re
	import pandas as pd
	import re
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	stop_words = set(stopwords.words('english'))

	class preprocessing_pipeline:
	def __init__(self,text):
	self.text = text

	def preprocess(self):
	self.text = self.clean_text(self.text)
	self.text = self.lowercase(self.text)
	self.text = self.remove_punctuation(self.text)
	self.text = self.remove_stopwords(self.text)
	self.text = self.lemmatize_tokens(self.text)
	return self.text

	def clean_text(self , text: str) -> str:
	text = text.strip()
	text = text.replace("\n", " ").replace("\xa0", " ")
	text = text.replace("“", "\"").replace("”", "\"").replace("–", "-")
	return text

	def lowercase(self, text: str) -> str:
	return text.lower()

	def remove_punctuation(self, text: str) -> str:
	return re.sub(r"[^\w\s]", "", text)

	def remove_stopwords(self, text: str) -> str:
	tokens = word_tokenize(text)
	return ' '.join([word for word in tokens if word not in stop_words])

	def lemmatize_tokens(self, text: str) -> str:
	tokens = word_tokenize(text)
	lemmatizer = WordNetLemmatizer()
	return ' '.join([lemmatizer.lemmatize(token) for token in tokens])