Spaces:

Julseb42
/

nlp-project

Sleeping

nlp-project / utils /data_process.py

Test deploy

8e83170 12 months ago

953 Bytes

	import pandas as pd
	from scripts.get_dataset import csv_file
	import re

	df = pd.read_csv(csv_file)


	def tokenize(df):
	df["tokens"] = df["reviews.text"].astype(str).str.split()
	return df


	def clean_text(text):
	text = text.lower()
	text = re.sub(r"http\S+\|www\S+\|https\S+", "", text, flags=re.MULTILINE)
	text = re.sub(r"[^a-z\s]", "", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def filter_data(data):
	df_filtered = data[data["cleaned_text"].str.split().apply(len) > 10]
	return df_filtered


	def remove_empty(data):
	df_filtered = data[
	data["cleaned_text"].notnull() & (data["cleaned_text"].str.strip() != "")
	]
	df_filtered = data[data["cleaned_text"].apply(lambda x: len(set(x.split())) > 2)]
	return df_filtered


	def remove_duplicates(data):
	df_filtered = data.drop_duplicates(subset=["cleaned_text"])
	df_filtered = data.reset_index(drop=True)
	return df_filtered