Spaces:
Sleeping
Sleeping
File size: 953 Bytes
3ccf31a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | import pandas as pd
from scripts.get_dataset import csv_file
import re
df = pd.read_csv(csv_file)
def tokenize(df):
df["tokens"] = df["reviews.text"].astype(str).str.split()
return df
def clean_text(text):
text = text.lower()
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
text = re.sub(r"[^a-z\s]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def filter_data(data):
df_filtered = data[data["cleaned_text"].str.split().apply(len) > 10]
return df_filtered
def remove_empty(data):
df_filtered = data[
data["cleaned_text"].notnull() & (data["cleaned_text"].str.strip() != "")
]
df_filtered = data[data["cleaned_text"].apply(lambda x: len(set(x.split())) > 2)]
return df_filtered
def remove_duplicates(data):
df_filtered = data.drop_duplicates(subset=["cleaned_text"])
df_filtered = data.reset_index(drop=True)
return df_filtered
|