Create process_data.py
Browse files- process_data.py +93 -0
process_data.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
| 5 |
+
from load_data import loadFileIntoDict
|
| 6 |
+
import nltk
|
| 7 |
+
nltk.download('stopwords')
|
| 8 |
+
from nltk.corpus import stopwords
|
| 9 |
+
from string import punctuation
|
| 10 |
+
|
| 11 |
+
D= loadFileIntoDict("/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/ID2Genre.txt")
|
| 12 |
+
PATH = '/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/model_weights_multi.pth'
|
| 13 |
+
|
| 14 |
+
punctuation = list(punctuation)
|
| 15 |
+
def cleanDesc(df):
|
| 16 |
+
sw_nltk = stopwords.words('english')
|
| 17 |
+
for i,desc in enumerate(df['Description']):
|
| 18 |
+
desc = [word.lower() for word in desc.split() if (word not in sw_nltk and word not in punctuation)]
|
| 19 |
+
|
| 20 |
+
df['Description'][i]= " ".join(desc)
|
| 21 |
+
return df
|
| 22 |
+
|
| 23 |
+
def cleanGenres(df):
|
| 24 |
+
cleaned_genre_col=[]
|
| 25 |
+
for genre in df['Genres']:
|
| 26 |
+
spaced_entries =[entry for entry in genre[1:-1].strip().split(",")]
|
| 27 |
+
cleaned_genre_col.append([g.strip()[1:-1] for g in spaced_entries])
|
| 28 |
+
|
| 29 |
+
df['Genres']= pd.Series(cleaned_genre_col)
|
| 30 |
+
return df
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def getMostFreqGenres(df):
|
| 34 |
+
keys_to_drop = ['personal development', 'biography', 'dystopia', 'science fiction fantasy','fiction', 'memoir', 'spirituality', 'classics', 'biography memoir' , 'new adult', 'thriller', 'suspense', 'literary fiction', 'christian', 'british literature', 'paranormal', 'short stories', 'literature', 'young Adult', 'audiobook', 'novels', 'history', 'mystery thriller', 'adult' , 'chick lit', 'contemporary romance', 'contemporary', 'adult fiction', 'urban fantasy', 'middle grade', 'historical', 'american']
|
| 35 |
+
freq={}
|
| 36 |
+
for genre in df['Genres']:
|
| 37 |
+
for g in genre:
|
| 38 |
+
if g not in freq.keys():
|
| 39 |
+
freq[g]=1
|
| 40 |
+
else:
|
| 41 |
+
freq[g] += 1
|
| 42 |
+
|
| 43 |
+
newfreq={}
|
| 44 |
+
for key,ent in freq.items():
|
| 45 |
+
if ent > 600 and key.lower() not in keys_to_drop:
|
| 46 |
+
newfreq[key]=ent
|
| 47 |
+
return newfreq.keys()
|
| 48 |
+
|
| 49 |
+
def store_most_frequent_genres(df, most_frequent):
|
| 50 |
+
# Remove labels from entries
|
| 51 |
+
df['Genres'] = df['Genres'].apply(lambda x: [item for item in x if item in most_frequent])
|
| 52 |
+
# most_freq_genres =[df['Genres'][i] for i in range(len(df))]
|
| 53 |
+
return df
|
| 54 |
+
|
| 55 |
+
def one_hot(df, col_name):
|
| 56 |
+
col = df[col_name].tolist()
|
| 57 |
+
mlb = MultiLabelBinarizer()
|
| 58 |
+
mlb.fit_transform(col)
|
| 59 |
+
one_hot_encodings = mlb.transform(col)
|
| 60 |
+
|
| 61 |
+
#store one_hot_encodings in a new col
|
| 62 |
+
d = {}
|
| 63 |
+
for i,classes in enumerate(mlb.classes_):
|
| 64 |
+
d[i] = classes
|
| 65 |
+
|
| 66 |
+
df["genre_id"]=[[0]*len(df) for i in range(len(df))]
|
| 67 |
+
for i in range(len(df)):
|
| 68 |
+
df["genre_id"][i]=one_hot_encodings[i]
|
| 69 |
+
df ["genre_id"] = [list(map(float, target)) for target in df["genre_id"]]
|
| 70 |
+
return df
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def getDF():
|
| 74 |
+
|
| 75 |
+
data_path = "/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/Backend/Data/goodreads_data.csv"
|
| 76 |
+
df = pd.read_csv(data_path)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
df = df.drop(columns=['Unnamed: 0', 'Book', 'Author', 'Avg_Rating', 'Num_Ratings', 'URL'])
|
| 80 |
+
df.dropna(inplace=True)
|
| 81 |
+
df = df[df['Genres']!= '[]']
|
| 82 |
+
|
| 83 |
+
df.reset_index(inplace=True, drop=True)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
df = cleanGenres(df)
|
| 87 |
+
df = cleanDesc(df)
|
| 88 |
+
most_freq_genres = getMostFreqGenres(df)
|
| 89 |
+
df = store_most_frequent_genres(df, most_freq_genres)
|
| 90 |
+
df =one_hot(df, 'Genres')
|
| 91 |
+
return df
|
| 92 |
+
|
| 93 |
+
|