rishikasrinivas commited on
Commit
2566a7e
·
verified ·
1 Parent(s): 5661ef4

Create process_data.py

Browse files
Files changed (1) hide show
  1. process_data.py +93 -0
process_data.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.preprocessing import MultiLabelBinarizer
5
+ from load_data import loadFileIntoDict
6
+ import nltk
7
+ nltk.download('stopwords')
8
+ from nltk.corpus import stopwords
9
+ from string import punctuation
10
+
11
+ D= loadFileIntoDict("/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/ID2Genre.txt")
12
+ PATH = '/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/model_weights_multi.pth'
13
+
14
+ punctuation = list(punctuation)
15
+ def cleanDesc(df):
16
+ sw_nltk = stopwords.words('english')
17
+ for i,desc in enumerate(df['Description']):
18
+ desc = [word.lower() for word in desc.split() if (word not in sw_nltk and word not in punctuation)]
19
+
20
+ df['Description'][i]= " ".join(desc)
21
+ return df
22
+
23
+ def cleanGenres(df):
24
+ cleaned_genre_col=[]
25
+ for genre in df['Genres']:
26
+ spaced_entries =[entry for entry in genre[1:-1].strip().split(",")]
27
+ cleaned_genre_col.append([g.strip()[1:-1] for g in spaced_entries])
28
+
29
+ df['Genres']= pd.Series(cleaned_genre_col)
30
+ return df
31
+
32
+
33
+ def getMostFreqGenres(df):
34
+ keys_to_drop = ['personal development', 'biography', 'dystopia', 'science fiction fantasy','fiction', 'memoir', 'spirituality', 'classics', 'biography memoir' , 'new adult', 'thriller', 'suspense', 'literary fiction', 'christian', 'british literature', 'paranormal', 'short stories', 'literature', 'young Adult', 'audiobook', 'novels', 'history', 'mystery thriller', 'adult' , 'chick lit', 'contemporary romance', 'contemporary', 'adult fiction', 'urban fantasy', 'middle grade', 'historical', 'american']
35
+ freq={}
36
+ for genre in df['Genres']:
37
+ for g in genre:
38
+ if g not in freq.keys():
39
+ freq[g]=1
40
+ else:
41
+ freq[g] += 1
42
+
43
+ newfreq={}
44
+ for key,ent in freq.items():
45
+ if ent > 600 and key.lower() not in keys_to_drop:
46
+ newfreq[key]=ent
47
+ return newfreq.keys()
48
+
49
+ def store_most_frequent_genres(df, most_frequent):
50
+ # Remove labels from entries
51
+ df['Genres'] = df['Genres'].apply(lambda x: [item for item in x if item in most_frequent])
52
+ # most_freq_genres =[df['Genres'][i] for i in range(len(df))]
53
+ return df
54
+
55
+ def one_hot(df, col_name):
56
+ col = df[col_name].tolist()
57
+ mlb = MultiLabelBinarizer()
58
+ mlb.fit_transform(col)
59
+ one_hot_encodings = mlb.transform(col)
60
+
61
+ #store one_hot_encodings in a new col
62
+ d = {}
63
+ for i,classes in enumerate(mlb.classes_):
64
+ d[i] = classes
65
+
66
+ df["genre_id"]=[[0]*len(df) for i in range(len(df))]
67
+ for i in range(len(df)):
68
+ df["genre_id"][i]=one_hot_encodings[i]
69
+ df ["genre_id"] = [list(map(float, target)) for target in df["genre_id"]]
70
+ return df
71
+
72
+
73
+ def getDF():
74
+
75
+ data_path = "/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/Backend/Data/goodreads_data.csv"
76
+ df = pd.read_csv(data_path)
77
+
78
+
79
+ df = df.drop(columns=['Unnamed: 0', 'Book', 'Author', 'Avg_Rating', 'Num_Ratings', 'URL'])
80
+ df.dropna(inplace=True)
81
+ df = df[df['Genres']!= '[]']
82
+
83
+ df.reset_index(inplace=True, drop=True)
84
+
85
+
86
+ df = cleanGenres(df)
87
+ df = cleanDesc(df)
88
+ most_freq_genres = getMostFreqGenres(df)
89
+ df = store_most_frequent_genres(df, most_freq_genres)
90
+ df =one_hot(df, 'Genres')
91
+ return df
92
+
93
+