File size: 4,997 Bytes
226e11e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import pandas as pd
import numpy as np
from transformers import pipeline
from tqdm import tqdm
import os

books = pd.read_csv('books_cleaned.csv')

# print categories with more than 50 books
print(books["categories"].value_counts().reset_index().query("count > 50"))

# look at a specific category
print(books[books["categories"] == "Juvenile Fiction"])

# come up with a mapping to highlight subtle categories, rest will be NaN by default
category_mapping = {'Fiction' : "Fiction",
'Juvenile Fiction': "Children's Fiction",
'Biography & Autobiography': "Nonfiction",
'History': "Nonfiction",
'Literary Criticism': "Nonfiction",
'Philosophy': "Nonfiction",
'Religion': "Nonfiction",
'Comics & Graphic Novels': "Fiction",
'Drama': "Fiction",
'Juvenile Nonfiction': "Children's Nonfiction",
'Science': "Nonfiction",
'Poetry': "Fiction"}

# add the new column to the dataset
books["simple_categories"] = books["categories"].map(category_mapping)
print(books.head())

# applying the ZERO-SHOT CLASSIFICATION
# import the HF model
fiction_categories = ["Fiction", "Nonfiction"]
pipe = pipeline("zero-shot-classification",
				model="facebook/bart-large-mnli",
				device="mps"
)

# looking at the first known fiction entry in the dataframe
print(books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0])

# set a sequence and categories to run/apply the classifier, check for the scpres once you run, the depic the probabilities
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]
print(pipe(sequence, fiction_categories))
print("\n")

# get the predicted label from the output using post-processing
max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
max_label = pipe(sequence, fiction_categories)["labels"][max_index]
print("Predicted Label using ZERO-SHOT: " + max_label + "\n")

# define a function for generating predictions
def generate_predictions(sequence, categories):
	predictions = pipe(sequence, categories)
	max_index = np.argmax(predictions["scores"])
	max_label = predictions["labels"][max_index]
	return max_label

# take a sizeable sample of fiction and non-fiction, and prediict the label using the classifier
# compare it with the already known label
# the if-else is here to avoid re-generation during every run
if os.path.exists("predictions_results.csv"):
    predictions_df = pd.read_csv("predictions_results.csv")
else:
    # Run the loops, then save
    actual_cats = []
    predicted_cats = []

    for i in tqdm(range(0, 300)):
        sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
        predicted_cats += [generate_predictions(sequence, fiction_categories)]
        actual_cats += ["Fiction"]

    for i in tqdm(range(0, 300)):
        sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
        predicted_cats += [generate_predictions(sequence, fiction_categories)]
        actual_cats += ["Nonfiction"]

    predictions_df = pd.DataFrame({
        "actual_categories": actual_cats,
        "predicted_categories": predicted_cats
    })

    predictions_df.to_csv("predictions_results.csv", index=False)

# sets 1 for correct prediction and 0 for incorrect
predictions_df["correct_prediction"] = (
	np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
)
print(predictions_df)

# calculating the accuracy of our labelling
accuracy = predictions_df["correct_prediction"].sum()/len(predictions_df)
print("Labelling accuracy: ", accuracy*100, "%\n")

# use this model to identify the missing categories (kinda create a subset of the dataset and take ones only with missing categories)
isbns = []
predicted_cats = []

# consider the isbn13 so that we can merge back to the original dataframe
missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

for i in tqdm(range(0, len(missing_cats))):
	sequence = missing_cats["description"][i]
	predicted_cats += [generate_predictions(sequence, fiction_categories)]
	isbns += [missing_cats["isbn13"][i]]

missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})
print("The missing predicted categories: \n")
print(missing_predicted_df)

# merge it back to the main dataframe
books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns = ["predicted_categories"])

# specific fiction categories with these keywords exclusively
print(books[books["categories"].str.lower().isin([
	"romance",
	"science fiction",
	"scifi",
	"fantasy",
	"horror",
	"mystery",
	"thriller",
	"comedy",
	"crime"
	"historical"
])])

# save the changes in a new file
books.to_csv("books_with_categories.csv", index=False)