File size: 4,997 Bytes
226e11e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import pandas as pd
import numpy as np
from transformers import pipeline
from tqdm import tqdm
import os
books = pd.read_csv('books_cleaned.csv')
# print categories with more than 50 books
print(books["categories"].value_counts().reset_index().query("count > 50"))
# look at a specific category
print(books[books["categories"] == "Juvenile Fiction"])
# come up with a mapping to highlight subtle categories, rest will be NaN by default
category_mapping = {'Fiction' : "Fiction",
'Juvenile Fiction': "Children's Fiction",
'Biography & Autobiography': "Nonfiction",
'History': "Nonfiction",
'Literary Criticism': "Nonfiction",
'Philosophy': "Nonfiction",
'Religion': "Nonfiction",
'Comics & Graphic Novels': "Fiction",
'Drama': "Fiction",
'Juvenile Nonfiction': "Children's Nonfiction",
'Science': "Nonfiction",
'Poetry': "Fiction"}
# add the new column to the dataset
books["simple_categories"] = books["categories"].map(category_mapping)
print(books.head())
# applying the ZERO-SHOT CLASSIFICATION
# import the HF model
fiction_categories = ["Fiction", "Nonfiction"]
pipe = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli",
device="mps"
)
# looking at the first known fiction entry in the dataframe
print(books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0])
# set a sequence and categories to run/apply the classifier, check for the scpres once you run, the depic the probabilities
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]
print(pipe(sequence, fiction_categories))
print("\n")
# get the predicted label from the output using post-processing
max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
max_label = pipe(sequence, fiction_categories)["labels"][max_index]
print("Predicted Label using ZERO-SHOT: " + max_label + "\n")
# define a function for generating predictions
def generate_predictions(sequence, categories):
predictions = pipe(sequence, categories)
max_index = np.argmax(predictions["scores"])
max_label = predictions["labels"][max_index]
return max_label
# take a sizeable sample of fiction and non-fiction, and prediict the label using the classifier
# compare it with the already known label
# the if-else is here to avoid re-generation during every run
if os.path.exists("predictions_results.csv"):
predictions_df = pd.read_csv("predictions_results.csv")
else:
# Run the loops, then save
actual_cats = []
predicted_cats = []
for i in tqdm(range(0, 300)):
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
predicted_cats += [generate_predictions(sequence, fiction_categories)]
actual_cats += ["Fiction"]
for i in tqdm(range(0, 300)):
sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
predicted_cats += [generate_predictions(sequence, fiction_categories)]
actual_cats += ["Nonfiction"]
predictions_df = pd.DataFrame({
"actual_categories": actual_cats,
"predicted_categories": predicted_cats
})
predictions_df.to_csv("predictions_results.csv", index=False)
# sets 1 for correct prediction and 0 for incorrect
predictions_df["correct_prediction"] = (
np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
)
print(predictions_df)
# calculating the accuracy of our labelling
accuracy = predictions_df["correct_prediction"].sum()/len(predictions_df)
print("Labelling accuracy: ", accuracy*100, "%\n")
# use this model to identify the missing categories (kinda create a subset of the dataset and take ones only with missing categories)
isbns = []
predicted_cats = []
# consider the isbn13 so that we can merge back to the original dataframe
missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
for i in tqdm(range(0, len(missing_cats))):
sequence = missing_cats["description"][i]
predicted_cats += [generate_predictions(sequence, fiction_categories)]
isbns += [missing_cats["isbn13"][i]]
missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})
print("The missing predicted categories: \n")
print(missing_predicted_df)
# merge it back to the main dataframe
books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns = ["predicted_categories"])
# specific fiction categories with these keywords exclusively
print(books[books["categories"].str.lower().isin([
"romance",
"science fiction",
"scifi",
"fantasy",
"horror",
"mystery",
"thriller",
"comedy",
"crime"
"historical"
])])
# save the changes in a new file
books.to_csv("books_with_categories.csv", index=False)
|