nirmanpatel commited on
Commit
226e11e
·
verified ·
1 Parent(s): f632dba

Upload 4 files

Browse files
data_exploration.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+
6
+ #------------------------ data cleaning ---------------------------#
7
+
8
+ books = pd.read_csv('books.csv')
9
+ print("Initial look at the dataset:\n", books.head())
10
+
11
+ #plot showing the missing values from the dataset
12
+ ax = plt.axes()
13
+ sns.heatmap(books.isna().transpose(), cbar=False, ax=ax)
14
+ plt.xlabel ("Columns")
15
+ plt.ylabel ("Missing values")
16
+ plt.show()
17
+
18
+ #marking if the description is present or not by adding the below column: 1 where it's missing else 0
19
+ books["missing_description"] = np.where(books["description"].isna(), 1, 0)
20
+ books["age_of_book"] = 2025 - books["published_year"]
21
+
22
+ columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]
23
+
24
+ correlation_matrix = books[columns_of_interest].corr(method='spearman')
25
+
26
+ sns.set_theme (style="white")
27
+ plt.figure(figsize=(8, 6))
28
+ heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Spearman correlation"})
29
+ heatmap.set_title("Correlation heatmap")
30
+ plt.show()
31
+
32
+ #checking all the books where anyone of the columns_of_interest data is missing
33
+ print(books[(books["description"].isna()) |
34
+ (books ["num_pages"]. isna()) |
35
+ (books["average_rating"].isna()) |
36
+ (books["published_year"]. isna())
37
+ ])
38
+
39
+ #hence, removing all such books from the dataframe
40
+ books_updated = books[~(books["description"].isna()) &
41
+ ~(books ["num_pages"]. isna()) &
42
+ ~(books["average_rating"].isna()) &
43
+ ~(books["published_year"]. isna())
44
+ ]
45
+
46
+ '''
47
+ Potential problems for not making a copy of modified DataFrame:
48
+
49
+ Unpredictable behavior - Code might work sometimes and fail other times
50
+ Silent data corruption - Changes might affect the original DataFrame unexpectedly
51
+ Hard-to-debug issues - The behavior depends on pandas' internal optimizations, which can change
52
+ Future pandas versions - This might become an error instead of just a warning
53
+ '''
54
+ books_updated = books_updated.copy()
55
+ print("Removing the above observations..\nUpdated dataset:\n", books_updated)
56
+
57
+ #just for visualization - first 100 entries
58
+ visualize_top = books_updated[:100]
59
+
60
+ #take a closer look at the distribution of the categories in the descending order
61
+ print("Categorical Distribution:\n", books_updated["categories"].value_counts().reset_index().sort_values("count", ascending=False))
62
+ sns.histplot(data=visualize_top, x = 'categories', kde=True)
63
+ plt.xticks(rotation=90)
64
+ plt.show()
65
+
66
+ #taking a closer look at the first few descriptions
67
+ print(books["description"].head())
68
+
69
+ '''IMP Note: Books with one-word descriptions won't enable a smooth recommendation process. They wouldn't be useful in the process
70
+ Hence, it makes sense to remove such observations from the dataset.'''
71
+
72
+ #introduce a new variable - gives us the length of the description
73
+ books_updated["words_in_description"] = books_updated["description"].str.split().str.len()
74
+ print(books_updated["words_in_description"].head())
75
+ sns.histplot(data=books_updated, x = 'words_in_description', kde=True)
76
+ plt.xticks(rotation=45)
77
+ plt.show()
78
+
79
+ '''
80
+ #books with description words ranging from 1 to 4
81
+ print("Books having description of upto 4 words:\n",
82
+ books_updated.loc[books_updated["words_in_description"].between(1, 4), "description"], "\n")
83
+
84
+ #books with description words ranging from 5 to 14
85
+ print("Books having description words ranging from 5 to 14:\n",
86
+ books_updated.loc[books_updated["words_in_description"].between(5, 14), "description"], "\n")
87
+
88
+ #books with description words ranging from 15 to 24
89
+ print("Books having description words ranging from 15 to 24:\n",
90
+ books_updated.loc[books_updated["words_in_description"].between(15, 24), "description"], "\n")
91
+
92
+ #books with description words ranging from 25 to 34
93
+ print("Books having description of upto 4 words:\n",
94
+ books_updated.loc[books_updated["words_in_description"].between(25, 34), "description"],)
95
+ '''
96
+
97
+ #use 25 words in description as a cutoff and remove all with less than 25 words
98
+ books_updated_25_words = books_updated[books_updated["words_in_description"] >= 25]
99
+ #print("Books having description words more than 25:\n", books_updated_25_words)
100
+ books_updated_25_words = books_updated_25_words.copy()
101
+
102
+ #create a new column where if the subtitle is missing, append the the title itself in that field, or
103
+ #if both, title and subtitle are present, merge them as a string using a colon
104
+ books_updated_25_words["title_and_subtitle"] = (
105
+ np.where(books_updated_25_words["subtitle"].isna(), books_updated_25_words["title"],
106
+ books_updated_25_words[["title", "subtitle"]].astype(str).agg(": ".join, axis=1))
107
+ )
108
+ #print("Title & Subtitle Column:\n", books_updated_25_words["title_and_subtitle"])
109
+
110
+
111
+ '''
112
+ create a new column with a tagged description. Why?
113
+ it's a good practice as compared to direct string matching and filtering while recommending bcz it can get messy and slow
114
+ isbn number is treated as an identifier and then later can be removed
115
+ '''
116
+ books_updated_25_words["tagged_description"] = books_updated_25_words[["isbn13", "description"]].astype(str).agg(" ".join, axis=1)
117
+ #print("Tagged Description Column:\n", books_updated_25_words["tagged_description"])
118
+
119
+ #now, removing all the unwanted columns and saving the dataframe as a new csv
120
+ (
121
+ books_updated_25_words
122
+ .drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1)
123
+ .to_csv("books_cleaned.csv", index = False)
124
+ )
125
+
sentiment_analysis.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from transformers import pipeline
4
+ from tqdm import tqdm
5
+
6
+ books = pd. read_csv("books_with_categories.csv")
7
+
8
+ # test the HF model with sample-text
9
+ classifier = pipeline("text-classification",
10
+ model="j-hartmann/emotion-english-distilroberta-base",
11
+ top_k=None,
12
+ device='mps')
13
+
14
+ """result = classifier("I love this!")
15
+
16
+ # as it returns a list of dictionaries
17
+ for item in result[0]:
18
+ print(item)
19
+ #another way, print(f"{item['label']:<10}: {item['score']}")"""
20
+
21
+ # break the description into sentences and give individual scores for each
22
+ """sentences = books["description"][0].split(".")
23
+ predictions = classifier(sentences)
24
+
25
+ i = len(predictions)
26
+ while i > 0:
27
+ for sentence in predictions[i-1]:
28
+ print(sentence)
29
+ print("--------------------")
30
+ i -= 1"""
31
+
32
+ # create a dictionary with emotions having maximum probabilities from each sentence
33
+ emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
34
+ isbn = []
35
+ emotion_scores = {label: [] for label in emotion_labels}
36
+
37
+ def calculate_max_emotion_scores(predictions):
38
+ per_emotion_scores = {label: [] for label in emotion_labels}
39
+ for prediction in predictions:
40
+ sorted_predictions = sorted(prediction, key=lambda x: ["label"])
41
+ for index, label in enumerate (emotion_labels):
42
+ per_emotion_scores[label].append(sorted_predictions[index]["score"])
43
+ return {label: np.max(scores) for label, scores in per_emotion_scores.items()}
44
+
45
+ # for all the books in dataset
46
+ for i in tqdm(range(len(books))):
47
+ isbn.append(books["isbn13"][i])
48
+ sentences = books["description"][i].split(".")
49
+ predictions = classifier(sentences)
50
+ max_scores = calculate_max_emotion_scores(predictions)
51
+ for label in emotion_labels:
52
+ emotion_scores[label].append(max_scores[label])
53
+
54
+ # create a new dataframe from the results
55
+ emotions_df = pd.DataFrame(emotion_scores)
56
+ emotions_df["isbn13"] = isbn
57
+ print(emotions_df)
58
+
59
+ books = pd.merge(books, emotions_df, on = "isbn13")
60
+ books.to_csv("books_with_emotions.csv", index = False)
61
+
text_classification.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from transformers import pipeline
4
+ from tqdm import tqdm
5
+ import os
6
+
7
+ books = pd.read_csv('books_cleaned.csv')
8
+
9
+ # print categories with more than 50 books
10
+ print(books["categories"].value_counts().reset_index().query("count > 50"))
11
+
12
+ # look at a specific category
13
+ print(books[books["categories"] == "Juvenile Fiction"])
14
+
15
+ # come up with a mapping to highlight subtle categories, rest will be NaN by default
16
+ category_mapping = {'Fiction' : "Fiction",
17
+ 'Juvenile Fiction': "Children's Fiction",
18
+ 'Biography & Autobiography': "Nonfiction",
19
+ 'History': "Nonfiction",
20
+ 'Literary Criticism': "Nonfiction",
21
+ 'Philosophy': "Nonfiction",
22
+ 'Religion': "Nonfiction",
23
+ 'Comics & Graphic Novels': "Fiction",
24
+ 'Drama': "Fiction",
25
+ 'Juvenile Nonfiction': "Children's Nonfiction",
26
+ 'Science': "Nonfiction",
27
+ 'Poetry': "Fiction"}
28
+
29
+ # add the new column to the dataset
30
+ books["simple_categories"] = books["categories"].map(category_mapping)
31
+ print(books.head())
32
+
33
+ # applying the ZERO-SHOT CLASSIFICATION
34
+ # import the HF model
35
+ fiction_categories = ["Fiction", "Nonfiction"]
36
+ pipe = pipeline("zero-shot-classification",
37
+ model="facebook/bart-large-mnli",
38
+ device="mps"
39
+ )
40
+
41
+ # looking at the first known fiction entry in the dataframe
42
+ print(books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0])
43
+
44
+ # set a sequence and categories to run/apply the classifier, check for the scpres once you run, the depic the probabilities
45
+ sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]
46
+ print(pipe(sequence, fiction_categories))
47
+ print("\n")
48
+
49
+ # get the predicted label from the output using post-processing
50
+ max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
51
+ max_label = pipe(sequence, fiction_categories)["labels"][max_index]
52
+ print("Predicted Label using ZERO-SHOT: " + max_label + "\n")
53
+
54
+ # define a function for generating predictions
55
+ def generate_predictions(sequence, categories):
56
+ predictions = pipe(sequence, categories)
57
+ max_index = np.argmax(predictions["scores"])
58
+ max_label = predictions["labels"][max_index]
59
+ return max_label
60
+
61
+ # take a sizeable sample of fiction and non-fiction, and prediict the label using the classifier
62
+ # compare it with the already known label
63
+ # the if-else is here to avoid re-generation during every run
64
+ if os.path.exists("predictions_results.csv"):
65
+ predictions_df = pd.read_csv("predictions_results.csv")
66
+ else:
67
+ # Run the loops, then save
68
+ actual_cats = []
69
+ predicted_cats = []
70
+
71
+ for i in tqdm(range(0, 300)):
72
+ sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
73
+ predicted_cats += [generate_predictions(sequence, fiction_categories)]
74
+ actual_cats += ["Fiction"]
75
+
76
+ for i in tqdm(range(0, 300)):
77
+ sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
78
+ predicted_cats += [generate_predictions(sequence, fiction_categories)]
79
+ actual_cats += ["Nonfiction"]
80
+
81
+ predictions_df = pd.DataFrame({
82
+ "actual_categories": actual_cats,
83
+ "predicted_categories": predicted_cats
84
+ })
85
+
86
+ predictions_df.to_csv("predictions_results.csv", index=False)
87
+
88
+ # sets 1 for correct prediction and 0 for incorrect
89
+ predictions_df["correct_prediction"] = (
90
+ np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
91
+ )
92
+ print(predictions_df)
93
+
94
+ # calculating the accuracy of our labelling
95
+ accuracy = predictions_df["correct_prediction"].sum()/len(predictions_df)
96
+ print("Labelling accuracy: ", accuracy*100, "%\n")
97
+
98
+ # use this model to identify the missing categories (kinda create a subset of the dataset and take ones only with missing categories)
99
+ isbns = []
100
+ predicted_cats = []
101
+
102
+ # consider the isbn13 so that we can merge back to the original dataframe
103
+ missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
104
+
105
+ for i in tqdm(range(0, len(missing_cats))):
106
+ sequence = missing_cats["description"][i]
107
+ predicted_cats += [generate_predictions(sequence, fiction_categories)]
108
+ isbns += [missing_cats["isbn13"][i]]
109
+
110
+ missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})
111
+ print("The missing predicted categories: \n")
112
+ print(missing_predicted_df)
113
+
114
+ # merge it back to the main dataframe
115
+ books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
116
+ books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
117
+ books = books.drop(columns = ["predicted_categories"])
118
+
119
+ # specific fiction categories with these keywords exclusively
120
+ print(books[books["categories"].str.lower().isin([
121
+ "romance",
122
+ "science fiction",
123
+ "scifi",
124
+ "fantasy",
125
+ "horror",
126
+ "mystery",
127
+ "thriller",
128
+ "comedy",
129
+ "crime"
130
+ "historical"
131
+ ])])
132
+
133
+ # save the changes in a new file
134
+ books.to_csv("books_with_categories.csv", index=False)
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
vector_search.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_chroma import Chroma
2
+ from langchain_openai import OpenAIEmbeddings
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.document_loaders import TextLoader
5
+ from langchain_text_splitters import CharacterTextSplitter
6
+ from dotenv import load_dotenv
7
+ from tabulate import tabulate
8
+ import pandas as pd
9
+
10
+ #loading the .env file
11
+ load_dotenv()
12
+
13
+ books = pd.read_csv("books_cleaned.csv")
14
+
15
+ books["tagged_description"].to_csv("tagged_description.txt",
16
+ sep = "\n",
17
+ index = False,
18
+ header = False)
19
+
20
+ """Our existing Chroma DB (chroma_db_books) was created with OpenAIEmbeddings, which produce 1536-dimensional vectors.
21
+ On the other hand, HuggingFaceEmbeddings produces 384-dimensional vectors.
22
+ """
23
+
24
+ # OpenAI approach using its API
25
+ # load the documents and instantiate the text-splitter
26
+ # the chunk size it set to zero to prioritize splitting at the separator rather than the chunk-size, hence we might warnings
27
+ """raw_documents = TextLoader("tagged_description.txt").load()
28
+ text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
29
+ documents = text_splitter.split_documents(raw_documents)
30
+
31
+ # checking if it prints the first description correctly
32
+ print(documents[0])
33
+
34
+ #create the document embeddings and store them in the vector database
35
+ db_books = Chroma.from_documents(
36
+ documents,
37
+ embedding=OpenAIEmbeddings(),
38
+ persist_directory="chroma_db_books"
39
+ )
40
+ print("Vector database stored to local disk:)")
41
+ """
42
+
43
+ # HuggingFace approach >> to save money
44
+ # conditional flag to avoid creating vector database everytime
45
+ query = "A book to teach children about nature"
46
+
47
+ REBUILD_VECTOR_DB = False
48
+ PERSIST_DIR = "chroma_db_books_hf"
49
+ MODEL = "sentence-transformers/all-MiniLM-L6-v2" #384-dim (keep consistent!)
50
+
51
+ # 1. Define the embedding model (same for build & query)
52
+ embedding = HuggingFaceEmbeddings(model_name=MODEL)
53
+
54
+ if REBUILD_VECTOR_DB:
55
+ # 2. Load and split text
56
+ raw_documents = TextLoader("tagged_description.txt").load()
57
+ text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
58
+ documents = text_splitter.split_documents(raw_documents)
59
+
60
+ # 3. Create and persist vector DB
61
+ db_books = Chroma.from_documents(
62
+ documents,
63
+ embedding=embedding,
64
+ persist_directory=PERSIST_DIR
65
+ )
66
+
67
+ print("First split chunk:")
68
+ print(documents[0].page_content)
69
+
70
+ else:
71
+ # 4. Load existing DB (no re-embedding)
72
+ db_books = Chroma(
73
+ persist_directory=PERSIST_DIR,
74
+ embedding_function=embedding
75
+ )
76
+
77
+ # 5. Run a query
78
+ results = db_books.similarity_search(query, k=1)
79
+ print("Top semantic match:\n" + results[0].page_content + "\n")
80
+
81
+ docs = db_books.similarity_search(query, k = 10)
82
+ print("First 10 results: \n", docs, "\n")
83
+
84
+ # filters and gives the isbn for the first result from the query results
85
+ print("First result of all:\n")
86
+ print(books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())])
87
+ print("\n")
88
+
89
+ def retrieve_semantic_recommendations(
90
+ query: str,
91
+ top_k: int = 10,
92
+ ) -> pd. DataFrame:
93
+ recs = db_books.similarity_search (query, k = 50)
94
+
95
+ books_list = []
96
+
97
+ for i in range(0, len(recs)):
98
+ books_list += [int(recs[i].page_content.strip('"').split()[0])]
99
+
100
+ return books[books["isbn13"].isin(books_list)].head(top_k)
101
+
102
+ results = retrieve_semantic_recommendations(query)
103
+ print("Recommendations:\n")
104
+ print(tabulate(results, headers='keys', tablefmt='grid', showindex=False))
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+