Spaces:
Build error
Build error
YuITC commited on
Commit ·
2d13434
1
Parent(s): 7f9296c
Final
Browse files- LICENSE +21 -0
- README.md +55 -14
- app.py +0 -11
- cover-not-found.jpg +0 -0
- data/books.csv +0 -0
- data/books_cleaned.csv +0 -0
- data/books_with_categories.csv +0 -0
- data/books_with_emotions.csv +0 -0
- data/full_desc.txt +0 -0
- requirements.txt +17 -0
- step_01_EDA.ipynb +0 -0
- step_02_Vector_Search.ipynb +0 -0
- step_03_Zero_Shot_Classification.ipynb +872 -0
- step_04_Sentiment_Analysis.ipynb +671 -0
- step_05_Gradio_Dashboard.py +122 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Nguyen Phu Tai
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,14 +1,55 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Semantic Book Recommender
|
| 2 |
+
|
| 3 |
+
A semantic-based book recommendation system leveraging modern NLP techniques to provide context-aware suggestions.
|
| 4 |
+
|
| 5 |
+

|
| 6 |
+
|
| 7 |
+
## 🧠 Project Overview
|
| 8 |
+
|
| 9 |
+
This project explores the application of Natural Language Processing (NLP) and Large Language Models (LLMs) in building a semantic book recommender system. The system goes beyond traditional keyword-based recommendations by understanding the contextual meaning of book descriptions and user preferences.
|
| 10 |
+
|
| 11 |
+
## 📁 Project Structure
|
| 12 |
+
|
| 13 |
+
- **`data/`**: Contains the dataset used for analysis and model training.
|
| 14 |
+
- **`step_01_EDA.ipynb`**: Performs Exploratory Data Analysis to understand data distribution and key features.
|
| 15 |
+
- **`step_02_Vector_Search.ipynb`**: Implements vector-based search using sentence embeddings to find semantically similar books.
|
| 16 |
+
- **`step_03_Zero_Shot_Classification.ipynb`**: Applies zero-shot classification to categorize books without labeled data, utilizing pre-trained LLMs.
|
| 17 |
+
- **`step_04_Sentiment_Analysis.ipynb`**: Conducts sentiment analysis on book reviews to gauge reader opinions.
|
| 18 |
+
- **`step_05_Gradio_Dashboard.py`**: Develops an interactive dashboard using Gradio for users to input preferences and receive recommendations.
|
| 19 |
+
- **`requirements.txt`**: Lists all Python dependencies required to run the project.
|
| 20 |
+
|
| 21 |
+
## 🔍 Key Features
|
| 22 |
+
|
| 23 |
+
- **Semantic Search**: Utilizes sentence embeddings to capture the semantic meaning of book descriptions, enabling more accurate recommendations.
|
| 24 |
+
- **Zero-Shot Classification**: Employs pre-trained LLMs to classify books into genres or categories without the need for labeled training data.
|
| 25 |
+
- **Sentiment Analysis**: Analyzes user reviews to understand the general sentiment towards books, aiding in recommendation decisions.
|
| 26 |
+
- **Interactive Dashboard**: Provides a user-friendly interface for users to input their preferences and receive tailored book suggestions.
|
| 27 |
+
|
| 28 |
+
## 🚀 Getting Started
|
| 29 |
+
|
| 30 |
+
1. **Clone the repository**:
|
| 31 |
+
```bash
|
| 32 |
+
git clone https://github.com/YuITC/Semantic-Book-Recommender.git
|
| 33 |
+
cd Semantic-Book-Recommender
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
2. **Install dependencies**:
|
| 37 |
+
```bash
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
3. **Run the Gradio dashboard**:
|
| 42 |
+
```bash
|
| 43 |
+
python step_05_Gradio_Dashboard.py
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## 📜 License
|
| 47 |
+
This project is licensed under the MIT License – feel free to modify and distribute it as needed.
|
| 48 |
+
|
| 49 |
+
## 🤝 Acknowledgments
|
| 50 |
+
If you find this project useful, consider ⭐️ starring the repository or contributing to further improvements!
|
| 51 |
+
|
| 52 |
+
## 📬 Contact
|
| 53 |
+
For any questions or collaboration opportunities, feel free to reach out:
|
| 54 |
+
|
| 55 |
+
📧 Email: tainguyenphu2502@gmail.com
|
app.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def greet(name):
|
| 5 |
-
return "Hello " + name + "!"
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
demo = gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
|
| 9 |
-
|
| 10 |
-
if __name__ == "__main__":
|
| 11 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cover-not-found.jpg
ADDED
|
data/books.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/books_cleaned.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/books_with_categories.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/books_with_emotions.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/full_desc.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
CHANGED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# data
|
| 2 |
+
kagglehub
|
| 3 |
+
pandas
|
| 4 |
+
matplotlib
|
| 5 |
+
seaborn
|
| 6 |
+
|
| 7 |
+
# environment
|
| 8 |
+
python-dotenv
|
| 9 |
+
|
| 10 |
+
# langchain
|
| 11 |
+
langchain-community==0.3.12
|
| 12 |
+
langchain-huggingface
|
| 13 |
+
langchain-chroma==0.1.4
|
| 14 |
+
|
| 15 |
+
# huggingface
|
| 16 |
+
transformers==4.47.1
|
| 17 |
+
gradio
|
step_01_EDA.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
step_02_Vector_Search.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
step_03_Zero_Shot_Classification.ipynb
ADDED
|
@@ -0,0 +1,872 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"data": {
|
| 10 |
+
"text/html": [
|
| 11 |
+
"<div>\n",
|
| 12 |
+
"<style scoped>\n",
|
| 13 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 14 |
+
" vertical-align: middle;\n",
|
| 15 |
+
" }\n",
|
| 16 |
+
"\n",
|
| 17 |
+
" .dataframe tbody tr th {\n",
|
| 18 |
+
" vertical-align: top;\n",
|
| 19 |
+
" }\n",
|
| 20 |
+
"\n",
|
| 21 |
+
" .dataframe thead th {\n",
|
| 22 |
+
" text-align: right;\n",
|
| 23 |
+
" }\n",
|
| 24 |
+
"</style>\n",
|
| 25 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 26 |
+
" <thead>\n",
|
| 27 |
+
" <tr style=\"text-align: right;\">\n",
|
| 28 |
+
" <th></th>\n",
|
| 29 |
+
" <th>isbn13</th>\n",
|
| 30 |
+
" <th>full_title</th>\n",
|
| 31 |
+
" <th>authors</th>\n",
|
| 32 |
+
" <th>categories</th>\n",
|
| 33 |
+
" <th>description</th>\n",
|
| 34 |
+
" <th>full_desc</th>\n",
|
| 35 |
+
" <th>published_year</th>\n",
|
| 36 |
+
" <th>num_pages</th>\n",
|
| 37 |
+
" <th>average_rating</th>\n",
|
| 38 |
+
" <th>ratings_count</th>\n",
|
| 39 |
+
" <th>thumbnail</th>\n",
|
| 40 |
+
" </tr>\n",
|
| 41 |
+
" </thead>\n",
|
| 42 |
+
" <tbody>\n",
|
| 43 |
+
" <tr>\n",
|
| 44 |
+
" <th>0</th>\n",
|
| 45 |
+
" <td>9780002005883</td>\n",
|
| 46 |
+
" <td>Gilead</td>\n",
|
| 47 |
+
" <td>Marilynne Robinson</td>\n",
|
| 48 |
+
" <td>Fiction</td>\n",
|
| 49 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 50 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 51 |
+
" <td>2004.0</td>\n",
|
| 52 |
+
" <td>247.0</td>\n",
|
| 53 |
+
" <td>3.85</td>\n",
|
| 54 |
+
" <td>361.0</td>\n",
|
| 55 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 56 |
+
" </tr>\n",
|
| 57 |
+
" <tr>\n",
|
| 58 |
+
" <th>1</th>\n",
|
| 59 |
+
" <td>9780002261982</td>\n",
|
| 60 |
+
" <td>Spider's Web: A Novel</td>\n",
|
| 61 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 62 |
+
" <td>Detective and mystery stories</td>\n",
|
| 63 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 64 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 65 |
+
" <td>2000.0</td>\n",
|
| 66 |
+
" <td>241.0</td>\n",
|
| 67 |
+
" <td>3.83</td>\n",
|
| 68 |
+
" <td>5164.0</td>\n",
|
| 69 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 70 |
+
" </tr>\n",
|
| 71 |
+
" <tr>\n",
|
| 72 |
+
" <th>2</th>\n",
|
| 73 |
+
" <td>9780006178736</td>\n",
|
| 74 |
+
" <td>Rage of angels</td>\n",
|
| 75 |
+
" <td>Sidney Sheldon</td>\n",
|
| 76 |
+
" <td>Fiction</td>\n",
|
| 77 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 78 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 79 |
+
" <td>1993.0</td>\n",
|
| 80 |
+
" <td>512.0</td>\n",
|
| 81 |
+
" <td>3.93</td>\n",
|
| 82 |
+
" <td>29532.0</td>\n",
|
| 83 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 84 |
+
" </tr>\n",
|
| 85 |
+
" <tr>\n",
|
| 86 |
+
" <th>3</th>\n",
|
| 87 |
+
" <td>9780006280897</td>\n",
|
| 88 |
+
" <td>The Four Loves</td>\n",
|
| 89 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 90 |
+
" <td>Christian life</td>\n",
|
| 91 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 92 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 93 |
+
" <td>2002.0</td>\n",
|
| 94 |
+
" <td>170.0</td>\n",
|
| 95 |
+
" <td>4.15</td>\n",
|
| 96 |
+
" <td>33684.0</td>\n",
|
| 97 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 98 |
+
" </tr>\n",
|
| 99 |
+
" <tr>\n",
|
| 100 |
+
" <th>4</th>\n",
|
| 101 |
+
" <td>9780006280934</td>\n",
|
| 102 |
+
" <td>The Problem of Pain</td>\n",
|
| 103 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 104 |
+
" <td>Christian life</td>\n",
|
| 105 |
+
" <td>\"In The Problem of Pain, C.S. Lewis, one of th...</td>\n",
|
| 106 |
+
" <td>9780006280934 \"In The Problem of Pain, C.S. Le...</td>\n",
|
| 107 |
+
" <td>2002.0</td>\n",
|
| 108 |
+
" <td>176.0</td>\n",
|
| 109 |
+
" <td>4.09</td>\n",
|
| 110 |
+
" <td>37569.0</td>\n",
|
| 111 |
+
" <td>http://books.google.com/books/content?id=Kk-uV...</td>\n",
|
| 112 |
+
" </tr>\n",
|
| 113 |
+
" </tbody>\n",
|
| 114 |
+
"</table>\n",
|
| 115 |
+
"</div>"
|
| 116 |
+
],
|
| 117 |
+
"text/plain": [
|
| 118 |
+
" isbn13 full_title authors \\\n",
|
| 119 |
+
"0 9780002005883 Gilead Marilynne Robinson \n",
|
| 120 |
+
"1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n",
|
| 121 |
+
"2 9780006178736 Rage of angels Sidney Sheldon \n",
|
| 122 |
+
"3 9780006280897 The Four Loves Clive Staples Lewis \n",
|
| 123 |
+
"4 9780006280934 The Problem of Pain Clive Staples Lewis \n",
|
| 124 |
+
"\n",
|
| 125 |
+
" categories \\\n",
|
| 126 |
+
"0 Fiction \n",
|
| 127 |
+
"1 Detective and mystery stories \n",
|
| 128 |
+
"2 Fiction \n",
|
| 129 |
+
"3 Christian life \n",
|
| 130 |
+
"4 Christian life \n",
|
| 131 |
+
"\n",
|
| 132 |
+
" description \\\n",
|
| 133 |
+
"0 A NOVEL THAT READERS and critics have been eag... \n",
|
| 134 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... \n",
|
| 135 |
+
"2 A memorable, mesmerizing heroine Jennifer -- b... \n",
|
| 136 |
+
"3 Lewis' work on the nature of love divides love... \n",
|
| 137 |
+
"4 \"In The Problem of Pain, C.S. Lewis, one of th... \n",
|
| 138 |
+
"\n",
|
| 139 |
+
" full_desc published_year \\\n",
|
| 140 |
+
"0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n",
|
| 141 |
+
"1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n",
|
| 142 |
+
"2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n",
|
| 143 |
+
"3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n",
|
| 144 |
+
"4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n",
|
| 145 |
+
"\n",
|
| 146 |
+
" num_pages average_rating ratings_count \\\n",
|
| 147 |
+
"0 247.0 3.85 361.0 \n",
|
| 148 |
+
"1 241.0 3.83 5164.0 \n",
|
| 149 |
+
"2 512.0 3.93 29532.0 \n",
|
| 150 |
+
"3 170.0 4.15 33684.0 \n",
|
| 151 |
+
"4 176.0 4.09 37569.0 \n",
|
| 152 |
+
"\n",
|
| 153 |
+
" thumbnail \n",
|
| 154 |
+
"0 http://books.google.com/books/content?id=KQZCP... \n",
|
| 155 |
+
"1 http://books.google.com/books/content?id=gA5GP... \n",
|
| 156 |
+
"2 http://books.google.com/books/content?id=FKo2T... \n",
|
| 157 |
+
"3 http://books.google.com/books/content?id=XhQ5X... \n",
|
| 158 |
+
"4 http://books.google.com/books/content?id=Kk-uV... "
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
"execution_count": 1,
|
| 162 |
+
"metadata": {},
|
| 163 |
+
"output_type": "execute_result"
|
| 164 |
+
}
|
| 165 |
+
],
|
| 166 |
+
"source": [
|
| 167 |
+
"import pandas as pd\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"books = pd.read_csv('data/books_cleaned.csv')\n",
|
| 170 |
+
"books.head()"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"cell_type": "code",
|
| 175 |
+
"execution_count": 3,
|
| 176 |
+
"metadata": {},
|
| 177 |
+
"outputs": [
|
| 178 |
+
{
|
| 179 |
+
"data": {
|
| 180 |
+
"text/html": [
|
| 181 |
+
"<div>\n",
|
| 182 |
+
"<style scoped>\n",
|
| 183 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 184 |
+
" vertical-align: middle;\n",
|
| 185 |
+
" }\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" .dataframe tbody tr th {\n",
|
| 188 |
+
" vertical-align: top;\n",
|
| 189 |
+
" }\n",
|
| 190 |
+
"\n",
|
| 191 |
+
" .dataframe thead th {\n",
|
| 192 |
+
" text-align: right;\n",
|
| 193 |
+
" }\n",
|
| 194 |
+
"</style>\n",
|
| 195 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 196 |
+
" <thead>\n",
|
| 197 |
+
" <tr style=\"text-align: right;\">\n",
|
| 198 |
+
" <th></th>\n",
|
| 199 |
+
" <th>0</th>\n",
|
| 200 |
+
" <th>1</th>\n",
|
| 201 |
+
" <th>2</th>\n",
|
| 202 |
+
" <th>3</th>\n",
|
| 203 |
+
" <th>4</th>\n",
|
| 204 |
+
" <th>5</th>\n",
|
| 205 |
+
" <th>6</th>\n",
|
| 206 |
+
" <th>7</th>\n",
|
| 207 |
+
" <th>8</th>\n",
|
| 208 |
+
" <th>9</th>\n",
|
| 209 |
+
" <th>10</th>\n",
|
| 210 |
+
" <th>11</th>\n",
|
| 211 |
+
" <th>12</th>\n",
|
| 212 |
+
" </tr>\n",
|
| 213 |
+
" </thead>\n",
|
| 214 |
+
" <tbody>\n",
|
| 215 |
+
" <tr>\n",
|
| 216 |
+
" <th>categories</th>\n",
|
| 217 |
+
" <td>Fiction</td>\n",
|
| 218 |
+
" <td>Juvenile Fiction</td>\n",
|
| 219 |
+
" <td>Biography & Autobiography</td>\n",
|
| 220 |
+
" <td>History</td>\n",
|
| 221 |
+
" <td>Literary Criticism</td>\n",
|
| 222 |
+
" <td>Religion</td>\n",
|
| 223 |
+
" <td>Philosophy</td>\n",
|
| 224 |
+
" <td>Comics & Graphic Novels</td>\n",
|
| 225 |
+
" <td>Drama</td>\n",
|
| 226 |
+
" <td>Juvenile Nonfiction</td>\n",
|
| 227 |
+
" <td>Science</td>\n",
|
| 228 |
+
" <td>Poetry</td>\n",
|
| 229 |
+
" <td>Literary Collections</td>\n",
|
| 230 |
+
" </tr>\n",
|
| 231 |
+
" <tr>\n",
|
| 232 |
+
" <th>count</th>\n",
|
| 233 |
+
" <td>2111</td>\n",
|
| 234 |
+
" <td>390</td>\n",
|
| 235 |
+
" <td>311</td>\n",
|
| 236 |
+
" <td>207</td>\n",
|
| 237 |
+
" <td>124</td>\n",
|
| 238 |
+
" <td>117</td>\n",
|
| 239 |
+
" <td>117</td>\n",
|
| 240 |
+
" <td>116</td>\n",
|
| 241 |
+
" <td>86</td>\n",
|
| 242 |
+
" <td>57</td>\n",
|
| 243 |
+
" <td>56</td>\n",
|
| 244 |
+
" <td>51</td>\n",
|
| 245 |
+
" <td>50</td>\n",
|
| 246 |
+
" </tr>\n",
|
| 247 |
+
" </tbody>\n",
|
| 248 |
+
"</table>\n",
|
| 249 |
+
"</div>"
|
| 250 |
+
],
|
| 251 |
+
"text/plain": [
|
| 252 |
+
" 0 1 2 3 \\\n",
|
| 253 |
+
"categories Fiction Juvenile Fiction Biography & Autobiography History \n",
|
| 254 |
+
"count 2111 390 311 207 \n",
|
| 255 |
+
"\n",
|
| 256 |
+
" 4 5 6 7 \\\n",
|
| 257 |
+
"categories Literary Criticism Religion Philosophy Comics & Graphic Novels \n",
|
| 258 |
+
"count 124 117 117 116 \n",
|
| 259 |
+
"\n",
|
| 260 |
+
" 8 9 10 11 12 \n",
|
| 261 |
+
"categories Drama Juvenile Nonfiction Science Poetry Literary Collections \n",
|
| 262 |
+
"count 86 57 56 51 50 "
|
| 263 |
+
]
|
| 264 |
+
},
|
| 265 |
+
"execution_count": 3,
|
| 266 |
+
"metadata": {},
|
| 267 |
+
"output_type": "execute_result"
|
| 268 |
+
}
|
| 269 |
+
],
|
| 270 |
+
"source": [
|
| 271 |
+
"books['categories'].value_counts().reset_index().query('count >= 50').T"
|
| 272 |
+
]
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"cell_type": "code",
|
| 276 |
+
"execution_count": 4,
|
| 277 |
+
"metadata": {},
|
| 278 |
+
"outputs": [
|
| 279 |
+
{
|
| 280 |
+
"name": "stdout",
|
| 281 |
+
"output_type": "stream",
|
| 282 |
+
"text": [
|
| 283 |
+
"Total books: 5197\n",
|
| 284 |
+
"Books with simple categories: 3743\n"
|
| 285 |
+
]
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"data": {
|
| 289 |
+
"text/html": [
|
| 290 |
+
"<div>\n",
|
| 291 |
+
"<style scoped>\n",
|
| 292 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 293 |
+
" vertical-align: middle;\n",
|
| 294 |
+
" }\n",
|
| 295 |
+
"\n",
|
| 296 |
+
" .dataframe tbody tr th {\n",
|
| 297 |
+
" vertical-align: top;\n",
|
| 298 |
+
" }\n",
|
| 299 |
+
"\n",
|
| 300 |
+
" .dataframe thead th {\n",
|
| 301 |
+
" text-align: right;\n",
|
| 302 |
+
" }\n",
|
| 303 |
+
"</style>\n",
|
| 304 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 305 |
+
" <thead>\n",
|
| 306 |
+
" <tr style=\"text-align: right;\">\n",
|
| 307 |
+
" <th></th>\n",
|
| 308 |
+
" <th>simple_categories</th>\n",
|
| 309 |
+
" <th>count</th>\n",
|
| 310 |
+
" </tr>\n",
|
| 311 |
+
" </thead>\n",
|
| 312 |
+
" <tbody>\n",
|
| 313 |
+
" <tr>\n",
|
| 314 |
+
" <th>0</th>\n",
|
| 315 |
+
" <td>Fiction</td>\n",
|
| 316 |
+
" <td>2364</td>\n",
|
| 317 |
+
" </tr>\n",
|
| 318 |
+
" <tr>\n",
|
| 319 |
+
" <th>1</th>\n",
|
| 320 |
+
" <td>Nonfiction</td>\n",
|
| 321 |
+
" <td>932</td>\n",
|
| 322 |
+
" </tr>\n",
|
| 323 |
+
" <tr>\n",
|
| 324 |
+
" <th>2</th>\n",
|
| 325 |
+
" <td>Children's Fiction</td>\n",
|
| 326 |
+
" <td>390</td>\n",
|
| 327 |
+
" </tr>\n",
|
| 328 |
+
" <tr>\n",
|
| 329 |
+
" <th>3</th>\n",
|
| 330 |
+
" <td>Children's Nonfiction</td>\n",
|
| 331 |
+
" <td>57</td>\n",
|
| 332 |
+
" </tr>\n",
|
| 333 |
+
" </tbody>\n",
|
| 334 |
+
"</table>\n",
|
| 335 |
+
"</div>"
|
| 336 |
+
],
|
| 337 |
+
"text/plain": [
|
| 338 |
+
" simple_categories count\n",
|
| 339 |
+
"0 Fiction 2364\n",
|
| 340 |
+
"1 Nonfiction 932\n",
|
| 341 |
+
"2 Children's Fiction 390\n",
|
| 342 |
+
"3 Children's Nonfiction 57"
|
| 343 |
+
]
|
| 344 |
+
},
|
| 345 |
+
"execution_count": 4,
|
| 346 |
+
"metadata": {},
|
| 347 |
+
"output_type": "execute_result"
|
| 348 |
+
}
|
| 349 |
+
],
|
| 350 |
+
"source": [
|
| 351 |
+
"category_mapping = {\n",
|
| 352 |
+
" 'Fiction' : \"Fiction\",\n",
|
| 353 |
+
" 'Juvenile Fiction' : \"Children's Fiction\",\n",
|
| 354 |
+
" 'Biography & Autobiography': \"Nonfiction\",\n",
|
| 355 |
+
" 'History' : \"Nonfiction\",\n",
|
| 356 |
+
" 'Literary Criticism' : \"Nonfiction\",\n",
|
| 357 |
+
" 'Philosophy' : \"Nonfiction\",\n",
|
| 358 |
+
" 'Religion' : \"Nonfiction\",\n",
|
| 359 |
+
" 'Comics & Graphic Novels' : \"Fiction\",\n",
|
| 360 |
+
" 'Drama' : \"Fiction\",\n",
|
| 361 |
+
" 'Juvenile Nonfiction' : \"Children's Nonfiction\",\n",
|
| 362 |
+
" 'Science' : \"Nonfiction\",\n",
|
| 363 |
+
" 'Poetry' : \"Fiction\"\n",
|
| 364 |
+
"}\n",
|
| 365 |
+
"\n",
|
| 366 |
+
"books['simple_categories'] = books['categories'].map(category_mapping)\n",
|
| 367 |
+
"\n",
|
| 368 |
+
"print(f\"Total books: {len(books)}\")\n",
|
| 369 |
+
"print(f\"Books with simple categories: {len(books[~(books['simple_categories'].isna())])}\")\n",
|
| 370 |
+
"books['simple_categories'].value_counts().reset_index()"
|
| 371 |
+
]
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"cell_type": "code",
|
| 375 |
+
"execution_count": 5,
|
| 376 |
+
"metadata": {},
|
| 377 |
+
"outputs": [
|
| 378 |
+
{
|
| 379 |
+
"name": "stderr",
|
| 380 |
+
"output_type": "stream",
|
| 381 |
+
"text": [
|
| 382 |
+
"Device set to use cuda:0\n"
|
| 383 |
+
]
|
| 384 |
+
}
|
| 385 |
+
],
|
| 386 |
+
"source": [
|
| 387 |
+
"from transformers import pipeline\n",
|
| 388 |
+
"\n",
|
| 389 |
+
"fiction_categories = ['Fiction', 'Nonfiction']\n",
|
| 390 |
+
"pipe = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0)"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"cell_type": "code",
|
| 395 |
+
"execution_count": 6,
|
| 396 |
+
"metadata": {},
|
| 397 |
+
"outputs": [],
|
| 398 |
+
"source": [
|
| 399 |
+
"import numpy as np\n",
|
| 400 |
+
"\n",
|
| 401 |
+
"def generate_predictions(sequence, categories):\n",
|
| 402 |
+
" predictions = pipe(sequence, categories)\n",
|
| 403 |
+
" max_index = np.argmax(predictions['scores'])\n",
|
| 404 |
+
" max_label = predictions['labels'][max_index]\n",
|
| 405 |
+
" return max_label"
|
| 406 |
+
]
|
| 407 |
+
},
|
| 408 |
+
{
|
| 409 |
+
"cell_type": "code",
|
| 410 |
+
"execution_count": 8,
|
| 411 |
+
"metadata": {},
|
| 412 |
+
"outputs": [
|
| 413 |
+
{
|
| 414 |
+
"name": "stderr",
|
| 415 |
+
"output_type": "stream",
|
| 416 |
+
"text": [
|
| 417 |
+
"Calculating accuracy for 500 Fiction books: 100%|██████████| 500/500 [00:42<00:00, 11.89it/s]\n",
|
| 418 |
+
"Calculating accuracy for 500 Nonfiction books: 100%|██████████| 500/500 [00:42<00:00, 11.70it/s]\n"
|
| 419 |
+
]
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
"data": {
|
| 423 |
+
"text/plain": [
|
| 424 |
+
"{'Fiction': 0.674, 'Nonfiction': 0.866, 'total': 0.77}"
|
| 425 |
+
]
|
| 426 |
+
},
|
| 427 |
+
"execution_count": 8,
|
| 428 |
+
"metadata": {},
|
| 429 |
+
"output_type": "execute_result"
|
| 430 |
+
}
|
| 431 |
+
],
|
| 432 |
+
"source": [
|
| 433 |
+
"from tqdm import tqdm\n",
|
| 434 |
+
"\n",
|
| 435 |
+
"# Calculate accuracy for 500 books for each category\n",
|
| 436 |
+
"demo_500_accuracy = {}\n",
|
| 437 |
+
"for label in ['Fiction', 'Nonfiction']:\n",
|
| 438 |
+
" correct = 0\n",
|
| 439 |
+
" descs = books.loc[books['simple_categories'] == label, 'description'].reset_index(drop=True)[:500]\n",
|
| 440 |
+
"\n",
|
| 441 |
+
" for desc in tqdm(descs, desc=f'Calculating accuracy for 500 {label} books'):\n",
|
| 442 |
+
" predicted_label = generate_predictions(desc, fiction_categories)\n",
|
| 443 |
+
" if predicted_label == label:\n",
|
| 444 |
+
" correct += 1\n",
|
| 445 |
+
"\n",
|
| 446 |
+
" accuracy = correct / len(descs)\n",
|
| 447 |
+
" demo_500_accuracy[label] = accuracy\n",
|
| 448 |
+
" \n",
|
| 449 |
+
"# Calculate macro average accuracy\n",
|
| 450 |
+
"demo_500_accuracy['total'] = sum(demo_500_accuracy.values()) / len(demo_500_accuracy)\n",
|
| 451 |
+
"demo_500_accuracy"
|
| 452 |
+
]
|
| 453 |
+
},
|
| 454 |
+
{
|
| 455 |
+
"cell_type": "code",
|
| 456 |
+
"execution_count": 9,
|
| 457 |
+
"metadata": {},
|
| 458 |
+
"outputs": [
|
| 459 |
+
{
|
| 460 |
+
"name": "stderr",
|
| 461 |
+
"output_type": "stream",
|
| 462 |
+
"text": [
|
| 463 |
+
"Predicting for books without simple categories: 100%|██████████| 1454/1454 [02:13<00:00, 10.93it/s]\n"
|
| 464 |
+
]
|
| 465 |
+
}
|
| 466 |
+
],
|
| 467 |
+
"source": [
|
| 468 |
+
"# Predict categories for books without simple categories\n",
|
| 469 |
+
"\n",
|
| 470 |
+
"isbns, preds = [], []\n",
|
| 471 |
+
"non_cat_books = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)\n",
|
| 472 |
+
"\n",
|
| 473 |
+
"for i in tqdm(range(len(non_cat_books)), desc=\"Predicting for books without simple categories\"):\n",
|
| 474 |
+
" sequence = non_cat_books.loc[i, 'description']\n",
|
| 475 |
+
" \n",
|
| 476 |
+
" isbns.append(non_cat_books.loc[i, 'isbn13'])\n",
|
| 477 |
+
" preds.append(generate_predictions(sequence, fiction_categories))"
|
| 478 |
+
]
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"cell_type": "code",
|
| 482 |
+
"execution_count": 10,
|
| 483 |
+
"metadata": {},
|
| 484 |
+
"outputs": [
|
| 485 |
+
{
|
| 486 |
+
"data": {
|
| 487 |
+
"text/html": [
|
| 488 |
+
"<div>\n",
|
| 489 |
+
"<style scoped>\n",
|
| 490 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 491 |
+
" vertical-align: middle;\n",
|
| 492 |
+
" }\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" .dataframe tbody tr th {\n",
|
| 495 |
+
" vertical-align: top;\n",
|
| 496 |
+
" }\n",
|
| 497 |
+
"\n",
|
| 498 |
+
" .dataframe thead th {\n",
|
| 499 |
+
" text-align: right;\n",
|
| 500 |
+
" }\n",
|
| 501 |
+
"</style>\n",
|
| 502 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 503 |
+
" <thead>\n",
|
| 504 |
+
" <tr style=\"text-align: right;\">\n",
|
| 505 |
+
" <th></th>\n",
|
| 506 |
+
" <th>isbn13</th>\n",
|
| 507 |
+
" <th>full_title</th>\n",
|
| 508 |
+
" <th>authors</th>\n",
|
| 509 |
+
" <th>categories</th>\n",
|
| 510 |
+
" <th>description</th>\n",
|
| 511 |
+
" <th>full_desc</th>\n",
|
| 512 |
+
" <th>published_year</th>\n",
|
| 513 |
+
" <th>num_pages</th>\n",
|
| 514 |
+
" <th>average_rating</th>\n",
|
| 515 |
+
" <th>ratings_count</th>\n",
|
| 516 |
+
" <th>thumbnail</th>\n",
|
| 517 |
+
" <th>simple_categories</th>\n",
|
| 518 |
+
" <th>predicted_categories</th>\n",
|
| 519 |
+
" </tr>\n",
|
| 520 |
+
" </thead>\n",
|
| 521 |
+
" <tbody>\n",
|
| 522 |
+
" <tr>\n",
|
| 523 |
+
" <th>0</th>\n",
|
| 524 |
+
" <td>9780002005883</td>\n",
|
| 525 |
+
" <td>Gilead</td>\n",
|
| 526 |
+
" <td>Marilynne Robinson</td>\n",
|
| 527 |
+
" <td>Fiction</td>\n",
|
| 528 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 529 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 530 |
+
" <td>2004.0</td>\n",
|
| 531 |
+
" <td>247.0</td>\n",
|
| 532 |
+
" <td>3.85</td>\n",
|
| 533 |
+
" <td>361.0</td>\n",
|
| 534 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 535 |
+
" <td>Fiction</td>\n",
|
| 536 |
+
" <td>NaN</td>\n",
|
| 537 |
+
" </tr>\n",
|
| 538 |
+
" <tr>\n",
|
| 539 |
+
" <th>1</th>\n",
|
| 540 |
+
" <td>9780002261982</td>\n",
|
| 541 |
+
" <td>Spider's Web: A Novel</td>\n",
|
| 542 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 543 |
+
" <td>Detective and mystery stories</td>\n",
|
| 544 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 545 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 546 |
+
" <td>2000.0</td>\n",
|
| 547 |
+
" <td>241.0</td>\n",
|
| 548 |
+
" <td>3.83</td>\n",
|
| 549 |
+
" <td>5164.0</td>\n",
|
| 550 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 551 |
+
" <td>NaN</td>\n",
|
| 552 |
+
" <td>Fiction</td>\n",
|
| 553 |
+
" </tr>\n",
|
| 554 |
+
" <tr>\n",
|
| 555 |
+
" <th>2</th>\n",
|
| 556 |
+
" <td>9780006178736</td>\n",
|
| 557 |
+
" <td>Rage of angels</td>\n",
|
| 558 |
+
" <td>Sidney Sheldon</td>\n",
|
| 559 |
+
" <td>Fiction</td>\n",
|
| 560 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 561 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 562 |
+
" <td>1993.0</td>\n",
|
| 563 |
+
" <td>512.0</td>\n",
|
| 564 |
+
" <td>3.93</td>\n",
|
| 565 |
+
" <td>29532.0</td>\n",
|
| 566 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 567 |
+
" <td>Fiction</td>\n",
|
| 568 |
+
" <td>NaN</td>\n",
|
| 569 |
+
" </tr>\n",
|
| 570 |
+
" <tr>\n",
|
| 571 |
+
" <th>3</th>\n",
|
| 572 |
+
" <td>9780006280897</td>\n",
|
| 573 |
+
" <td>The Four Loves</td>\n",
|
| 574 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 575 |
+
" <td>Christian life</td>\n",
|
| 576 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 577 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 578 |
+
" <td>2002.0</td>\n",
|
| 579 |
+
" <td>170.0</td>\n",
|
| 580 |
+
" <td>4.15</td>\n",
|
| 581 |
+
" <td>33684.0</td>\n",
|
| 582 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 583 |
+
" <td>NaN</td>\n",
|
| 584 |
+
" <td>Nonfiction</td>\n",
|
| 585 |
+
" </tr>\n",
|
| 586 |
+
" <tr>\n",
|
| 587 |
+
" <th>4</th>\n",
|
| 588 |
+
" <td>9780006280934</td>\n",
|
| 589 |
+
" <td>The Problem of Pain</td>\n",
|
| 590 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 591 |
+
" <td>Christian life</td>\n",
|
| 592 |
+
" <td>\"In The Problem of Pain, C.S. Lewis, one of th...</td>\n",
|
| 593 |
+
" <td>9780006280934 \"In The Problem of Pain, C.S. Le...</td>\n",
|
| 594 |
+
" <td>2002.0</td>\n",
|
| 595 |
+
" <td>176.0</td>\n",
|
| 596 |
+
" <td>4.09</td>\n",
|
| 597 |
+
" <td>37569.0</td>\n",
|
| 598 |
+
" <td>http://books.google.com/books/content?id=Kk-uV...</td>\n",
|
| 599 |
+
" <td>NaN</td>\n",
|
| 600 |
+
" <td>Nonfiction</td>\n",
|
| 601 |
+
" </tr>\n",
|
| 602 |
+
" </tbody>\n",
|
| 603 |
+
"</table>\n",
|
| 604 |
+
"</div>"
|
| 605 |
+
],
|
| 606 |
+
"text/plain": [
|
| 607 |
+
" isbn13 full_title authors \\\n",
|
| 608 |
+
"0 9780002005883 Gilead Marilynne Robinson \n",
|
| 609 |
+
"1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n",
|
| 610 |
+
"2 9780006178736 Rage of angels Sidney Sheldon \n",
|
| 611 |
+
"3 9780006280897 The Four Loves Clive Staples Lewis \n",
|
| 612 |
+
"4 9780006280934 The Problem of Pain Clive Staples Lewis \n",
|
| 613 |
+
"\n",
|
| 614 |
+
" categories \\\n",
|
| 615 |
+
"0 Fiction \n",
|
| 616 |
+
"1 Detective and mystery stories \n",
|
| 617 |
+
"2 Fiction \n",
|
| 618 |
+
"3 Christian life \n",
|
| 619 |
+
"4 Christian life \n",
|
| 620 |
+
"\n",
|
| 621 |
+
" description \\\n",
|
| 622 |
+
"0 A NOVEL THAT READERS and critics have been eag... \n",
|
| 623 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... \n",
|
| 624 |
+
"2 A memorable, mesmerizing heroine Jennifer -- b... \n",
|
| 625 |
+
"3 Lewis' work on the nature of love divides love... \n",
|
| 626 |
+
"4 \"In The Problem of Pain, C.S. Lewis, one of th... \n",
|
| 627 |
+
"\n",
|
| 628 |
+
" full_desc published_year \\\n",
|
| 629 |
+
"0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n",
|
| 630 |
+
"1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n",
|
| 631 |
+
"2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n",
|
| 632 |
+
"3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n",
|
| 633 |
+
"4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n",
|
| 634 |
+
"\n",
|
| 635 |
+
" num_pages average_rating ratings_count \\\n",
|
| 636 |
+
"0 247.0 3.85 361.0 \n",
|
| 637 |
+
"1 241.0 3.83 5164.0 \n",
|
| 638 |
+
"2 512.0 3.93 29532.0 \n",
|
| 639 |
+
"3 170.0 4.15 33684.0 \n",
|
| 640 |
+
"4 176.0 4.09 37569.0 \n",
|
| 641 |
+
"\n",
|
| 642 |
+
" thumbnail simple_categories \\\n",
|
| 643 |
+
"0 http://books.google.com/books/content?id=KQZCP... Fiction \n",
|
| 644 |
+
"1 http://books.google.com/books/content?id=gA5GP... NaN \n",
|
| 645 |
+
"2 http://books.google.com/books/content?id=FKo2T... Fiction \n",
|
| 646 |
+
"3 http://books.google.com/books/content?id=XhQ5X... NaN \n",
|
| 647 |
+
"4 http://books.google.com/books/content?id=Kk-uV... NaN \n",
|
| 648 |
+
"\n",
|
| 649 |
+
" predicted_categories \n",
|
| 650 |
+
"0 NaN \n",
|
| 651 |
+
"1 Fiction \n",
|
| 652 |
+
"2 NaN \n",
|
| 653 |
+
"3 Nonfiction \n",
|
| 654 |
+
"4 Nonfiction "
|
| 655 |
+
]
|
| 656 |
+
},
|
| 657 |
+
"execution_count": 10,
|
| 658 |
+
"metadata": {},
|
| 659 |
+
"output_type": "execute_result"
|
| 660 |
+
}
|
| 661 |
+
],
|
| 662 |
+
"source": [
|
| 663 |
+
"# Create predicted books dataframe\n",
|
| 664 |
+
"preds_df = pd.DataFrame({'isbn13': isbns, 'predicted_categories': preds})\n",
|
| 665 |
+
"books_with_cat = pd.merge(books, preds_df, on='isbn13', how='left')\n",
|
| 666 |
+
"books_with_cat.head()"
|
| 667 |
+
]
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"cell_type": "code",
|
| 671 |
+
"execution_count": 11,
|
| 672 |
+
"metadata": {},
|
| 673 |
+
"outputs": [
|
| 674 |
+
{
|
| 675 |
+
"data": {
|
| 676 |
+
"text/html": [
|
| 677 |
+
"<div>\n",
|
| 678 |
+
"<style scoped>\n",
|
| 679 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 680 |
+
" vertical-align: middle;\n",
|
| 681 |
+
" }\n",
|
| 682 |
+
"\n",
|
| 683 |
+
" .dataframe tbody tr th {\n",
|
| 684 |
+
" vertical-align: top;\n",
|
| 685 |
+
" }\n",
|
| 686 |
+
"\n",
|
| 687 |
+
" .dataframe thead th {\n",
|
| 688 |
+
" text-align: right;\n",
|
| 689 |
+
" }\n",
|
| 690 |
+
"</style>\n",
|
| 691 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 692 |
+
" <thead>\n",
|
| 693 |
+
" <tr style=\"text-align: right;\">\n",
|
| 694 |
+
" <th></th>\n",
|
| 695 |
+
" <th>isbn13</th>\n",
|
| 696 |
+
" <th>full_title</th>\n",
|
| 697 |
+
" <th>authors</th>\n",
|
| 698 |
+
" <th>categories</th>\n",
|
| 699 |
+
" <th>description</th>\n",
|
| 700 |
+
" <th>full_desc</th>\n",
|
| 701 |
+
" <th>published_year</th>\n",
|
| 702 |
+
" <th>num_pages</th>\n",
|
| 703 |
+
" <th>average_rating</th>\n",
|
| 704 |
+
" <th>ratings_count</th>\n",
|
| 705 |
+
" <th>thumbnail</th>\n",
|
| 706 |
+
" <th>final_categories</th>\n",
|
| 707 |
+
" </tr>\n",
|
| 708 |
+
" </thead>\n",
|
| 709 |
+
" <tbody>\n",
|
| 710 |
+
" <tr>\n",
|
| 711 |
+
" <th>0</th>\n",
|
| 712 |
+
" <td>9780002005883</td>\n",
|
| 713 |
+
" <td>Gilead</td>\n",
|
| 714 |
+
" <td>Marilynne Robinson</td>\n",
|
| 715 |
+
" <td>Fiction</td>\n",
|
| 716 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 717 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 718 |
+
" <td>2004.0</td>\n",
|
| 719 |
+
" <td>247.0</td>\n",
|
| 720 |
+
" <td>3.85</td>\n",
|
| 721 |
+
" <td>361.0</td>\n",
|
| 722 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 723 |
+
" <td>Fiction</td>\n",
|
| 724 |
+
" </tr>\n",
|
| 725 |
+
" <tr>\n",
|
| 726 |
+
" <th>1</th>\n",
|
| 727 |
+
" <td>9780002261982</td>\n",
|
| 728 |
+
" <td>Spider's Web: A Novel</td>\n",
|
| 729 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 730 |
+
" <td>Detective and mystery stories</td>\n",
|
| 731 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 732 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 733 |
+
" <td>2000.0</td>\n",
|
| 734 |
+
" <td>241.0</td>\n",
|
| 735 |
+
" <td>3.83</td>\n",
|
| 736 |
+
" <td>5164.0</td>\n",
|
| 737 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 738 |
+
" <td>Fiction</td>\n",
|
| 739 |
+
" </tr>\n",
|
| 740 |
+
" <tr>\n",
|
| 741 |
+
" <th>2</th>\n",
|
| 742 |
+
" <td>9780006178736</td>\n",
|
| 743 |
+
" <td>Rage of angels</td>\n",
|
| 744 |
+
" <td>Sidney Sheldon</td>\n",
|
| 745 |
+
" <td>Fiction</td>\n",
|
| 746 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 747 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 748 |
+
" <td>1993.0</td>\n",
|
| 749 |
+
" <td>512.0</td>\n",
|
| 750 |
+
" <td>3.93</td>\n",
|
| 751 |
+
" <td>29532.0</td>\n",
|
| 752 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 753 |
+
" <td>Fiction</td>\n",
|
| 754 |
+
" </tr>\n",
|
| 755 |
+
" <tr>\n",
|
| 756 |
+
" <th>3</th>\n",
|
| 757 |
+
" <td>9780006280897</td>\n",
|
| 758 |
+
" <td>The Four Loves</td>\n",
|
| 759 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 760 |
+
" <td>Christian life</td>\n",
|
| 761 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 762 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 763 |
+
" <td>2002.0</td>\n",
|
| 764 |
+
" <td>170.0</td>\n",
|
| 765 |
+
" <td>4.15</td>\n",
|
| 766 |
+
" <td>33684.0</td>\n",
|
| 767 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 768 |
+
" <td>Nonfiction</td>\n",
|
| 769 |
+
" </tr>\n",
|
| 770 |
+
" <tr>\n",
|
| 771 |
+
" <th>4</th>\n",
|
| 772 |
+
" <td>9780006280934</td>\n",
|
| 773 |
+
" <td>The Problem of Pain</td>\n",
|
| 774 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 775 |
+
" <td>Christian life</td>\n",
|
| 776 |
+
" <td>\"In The Problem of Pain, C.S. Lewis, one of th...</td>\n",
|
| 777 |
+
" <td>9780006280934 \"In The Problem of Pain, C.S. Le...</td>\n",
|
| 778 |
+
" <td>2002.0</td>\n",
|
| 779 |
+
" <td>176.0</td>\n",
|
| 780 |
+
" <td>4.09</td>\n",
|
| 781 |
+
" <td>37569.0</td>\n",
|
| 782 |
+
" <td>http://books.google.com/books/content?id=Kk-uV...</td>\n",
|
| 783 |
+
" <td>Nonfiction</td>\n",
|
| 784 |
+
" </tr>\n",
|
| 785 |
+
" </tbody>\n",
|
| 786 |
+
"</table>\n",
|
| 787 |
+
"</div>"
|
| 788 |
+
],
|
| 789 |
+
"text/plain": [
|
| 790 |
+
" isbn13 full_title authors \\\n",
|
| 791 |
+
"0 9780002005883 Gilead Marilynne Robinson \n",
|
| 792 |
+
"1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n",
|
| 793 |
+
"2 9780006178736 Rage of angels Sidney Sheldon \n",
|
| 794 |
+
"3 9780006280897 The Four Loves Clive Staples Lewis \n",
|
| 795 |
+
"4 9780006280934 The Problem of Pain Clive Staples Lewis \n",
|
| 796 |
+
"\n",
|
| 797 |
+
" categories \\\n",
|
| 798 |
+
"0 Fiction \n",
|
| 799 |
+
"1 Detective and mystery stories \n",
|
| 800 |
+
"2 Fiction \n",
|
| 801 |
+
"3 Christian life \n",
|
| 802 |
+
"4 Christian life \n",
|
| 803 |
+
"\n",
|
| 804 |
+
" description \\\n",
|
| 805 |
+
"0 A NOVEL THAT READERS and critics have been eag... \n",
|
| 806 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... \n",
|
| 807 |
+
"2 A memorable, mesmerizing heroine Jennifer -- b... \n",
|
| 808 |
+
"3 Lewis' work on the nature of love divides love... \n",
|
| 809 |
+
"4 \"In The Problem of Pain, C.S. Lewis, one of th... \n",
|
| 810 |
+
"\n",
|
| 811 |
+
" full_desc published_year \\\n",
|
| 812 |
+
"0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n",
|
| 813 |
+
"1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n",
|
| 814 |
+
"2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n",
|
| 815 |
+
"3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n",
|
| 816 |
+
"4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n",
|
| 817 |
+
"\n",
|
| 818 |
+
" num_pages average_rating ratings_count \\\n",
|
| 819 |
+
"0 247.0 3.85 361.0 \n",
|
| 820 |
+
"1 241.0 3.83 5164.0 \n",
|
| 821 |
+
"2 512.0 3.93 29532.0 \n",
|
| 822 |
+
"3 170.0 4.15 33684.0 \n",
|
| 823 |
+
"4 176.0 4.09 37569.0 \n",
|
| 824 |
+
"\n",
|
| 825 |
+
" thumbnail final_categories \n",
|
| 826 |
+
"0 http://books.google.com/books/content?id=KQZCP... Fiction \n",
|
| 827 |
+
"1 http://books.google.com/books/content?id=gA5GP... Fiction \n",
|
| 828 |
+
"2 http://books.google.com/books/content?id=FKo2T... Fiction \n",
|
| 829 |
+
"3 http://books.google.com/books/content?id=XhQ5X... Nonfiction \n",
|
| 830 |
+
"4 http://books.google.com/books/content?id=Kk-uV... Nonfiction "
|
| 831 |
+
]
|
| 832 |
+
},
|
| 833 |
+
"execution_count": 11,
|
| 834 |
+
"metadata": {},
|
| 835 |
+
"output_type": "execute_result"
|
| 836 |
+
}
|
| 837 |
+
],
|
| 838 |
+
"source": [
|
| 839 |
+
"books_with_cat['final_categories'] = np.where(\n",
|
| 840 |
+
" books_with_cat['predicted_categories'].isna(),\n",
|
| 841 |
+
" books_with_cat['simple_categories'],\n",
|
| 842 |
+
" books_with_cat['predicted_categories']\n",
|
| 843 |
+
")\n",
|
| 844 |
+
"books_with_cat = books_with_cat.drop(columns=['simple_categories', 'predicted_categories'])\n",
|
| 845 |
+
"\n",
|
| 846 |
+
"books_with_cat.to_csv('data/books_with_categories.csv', index=False)\n",
|
| 847 |
+
"books_with_cat.head()"
|
| 848 |
+
]
|
| 849 |
+
}
|
| 850 |
+
],
|
| 851 |
+
"metadata": {
|
| 852 |
+
"kernelspec": {
|
| 853 |
+
"display_name": "book_rcm",
|
| 854 |
+
"language": "python",
|
| 855 |
+
"name": "python3"
|
| 856 |
+
},
|
| 857 |
+
"language_info": {
|
| 858 |
+
"codemirror_mode": {
|
| 859 |
+
"name": "ipython",
|
| 860 |
+
"version": 3
|
| 861 |
+
},
|
| 862 |
+
"file_extension": ".py",
|
| 863 |
+
"mimetype": "text/x-python",
|
| 864 |
+
"name": "python",
|
| 865 |
+
"nbconvert_exporter": "python",
|
| 866 |
+
"pygments_lexer": "ipython3",
|
| 867 |
+
"version": "3.10.16"
|
| 868 |
+
}
|
| 869 |
+
},
|
| 870 |
+
"nbformat": 4,
|
| 871 |
+
"nbformat_minor": 2
|
| 872 |
+
}
|
step_04_Sentiment_Analysis.ipynb
ADDED
|
@@ -0,0 +1,671 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"data": {
|
| 10 |
+
"text/html": [
|
| 11 |
+
"<div>\n",
|
| 12 |
+
"<style scoped>\n",
|
| 13 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 14 |
+
" vertical-align: middle;\n",
|
| 15 |
+
" }\n",
|
| 16 |
+
"\n",
|
| 17 |
+
" .dataframe tbody tr th {\n",
|
| 18 |
+
" vertical-align: top;\n",
|
| 19 |
+
" }\n",
|
| 20 |
+
"\n",
|
| 21 |
+
" .dataframe thead th {\n",
|
| 22 |
+
" text-align: right;\n",
|
| 23 |
+
" }\n",
|
| 24 |
+
"</style>\n",
|
| 25 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 26 |
+
" <thead>\n",
|
| 27 |
+
" <tr style=\"text-align: right;\">\n",
|
| 28 |
+
" <th></th>\n",
|
| 29 |
+
" <th>isbn13</th>\n",
|
| 30 |
+
" <th>full_title</th>\n",
|
| 31 |
+
" <th>authors</th>\n",
|
| 32 |
+
" <th>categories</th>\n",
|
| 33 |
+
" <th>description</th>\n",
|
| 34 |
+
" <th>full_desc</th>\n",
|
| 35 |
+
" <th>published_year</th>\n",
|
| 36 |
+
" <th>num_pages</th>\n",
|
| 37 |
+
" <th>average_rating</th>\n",
|
| 38 |
+
" <th>ratings_count</th>\n",
|
| 39 |
+
" <th>thumbnail</th>\n",
|
| 40 |
+
" <th>final_categories</th>\n",
|
| 41 |
+
" </tr>\n",
|
| 42 |
+
" </thead>\n",
|
| 43 |
+
" <tbody>\n",
|
| 44 |
+
" <tr>\n",
|
| 45 |
+
" <th>0</th>\n",
|
| 46 |
+
" <td>9780002005883</td>\n",
|
| 47 |
+
" <td>Gilead</td>\n",
|
| 48 |
+
" <td>Marilynne Robinson</td>\n",
|
| 49 |
+
" <td>Fiction</td>\n",
|
| 50 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 51 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 52 |
+
" <td>2004.0</td>\n",
|
| 53 |
+
" <td>247.0</td>\n",
|
| 54 |
+
" <td>3.85</td>\n",
|
| 55 |
+
" <td>361.0</td>\n",
|
| 56 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 57 |
+
" <td>Fiction</td>\n",
|
| 58 |
+
" </tr>\n",
|
| 59 |
+
" <tr>\n",
|
| 60 |
+
" <th>1</th>\n",
|
| 61 |
+
" <td>9780002261982</td>\n",
|
| 62 |
+
" <td>Spider's Web: A Novel</td>\n",
|
| 63 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 64 |
+
" <td>Detective and mystery stories</td>\n",
|
| 65 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 66 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 67 |
+
" <td>2000.0</td>\n",
|
| 68 |
+
" <td>241.0</td>\n",
|
| 69 |
+
" <td>3.83</td>\n",
|
| 70 |
+
" <td>5164.0</td>\n",
|
| 71 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 72 |
+
" <td>Fiction</td>\n",
|
| 73 |
+
" </tr>\n",
|
| 74 |
+
" <tr>\n",
|
| 75 |
+
" <th>2</th>\n",
|
| 76 |
+
" <td>9780006178736</td>\n",
|
| 77 |
+
" <td>Rage of angels</td>\n",
|
| 78 |
+
" <td>Sidney Sheldon</td>\n",
|
| 79 |
+
" <td>Fiction</td>\n",
|
| 80 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 81 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 82 |
+
" <td>1993.0</td>\n",
|
| 83 |
+
" <td>512.0</td>\n",
|
| 84 |
+
" <td>3.93</td>\n",
|
| 85 |
+
" <td>29532.0</td>\n",
|
| 86 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 87 |
+
" <td>Fiction</td>\n",
|
| 88 |
+
" </tr>\n",
|
| 89 |
+
" <tr>\n",
|
| 90 |
+
" <th>3</th>\n",
|
| 91 |
+
" <td>9780006280897</td>\n",
|
| 92 |
+
" <td>The Four Loves</td>\n",
|
| 93 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 94 |
+
" <td>Christian life</td>\n",
|
| 95 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 96 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 97 |
+
" <td>2002.0</td>\n",
|
| 98 |
+
" <td>170.0</td>\n",
|
| 99 |
+
" <td>4.15</td>\n",
|
| 100 |
+
" <td>33684.0</td>\n",
|
| 101 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 102 |
+
" <td>Nonfiction</td>\n",
|
| 103 |
+
" </tr>\n",
|
| 104 |
+
" <tr>\n",
|
| 105 |
+
" <th>4</th>\n",
|
| 106 |
+
" <td>9780006280934</td>\n",
|
| 107 |
+
" <td>The Problem of Pain</td>\n",
|
| 108 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 109 |
+
" <td>Christian life</td>\n",
|
| 110 |
+
" <td>\"In The Problem of Pain, C.S. Lewis, one of th...</td>\n",
|
| 111 |
+
" <td>9780006280934 \"In The Problem of Pain, C.S. Le...</td>\n",
|
| 112 |
+
" <td>2002.0</td>\n",
|
| 113 |
+
" <td>176.0</td>\n",
|
| 114 |
+
" <td>4.09</td>\n",
|
| 115 |
+
" <td>37569.0</td>\n",
|
| 116 |
+
" <td>http://books.google.com/books/content?id=Kk-uV...</td>\n",
|
| 117 |
+
" <td>Nonfiction</td>\n",
|
| 118 |
+
" </tr>\n",
|
| 119 |
+
" </tbody>\n",
|
| 120 |
+
"</table>\n",
|
| 121 |
+
"</div>"
|
| 122 |
+
],
|
| 123 |
+
"text/plain": [
|
| 124 |
+
" isbn13 full_title authors \\\n",
|
| 125 |
+
"0 9780002005883 Gilead Marilynne Robinson \n",
|
| 126 |
+
"1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n",
|
| 127 |
+
"2 9780006178736 Rage of angels Sidney Sheldon \n",
|
| 128 |
+
"3 9780006280897 The Four Loves Clive Staples Lewis \n",
|
| 129 |
+
"4 9780006280934 The Problem of Pain Clive Staples Lewis \n",
|
| 130 |
+
"\n",
|
| 131 |
+
" categories \\\n",
|
| 132 |
+
"0 Fiction \n",
|
| 133 |
+
"1 Detective and mystery stories \n",
|
| 134 |
+
"2 Fiction \n",
|
| 135 |
+
"3 Christian life \n",
|
| 136 |
+
"4 Christian life \n",
|
| 137 |
+
"\n",
|
| 138 |
+
" description \\\n",
|
| 139 |
+
"0 A NOVEL THAT READERS and critics have been eag... \n",
|
| 140 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... \n",
|
| 141 |
+
"2 A memorable, mesmerizing heroine Jennifer -- b... \n",
|
| 142 |
+
"3 Lewis' work on the nature of love divides love... \n",
|
| 143 |
+
"4 \"In The Problem of Pain, C.S. Lewis, one of th... \n",
|
| 144 |
+
"\n",
|
| 145 |
+
" full_desc published_year \\\n",
|
| 146 |
+
"0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n",
|
| 147 |
+
"1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n",
|
| 148 |
+
"2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n",
|
| 149 |
+
"3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n",
|
| 150 |
+
"4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n",
|
| 151 |
+
"\n",
|
| 152 |
+
" num_pages average_rating ratings_count \\\n",
|
| 153 |
+
"0 247.0 3.85 361.0 \n",
|
| 154 |
+
"1 241.0 3.83 5164.0 \n",
|
| 155 |
+
"2 512.0 3.93 29532.0 \n",
|
| 156 |
+
"3 170.0 4.15 33684.0 \n",
|
| 157 |
+
"4 176.0 4.09 37569.0 \n",
|
| 158 |
+
"\n",
|
| 159 |
+
" thumbnail final_categories \n",
|
| 160 |
+
"0 http://books.google.com/books/content?id=KQZCP... Fiction \n",
|
| 161 |
+
"1 http://books.google.com/books/content?id=gA5GP... Fiction \n",
|
| 162 |
+
"2 http://books.google.com/books/content?id=FKo2T... Fiction \n",
|
| 163 |
+
"3 http://books.google.com/books/content?id=XhQ5X... Nonfiction \n",
|
| 164 |
+
"4 http://books.google.com/books/content?id=Kk-uV... Nonfiction "
|
| 165 |
+
]
|
| 166 |
+
},
|
| 167 |
+
"execution_count": 1,
|
| 168 |
+
"metadata": {},
|
| 169 |
+
"output_type": "execute_result"
|
| 170 |
+
}
|
| 171 |
+
],
|
| 172 |
+
"source": [
|
| 173 |
+
"import pandas as pd\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"books = pd.read_csv('data/books_with_categories.csv')\n",
|
| 176 |
+
"books.head()"
|
| 177 |
+
]
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"cell_type": "code",
|
| 181 |
+
"execution_count": 2,
|
| 182 |
+
"metadata": {},
|
| 183 |
+
"outputs": [
|
| 184 |
+
{
|
| 185 |
+
"name": "stderr",
|
| 186 |
+
"output_type": "stream",
|
| 187 |
+
"text": [
|
| 188 |
+
"Device set to use cuda:0\n"
|
| 189 |
+
]
|
| 190 |
+
}
|
| 191 |
+
],
|
| 192 |
+
"source": [
|
| 193 |
+
"import torch\n",
|
| 194 |
+
"from transformers import pipeline\n",
|
| 195 |
+
"\n",
|
| 196 |
+
"classifier = pipeline('text-classification', model='j-hartmann/emotion-english-distilroberta-base',\n",
|
| 197 |
+
" top_k=None,\n",
|
| 198 |
+
" device=0 if torch.cuda.is_available() else -1)"
|
| 199 |
+
]
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"cell_type": "code",
|
| 203 |
+
"execution_count": 3,
|
| 204 |
+
"metadata": {},
|
| 205 |
+
"outputs": [
|
| 206 |
+
{
|
| 207 |
+
"name": "stdout",
|
| 208 |
+
"output_type": "stream",
|
| 209 |
+
"text": [
|
| 210 |
+
"A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives\n",
|
| 211 |
+
"{'label': 'surprise', 'score': 0.7296027541160583}\n",
|
| 212 |
+
"--------------------------------\n",
|
| 213 |
+
"\n",
|
| 214 |
+
"John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers\n",
|
| 215 |
+
"{'label': 'neutral', 'score': 0.4662497639656067}\n",
|
| 216 |
+
"--------------------------------\n",
|
| 217 |
+
"\n",
|
| 218 |
+
"It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up\n",
|
| 219 |
+
"{'label': 'neutral', 'score': 0.6978469491004944}\n",
|
| 220 |
+
"--------------------------------\n",
|
| 221 |
+
"\n",
|
| 222 |
+
"Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist\n",
|
| 223 |
+
"{'label': 'fear', 'score': 0.9839729070663452}\n",
|
| 224 |
+
"--------------------------------\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption\n",
|
| 227 |
+
"{'label': 'sadness', 'score': 0.9560651183128357}\n",
|
| 228 |
+
"--------------------------------\n",
|
| 229 |
+
"\n",
|
| 230 |
+
"Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world has to offer\n",
|
| 231 |
+
"{'label': 'joy', 'score': 0.9490270018577576}\n",
|
| 232 |
+
"--------------------------------\n",
|
| 233 |
+
"\n",
|
| 234 |
+
"At its heart is a tale of the sacred bonds between fathers and sons, pitch-perfect in style and story, set to dazzle critics and readers alike\n",
|
| 235 |
+
"{'label': 'joy', 'score': 0.6701961159706116}\n",
|
| 236 |
+
"--------------------------------\n",
|
| 237 |
+
"\n"
|
| 238 |
+
]
|
| 239 |
+
}
|
| 240 |
+
],
|
| 241 |
+
"source": [
|
| 242 |
+
"sentences = [s.strip() for s in books['description'][0].split('.') if s.strip()]\n",
|
| 243 |
+
"emotions = classifier(sentences)\n",
|
| 244 |
+
"\n",
|
| 245 |
+
"for sentence, emotion in zip(sentences, emotions):\n",
|
| 246 |
+
" print(sentence)\n",
|
| 247 |
+
" print(emotion[0])\n",
|
| 248 |
+
" print('--------------------------------\\n')"
|
| 249 |
+
]
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"cell_type": "code",
|
| 253 |
+
"execution_count": 4,
|
| 254 |
+
"metadata": {},
|
| 255 |
+
"outputs": [],
|
| 256 |
+
"source": [
|
| 257 |
+
"import numpy as np\n",
|
| 258 |
+
"\n",
|
| 259 |
+
"emotion_labels = sorted(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])\n",
|
| 260 |
+
"\n",
|
| 261 |
+
"def calculate_book_emotion_scores(emotions):\n",
|
| 262 |
+
" sentence_emotion_scores = {label: [] for label in emotion_labels}\n",
|
| 263 |
+
" \n",
|
| 264 |
+
" for emotion in emotions:\n",
|
| 265 |
+
" emotion = sorted(emotion, key=lambda x: x['label'])\n",
|
| 266 |
+
" \n",
|
| 267 |
+
" for idx, label in enumerate(emotion_labels):\n",
|
| 268 |
+
" sentence_emotion_scores[label].append(emotion[idx]['score'])\n",
|
| 269 |
+
" \n",
|
| 270 |
+
" return {label: np.max(scores) for label, scores in sentence_emotion_scores.items()} # Note: Try to use both max and mean later"
|
| 271 |
+
]
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"cell_type": "code",
|
| 275 |
+
"execution_count": 6,
|
| 276 |
+
"metadata": {},
|
| 277 |
+
"outputs": [
|
| 278 |
+
{
|
| 279 |
+
"name": "stderr",
|
| 280 |
+
"output_type": "stream",
|
| 281 |
+
"text": [
|
| 282 |
+
"100%|██████████| 5197/5197 [02:51<00:00, 30.27it/s]\n"
|
| 283 |
+
]
|
| 284 |
+
}
|
| 285 |
+
],
|
| 286 |
+
"source": [
|
| 287 |
+
"from tqdm import tqdm\n",
|
| 288 |
+
"\n",
|
| 289 |
+
"isbns = []\n",
|
| 290 |
+
"emotion_scores = {label: [] for label in emotion_labels}\n",
|
| 291 |
+
"\n",
|
| 292 |
+
"for i in tqdm(range(len(books))):\n",
|
| 293 |
+
" isbns.append(books['isbn13'][i])\n",
|
| 294 |
+
" sentences = books['description'][i].split('.')\n",
|
| 295 |
+
" emotions = classifier(sentences)\n",
|
| 296 |
+
" max_emotion_scores = calculate_book_emotion_scores(emotions)\n",
|
| 297 |
+
" for label in emotion_labels:\n",
|
| 298 |
+
" emotion_scores[label].append(max_emotion_scores[label])"
|
| 299 |
+
]
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"cell_type": "code",
|
| 303 |
+
"execution_count": 7,
|
| 304 |
+
"metadata": {},
|
| 305 |
+
"outputs": [
|
| 306 |
+
{
|
| 307 |
+
"data": {
|
| 308 |
+
"text/html": [
|
| 309 |
+
"<div>\n",
|
| 310 |
+
"<style scoped>\n",
|
| 311 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 312 |
+
" vertical-align: middle;\n",
|
| 313 |
+
" }\n",
|
| 314 |
+
"\n",
|
| 315 |
+
" .dataframe tbody tr th {\n",
|
| 316 |
+
" vertical-align: top;\n",
|
| 317 |
+
" }\n",
|
| 318 |
+
"\n",
|
| 319 |
+
" .dataframe thead th {\n",
|
| 320 |
+
" text-align: right;\n",
|
| 321 |
+
" }\n",
|
| 322 |
+
"</style>\n",
|
| 323 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 324 |
+
" <thead>\n",
|
| 325 |
+
" <tr style=\"text-align: right;\">\n",
|
| 326 |
+
" <th></th>\n",
|
| 327 |
+
" <th>anger</th>\n",
|
| 328 |
+
" <th>disgust</th>\n",
|
| 329 |
+
" <th>fear</th>\n",
|
| 330 |
+
" <th>joy</th>\n",
|
| 331 |
+
" <th>neutral</th>\n",
|
| 332 |
+
" <th>sadness</th>\n",
|
| 333 |
+
" <th>surprise</th>\n",
|
| 334 |
+
" <th>isbn13</th>\n",
|
| 335 |
+
" </tr>\n",
|
| 336 |
+
" </thead>\n",
|
| 337 |
+
" <tbody>\n",
|
| 338 |
+
" <tr>\n",
|
| 339 |
+
" <th>0</th>\n",
|
| 340 |
+
" <td>0.064134</td>\n",
|
| 341 |
+
" <td>0.273591</td>\n",
|
| 342 |
+
" <td>0.928168</td>\n",
|
| 343 |
+
" <td>0.932797</td>\n",
|
| 344 |
+
" <td>0.646217</td>\n",
|
| 345 |
+
" <td>0.967158</td>\n",
|
| 346 |
+
" <td>0.729603</td>\n",
|
| 347 |
+
" <td>9780002005883</td>\n",
|
| 348 |
+
" </tr>\n",
|
| 349 |
+
" <tr>\n",
|
| 350 |
+
" <th>1</th>\n",
|
| 351 |
+
" <td>0.612619</td>\n",
|
| 352 |
+
" <td>0.348284</td>\n",
|
| 353 |
+
" <td>0.942528</td>\n",
|
| 354 |
+
" <td>0.704421</td>\n",
|
| 355 |
+
" <td>0.887940</td>\n",
|
| 356 |
+
" <td>0.111690</td>\n",
|
| 357 |
+
" <td>0.252545</td>\n",
|
| 358 |
+
" <td>9780002261982</td>\n",
|
| 359 |
+
" </tr>\n",
|
| 360 |
+
" <tr>\n",
|
| 361 |
+
" <th>2</th>\n",
|
| 362 |
+
" <td>0.064134</td>\n",
|
| 363 |
+
" <td>0.104007</td>\n",
|
| 364 |
+
" <td>0.972321</td>\n",
|
| 365 |
+
" <td>0.767237</td>\n",
|
| 366 |
+
" <td>0.549477</td>\n",
|
| 367 |
+
" <td>0.111690</td>\n",
|
| 368 |
+
" <td>0.078766</td>\n",
|
| 369 |
+
" <td>9780006178736</td>\n",
|
| 370 |
+
" </tr>\n",
|
| 371 |
+
" <tr>\n",
|
| 372 |
+
" <th>3</th>\n",
|
| 373 |
+
" <td>0.351483</td>\n",
|
| 374 |
+
" <td>0.150722</td>\n",
|
| 375 |
+
" <td>0.360707</td>\n",
|
| 376 |
+
" <td>0.251881</td>\n",
|
| 377 |
+
" <td>0.732686</td>\n",
|
| 378 |
+
" <td>0.111690</td>\n",
|
| 379 |
+
" <td>0.078766</td>\n",
|
| 380 |
+
" <td>9780006280897</td>\n",
|
| 381 |
+
" </tr>\n",
|
| 382 |
+
" <tr>\n",
|
| 383 |
+
" <th>4</th>\n",
|
| 384 |
+
" <td>0.081412</td>\n",
|
| 385 |
+
" <td>0.184495</td>\n",
|
| 386 |
+
" <td>0.095043</td>\n",
|
| 387 |
+
" <td>0.040564</td>\n",
|
| 388 |
+
" <td>0.884390</td>\n",
|
| 389 |
+
" <td>0.475881</td>\n",
|
| 390 |
+
" <td>0.078766</td>\n",
|
| 391 |
+
" <td>9780006280934</td>\n",
|
| 392 |
+
" </tr>\n",
|
| 393 |
+
" </tbody>\n",
|
| 394 |
+
"</table>\n",
|
| 395 |
+
"</div>"
|
| 396 |
+
],
|
| 397 |
+
"text/plain": [
|
| 398 |
+
" anger disgust fear joy neutral sadness surprise \\\n",
|
| 399 |
+
"0 0.064134 0.273591 0.928168 0.932797 0.646217 0.967158 0.729603 \n",
|
| 400 |
+
"1 0.612619 0.348284 0.942528 0.704421 0.887940 0.111690 0.252545 \n",
|
| 401 |
+
"2 0.064134 0.104007 0.972321 0.767237 0.549477 0.111690 0.078766 \n",
|
| 402 |
+
"3 0.351483 0.150722 0.360707 0.251881 0.732686 0.111690 0.078766 \n",
|
| 403 |
+
"4 0.081412 0.184495 0.095043 0.040564 0.884390 0.475881 0.078766 \n",
|
| 404 |
+
"\n",
|
| 405 |
+
" isbn13 \n",
|
| 406 |
+
"0 9780002005883 \n",
|
| 407 |
+
"1 9780002261982 \n",
|
| 408 |
+
"2 9780006178736 \n",
|
| 409 |
+
"3 9780006280897 \n",
|
| 410 |
+
"4 9780006280934 "
|
| 411 |
+
]
|
| 412 |
+
},
|
| 413 |
+
"execution_count": 7,
|
| 414 |
+
"metadata": {},
|
| 415 |
+
"output_type": "execute_result"
|
| 416 |
+
}
|
| 417 |
+
],
|
| 418 |
+
"source": [
|
| 419 |
+
"emotions_df = pd.DataFrame(emotion_scores)\n",
|
| 420 |
+
"emotions_df['isbn13'] = isbns\n",
|
| 421 |
+
"emotions_df.head()"
|
| 422 |
+
]
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"cell_type": "code",
|
| 426 |
+
"execution_count": 8,
|
| 427 |
+
"metadata": {},
|
| 428 |
+
"outputs": [
|
| 429 |
+
{
|
| 430 |
+
"data": {
|
| 431 |
+
"text/html": [
|
| 432 |
+
"<div>\n",
|
| 433 |
+
"<style scoped>\n",
|
| 434 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 435 |
+
" vertical-align: middle;\n",
|
| 436 |
+
" }\n",
|
| 437 |
+
"\n",
|
| 438 |
+
" .dataframe tbody tr th {\n",
|
| 439 |
+
" vertical-align: top;\n",
|
| 440 |
+
" }\n",
|
| 441 |
+
"\n",
|
| 442 |
+
" .dataframe thead th {\n",
|
| 443 |
+
" text-align: right;\n",
|
| 444 |
+
" }\n",
|
| 445 |
+
"</style>\n",
|
| 446 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 447 |
+
" <thead>\n",
|
| 448 |
+
" <tr style=\"text-align: right;\">\n",
|
| 449 |
+
" <th></th>\n",
|
| 450 |
+
" <th>isbn13</th>\n",
|
| 451 |
+
" <th>full_title</th>\n",
|
| 452 |
+
" <th>authors</th>\n",
|
| 453 |
+
" <th>categories</th>\n",
|
| 454 |
+
" <th>description</th>\n",
|
| 455 |
+
" <th>full_desc</th>\n",
|
| 456 |
+
" <th>published_year</th>\n",
|
| 457 |
+
" <th>num_pages</th>\n",
|
| 458 |
+
" <th>average_rating</th>\n",
|
| 459 |
+
" <th>ratings_count</th>\n",
|
| 460 |
+
" <th>thumbnail</th>\n",
|
| 461 |
+
" <th>final_categories</th>\n",
|
| 462 |
+
" <th>anger</th>\n",
|
| 463 |
+
" <th>disgust</th>\n",
|
| 464 |
+
" <th>fear</th>\n",
|
| 465 |
+
" <th>joy</th>\n",
|
| 466 |
+
" <th>neutral</th>\n",
|
| 467 |
+
" <th>sadness</th>\n",
|
| 468 |
+
" <th>surprise</th>\n",
|
| 469 |
+
" </tr>\n",
|
| 470 |
+
" </thead>\n",
|
| 471 |
+
" <tbody>\n",
|
| 472 |
+
" <tr>\n",
|
| 473 |
+
" <th>0</th>\n",
|
| 474 |
+
" <td>9780002005883</td>\n",
|
| 475 |
+
" <td>Gilead</td>\n",
|
| 476 |
+
" <td>Marilynne Robinson</td>\n",
|
| 477 |
+
" <td>Fiction</td>\n",
|
| 478 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 479 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 480 |
+
" <td>2004.0</td>\n",
|
| 481 |
+
" <td>247.0</td>\n",
|
| 482 |
+
" <td>3.85</td>\n",
|
| 483 |
+
" <td>361.0</td>\n",
|
| 484 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 485 |
+
" <td>Fiction</td>\n",
|
| 486 |
+
" <td>0.064134</td>\n",
|
| 487 |
+
" <td>0.273591</td>\n",
|
| 488 |
+
" <td>0.928168</td>\n",
|
| 489 |
+
" <td>0.932797</td>\n",
|
| 490 |
+
" <td>0.646217</td>\n",
|
| 491 |
+
" <td>0.967158</td>\n",
|
| 492 |
+
" <td>0.729603</td>\n",
|
| 493 |
+
" </tr>\n",
|
| 494 |
+
" <tr>\n",
|
| 495 |
+
" <th>1</th>\n",
|
| 496 |
+
" <td>9780002261982</td>\n",
|
| 497 |
+
" <td>Spider's Web: A Novel</td>\n",
|
| 498 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 499 |
+
" <td>Detective and mystery stories</td>\n",
|
| 500 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 501 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 502 |
+
" <td>2000.0</td>\n",
|
| 503 |
+
" <td>241.0</td>\n",
|
| 504 |
+
" <td>3.83</td>\n",
|
| 505 |
+
" <td>5164.0</td>\n",
|
| 506 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 507 |
+
" <td>Fiction</td>\n",
|
| 508 |
+
" <td>0.612619</td>\n",
|
| 509 |
+
" <td>0.348284</td>\n",
|
| 510 |
+
" <td>0.942528</td>\n",
|
| 511 |
+
" <td>0.704421</td>\n",
|
| 512 |
+
" <td>0.887940</td>\n",
|
| 513 |
+
" <td>0.111690</td>\n",
|
| 514 |
+
" <td>0.252545</td>\n",
|
| 515 |
+
" </tr>\n",
|
| 516 |
+
" <tr>\n",
|
| 517 |
+
" <th>2</th>\n",
|
| 518 |
+
" <td>9780006178736</td>\n",
|
| 519 |
+
" <td>Rage of angels</td>\n",
|
| 520 |
+
" <td>Sidney Sheldon</td>\n",
|
| 521 |
+
" <td>Fiction</td>\n",
|
| 522 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 523 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 524 |
+
" <td>1993.0</td>\n",
|
| 525 |
+
" <td>512.0</td>\n",
|
| 526 |
+
" <td>3.93</td>\n",
|
| 527 |
+
" <td>29532.0</td>\n",
|
| 528 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 529 |
+
" <td>Fiction</td>\n",
|
| 530 |
+
" <td>0.064134</td>\n",
|
| 531 |
+
" <td>0.104007</td>\n",
|
| 532 |
+
" <td>0.972321</td>\n",
|
| 533 |
+
" <td>0.767237</td>\n",
|
| 534 |
+
" <td>0.549477</td>\n",
|
| 535 |
+
" <td>0.111690</td>\n",
|
| 536 |
+
" <td>0.078766</td>\n",
|
| 537 |
+
" </tr>\n",
|
| 538 |
+
" <tr>\n",
|
| 539 |
+
" <th>3</th>\n",
|
| 540 |
+
" <td>9780006280897</td>\n",
|
| 541 |
+
" <td>The Four Loves</td>\n",
|
| 542 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 543 |
+
" <td>Christian life</td>\n",
|
| 544 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 545 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 546 |
+
" <td>2002.0</td>\n",
|
| 547 |
+
" <td>170.0</td>\n",
|
| 548 |
+
" <td>4.15</td>\n",
|
| 549 |
+
" <td>33684.0</td>\n",
|
| 550 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 551 |
+
" <td>Nonfiction</td>\n",
|
| 552 |
+
" <td>0.351483</td>\n",
|
| 553 |
+
" <td>0.150722</td>\n",
|
| 554 |
+
" <td>0.360707</td>\n",
|
| 555 |
+
" <td>0.251881</td>\n",
|
| 556 |
+
" <td>0.732686</td>\n",
|
| 557 |
+
" <td>0.111690</td>\n",
|
| 558 |
+
" <td>0.078766</td>\n",
|
| 559 |
+
" </tr>\n",
|
| 560 |
+
" <tr>\n",
|
| 561 |
+
" <th>4</th>\n",
|
| 562 |
+
" <td>9780006280934</td>\n",
|
| 563 |
+
" <td>The Problem of Pain</td>\n",
|
| 564 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 565 |
+
" <td>Christian life</td>\n",
|
| 566 |
+
" <td>\"In The Problem of Pain, C.S. Lewis, one of th...</td>\n",
|
| 567 |
+
" <td>9780006280934 \"In The Problem of Pain, C.S. Le...</td>\n",
|
| 568 |
+
" <td>2002.0</td>\n",
|
| 569 |
+
" <td>176.0</td>\n",
|
| 570 |
+
" <td>4.09</td>\n",
|
| 571 |
+
" <td>37569.0</td>\n",
|
| 572 |
+
" <td>http://books.google.com/books/content?id=Kk-uV...</td>\n",
|
| 573 |
+
" <td>Nonfiction</td>\n",
|
| 574 |
+
" <td>0.081412</td>\n",
|
| 575 |
+
" <td>0.184495</td>\n",
|
| 576 |
+
" <td>0.095043</td>\n",
|
| 577 |
+
" <td>0.040564</td>\n",
|
| 578 |
+
" <td>0.884390</td>\n",
|
| 579 |
+
" <td>0.475881</td>\n",
|
| 580 |
+
" <td>0.078766</td>\n",
|
| 581 |
+
" </tr>\n",
|
| 582 |
+
" </tbody>\n",
|
| 583 |
+
"</table>\n",
|
| 584 |
+
"</div>"
|
| 585 |
+
],
|
| 586 |
+
"text/plain": [
|
| 587 |
+
" isbn13 full_title authors \\\n",
|
| 588 |
+
"0 9780002005883 Gilead Marilynne Robinson \n",
|
| 589 |
+
"1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n",
|
| 590 |
+
"2 9780006178736 Rage of angels Sidney Sheldon \n",
|
| 591 |
+
"3 9780006280897 The Four Loves Clive Staples Lewis \n",
|
| 592 |
+
"4 9780006280934 The Problem of Pain Clive Staples Lewis \n",
|
| 593 |
+
"\n",
|
| 594 |
+
" categories \\\n",
|
| 595 |
+
"0 Fiction \n",
|
| 596 |
+
"1 Detective and mystery stories \n",
|
| 597 |
+
"2 Fiction \n",
|
| 598 |
+
"3 Christian life \n",
|
| 599 |
+
"4 Christian life \n",
|
| 600 |
+
"\n",
|
| 601 |
+
" description \\\n",
|
| 602 |
+
"0 A NOVEL THAT READERS and critics have been eag... \n",
|
| 603 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... \n",
|
| 604 |
+
"2 A memorable, mesmerizing heroine Jennifer -- b... \n",
|
| 605 |
+
"3 Lewis' work on the nature of love divides love... \n",
|
| 606 |
+
"4 \"In The Problem of Pain, C.S. Lewis, one of th... \n",
|
| 607 |
+
"\n",
|
| 608 |
+
" full_desc published_year \\\n",
|
| 609 |
+
"0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n",
|
| 610 |
+
"1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n",
|
| 611 |
+
"2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n",
|
| 612 |
+
"3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n",
|
| 613 |
+
"4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n",
|
| 614 |
+
"\n",
|
| 615 |
+
" num_pages average_rating ratings_count \\\n",
|
| 616 |
+
"0 247.0 3.85 361.0 \n",
|
| 617 |
+
"1 241.0 3.83 5164.0 \n",
|
| 618 |
+
"2 512.0 3.93 29532.0 \n",
|
| 619 |
+
"3 170.0 4.15 33684.0 \n",
|
| 620 |
+
"4 176.0 4.09 37569.0 \n",
|
| 621 |
+
"\n",
|
| 622 |
+
" thumbnail final_categories \\\n",
|
| 623 |
+
"0 http://books.google.com/books/content?id=KQZCP... Fiction \n",
|
| 624 |
+
"1 http://books.google.com/books/content?id=gA5GP... Fiction \n",
|
| 625 |
+
"2 http://books.google.com/books/content?id=FKo2T... Fiction \n",
|
| 626 |
+
"3 http://books.google.com/books/content?id=XhQ5X... Nonfiction \n",
|
| 627 |
+
"4 http://books.google.com/books/content?id=Kk-uV... Nonfiction \n",
|
| 628 |
+
"\n",
|
| 629 |
+
" anger disgust fear joy neutral sadness surprise \n",
|
| 630 |
+
"0 0.064134 0.273591 0.928168 0.932797 0.646217 0.967158 0.729603 \n",
|
| 631 |
+
"1 0.612619 0.348284 0.942528 0.704421 0.887940 0.111690 0.252545 \n",
|
| 632 |
+
"2 0.064134 0.104007 0.972321 0.767237 0.549477 0.111690 0.078766 \n",
|
| 633 |
+
"3 0.351483 0.150722 0.360707 0.251881 0.732686 0.111690 0.078766 \n",
|
| 634 |
+
"4 0.081412 0.184495 0.095043 0.040564 0.884390 0.475881 0.078766 "
|
| 635 |
+
]
|
| 636 |
+
},
|
| 637 |
+
"execution_count": 8,
|
| 638 |
+
"metadata": {},
|
| 639 |
+
"output_type": "execute_result"
|
| 640 |
+
}
|
| 641 |
+
],
|
| 642 |
+
"source": [
|
| 643 |
+
"books_with_emotions = pd.merge(books, emotions_df, on='isbn13')\n",
|
| 644 |
+
"\n",
|
| 645 |
+
"books_with_emotions.to_csv('data/books_with_emotions.csv', index=False)\n",
|
| 646 |
+
"books_with_emotions.head()"
|
| 647 |
+
]
|
| 648 |
+
}
|
| 649 |
+
],
|
| 650 |
+
"metadata": {
|
| 651 |
+
"kernelspec": {
|
| 652 |
+
"display_name": "book_rcm",
|
| 653 |
+
"language": "python",
|
| 654 |
+
"name": "python3"
|
| 655 |
+
},
|
| 656 |
+
"language_info": {
|
| 657 |
+
"codemirror_mode": {
|
| 658 |
+
"name": "ipython",
|
| 659 |
+
"version": 3
|
| 660 |
+
},
|
| 661 |
+
"file_extension": ".py",
|
| 662 |
+
"mimetype": "text/x-python",
|
| 663 |
+
"name": "python",
|
| 664 |
+
"nbconvert_exporter": "python",
|
| 665 |
+
"pygments_lexer": "ipython3",
|
| 666 |
+
"version": "3.10.16"
|
| 667 |
+
}
|
| 668 |
+
},
|
| 669 |
+
"nbformat": 4,
|
| 670 |
+
"nbformat_minor": 2
|
| 671 |
+
}
|
step_05_Gradio_Dashboard.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
from langchain_community.document_loaders import TextLoader
|
| 7 |
+
from langchain_text_splitters import CharacterTextSplitter
|
| 8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 9 |
+
from langchain_chroma import Chroma
|
| 10 |
+
|
| 11 |
+
books = pd.read_csv('data/books_with_emotions.csv')
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Book Thumbnail
|
| 15 |
+
books['large_thumbnail'] = books['thumbnail'] + '&fife=w800'
|
| 16 |
+
books['large_thumbnail'] = np.where(
|
| 17 |
+
books['large_thumbnail'].isna(),
|
| 18 |
+
'cover-not-found.jpg',
|
| 19 |
+
books['large_thumbnail']
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Create Vector Database
|
| 24 |
+
raw_docs = TextLoader('./data/full_desc.txt', encoding='utf-8').load()
|
| 25 |
+
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator='\n')
|
| 26 |
+
docs = text_splitter.split_documents(raw_docs)
|
| 27 |
+
|
| 28 |
+
embeddings = HuggingFaceEmbeddings(
|
| 29 |
+
model_name='sentence-transformers/all-MiniLM-L6-v2',
|
| 30 |
+
model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
|
| 31 |
+
)
|
| 32 |
+
database = Chroma.from_documents(docs, embeddings)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Retrieval
|
| 36 |
+
def retrieval(query: str, category: str=None, tone: str=None, init_top_k: int=80, final_top_k: int=16) -> pd.DataFrame:
|
| 37 |
+
# recs = database.similarity_search_with_score(query, k=init_top_k)
|
| 38 |
+
recs = database.similarity_search(query, k=init_top_k)
|
| 39 |
+
ids = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
|
| 40 |
+
matches = books[books['isbn13'].isin(ids)]
|
| 41 |
+
|
| 42 |
+
if category != 'All':
|
| 43 |
+
matches = matches[matches['final_categories'] == category]
|
| 44 |
+
matches = matches.head(final_top_k)
|
| 45 |
+
|
| 46 |
+
if tone == 'Happy':
|
| 47 |
+
matches.sort_values(by='joy', ascending=False, inplace=True)
|
| 48 |
+
elif tone == 'Surprising':
|
| 49 |
+
matches.sort_values(by='surprise', ascending=False, inplace=True)
|
| 50 |
+
elif tone == 'Angry':
|
| 51 |
+
matches.sort_values(by='anger', ascending=False, inplace=True)
|
| 52 |
+
elif tone == 'Suspenseful':
|
| 53 |
+
matches.sort_values(by='fear', ascending=False, inplace=True)
|
| 54 |
+
elif tone == 'Sad':
|
| 55 |
+
matches.sort_values(by='sadness', ascending=False, inplace=True)
|
| 56 |
+
|
| 57 |
+
return matches
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# Recommendation
|
| 61 |
+
def recommend(query: str, category: str, tone: str):
|
| 62 |
+
recs = retrieval(query, category, tone)
|
| 63 |
+
results = []
|
| 64 |
+
|
| 65 |
+
for _, row in recs.iterrows():
|
| 66 |
+
description = ' '.join(row['description'].split()[:30]) + '...'
|
| 67 |
+
|
| 68 |
+
authors = row['authors'].split(';')
|
| 69 |
+
if len(authors) == 2:
|
| 70 |
+
authors_str = authors[0] + ' and ' + authors[1]
|
| 71 |
+
elif len(authors) > 2:
|
| 72 |
+
authors_str = ', '.join(authors[:-1]) + ' and ' + authors[-1]
|
| 73 |
+
else:
|
| 74 |
+
authors_str = authors[0]
|
| 75 |
+
|
| 76 |
+
caption = f"{row['full_title']} by {authors_str}: {description}"
|
| 77 |
+
|
| 78 |
+
results.append([
|
| 79 |
+
row['large_thumbnail'],
|
| 80 |
+
caption,
|
| 81 |
+
])
|
| 82 |
+
return results
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Dashboard
|
| 86 |
+
categories = ['All'] + sorted(books['final_categories'].unique())
|
| 87 |
+
tones = ['All'] + ['Happy', 'Surprising', 'Angry', 'Suspenseful', 'Sad']
|
| 88 |
+
|
| 89 |
+
with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
|
| 90 |
+
gr.Markdown('# Semantics Book Recommendation System')
|
| 91 |
+
|
| 92 |
+
with gr.Row():
|
| 93 |
+
user_query = gr.Textbox(
|
| 94 |
+
label='Please enter the description of the book you want to read',
|
| 95 |
+
placeholder='e.g. A story about a boy who ...',
|
| 96 |
+
)
|
| 97 |
+
category = gr.Dropdown(
|
| 98 |
+
choices=categories,
|
| 99 |
+
label='Select a category',
|
| 100 |
+
value='All'
|
| 101 |
+
)
|
| 102 |
+
tone = gr.Dropdown(
|
| 103 |
+
choices=tones,
|
| 104 |
+
label='Select an emotional tone',
|
| 105 |
+
value='All'
|
| 106 |
+
)
|
| 107 |
+
btn = gr.Button('Find books')
|
| 108 |
+
|
| 109 |
+
gr.Markdown('## Recommendations')
|
| 110 |
+
output = gr.Gallery(
|
| 111 |
+
label='Recommended Books',
|
| 112 |
+
columns=8, rows=2
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
btn.click(
|
| 116 |
+
fn=recommend,
|
| 117 |
+
inputs=[user_query, category, tone],
|
| 118 |
+
outputs=output
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
if __name__ == '__main__':
|
| 122 |
+
dashboard.launch(share=True)
|