Spaces:
Sleeping
Sleeping
github-actions[bot] commited on
Commit ·
267ad08
1
Parent(s): 49bc1b1
Auto-deploy from GitHub Actions
Browse files- app.py +1 -1
- space_repo/requirements.txt +2 -2
- space_repo/space_repo/space_repo/.gitignore +207 -0
- space_repo/space_repo/space_repo/EDA/EDA.ipynb +0 -0
- space_repo/space_repo/space_repo/LICENSE +21 -0
- space_repo/space_repo/space_repo/README.md +27 -0
- space_repo/space_repo/space_repo/app.py +125 -0
- space_repo/space_repo/space_repo/data/book_cleaned.csv +3 -0
- space_repo/space_repo/space_repo/data/book_with_categories.csv +3 -0
- space_repo/space_repo/space_repo/data/books.csv +3 -0
- space_repo/space_repo/space_repo/data/books_with_emotions.csv +3 -0
- space_repo/space_repo/space_repo/data/recommendations.csv +3 -0
- space_repo/space_repo/space_repo/data/tagged_description.txt +3 -0
- space_repo/space_repo/space_repo/requirements.txt +3 -0
- space_repo/space_repo/space_repo/sentiment_analysis.py +49 -0
- space_repo/space_repo/space_repo/text_classification.py +124 -0
- space_repo/space_repo/space_repo/vector_search.py +56 -0
app.py
CHANGED
|
@@ -14,7 +14,7 @@ load_dotenv()
|
|
| 14 |
# =======================
|
| 15 |
# Load and preprocess books
|
| 16 |
# =======================
|
| 17 |
-
books = pd.read_csv("books_with_emotions.csv")
|
| 18 |
|
| 19 |
books["large_thumbnail"] = np.where(
|
| 20 |
books["thumbnail"].notna(),
|
|
|
|
| 14 |
# =======================
|
| 15 |
# Load and preprocess books
|
| 16 |
# =======================
|
| 17 |
+
books = pd.read_csv("data/books_with_emotions.csv")
|
| 18 |
|
| 19 |
books["large_thumbnail"] = np.where(
|
| 20 |
books["thumbnail"].notna(),
|
space_repo/requirements.txt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8e7ea40c3f1142afd5bd94236b82b270c2f4cf4f2b600209c43416e461d89e7
|
| 3 |
+
size 372
|
space_repo/space_repo/space_repo/.gitignore
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
space_repo/space_repo/space_repo/EDA/EDA.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
space_repo/space_repo/space_repo/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Thien Phuc Nguyen
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
space_repo/space_repo/space_repo/README.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Sentiment Analysis Dashboard
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "4.39.0"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Sentiment Analysis Dashboard 📊
|
| 13 |
+
|
| 14 |
+
This project is an interactive Gradio web app for analyzing **book emotions** based on their descriptions.
|
| 15 |
+
It combines NLP and emotion classification models to visualize the emotional distribution of books.
|
| 16 |
+
|
| 17 |
+
## 🚀 Features
|
| 18 |
+
- Upload or search books with emotion detection
|
| 19 |
+
- Supports emotions: joy, sadness, anger, fear, disgust, surprise, and neutral
|
| 20 |
+
- Visual charts and comparison between predicted vs. labeled emotions
|
| 21 |
+
- Built with `transformers`, `pandas`, and `gradio`
|
| 22 |
+
|
| 23 |
+
## 🧠 Model
|
| 24 |
+
Uses a pre-trained Hugging Face emotion classifier fine-tuned on social media text.
|
| 25 |
+
|
| 26 |
+
## ⚙️ Deployment
|
| 27 |
+
CI/CD is automated via **GitHub Actions**, deploying directly to Hugging Face Spaces.
|
space_repo/space_repo/space_repo/app.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
from langchain_community.document_loaders import TextLoader
|
| 7 |
+
from langchain_openai import OpenAIEmbeddings
|
| 8 |
+
from langchain_text_splitters import CharacterTextSplitter
|
| 9 |
+
from langchain_chroma import Chroma
|
| 10 |
+
|
| 11 |
+
# Load environment variables
|
| 12 |
+
load_dotenv()
|
| 13 |
+
|
| 14 |
+
# =======================
|
| 15 |
+
# Load and preprocess books
|
| 16 |
+
# =======================
|
| 17 |
+
books = pd.read_csv("books_with_emotions.csv")
|
| 18 |
+
|
| 19 |
+
books["large_thumbnail"] = np.where(
|
| 20 |
+
books["thumbnail"].notna(),
|
| 21 |
+
books["thumbnail"] + "&fife=w800",
|
| 22 |
+
"cover-not-found.jpg"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# =======================
|
| 26 |
+
# Prepare Chroma vector DB
|
| 27 |
+
# =======================
|
| 28 |
+
raw_documents = TextLoader("tagged_description.txt").load()
|
| 29 |
+
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
|
| 30 |
+
documents = text_splitter.split_documents(raw_documents)
|
| 31 |
+
|
| 32 |
+
db_books = Chroma.from_documents(documents, OpenAIEmbeddings())
|
| 33 |
+
|
| 34 |
+
# =======================
|
| 35 |
+
# Semantic retrieval logic
|
| 36 |
+
# =======================
|
| 37 |
+
def retrieve_semantic_recommendations(query: str,
|
| 38 |
+
category: str = "All",
|
| 39 |
+
tone: str = "All",
|
| 40 |
+
initial_top_k: int = 50,
|
| 41 |
+
final_top_k: int = 16) -> pd.DataFrame:
|
| 42 |
+
"""Truy xuất danh sách gợi ý dựa trên ngữ nghĩa, danh mục và cảm xúc."""
|
| 43 |
+
|
| 44 |
+
recs = db_books.similarity_search(query, k=initial_top_k)
|
| 45 |
+
books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
|
| 46 |
+
|
| 47 |
+
# Lọc sách theo ISBN
|
| 48 |
+
book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
|
| 49 |
+
|
| 50 |
+
# Lọc theo category
|
| 51 |
+
if category != "All":
|
| 52 |
+
book_recs = book_recs[book_recs["simple_categories"] == category]
|
| 53 |
+
|
| 54 |
+
# Sắp xếp theo tone cảm xúc
|
| 55 |
+
tone_sort_map = {
|
| 56 |
+
"Happy": "joy",
|
| 57 |
+
"Surprising": "surprise",
|
| 58 |
+
"Angry": "anger",
|
| 59 |
+
"Suspenseful": "fear",
|
| 60 |
+
"Sad": "sadness"
|
| 61 |
+
}
|
| 62 |
+
if tone in tone_sort_map:
|
| 63 |
+
book_recs = book_recs.sort_values(by=tone_sort_map[tone], ascending=False)
|
| 64 |
+
|
| 65 |
+
return book_recs.head(final_top_k)
|
| 66 |
+
|
| 67 |
+
# =======================
|
| 68 |
+
# Recommendation formatting
|
| 69 |
+
# =======================
|
| 70 |
+
def recommend_books(query: str, category: str, tone: str):
|
| 71 |
+
recommendations = retrieve_semantic_recommendations(query, category, tone)
|
| 72 |
+
results = []
|
| 73 |
+
|
| 74 |
+
for _, row in recommendations.iterrows():
|
| 75 |
+
desc = row["description"].split()
|
| 76 |
+
truncated_description = " ".join(desc[:30]) + "..."
|
| 77 |
+
|
| 78 |
+
authors = row["authors"].split(";")
|
| 79 |
+
if len(authors) == 1:
|
| 80 |
+
authors_str = authors[0]
|
| 81 |
+
elif len(authors) == 2:
|
| 82 |
+
authors_str = f"{authors[0]} and {authors[1]}"
|
| 83 |
+
else:
|
| 84 |
+
authors_str = f"{', '.join(authors[:-1])}, and {authors[-1]}"
|
| 85 |
+
|
| 86 |
+
caption = f"{row['title']} by {authors_str}: {truncated_description}"
|
| 87 |
+
results.append((row["large_thumbnail"], caption))
|
| 88 |
+
|
| 89 |
+
return results
|
| 90 |
+
|
| 91 |
+
# =======================
|
| 92 |
+
# Build Gradio dashboard
|
| 93 |
+
# =======================
|
| 94 |
+
categories = ["All"] + sorted(books["simple_categories"].unique())
|
| 95 |
+
tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
|
| 96 |
+
|
| 97 |
+
with gr.Blocks() as dashboard:
|
| 98 |
+
gr.Markdown("# 📚 Semantic Book Recommender")
|
| 99 |
+
|
| 100 |
+
with gr.Row():
|
| 101 |
+
user_query = gr.Textbox(
|
| 102 |
+
label="Please enter a description of a book:",
|
| 103 |
+
placeholder="e.g., A story about forgiveness"
|
| 104 |
+
)
|
| 105 |
+
category_dropdown = gr.Dropdown(
|
| 106 |
+
choices=categories,
|
| 107 |
+
label="Select a category:",
|
| 108 |
+
value="All"
|
| 109 |
+
)
|
| 110 |
+
tone_dropdown = gr.Dropdown(
|
| 111 |
+
choices=tones,
|
| 112 |
+
label="Select an emotional tone:",
|
| 113 |
+
value="All"
|
| 114 |
+
)
|
| 115 |
+
submit_button = gr.Button("Find recommendations")
|
| 116 |
+
|
| 117 |
+
gr.Markdown("## 🧠 Recommendations")
|
| 118 |
+
output = gr.Gallery(label="Recommended books", columns=8, rows=2)
|
| 119 |
+
|
| 120 |
+
submit_button.click(fn=recommend_books,
|
| 121 |
+
inputs=[user_query, category_dropdown, tone_dropdown],
|
| 122 |
+
outputs=output)
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
dashboard.launch()
|
space_repo/space_repo/space_repo/data/book_cleaned.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d34b72aed224acefc8fd234b63f242e3ddcbdea95c04082f4980d4909ea00b1
|
| 3 |
+
size 6421116
|
space_repo/space_repo/space_repo/data/book_with_categories.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efd1c17d252abcb2d30f5eea07faedc8180c7ce0e23fccb8613eb6d42c9c88d5
|
| 3 |
+
size 6468698
|
space_repo/space_repo/space_repo/data/books.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64a19afd3df7170d153bb5b7e1afdfed4a05e2fe3ac163a168867596f2515e43
|
| 3 |
+
size 4142211
|
space_repo/space_repo/space_repo/data/books_with_emotions.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e94e95dc2caec7a9bf8044dda0611d0837428cce1873eae15c7043786ecdfb81
|
| 3 |
+
size 7185778
|
space_repo/space_repo/space_repo/data/recommendations.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f340096a7be4a17caa068d89af79b6e69f0360512cfb3664dcee510a234b88a
|
| 3 |
+
size 13787
|
space_repo/space_repo/space_repo/data/tagged_description.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cacf3c12e1418e194fa09f06006de8c202fa40c97109ef183ae7f4b23b41712
|
| 3 |
+
size 2607714
|
space_repo/space_repo/space_repo/requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:472496b2ad0b094965beb86014b030ece208be13538da848bf1c85c6d1ea2678
|
| 3 |
+
size 313
|
space_repo/space_repo/space_repo/sentiment_analysis.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
|
| 6 |
+
# Initialize the emotion classifier
|
| 7 |
+
classifier = pipeline(
|
| 8 |
+
"text-classification",
|
| 9 |
+
model="j-hartmann/emotion-english-distilroberta-base",
|
| 10 |
+
top_k=None,
|
| 11 |
+
device="mps"
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
books = pd.read_csv("data/book_with_categories.csv")
|
| 15 |
+
|
| 16 |
+
# Define emotion labels
|
| 17 |
+
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
|
| 18 |
+
|
| 19 |
+
def calculate_max_emotion_score(predictions):
|
| 20 |
+
"""Calculate maximum score for each emotion from predictions."""
|
| 21 |
+
per_emotion_scores = {label: [] for label in emotion_labels}
|
| 22 |
+
for pred in predictions:
|
| 23 |
+
sorted_pred = sorted(pred, key=lambda x: x['label'])
|
| 24 |
+
for idx, label in enumerate(emotion_labels):
|
| 25 |
+
per_emotion_scores[label].append(sorted_pred[idx]['score'])
|
| 26 |
+
return {label: np.max(scores) for label, scores in per_emotion_scores.items()}
|
| 27 |
+
|
| 28 |
+
# Process sentiment analysis for all books
|
| 29 |
+
emotion_scores = {label: [] for label in emotion_labels}
|
| 30 |
+
isbn = []
|
| 31 |
+
|
| 32 |
+
for i in tqdm(range(len(books)), desc="Processing books"):
|
| 33 |
+
isbn.append(books.iloc[i]['isbn13'])
|
| 34 |
+
sentences = books.iloc[i]['description'].split('.')
|
| 35 |
+
predictions = classifier(sentences)
|
| 36 |
+
max_scores = calculate_max_emotion_score(predictions)
|
| 37 |
+
for label in emotion_labels:
|
| 38 |
+
emotion_scores[label].append(max_scores[label])
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
emotions_df = pd.DataFrame(emotion_scores)
|
| 42 |
+
emotions_df["isbn13"] = isbn
|
| 43 |
+
|
| 44 |
+
# Merge with original books DataFrame
|
| 45 |
+
books = pd.merge(books, emotions_df, on="isbn13")
|
| 46 |
+
|
| 47 |
+
books.to_csv('data/books_with_emotions.csv', index=False)
|
| 48 |
+
|
| 49 |
+
print("Sentiment analysis completed and saved to 'data/books_with_emotions.csv'")
|
space_repo/space_repo/space_repo/text_classification.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
|
| 6 |
+
# Load data
|
| 7 |
+
books = pd.read_csv('data/book_cleaned.csv')
|
| 8 |
+
|
| 9 |
+
# Define category mapping for simplification
|
| 10 |
+
categories_mapping = {
|
| 11 |
+
'Fiction': 'Fiction',
|
| 12 |
+
'Juvenile Fiction': 'Fiction',
|
| 13 |
+
'Biography & Autobiography': 'Nonfiction',
|
| 14 |
+
'History': 'Nonfiction',
|
| 15 |
+
'Literary Criticism': 'Nonfiction',
|
| 16 |
+
'Philosophy': 'Nonfiction',
|
| 17 |
+
'Religion': 'Nonfiction',
|
| 18 |
+
'Comics & Graphic Novels': 'Fiction',
|
| 19 |
+
'Drama': 'Fiction',
|
| 20 |
+
'Juvenile Nonfiction': 'Nonfiction',
|
| 21 |
+
'Science': 'Nonfiction',
|
| 22 |
+
'Poetry': 'Fiction'
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# Apply category mapping
|
| 26 |
+
books['simple_categories'] = books['categories'].map(categories_mapping)
|
| 27 |
+
|
| 28 |
+
# Initialize zero-shot classifier
|
| 29 |
+
fiction_categories = ['Fiction', 'Nonfiction']
|
| 30 |
+
classifier = pipeline(
|
| 31 |
+
"zero-shot-classification",
|
| 32 |
+
model="facebook/bart-large-mnli",
|
| 33 |
+
device="mps"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
def generate_prediction(sequence: str, categories: list, classifier: pipeline) -> str:
|
| 37 |
+
"""
|
| 38 |
+
Generate predicted category for a given sequence using a zero-shot classifier.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
sequence (str): Input text to classify.
|
| 42 |
+
categories (list): List of possible categories.
|
| 43 |
+
classifier (pipeline): Hugging Face zero-shot classification pipeline.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
str: Predicted category label.
|
| 47 |
+
|
| 48 |
+
Raises:
|
| 49 |
+
ValueError: If sequence is empty or invalid.
|
| 50 |
+
RuntimeError: If classifier output is invalid.
|
| 51 |
+
"""
|
| 52 |
+
if not sequence or not isinstance(sequence, str):
|
| 53 |
+
raise ValueError("Sequence must be a non-empty string")
|
| 54 |
+
if not categories or not isinstance(categories, list):
|
| 55 |
+
raise ValueError("Categories must be a non-empty list")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
sequence = sequence[:512] if len(sequence) > 512 else sequence
|
| 59 |
+
result = classifier(sequence, candidate_labels=categories, multi_label=False)
|
| 60 |
+
if not isinstance(result, dict) or 'labels' not in result or 'scores' not in result:
|
| 61 |
+
raise RuntimeError(f"Unexpected classifier output: {result}")
|
| 62 |
+
max_idx = np.argmax(result['scores'])
|
| 63 |
+
return result['labels'][max_idx]
|
| 64 |
+
except Exception as e:
|
| 65 |
+
raise RuntimeError(f"Error in prediction: {str(e)}")
|
| 66 |
+
|
| 67 |
+
# Evaluate classifier on a sample of known categories
|
| 68 |
+
actual_cats = []
|
| 69 |
+
pred_cats = []
|
| 70 |
+
|
| 71 |
+
# Process Fiction samples
|
| 72 |
+
fiction_descriptions = books.loc[books['simple_categories'] == "Fiction", "description"].reset_index(drop=True)
|
| 73 |
+
for i in tqdm(range(min(200, len(fiction_descriptions))), desc="Processing Fiction"):
|
| 74 |
+
try:
|
| 75 |
+
actual_cats.append("Fiction")
|
| 76 |
+
pred_cats.append(generate_prediction(fiction_descriptions[i], fiction_categories, classifier))
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"Error processing Fiction sample {i}: {str(e)}")
|
| 79 |
+
pred_cats.append("Unknown") # Fallback category
|
| 80 |
+
|
| 81 |
+
# Process Nonfiction samples
|
| 82 |
+
nonfiction_descriptions = books.loc[books['simple_categories'] == "Nonfiction", "description"].reset_index(drop=True)
|
| 83 |
+
for i in tqdm(range(min(200, len(nonfiction_descriptions))), desc="Processing Nonfiction"):
|
| 84 |
+
try:
|
| 85 |
+
actual_cats.append("Nonfiction")
|
| 86 |
+
pred_cats.append(generate_prediction(nonfiction_descriptions[i], fiction_categories, classifier))
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"Error processing Nonfiction sample {i}: {str(e)}")
|
| 89 |
+
pred_cats.append("Unknown") # Fallback category
|
| 90 |
+
|
| 91 |
+
# Create predictions DataFrame and calculate accuracy
|
| 92 |
+
preds_df = pd.DataFrame({"actual_cats": actual_cats, "pred_cats": pred_cats})
|
| 93 |
+
preds_df["correct_pred"] = (preds_df["actual_cats"] == preds_df["pred_cats"]).astype(int)
|
| 94 |
+
accuracy = preds_df["correct_pred"].mean()
|
| 95 |
+
print(f"Classification accuracy: {accuracy:.4f}")
|
| 96 |
+
|
| 97 |
+
# Predict categories for missing values
|
| 98 |
+
missing_cat = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
|
| 99 |
+
isbn = []
|
| 100 |
+
preds = []
|
| 101 |
+
|
| 102 |
+
for i in tqdm(range(len(missing_cat)), desc="Predicting missing categories"):
|
| 103 |
+
try:
|
| 104 |
+
isbn.append(missing_cat['isbn13'][i])
|
| 105 |
+
preds.append(generate_prediction(missing_cat['description'][i], fiction_categories, classifier))
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"Error predicting for ISBN {missing_cat['isbn13'][i]}: {str(e)}")
|
| 108 |
+
preds.append("Unknown") # Fallback category
|
| 109 |
+
isbn.append(missing_cat['isbn13'][i])
|
| 110 |
+
|
| 111 |
+
# Create DataFrame for predicted categories
|
| 112 |
+
missing_preds_df = pd.DataFrame({"isbn13": isbn, "predicted_categories": preds})
|
| 113 |
+
|
| 114 |
+
# Merge predictions and fill missing categories
|
| 115 |
+
books = pd.merge(books, missing_preds_df, on="isbn13", how="left")
|
| 116 |
+
books["simple_categories"] = books["simple_categories"].fillna(books["predicted_categories"])
|
| 117 |
+
books = books.drop(columns=["predicted_categories"])
|
| 118 |
+
|
| 119 |
+
# Save updated DataFrame
|
| 120 |
+
books.to_csv('data/book_with_categories.csv', index=False)
|
| 121 |
+
|
| 122 |
+
print("Category classification completed and saved to 'data/book_with_categories.csv'")
|
| 123 |
+
print("Category distribution:")
|
| 124 |
+
print(books['simple_categories'].value_counts())
|
space_repo/space_repo/space_repo/vector_search.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
from langchain_community.document_loaders import TextLoader
|
| 5 |
+
from langchain_text_splitters import CharacterTextSplitter
|
| 6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_openai import OpenAIEmbeddings
|
| 8 |
+
from langchain_chroma import Chroma
|
| 9 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
| 10 |
+
|
| 11 |
+
# Load data (assuming db_books is a FAISS vector store)
|
| 12 |
+
books = pd.read_csv('data/book_with_categories.csv')
|
| 13 |
+
|
| 14 |
+
def retrieve_semantic_recommendations(query: str, top_k: int = 10, db_books=None) -> pd.DataFrame:
|
| 15 |
+
"""
|
| 16 |
+
Retrieve top-k book recommendations based on semantic similarity to the query.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
query (str): The search query.
|
| 20 |
+
top_k (int): Number of recommendations to return.
|
| 21 |
+
db_books: FAISS vector store containing book embeddings.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
pd.DataFrame: DataFrame with top-k book recommendations.
|
| 25 |
+
"""
|
| 26 |
+
if db_books is None:
|
| 27 |
+
raise ValueError("db_books vector store is required")
|
| 28 |
+
|
| 29 |
+
# Perform similarity search
|
| 30 |
+
recs = db_books.similarity_search(query, k=50)
|
| 31 |
+
|
| 32 |
+
# Extract ISBNs from search results
|
| 33 |
+
books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
|
| 34 |
+
|
| 35 |
+
# Filter books DataFrame and return top-k
|
| 36 |
+
return books[books["isbn13"].isin(books_list)].head(top_k)
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
books = pd.read_csv('data/book_cleaned.csv')
|
| 40 |
+
books['tagged_description'].to_csv('data/tagged_description.txt',
|
| 41 |
+
sep='\n',
|
| 42 |
+
index=False,
|
| 43 |
+
header=False)
|
| 44 |
+
raw_docs = TextLoader('data/tagged_description.txt', encoding='utf-8').load()
|
| 45 |
+
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
|
| 46 |
+
documents = text_splitter.split_documents(raw_docs)
|
| 47 |
+
|
| 48 |
+
embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
|
| 49 |
+
db_books = Chroma.from_documents(
|
| 50 |
+
documents,
|
| 51 |
+
embedding=embedding)
|
| 52 |
+
|
| 53 |
+
query = "A book to teach children about nature"
|
| 54 |
+
|
| 55 |
+
recommendations = retrieve_semantic_recommendations(query, top_k=10, db_books=db_books)
|
| 56 |
+
print(recommendations)
|