github-actions[bot] commited on
Commit
267ad08
·
1 Parent(s): 49bc1b1

Auto-deploy from GitHub Actions

Browse files
app.py CHANGED
@@ -14,7 +14,7 @@ load_dotenv()
14
  # =======================
15
  # Load and preprocess books
16
  # =======================
17
- books = pd.read_csv("books_with_emotions.csv")
18
 
19
  books["large_thumbnail"] = np.where(
20
  books["thumbnail"].notna(),
 
14
  # =======================
15
  # Load and preprocess books
16
  # =======================
17
+ books = pd.read_csv("data/books_with_emotions.csv")
18
 
19
  books["large_thumbnail"] = np.where(
20
  books["thumbnail"].notna(),
space_repo/requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:472496b2ad0b094965beb86014b030ece208be13538da848bf1c85c6d1ea2678
3
- size 313
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8e7ea40c3f1142afd5bd94236b82b270c2f4cf4f2b600209c43416e461d89e7
3
+ size 372
space_repo/space_repo/space_repo/.gitignore ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
space_repo/space_repo/space_repo/EDA/EDA.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
space_repo/space_repo/space_repo/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Thien Phuc Nguyen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
space_repo/space_repo/space_repo/README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sentiment Analysis Dashboard
3
+ emoji: 📚
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: "4.39.0"
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ # Sentiment Analysis Dashboard 📊
13
+
14
+ This project is an interactive Gradio web app for analyzing **book emotions** based on their descriptions.
15
+ It combines NLP and emotion classification models to visualize the emotional distribution of books.
16
+
17
+ ## 🚀 Features
18
+ - Upload or search books with emotion detection
19
+ - Supports emotions: joy, sadness, anger, fear, disgust, surprise, and neutral
20
+ - Visual charts and comparison between predicted vs. labeled emotions
21
+ - Built with `transformers`, `pandas`, and `gradio`
22
+
23
+ ## 🧠 Model
24
+ Uses a pre-trained Hugging Face emotion classifier fine-tuned on social media text.
25
+
26
+ ## ⚙️ Deployment
27
+ CI/CD is automated via **GitHub Actions**, deploying directly to Hugging Face Spaces.
space_repo/space_repo/space_repo/app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from dotenv import load_dotenv
4
+ import gradio as gr
5
+
6
+ from langchain_community.document_loaders import TextLoader
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from langchain_text_splitters import CharacterTextSplitter
9
+ from langchain_chroma import Chroma
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # =======================
15
+ # Load and preprocess books
16
+ # =======================
17
+ books = pd.read_csv("books_with_emotions.csv")
18
+
19
+ books["large_thumbnail"] = np.where(
20
+ books["thumbnail"].notna(),
21
+ books["thumbnail"] + "&fife=w800",
22
+ "cover-not-found.jpg"
23
+ )
24
+
25
+ # =======================
26
+ # Prepare Chroma vector DB
27
+ # =======================
28
+ raw_documents = TextLoader("tagged_description.txt").load()
29
+ text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
30
+ documents = text_splitter.split_documents(raw_documents)
31
+
32
+ db_books = Chroma.from_documents(documents, OpenAIEmbeddings())
33
+
34
+ # =======================
35
+ # Semantic retrieval logic
36
+ # =======================
37
+ def retrieve_semantic_recommendations(query: str,
38
+ category: str = "All",
39
+ tone: str = "All",
40
+ initial_top_k: int = 50,
41
+ final_top_k: int = 16) -> pd.DataFrame:
42
+ """Truy xuất danh sách gợi ý dựa trên ngữ nghĩa, danh mục và cảm xúc."""
43
+
44
+ recs = db_books.similarity_search(query, k=initial_top_k)
45
+ books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
46
+
47
+ # Lọc sách theo ISBN
48
+ book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
49
+
50
+ # Lọc theo category
51
+ if category != "All":
52
+ book_recs = book_recs[book_recs["simple_categories"] == category]
53
+
54
+ # Sắp xếp theo tone cảm xúc
55
+ tone_sort_map = {
56
+ "Happy": "joy",
57
+ "Surprising": "surprise",
58
+ "Angry": "anger",
59
+ "Suspenseful": "fear",
60
+ "Sad": "sadness"
61
+ }
62
+ if tone in tone_sort_map:
63
+ book_recs = book_recs.sort_values(by=tone_sort_map[tone], ascending=False)
64
+
65
+ return book_recs.head(final_top_k)
66
+
67
+ # =======================
68
+ # Recommendation formatting
69
+ # =======================
70
+ def recommend_books(query: str, category: str, tone: str):
71
+ recommendations = retrieve_semantic_recommendations(query, category, tone)
72
+ results = []
73
+
74
+ for _, row in recommendations.iterrows():
75
+ desc = row["description"].split()
76
+ truncated_description = " ".join(desc[:30]) + "..."
77
+
78
+ authors = row["authors"].split(";")
79
+ if len(authors) == 1:
80
+ authors_str = authors[0]
81
+ elif len(authors) == 2:
82
+ authors_str = f"{authors[0]} and {authors[1]}"
83
+ else:
84
+ authors_str = f"{', '.join(authors[:-1])}, and {authors[-1]}"
85
+
86
+ caption = f"{row['title']} by {authors_str}: {truncated_description}"
87
+ results.append((row["large_thumbnail"], caption))
88
+
89
+ return results
90
+
91
+ # =======================
92
+ # Build Gradio dashboard
93
+ # =======================
94
+ categories = ["All"] + sorted(books["simple_categories"].unique())
95
+ tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
96
+
97
+ with gr.Blocks() as dashboard:
98
+ gr.Markdown("# 📚 Semantic Book Recommender")
99
+
100
+ with gr.Row():
101
+ user_query = gr.Textbox(
102
+ label="Please enter a description of a book:",
103
+ placeholder="e.g., A story about forgiveness"
104
+ )
105
+ category_dropdown = gr.Dropdown(
106
+ choices=categories,
107
+ label="Select a category:",
108
+ value="All"
109
+ )
110
+ tone_dropdown = gr.Dropdown(
111
+ choices=tones,
112
+ label="Select an emotional tone:",
113
+ value="All"
114
+ )
115
+ submit_button = gr.Button("Find recommendations")
116
+
117
+ gr.Markdown("## 🧠 Recommendations")
118
+ output = gr.Gallery(label="Recommended books", columns=8, rows=2)
119
+
120
+ submit_button.click(fn=recommend_books,
121
+ inputs=[user_query, category_dropdown, tone_dropdown],
122
+ outputs=output)
123
+
124
+ if __name__ == "__main__":
125
+ dashboard.launch()
space_repo/space_repo/space_repo/data/book_cleaned.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d34b72aed224acefc8fd234b63f242e3ddcbdea95c04082f4980d4909ea00b1
3
+ size 6421116
space_repo/space_repo/space_repo/data/book_with_categories.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd1c17d252abcb2d30f5eea07faedc8180c7ce0e23fccb8613eb6d42c9c88d5
3
+ size 6468698
space_repo/space_repo/space_repo/data/books.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a19afd3df7170d153bb5b7e1afdfed4a05e2fe3ac163a168867596f2515e43
3
+ size 4142211
space_repo/space_repo/space_repo/data/books_with_emotions.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e94e95dc2caec7a9bf8044dda0611d0837428cce1873eae15c7043786ecdfb81
3
+ size 7185778
space_repo/space_repo/space_repo/data/recommendations.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f340096a7be4a17caa068d89af79b6e69f0360512cfb3664dcee510a234b88a
3
+ size 13787
space_repo/space_repo/space_repo/data/tagged_description.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cacf3c12e1418e194fa09f06006de8c202fa40c97109ef183ae7f4b23b41712
3
+ size 2607714
space_repo/space_repo/space_repo/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:472496b2ad0b094965beb86014b030ece208be13538da848bf1c85c6d1ea2678
3
+ size 313
space_repo/space_repo/space_repo/sentiment_analysis.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from tqdm import tqdm
4
+ from transformers import pipeline
5
+
6
+ # Initialize the emotion classifier
7
+ classifier = pipeline(
8
+ "text-classification",
9
+ model="j-hartmann/emotion-english-distilroberta-base",
10
+ top_k=None,
11
+ device="mps"
12
+ )
13
+
14
+ books = pd.read_csv("data/book_with_categories.csv")
15
+
16
+ # Define emotion labels
17
+ emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
18
+
19
+ def calculate_max_emotion_score(predictions):
20
+ """Calculate maximum score for each emotion from predictions."""
21
+ per_emotion_scores = {label: [] for label in emotion_labels}
22
+ for pred in predictions:
23
+ sorted_pred = sorted(pred, key=lambda x: x['label'])
24
+ for idx, label in enumerate(emotion_labels):
25
+ per_emotion_scores[label].append(sorted_pred[idx]['score'])
26
+ return {label: np.max(scores) for label, scores in per_emotion_scores.items()}
27
+
28
+ # Process sentiment analysis for all books
29
+ emotion_scores = {label: [] for label in emotion_labels}
30
+ isbn = []
31
+
32
+ for i in tqdm(range(len(books)), desc="Processing books"):
33
+ isbn.append(books.iloc[i]['isbn13'])
34
+ sentences = books.iloc[i]['description'].split('.')
35
+ predictions = classifier(sentences)
36
+ max_scores = calculate_max_emotion_score(predictions)
37
+ for label in emotion_labels:
38
+ emotion_scores[label].append(max_scores[label])
39
+
40
+
41
+ emotions_df = pd.DataFrame(emotion_scores)
42
+ emotions_df["isbn13"] = isbn
43
+
44
+ # Merge with original books DataFrame
45
+ books = pd.merge(books, emotions_df, on="isbn13")
46
+
47
+ books.to_csv('data/books_with_emotions.csv', index=False)
48
+
49
+ print("Sentiment analysis completed and saved to 'data/books_with_emotions.csv'")
space_repo/space_repo/space_repo/text_classification.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from tqdm import tqdm
4
+ from transformers import pipeline
5
+
6
+ # Load data
7
+ books = pd.read_csv('data/book_cleaned.csv')
8
+
9
+ # Define category mapping for simplification
10
+ categories_mapping = {
11
+ 'Fiction': 'Fiction',
12
+ 'Juvenile Fiction': 'Fiction',
13
+ 'Biography & Autobiography': 'Nonfiction',
14
+ 'History': 'Nonfiction',
15
+ 'Literary Criticism': 'Nonfiction',
16
+ 'Philosophy': 'Nonfiction',
17
+ 'Religion': 'Nonfiction',
18
+ 'Comics & Graphic Novels': 'Fiction',
19
+ 'Drama': 'Fiction',
20
+ 'Juvenile Nonfiction': 'Nonfiction',
21
+ 'Science': 'Nonfiction',
22
+ 'Poetry': 'Fiction'
23
+ }
24
+
25
+ # Apply category mapping
26
+ books['simple_categories'] = books['categories'].map(categories_mapping)
27
+
28
+ # Initialize zero-shot classifier
29
+ fiction_categories = ['Fiction', 'Nonfiction']
30
+ classifier = pipeline(
31
+ "zero-shot-classification",
32
+ model="facebook/bart-large-mnli",
33
+ device="mps"
34
+ )
35
+
36
+ def generate_prediction(sequence: str, categories: list, classifier: pipeline) -> str:
37
+ """
38
+ Generate predicted category for a given sequence using a zero-shot classifier.
39
+
40
+ Args:
41
+ sequence (str): Input text to classify.
42
+ categories (list): List of possible categories.
43
+ classifier (pipeline): Hugging Face zero-shot classification pipeline.
44
+
45
+ Returns:
46
+ str: Predicted category label.
47
+
48
+ Raises:
49
+ ValueError: If sequence is empty or invalid.
50
+ RuntimeError: If classifier output is invalid.
51
+ """
52
+ if not sequence or not isinstance(sequence, str):
53
+ raise ValueError("Sequence must be a non-empty string")
54
+ if not categories or not isinstance(categories, list):
55
+ raise ValueError("Categories must be a non-empty list")
56
+
57
+ try:
58
+ sequence = sequence[:512] if len(sequence) > 512 else sequence
59
+ result = classifier(sequence, candidate_labels=categories, multi_label=False)
60
+ if not isinstance(result, dict) or 'labels' not in result or 'scores' not in result:
61
+ raise RuntimeError(f"Unexpected classifier output: {result}")
62
+ max_idx = np.argmax(result['scores'])
63
+ return result['labels'][max_idx]
64
+ except Exception as e:
65
+ raise RuntimeError(f"Error in prediction: {str(e)}")
66
+
67
+ # Evaluate classifier on a sample of known categories
68
+ actual_cats = []
69
+ pred_cats = []
70
+
71
+ # Process Fiction samples
72
+ fiction_descriptions = books.loc[books['simple_categories'] == "Fiction", "description"].reset_index(drop=True)
73
+ for i in tqdm(range(min(200, len(fiction_descriptions))), desc="Processing Fiction"):
74
+ try:
75
+ actual_cats.append("Fiction")
76
+ pred_cats.append(generate_prediction(fiction_descriptions[i], fiction_categories, classifier))
77
+ except Exception as e:
78
+ print(f"Error processing Fiction sample {i}: {str(e)}")
79
+ pred_cats.append("Unknown") # Fallback category
80
+
81
+ # Process Nonfiction samples
82
+ nonfiction_descriptions = books.loc[books['simple_categories'] == "Nonfiction", "description"].reset_index(drop=True)
83
+ for i in tqdm(range(min(200, len(nonfiction_descriptions))), desc="Processing Nonfiction"):
84
+ try:
85
+ actual_cats.append("Nonfiction")
86
+ pred_cats.append(generate_prediction(nonfiction_descriptions[i], fiction_categories, classifier))
87
+ except Exception as e:
88
+ print(f"Error processing Nonfiction sample {i}: {str(e)}")
89
+ pred_cats.append("Unknown") # Fallback category
90
+
91
+ # Create predictions DataFrame and calculate accuracy
92
+ preds_df = pd.DataFrame({"actual_cats": actual_cats, "pred_cats": pred_cats})
93
+ preds_df["correct_pred"] = (preds_df["actual_cats"] == preds_df["pred_cats"]).astype(int)
94
+ accuracy = preds_df["correct_pred"].mean()
95
+ print(f"Classification accuracy: {accuracy:.4f}")
96
+
97
+ # Predict categories for missing values
98
+ missing_cat = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
99
+ isbn = []
100
+ preds = []
101
+
102
+ for i in tqdm(range(len(missing_cat)), desc="Predicting missing categories"):
103
+ try:
104
+ isbn.append(missing_cat['isbn13'][i])
105
+ preds.append(generate_prediction(missing_cat['description'][i], fiction_categories, classifier))
106
+ except Exception as e:
107
+ print(f"Error predicting for ISBN {missing_cat['isbn13'][i]}: {str(e)}")
108
+ preds.append("Unknown") # Fallback category
109
+ isbn.append(missing_cat['isbn13'][i])
110
+
111
+ # Create DataFrame for predicted categories
112
+ missing_preds_df = pd.DataFrame({"isbn13": isbn, "predicted_categories": preds})
113
+
114
+ # Merge predictions and fill missing categories
115
+ books = pd.merge(books, missing_preds_df, on="isbn13", how="left")
116
+ books["simple_categories"] = books["simple_categories"].fillna(books["predicted_categories"])
117
+ books = books.drop(columns=["predicted_categories"])
118
+
119
+ # Save updated DataFrame
120
+ books.to_csv('data/book_with_categories.csv', index=False)
121
+
122
+ print("Category classification completed and saved to 'data/book_with_categories.csv'")
123
+ print("Category distribution:")
124
+ print(books['simple_categories'].value_counts())
space_repo/space_repo/space_repo/vector_search.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from langchain_community.document_loaders import TextLoader
5
+ from langchain_text_splitters import CharacterTextSplitter
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from langchain_chroma import Chroma
9
+ from langchain.embeddings import SentenceTransformerEmbeddings
10
+
11
+ # Load data (assuming db_books is a FAISS vector store)
12
+ books = pd.read_csv('data/book_with_categories.csv')
13
+
14
+ def retrieve_semantic_recommendations(query: str, top_k: int = 10, db_books=None) -> pd.DataFrame:
15
+ """
16
+ Retrieve top-k book recommendations based on semantic similarity to the query.
17
+
18
+ Args:
19
+ query (str): The search query.
20
+ top_k (int): Number of recommendations to return.
21
+ db_books: FAISS vector store containing book embeddings.
22
+
23
+ Returns:
24
+ pd.DataFrame: DataFrame with top-k book recommendations.
25
+ """
26
+ if db_books is None:
27
+ raise ValueError("db_books vector store is required")
28
+
29
+ # Perform similarity search
30
+ recs = db_books.similarity_search(query, k=50)
31
+
32
+ # Extract ISBNs from search results
33
+ books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
34
+
35
+ # Filter books DataFrame and return top-k
36
+ return books[books["isbn13"].isin(books_list)].head(top_k)
37
+
38
+ if __name__ == "__main__":
39
+ books = pd.read_csv('data/book_cleaned.csv')
40
+ books['tagged_description'].to_csv('data/tagged_description.txt',
41
+ sep='\n',
42
+ index=False,
43
+ header=False)
44
+ raw_docs = TextLoader('data/tagged_description.txt', encoding='utf-8').load()
45
+ text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
46
+ documents = text_splitter.split_documents(raw_docs)
47
+
48
+ embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
49
+ db_books = Chroma.from_documents(
50
+ documents,
51
+ embedding=embedding)
52
+
53
+ query = "A book to teach children about nature"
54
+
55
+ recommendations = retrieve_semantic_recommendations(query, top_k=10, db_books=db_books)
56
+ print(recommendations)