badminton001 commited on
Commit
3ef5bd7
·
verified ·
1 Parent(s): 9381930

Update preprocessing/annotate_books_50000.py

Browse files
Files changed (1) hide show
  1. preprocessing/annotate_books_50000.py +232 -235
preprocessing/annotate_books_50000.py CHANGED
@@ -1,236 +1,233 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- Annotation module for Open Library books.
6
-
7
- Annotates 50,000-book dataset with additional features:
8
- - Genres (based on subjects)
9
- - Mood (based on subjects)
10
- - Target Audience (based on subjects and reading levels)
11
- - Era (based on publish year)
12
- - Decade (based on publish year)
13
- - Language (currently defaults to unknown due to data limitations)
14
-
15
- Input:
16
- - data/book/raw/openlibrary_books_50000.json
17
-
18
- Output:
19
- - data/book/annotated/books_annotated_50000.json
20
- """
21
-
22
- import os
23
- import json
24
- import re
25
- from pathlib import Path
26
- from datetime import datetime
27
- from typing import List, Dict, Any, Optional
28
-
29
- # --- Paths Configuration ---
30
- # Path to the raw Open Library books JSON file (updated for 50,000 records)
31
- RAW_PATH = Path(__file__).parent.parent / "data" / "book" / "raw" / "openlibrary_books_50000.json"
32
- # Path to the output annotated books JSON file (updated for 50,000 records)
33
- OUT_PATH = Path(__file__).parent.parent / "data" / "book" / "annotated" / "books_annotated_50000.json"
34
- # Ensure the output directory exists
35
- OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
36
-
37
- # --- Genre Keywords Mapping ---
38
- GENRE_KEYWORDS = {
39
- "fiction": "Fiction",
40
- "romance": "Romance",
41
- "mystery": "Mystery",
42
- "fantasy": "Fantasy",
43
- "science fiction": "Science Fiction",
44
- "history": "History",
45
- "poetry": "Poetry",
46
- "biography": "Biography",
47
- "self-help": "Self-Help",
48
- "horror": "Horror",
49
- "children": "Children",
50
- "young adult": "Young Adult",
51
- "adventure": "Adventure",
52
- "classic": "Classics",
53
- "education": "Education",
54
- "philosophy": "Philosophy",
55
- "thriller": "Thriller",
56
- "drama": "Drama",
57
- "crime": "Crime",
58
- "western": "Western"
59
- }
60
-
61
- # --- Mood Keywords Mapping ---
62
- # Expanded mood keywords to capture more nuances
63
- MOOD_KEYWORDS = {
64
- "love": "Romantic",
65
- "friend": "Heartwarming",
66
- "family": "Heartwarming",
67
- "ghost": "Supernatural",
68
- "magic": "Magical",
69
- "dark": "Dark",
70
- "funny": "Humorous",
71
- "sad": "Melancholic",
72
- "mystery": "Suspenseful",
73
- "hope": "Inspiring",
74
- "thrill": "Thrilling",
75
- "adventure": "Adventurous",
76
- "peace": "Calming",
77
- "joy": "Uplifting",
78
- "fear": "Horrifying",
79
- "anger": "Intense"
80
- }
81
-
82
-
83
- # --- Genre Assignment ---
84
- def assign_genres(subjects: List[str]) -> List[str]:
85
- """
86
- Assigns genre tags to a book based on keywords found in its subjects.
87
- Returns a list of matching genres or ['General'] if none found.
88
- """
89
- tags = set()
90
- for subj in subjects:
91
- s = subj.lower()
92
- for kw, label in GENRE_KEYWORDS.items():
93
- if kw in s:
94
- tags.add(label)
95
- return sorted(list(tags)) if tags else ["General"]
96
-
97
-
98
- # --- Target Audience Assignment ---
99
- def assign_target_audience(subjects: List[str]) -> str:
100
- """
101
- Determines the target audience for books by checking explicit subjects and reading levels.
102
- Prioritizes specific age-related keywords and grade levels.
103
- """
104
- joined_subjects = " ".join(subjects).lower()
105
-
106
- # 1) Reading level grade (most specific)
107
- match = re.search(r"grade\s*(\d+)", joined_subjects)
108
- if match:
109
- grade = int(match.group(1))
110
- if grade <= 6:
111
- return "children"
112
- elif grade <= 12:
113
- return "young_adult"
114
- else:
115
- return "adult"
116
-
117
- # 2) Explicit keywords for Young Adult
118
- if "young adult" in joined_subjects or "teen" in joined_subjects or "adolescent" in joined_subjects:
119
- return "young_adult"
120
-
121
- # 3) Explicit keywords for Children
122
- if "juvenile" in joined_subjects or "children's" in joined_subjects or "kids" in joined_subjects:
123
- return "children"
124
-
125
- # 4) Fallback to adult if no specific audience is detected
126
- return "adult"
127
-
128
-
129
- # --- Era Assignment ---
130
- def assign_era(year: Optional[int]) -> str:
131
- """Categorizes books into eras based on their first publish year."""
132
- try:
133
- y = int(year) if year is not None else 0
134
- except ValueError:
135
- return "unknown"
136
-
137
- current_year = datetime.now().year
138
- if y < 1900:
139
- return "classic"
140
- elif y < 2000:
141
- return "modern"
142
- elif y < current_year - 5:
143
- return "contemporary"
144
- return "recent"
145
-
146
-
147
- # --- Decade Assignment ---
148
- def assign_decade(year: Optional[int]) -> str:
149
- """Extracts the decade from the first publish year."""
150
- try:
151
- y = int(year) if year is not None else 0
152
- if y == 0: return "unknown"
153
- decade = (y // 10) * 10
154
- return f"{(decade)}s"
155
- except ValueError:
156
- return "unknown"
157
-
158
-
159
- # --- Mood Assignment ---
160
- def assign_mood(subjects: List[str]) -> List[str]:
161
- """
162
- Assigns mood tags to a book based on keywords found in its subjects.
163
- Returns a list of matching moods or ['Neutral'] if none found.
164
- """
165
- joined_subjects = " ".join(subjects).lower()
166
- mood_tags = set()
167
- for kw, label in MOOD_KEYWORDS.items():
168
- if kw in joined_subjects:
169
- mood_tags.add(label)
170
- return sorted(list(mood_tags)) if mood_tags else ["Neutral"]
171
-
172
-
173
- # --- Language Assignment (Placeholder) ---
174
- def assign_language(original_language_code: Optional[str] = None) -> str:
175
- """
176
- Assigns a language. Placeholder: Open Library data often lacks reliable
177
- explicit language information for books at this raw stage.
178
- Future improvement: integrate a language detection library or use source-specific language info.
179
- """
180
- if original_language_code:
181
- return original_language_code.lower()
182
- return "unknown"
183
-
184
-
185
- # --- Main Annotation Function ---
186
- def main():
187
- """
188
- Main function to load raw book data, apply annotations,
189
- and save the results to a JSON file.
190
- """
191
- if not RAW_PATH.exists():
192
- raise FileNotFoundError(f"Input file not found: {RAW_PATH}. Please ensure the crawler has run.")
193
-
194
- print(f"Loading raw book data from: {RAW_PATH}")
195
- try:
196
- with open(RAW_PATH, "r", encoding="utf-8") as f:
197
- books_raw = json.load(f)
198
- except json.JSONDecodeError as e:
199
- print(f"Error decoding JSON from {RAW_PATH}: {e}")
200
- return
201
-
202
- annotated_books = []
203
- print(f"Annotating {len(books_raw)} book records...")
204
- for b in books_raw:
205
- subjects = b.get("subjects", [])
206
- first_publish_year = b.get("first_publish_year")
207
- try:
208
- year_int = int(first_publish_year) if first_publish_year else 0
209
- except ValueError:
210
- year_int = 0
211
-
212
- annotated_books.append({
213
- "title": b.get("title", ""),
214
- "authors": b.get("authors", []),
215
- "first_publish_year": year_int,
216
- "subjects": subjects,
217
- "cover_url": b.get("cover_url", ""),
218
- "source_key": b.get("source_key", ""),
219
- "genres": assign_genres(subjects),
220
- "target_audience": assign_target_audience(subjects),
221
- "era": assign_era(year_int),
222
- "decade": assign_decade(year_int),
223
- "mood": assign_mood(subjects),
224
- "language": assign_language(b.get("language_code"))
225
- })
226
-
227
- print(f"Saving annotated data to: {OUT_PATH}")
228
- with open(OUT_PATH, "w", encoding="utf-8") as f:
229
- json.dump(annotated_books, f, ensure_ascii=False, indent=2)
230
-
231
- print(f"✅ Book annotation complete. Annotated {len(annotated_books)} records.")
232
- print(f"Output saved to: {OUT_PATH}")
233
-
234
-
235
- if __name__ == "__main__":
236
  main()
 
1
+ """
2
+ Annotation module for Open Library books.
3
+
4
+ Annotates 50,000-book dataset with additional features:
5
+ - Genres (based on subjects)
6
+ - Mood (based on subjects)
7
+ - Target Audience (based on subjects and reading levels)
8
+ - Era (based on publish year)
9
+ - Decade (based on publish year)
10
+ - Language (currently defaults to unknown due to data limitations)
11
+
12
+ Input:
13
+ - data/book/raw/openlibrary_books_50000.json
14
+
15
+ Output:
16
+ - data/book/annotated/books_annotated_50000.json
17
+ """
18
+
19
+ import os
20
+ import json
21
+ import re
22
+ from pathlib import Path
23
+ from datetime import datetime
24
+ from typing import List, Dict, Any, Optional
25
+
26
+ # --- Paths Configuration ---
27
+ # Path to the raw Open Library books JSON file (updated for 50,000 records)
28
+ RAW_PATH = Path(__file__).parent.parent / "data" / "book" / "raw" / "openlibrary_books_50000.json"
29
+ # Path to the output annotated books JSON file (updated for 50,000 records)
30
+ OUT_PATH = Path(__file__).parent.parent / "data" / "book" / "annotated" / "books_annotated_50000.json"
31
+ # Ensure the output directory exists
32
+ OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
33
+
34
+ # --- Genre Keywords Mapping ---
35
+ GENRE_KEYWORDS = {
36
+ "fiction": "Fiction",
37
+ "romance": "Romance",
38
+ "mystery": "Mystery",
39
+ "fantasy": "Fantasy",
40
+ "science fiction": "Science Fiction",
41
+ "history": "History",
42
+ "poetry": "Poetry",
43
+ "biography": "Biography",
44
+ "self-help": "Self-Help",
45
+ "horror": "Horror",
46
+ "children": "Children",
47
+ "young adult": "Young Adult",
48
+ "adventure": "Adventure",
49
+ "classic": "Classics",
50
+ "education": "Education",
51
+ "philosophy": "Philosophy",
52
+ "thriller": "Thriller",
53
+ "drama": "Drama",
54
+ "crime": "Crime",
55
+ "western": "Western"
56
+ }
57
+
58
+ # --- Mood Keywords Mapping ---
59
+ # Expanded mood keywords to capture more nuances
60
+ MOOD_KEYWORDS = {
61
+ "love": "Romantic",
62
+ "friend": "Heartwarming",
63
+ "family": "Heartwarming",
64
+ "ghost": "Supernatural",
65
+ "magic": "Magical",
66
+ "dark": "Dark",
67
+ "funny": "Humorous",
68
+ "sad": "Melancholic",
69
+ "mystery": "Suspenseful",
70
+ "hope": "Inspiring",
71
+ "thrill": "Thrilling",
72
+ "adventure": "Adventurous",
73
+ "peace": "Calming",
74
+ "joy": "Uplifting",
75
+ "fear": "Horrifying",
76
+ "anger": "Intense"
77
+ }
78
+
79
+
80
+ # --- Genre Assignment ---
81
+ def assign_genres(subjects: List[str]) -> List[str]:
82
+ """
83
+ Assigns genre tags to a book based on keywords found in its subjects.
84
+ Returns a list of matching genres or ['General'] if none found.
85
+ """
86
+ tags = set()
87
+ for subj in subjects:
88
+ s = subj.lower()
89
+ for kw, label in GENRE_KEYWORDS.items():
90
+ if kw in s:
91
+ tags.add(label)
92
+ return sorted(list(tags)) if tags else ["General"]
93
+
94
+
95
+ # --- Target Audience Assignment ---
96
+ def assign_target_audience(subjects: List[str]) -> str:
97
+ """
98
+ Determines the target audience for books by checking explicit subjects and reading levels.
99
+ Prioritizes specific age-related keywords and grade levels.
100
+ """
101
+ joined_subjects = " ".join(subjects).lower()
102
+
103
+ # 1) Reading level grade (most specific)
104
+ match = re.search(r"grade\s*(\d+)", joined_subjects)
105
+ if match:
106
+ grade = int(match.group(1))
107
+ if grade <= 6:
108
+ return "children"
109
+ elif grade <= 12:
110
+ return "young_adult"
111
+ else:
112
+ return "adult"
113
+
114
+ # 2) Explicit keywords for Young Adult
115
+ if "young adult" in joined_subjects or "teen" in joined_subjects or "adolescent" in joined_subjects:
116
+ return "young_adult"
117
+
118
+ # 3) Explicit keywords for Children
119
+ if "juvenile" in joined_subjects or "children's" in joined_subjects or "kids" in joined_subjects:
120
+ return "children"
121
+
122
+ # 4) Fallback to adult if no specific audience is detected
123
+ return "adult"
124
+
125
+
126
+ # --- Era Assignment ---
127
+ def assign_era(year: Optional[int]) -> str:
128
+ """Categorizes books into eras based on their first publish year."""
129
+ try:
130
+ y = int(year) if year is not None else 0
131
+ except ValueError:
132
+ return "unknown"
133
+
134
+ current_year = datetime.now().year
135
+ if y < 1900:
136
+ return "classic"
137
+ elif y < 2000:
138
+ return "modern"
139
+ elif y < current_year - 5:
140
+ return "contemporary"
141
+ return "recent"
142
+
143
+
144
+ # --- Decade Assignment ---
145
+ def assign_decade(year: Optional[int]) -> str:
146
+ """Extracts the decade from the first publish year."""
147
+ try:
148
+ y = int(year) if year is not None else 0
149
+ if y == 0: return "unknown"
150
+ decade = (y // 10) * 10
151
+ return f"{(decade)}s"
152
+ except ValueError:
153
+ return "unknown"
154
+
155
+
156
+ # --- Mood Assignment ---
157
+ def assign_mood(subjects: List[str]) -> List[str]:
158
+ """
159
+ Assigns mood tags to a book based on keywords found in its subjects.
160
+ Returns a list of matching moods or ['Neutral'] if none found.
161
+ """
162
+ joined_subjects = " ".join(subjects).lower()
163
+ mood_tags = set()
164
+ for kw, label in MOOD_KEYWORDS.items():
165
+ if kw in joined_subjects:
166
+ mood_tags.add(label)
167
+ return sorted(list(mood_tags)) if mood_tags else ["Neutral"]
168
+
169
+
170
+ # --- Language Assignment (Placeholder) ---
171
+ def assign_language(original_language_code: Optional[str] = None) -> str:
172
+ """
173
+ Assigns a language. Placeholder: Open Library data often lacks reliable
174
+ explicit language information for books at this raw stage.
175
+ Future improvement: integrate a language detection library or use source-specific language info.
176
+ """
177
+ if original_language_code:
178
+ return original_language_code.lower()
179
+ return "unknown"
180
+
181
+
182
+ # --- Main Annotation Function ---
183
+ def main():
184
+ """
185
+ Main function to load raw book data, apply annotations,
186
+ and save the results to a JSON file.
187
+ """
188
+ if not RAW_PATH.exists():
189
+ raise FileNotFoundError(f"Input file not found: {RAW_PATH}. Please ensure the crawler has run.")
190
+
191
+ print(f"Loading raw book data from: {RAW_PATH}")
192
+ try:
193
+ with open(RAW_PATH, "r", encoding="utf-8") as f:
194
+ books_raw = json.load(f)
195
+ except json.JSONDecodeError as e:
196
+ print(f"Error decoding JSON from {RAW_PATH}: {e}")
197
+ return
198
+
199
+ annotated_books = []
200
+ print(f"Annotating {len(books_raw)} book records...")
201
+ for b in books_raw:
202
+ subjects = b.get("subjects", [])
203
+ first_publish_year = b.get("first_publish_year")
204
+ try:
205
+ year_int = int(first_publish_year) if first_publish_year else 0
206
+ except ValueError:
207
+ year_int = 0
208
+
209
+ annotated_books.append({
210
+ "title": b.get("title", ""),
211
+ "authors": b.get("authors", []),
212
+ "first_publish_year": year_int,
213
+ "subjects": subjects,
214
+ "cover_url": b.get("cover_url", ""),
215
+ "source_key": b.get("source_key", ""),
216
+ "genres": assign_genres(subjects),
217
+ "target_audience": assign_target_audience(subjects),
218
+ "era": assign_era(year_int),
219
+ "decade": assign_decade(year_int),
220
+ "mood": assign_mood(subjects),
221
+ "language": assign_language(b.get("language_code"))
222
+ })
223
+
224
+ print(f"Saving annotated data to: {OUT_PATH}")
225
+ with open(OUT_PATH, "w", encoding="utf-8") as f:
226
+ json.dump(annotated_books, f, ensure_ascii=False, indent=2)
227
+
228
+ print(f" Book annotation complete. Annotated {len(annotated_books)} records.")
229
+ print(f"Output saved to: {OUT_PATH}")
230
+
231
+
232
+ if __name__ == "__main__":
 
 
 
233
  main()