Spaces:

badminton001
/

Recommender

Sleeping

App Files Files Community

badminton001 commited on Jun 29, 2025

Commit

3ef5bd7

verified ·

1 Parent(s): 9381930

Update preprocessing/annotate_books_50000.py

Browse files

Files changed (1) hide show

preprocessing/annotate_books_50000.py +232 -235

preprocessing/annotate_books_50000.py CHANGED Viewed

@@ -1,236 +1,233 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Annotation module for Open Library books.
-Annotates 50,000-book dataset with additional features:
-- Genres (based on subjects)
-- Mood (based on subjects)
-- Target Audience (based on subjects and reading levels)
-- Era (based on publish year)
-- Decade (based on publish year)
-- Language (currently defaults to unknown due to data limitations)
-Input:
- - data/book/raw/openlibrary_books_50000.json
-Output:
- - data/book/annotated/books_annotated_50000.json
-"""
-import os
-import json
-import re
-from pathlib import Path
-from datetime import datetime
-from typing import List, Dict, Any, Optional
-# --- Paths Configuration ---
-# Path to the raw Open Library books JSON file (updated for 50,000 records)
-RAW_PATH = Path(__file__).parent.parent / "data" / "book" / "raw" / "openlibrary_books_50000.json"
-# Path to the output annotated books JSON file (updated for 50,000 records)
-OUT_PATH = Path(__file__).parent.parent / "data" / "book" / "annotated" / "books_annotated_50000.json"
-# Ensure the output directory exists
-OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
-# --- Genre Keywords Mapping ---
-GENRE_KEYWORDS = {
-    "fiction": "Fiction",
-    "romance": "Romance",
-    "mystery": "Mystery",
-    "fantasy": "Fantasy",
-    "science fiction": "Science Fiction",
-    "history": "History",
-    "poetry": "Poetry",
-    "biography": "Biography",
-    "self-help": "Self-Help",
-    "horror": "Horror",
-    "children": "Children",
-    "young adult": "Young Adult",
-    "adventure": "Adventure",
-    "classic": "Classics",
-    "education": "Education",
-    "philosophy": "Philosophy",
-    "thriller": "Thriller",
-    "drama": "Drama",
-    "crime": "Crime",
-    "western": "Western"
-}
-# --- Mood Keywords Mapping ---
-# Expanded mood keywords to capture more nuances
-MOOD_KEYWORDS = {
-    "love": "Romantic",
-    "friend": "Heartwarming",
-    "family": "Heartwarming",
-    "ghost": "Supernatural",
-    "magic": "Magical",
-    "dark": "Dark",
-    "funny": "Humorous",
-    "sad": "Melancholic",
-    "mystery": "Suspenseful",
-    "hope": "Inspiring",
-    "thrill": "Thrilling",
-    "adventure": "Adventurous",
-    "peace": "Calming",
-    "joy": "Uplifting",
-    "fear": "Horrifying",
-    "anger": "Intense"
-}
-# --- Genre Assignment ---
-def assign_genres(subjects: List[str]) -> List[str]:
-    """
-    Assigns genre tags to a book based on keywords found in its subjects.
-    Returns a list of matching genres or ['General'] if none found.
-    """
-    tags = set()
-    for subj in subjects:
-        s = subj.lower()
-        for kw, label in GENRE_KEYWORDS.items():
-            if kw in s:
-                tags.add(label)
-    return sorted(list(tags)) if tags else ["General"]
-# --- Target Audience Assignment ---
-def assign_target_audience(subjects: List[str]) -> str:
-    """
-    Determines the target audience for books by checking explicit subjects and reading levels.
-    Prioritizes specific age-related keywords and grade levels.
-    """
-    joined_subjects = " ".join(subjects).lower()
-    # 1) Reading level grade (most specific)
-    match = re.search(r"grade\s*(\d+)", joined_subjects)
-    if match:
-        grade = int(match.group(1))
-        if grade <= 6:
-            return "children"
-        elif grade <= 12:
-            return "young_adult"
-        else:
-            return "adult"
-    # 2) Explicit keywords for Young Adult
-    if "young adult" in joined_subjects or "teen" in joined_subjects or "adolescent" in joined_subjects:
-        return "young_adult"
-    # 3) Explicit keywords for Children
-    if "juvenile" in joined_subjects or "children's" in joined_subjects or "kids" in joined_subjects:
-        return "children"
-    # 4) Fallback to adult if no specific audience is detected
-    return "adult"
-# --- Era Assignment ---
-def assign_era(year: Optional[int]) -> str:
-    """Categorizes books into eras based on their first publish year."""
-    try:
-        y = int(year) if year is not None else 0
-    except ValueError:
-        return "unknown"
-    current_year = datetime.now().year
-    if y < 1900:
-        return "classic"
-    elif y < 2000:
-        return "modern"
-    elif y < current_year - 5:
-        return "contemporary"
-    return "recent"
-# --- Decade Assignment ---
-def assign_decade(year: Optional[int]) -> str:
-    """Extracts the decade from the first publish year."""
-    try:
-        y = int(year) if year is not None else 0
-        if y == 0: return "unknown"
-        decade = (y // 10) * 10
-        return f"{(decade)}s"
-    except ValueError:
-        return "unknown"
-# --- Mood Assignment ---
-def assign_mood(subjects: List[str]) -> List[str]:
-    """
-    Assigns mood tags to a book based on keywords found in its subjects.
-    Returns a list of matching moods or ['Neutral'] if none found.
-    """
-    joined_subjects = " ".join(subjects).lower()
-    mood_tags = set()
-    for kw, label in MOOD_KEYWORDS.items():
-        if kw in joined_subjects:
-            mood_tags.add(label)
-    return sorted(list(mood_tags)) if mood_tags else ["Neutral"]
-# --- Language Assignment (Placeholder) ---
-def assign_language(original_language_code: Optional[str] = None) -> str:
-    """
-    Assigns a language. Placeholder: Open Library data often lacks reliable
-    explicit language information for books at this raw stage.
-    Future improvement: integrate a language detection library or use source-specific language info.
-    """
-    if original_language_code:
-        return original_language_code.lower()
-    return "unknown"
-# --- Main Annotation Function ---
-def main():
-    """
-    Main function to load raw book data, apply annotations,
-    and save the results to a JSON file.
-    """
-    if not RAW_PATH.exists():
-        raise FileNotFoundError(f"Input file not found: {RAW_PATH}. Please ensure the crawler has run.")
-    print(f"Loading raw book data from: {RAW_PATH}")
-    try:
-        with open(RAW_PATH, "r", encoding="utf-8") as f:
-            books_raw = json.load(f)
-    except json.JSONDecodeError as e:
-        print(f"Error decoding JSON from {RAW_PATH}: {e}")
-        return
-    annotated_books = []
-    print(f"Annotating {len(books_raw)} book records...")
-    for b in books_raw:
-        subjects = b.get("subjects", [])
-        first_publish_year = b.get("first_publish_year")
-        try:
-            year_int = int(first_publish_year) if first_publish_year else 0
-        except ValueError:
-            year_int = 0
-        annotated_books.append({
-            "title": b.get("title", ""),
-            "authors": b.get("authors", []),
-            "first_publish_year": year_int,
-            "subjects": subjects,
-            "cover_url": b.get("cover_url", ""),
-            "source_key": b.get("source_key", ""),
-            "genres": assign_genres(subjects),
-            "target_audience": assign_target_audience(subjects),
-            "era": assign_era(year_int),
-            "decade": assign_decade(year_int),
-            "mood": assign_mood(subjects),
-            "language": assign_language(b.get("language_code"))
-        })
-    print(f"Saving annotated data to: {OUT_PATH}")
-    with open(OUT_PATH, "w", encoding="utf-8") as f:
-        json.dump(annotated_books, f, ensure_ascii=False, indent=2)
-    print(f"✅ Book annotation complete. Annotated {len(annotated_books)} records.")
-    print(f"Output saved to: {OUT_PATH}")
-if __name__ == "__main__":
     main()

+"""
+Annotation module for Open Library books.
+Annotates 50,000-book dataset with additional features:
+- Genres (based on subjects)
+- Mood (based on subjects)
+- Target Audience (based on subjects and reading levels)
+- Era (based on publish year)
+- Decade (based on publish year)
+- Language (currently defaults to unknown due to data limitations)
+Input:
+ - data/book/raw/openlibrary_books_50000.json
+Output:
+ - data/book/annotated/books_annotated_50000.json
+"""
+import os
+import json
+import re
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+# --- Paths Configuration ---
+# Path to the raw Open Library books JSON file (updated for 50,000 records)
+RAW_PATH = Path(__file__).parent.parent / "data" / "book" / "raw" / "openlibrary_books_50000.json"
+# Path to the output annotated books JSON file (updated for 50,000 records)
+OUT_PATH = Path(__file__).parent.parent / "data" / "book" / "annotated" / "books_annotated_50000.json"
+# Ensure the output directory exists
+OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+# --- Genre Keywords Mapping ---
+GENRE_KEYWORDS = {
+    "fiction": "Fiction",
+    "romance": "Romance",
+    "mystery": "Mystery",
+    "fantasy": "Fantasy",
+    "science fiction": "Science Fiction",
+    "history": "History",
+    "poetry": "Poetry",
+    "biography": "Biography",
+    "self-help": "Self-Help",
+    "horror": "Horror",
+    "children": "Children",
+    "young adult": "Young Adult",
+    "adventure": "Adventure",
+    "classic": "Classics",
+    "education": "Education",
+    "philosophy": "Philosophy",
+    "thriller": "Thriller",
+    "drama": "Drama",
+    "crime": "Crime",
+    "western": "Western"
+}
+# --- Mood Keywords Mapping ---
+# Expanded mood keywords to capture more nuances
+MOOD_KEYWORDS = {
+    "love": "Romantic",
+    "friend": "Heartwarming",
+    "family": "Heartwarming",
+    "ghost": "Supernatural",
+    "magic": "Magical",
+    "dark": "Dark",
+    "funny": "Humorous",
+    "sad": "Melancholic",
+    "mystery": "Suspenseful",
+    "hope": "Inspiring",
+    "thrill": "Thrilling",
+    "adventure": "Adventurous",
+    "peace": "Calming",
+    "joy": "Uplifting",
+    "fear": "Horrifying",
+    "anger": "Intense"
+}
+# --- Genre Assignment ---
+def assign_genres(subjects: List[str]) -> List[str]:
+    """
+    Assigns genre tags to a book based on keywords found in its subjects.
+    Returns a list of matching genres or ['General'] if none found.
+    """
+    tags = set()
+    for subj in subjects:
+        s = subj.lower()
+        for kw, label in GENRE_KEYWORDS.items():
+            if kw in s:
+                tags.add(label)
+    return sorted(list(tags)) if tags else ["General"]
+# --- Target Audience Assignment ---
+def assign_target_audience(subjects: List[str]) -> str:
+    """
+    Determines the target audience for books by checking explicit subjects and reading levels.
+    Prioritizes specific age-related keywords and grade levels.
+    """
+    joined_subjects = " ".join(subjects).lower()
+    # 1) Reading level grade (most specific)
+    match = re.search(r"grade\s*(\d+)", joined_subjects)
+    if match:
+        grade = int(match.group(1))
+        if grade <= 6:
+            return "children"
+        elif grade <= 12:
+            return "young_adult"
+        else:
+            return "adult"
+    # 2) Explicit keywords for Young Adult
+    if "young adult" in joined_subjects or "teen" in joined_subjects or "adolescent" in joined_subjects:
+        return "young_adult"
+    # 3) Explicit keywords for Children
+    if "juvenile" in joined_subjects or "children's" in joined_subjects or "kids" in joined_subjects:
+        return "children"
+    # 4) Fallback to adult if no specific audience is detected
+    return "adult"
+# --- Era Assignment ---
+def assign_era(year: Optional[int]) -> str:
+    """Categorizes books into eras based on their first publish year."""
+    try:
+        y = int(year) if year is not None else 0
+    except ValueError:
+        return "unknown"
+    current_year = datetime.now().year
+    if y < 1900:
+        return "classic"
+    elif y < 2000:
+        return "modern"
+    elif y < current_year - 5:
+        return "contemporary"
+    return "recent"
+# --- Decade Assignment ---
+def assign_decade(year: Optional[int]) -> str:
+    """Extracts the decade from the first publish year."""
+    try:
+        y = int(year) if year is not None else 0
+        if y == 0: return "unknown"
+        decade = (y // 10) * 10
+        return f"{(decade)}s"
+    except ValueError:
+        return "unknown"
+# --- Mood Assignment ---
+def assign_mood(subjects: List[str]) -> List[str]:
+    """
+    Assigns mood tags to a book based on keywords found in its subjects.
+    Returns a list of matching moods or ['Neutral'] if none found.
+    """
+    joined_subjects = " ".join(subjects).lower()
+    mood_tags = set()
+    for kw, label in MOOD_KEYWORDS.items():
+        if kw in joined_subjects:
+            mood_tags.add(label)
+    return sorted(list(mood_tags)) if mood_tags else ["Neutral"]
+# --- Language Assignment (Placeholder) ---
+def assign_language(original_language_code: Optional[str] = None) -> str:
+    """
+    Assigns a language. Placeholder: Open Library data often lacks reliable
+    explicit language information for books at this raw stage.
+    Future improvement: integrate a language detection library or use source-specific language info.
+    """
+    if original_language_code:
+        return original_language_code.lower()
+    return "unknown"
+# --- Main Annotation Function ---
+def main():
+    """
+    Main function to load raw book data, apply annotations,
+    and save the results to a JSON file.
+    """
+    if not RAW_PATH.exists():
+        raise FileNotFoundError(f"Input file not found: {RAW_PATH}. Please ensure the crawler has run.")
+    print(f"Loading raw book data from: {RAW_PATH}")
+    try:
+        with open(RAW_PATH, "r", encoding="utf-8") as f:
+            books_raw = json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON from {RAW_PATH}: {e}")
+        return
+    annotated_books = []
+    print(f"Annotating {len(books_raw)} book records...")
+    for b in books_raw:
+        subjects = b.get("subjects", [])
+        first_publish_year = b.get("first_publish_year")
+        try:
+            year_int = int(first_publish_year) if first_publish_year else 0
+        except ValueError:
+            year_int = 0
+        annotated_books.append({
+            "title": b.get("title", ""),
+            "authors": b.get("authors", []),
+            "first_publish_year": year_int,
+            "subjects": subjects,
+            "cover_url": b.get("cover_url", ""),
+            "source_key": b.get("source_key", ""),
+            "genres": assign_genres(subjects),
+            "target_audience": assign_target_audience(subjects),
+            "era": assign_era(year_int),
+            "decade": assign_decade(year_int),
+            "mood": assign_mood(subjects),
+            "language": assign_language(b.get("language_code"))
+        })
+    print(f"Saving annotated data to: {OUT_PATH}")
+    with open(OUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(annotated_books, f, ensure_ascii=False, indent=2)
+    print(f"✅ Book annotation complete. Annotated {len(annotated_books)} records.")
+    print(f"Output saved to: {OUT_PATH}")
+if __name__ == "__main__":
     main()