Spaces:

badminton001
/

Recommender

Sleeping

App Files Files Community

badminton001 commited on Jun 29, 2025

Commit

5bf6c44

verified ·

1 Parent(s): 3ef5bd7

Update preprocessing/preprocess_50000_movies.py

Browse files

Files changed (1) hide show

preprocessing/preprocess_50000_movies.py +167 -170

preprocessing/preprocess_50000_movies.py CHANGED Viewed

@@ -1,171 +1,168 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-preprocessing/preprocess_50000_movies.py
-Preprocesses annotated TMDb movie data using NLTK:
-- Tokenizes titles and overviews (word_tokenize)
-- Converts text to lowercase
-- Removes English stopwords
-- Keeps only alphabetic tokens
-- Normalizes various tag fields (genres, mood, target_audience, era, decade, language)
-- Flattens director and cast lists for CSV output
-Input:
-- data/movie/annotated/movies_annotated_50000.json
-Outputs:
-- data/movie/preprocessed/movies_preprocessed_50000.json
-- data/movie/preprocessed/movies_preprocessed_50000.csv
-"""
-import json
-import csv
-import os
-import re
-import nltk
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-from pathlib import Path
-from typing import List, Dict, Any, Optional
-# --- NLTK Downloads (Uncomment if not already downloaded) ---
-# try:
-#     nltk.data.find('tokenizers/punkt')
-# except nltk.downloader.DownloadError:
-#     nltk.download('punkt')
-# try:
-#     nltk.data.find('corpora/stopwords')
-# except nltk.downloader.DownloadError:
-#     nltk.download('stopwords')
-# --- File Paths Configuration ---
-# Input annotated movies JSON file (updated for 50,000 records)
-INPUT_JSON = "data/movie/annotated/movies_annotated_50000.json"
-# Output preprocessed movies JSON file (updated for 50,000 records)
-OUTPUT_JSON = "data/movie/preprocessed/movies_preprocessed_50000.json"
-# Output preprocessed movies CSV file (for inspection, updated for 50,000 records)
-OUTPUT_CSV = "data/movie/preprocessed/movies_preprocessed_50000.csv"
-# Ensure output directory exists
-Path(OUTPUT_JSON).parent.mkdir(parents=True, exist_ok=True)
-# --- Global Stopword Set ---
-# Using a set for efficient lookup
-STOPWORDS = set(stopwords.words('english'))
-# Optional: Add custom stopwords relevant to movie data if needed (e.g., "film", "movie", "story")
-# CUSTOM_STOPWORDS = {"film", "movie", "story"}
-# STOPWORDS.update(CUSTOM_STOPWORDS)
-# --- Text Cleaning Function ---
-def clean_text(text: Optional[str]) -> List[str]:
-    """
-    Cleans and tokenizes text:
-    - Handles None input
-    - Converts to lowercase
-    - Removes URLs and HTML tags
-    - Tokenizes using NLTK's word_tokenize
-    - Filters out non-alphabetic tokens and stopwords
-    """
-    text = text or ""
-    text = text.lower()
-    text = re.sub(r'http\S+', ' ', text)
-    text = re.sub(r'<[^>]+>', ' ', text)
-    text = re.sub(r'[^a-z\s]', ' ', text)
-    tokens = word_tokenize(text)
-    return [t for t in tokens if t.isalpha() and t not in STOPWORDS]
-# --- List Normalization Function ---
-def normalize_list_tags(tags: Optional[List[str]]) -> List[str]:
-    """
-    Normalizes a list of string tags:
-    - Handles None input
-    - Strips whitespace
-    - Converts to lowercase
-    - Filters out empty strings
-    - Ensures elements are strings before processing
-    """
-    if tags is None:
-        return []
-    return [str(t).strip().lower() for t in tags if t is not None and isinstance(t, str) and str(t).strip()]
-# --- Main Preprocessing Function ---
-def main():
-    """
-    Main function to load annotated movie data, preprocess it,
-    and save the results to JSON and CSV files.
-    """
-    if not os.path.exists(INPUT_JSON):
-        raise FileNotFoundError(f"Cannot find input file: {INPUT_JSON}. Please ensure annotation is complete.")
-    print(f"Loading annotated movie data from: {INPUT_JSON}")
-    try:
-        with open(INPUT_JSON, 'r', encoding='utf-8') as f:
-            movies_annotated = json.load(f)
-    except json.JSONDecodeError as e:
-        print(f"Error decoding JSON from {INPUT_JSON}: {e}")
-        return
-    processed_movies = []
-    print(f"Preprocessing {len(movies_annotated)} movie records...")
-    for rec in movies_annotated:
-        processed_rec = dict(rec)
-        processed_rec['title_tokens'] = clean_text(rec.get('title'))
-        processed_rec['overview_tokens'] = clean_text(rec.get('overview'))
-        processed_rec['genres'] = normalize_list_tags(rec.get('genres'))
-        processed_rec['mood'] = normalize_list_tags(rec.get('mood'))
-        processed_rec['cast'] = normalize_list_tags(rec.get('cast'))
-        processed_rec['target_audience'] = rec.get('target_audience', '').strip().lower().replace(' ', '_')
-        processed_rec['era'] = rec.get('era', '').strip().lower().replace(' ', '_')
-        processed_rec['decade'] = rec.get('decade', '').strip().lower()
-        processed_rec['language'] = rec.get('language', '').strip().lower()
-        director_value = rec.get('director')
-        if director_value is not None:
-            processed_rec['director'] = str(director_value).strip().lower()
-        else:
-            processed_rec['director'] = ''
-        processed_movies.append(processed_rec)
-    print(f"Saving preprocessed data to: {OUTPUT_JSON}")
-    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
-        json.dump(processed_movies, f, ensure_ascii=False, indent=2)
-    fieldnames = [
-        'tmdb_id', 'title',
-        'title_tokens', 'overview_tokens',
-        'genres', 'mood', 'target_audience',
-        'era', 'decade', 'language',
-        'director', 'cast'
-    ]
-    print(f"Saving preprocessed data to: {OUTPUT_CSV}")
-    with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=fieldnames)
-        writer.writeheader()
-        for rec in processed_movies:
-            row = {k: rec.get(k, "") for k in fieldnames}
-            row['title_tokens'] = ' '.join(row['title_tokens'])
-            row['overview_tokens'] = ' '.join(row['overview_tokens'])
-            row['genres'] = ';'.join(row['genres'])
-            row['mood'] = ';'.join(row['mood'])
-            row['cast'] = ';'.join(row['cast'])
-            writer.writerow(row)
-    print(f"✅ Movie preprocessing complete. Processed {len(processed_movies)} records.")
-    print(f"Outputs:\n - {OUTPUT_JSON}\n - {OUTPUT_CSV}")
-if __name__ == '__main__':
     main()

+"""
+preprocessing/preprocess_50000_movies.py
+Preprocesses annotated TMDb movie data using NLTK:
+- Tokenizes titles and overviews (word_tokenize)
+- Converts text to lowercase
+- Removes English stopwords
+- Keeps only alphabetic tokens
+- Normalizes various tag fields (genres, mood, target_audience, era, decade, language)
+- Flattens director and cast lists for CSV output
+Input:
+- data/movie/annotated/movies_annotated_50000.json
+Outputs:
+- data/movie/preprocessed/movies_preprocessed_50000.json
+- data/movie/preprocessed/movies_preprocessed_50000.csv
+"""
+import json
+import csv
+import os
+import re
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+# --- NLTK Downloads (Uncomment if not already downloaded) ---
+# try:
+#     nltk.data.find('tokenizers/punkt')
+# except nltk.downloader.DownloadError:
+#     nltk.download('punkt')
+# try:
+#     nltk.data.find('corpora/stopwords')
+# except nltk.downloader.DownloadError:
+#     nltk.download('stopwords')
+# --- File Paths Configuration ---
+# Input annotated movies JSON file (updated for 50,000 records)
+INPUT_JSON = "data/movie/annotated/movies_annotated_50000.json"
+# Output preprocessed movies JSON file (updated for 50,000 records)
+OUTPUT_JSON = "data/movie/preprocessed/movies_preprocessed_50000.json"
+# Output preprocessed movies CSV file (for inspection, updated for 50,000 records)
+OUTPUT_CSV = "data/movie/preprocessed/movies_preprocessed_50000.csv"
+# Ensure output directory exists
+Path(OUTPUT_JSON).parent.mkdir(parents=True, exist_ok=True)
+# --- Global Stopword Set ---
+# Using a set for efficient lookup
+STOPWORDS = set(stopwords.words('english'))
+# Optional: Add custom stopwords relevant to movie data if needed (e.g., "film", "movie", "story")
+# CUSTOM_STOPWORDS = {"film", "movie", "story"}
+# STOPWORDS.update(CUSTOM_STOPWORDS)
+# --- Text Cleaning Function ---
+def clean_text(text: Optional[str]) -> List[str]:
+    """
+    Cleans and tokenizes text:
+    - Handles None input
+    - Converts to lowercase
+    - Removes URLs and HTML tags
+    - Tokenizes using NLTK's word_tokenize
+    - Filters out non-alphabetic tokens and stopwords
+    """
+    text = text or ""
+    text = text.lower()
+    text = re.sub(r'http\S+', ' ', text)
+    text = re.sub(r'<[^>]+>', ' ', text)
+    text = re.sub(r'[^a-z\s]', ' ', text)
+    tokens = word_tokenize(text)
+    return [t for t in tokens if t.isalpha() and t not in STOPWORDS]
+# --- List Normalization Function ---
+def normalize_list_tags(tags: Optional[List[str]]) -> List[str]:
+    """
+    Normalizes a list of string tags:
+    - Handles None input
+    - Strips whitespace
+    - Converts to lowercase
+    - Filters out empty strings
+    - Ensures elements are strings before processing
+    """
+    if tags is None:
+        return []
+    return [str(t).strip().lower() for t in tags if t is not None and isinstance(t, str) and str(t).strip()]
+# --- Main Preprocessing Function ---
+def main():
+    """
+    Main function to load annotated movie data, preprocess it,
+    and save the results to JSON and CSV files.
+    """
+    if not os.path.exists(INPUT_JSON):
+        raise FileNotFoundError(f"Cannot find input file: {INPUT_JSON}. Please ensure annotation is complete.")
+    print(f"Loading annotated movie data from: {INPUT_JSON}")
+    try:
+        with open(INPUT_JSON, 'r', encoding='utf-8') as f:
+            movies_annotated = json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON from {INPUT_JSON}: {e}")
+        return
+    processed_movies = []
+    print(f"Preprocessing {len(movies_annotated)} movie records...")
+    for rec in movies_annotated:
+        processed_rec = dict(rec)
+        processed_rec['title_tokens'] = clean_text(rec.get('title'))
+        processed_rec['overview_tokens'] = clean_text(rec.get('overview'))
+        processed_rec['genres'] = normalize_list_tags(rec.get('genres'))
+        processed_rec['mood'] = normalize_list_tags(rec.get('mood'))
+        processed_rec['cast'] = normalize_list_tags(rec.get('cast'))
+        processed_rec['target_audience'] = rec.get('target_audience', '').strip().lower().replace(' ', '_')
+        processed_rec['era'] = rec.get('era', '').strip().lower().replace(' ', '_')
+        processed_rec['decade'] = rec.get('decade', '').strip().lower()
+        processed_rec['language'] = rec.get('language', '').strip().lower()
+        director_value = rec.get('director')
+        if director_value is not None:
+            processed_rec['director'] = str(director_value).strip().lower()
+        else:
+            processed_rec['director'] = ''
+        processed_movies.append(processed_rec)
+    print(f"Saving preprocessed data to: {OUTPUT_JSON}")
+    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
+        json.dump(processed_movies, f, ensure_ascii=False, indent=2)
+    fieldnames = [
+        'tmdb_id', 'title',
+        'title_tokens', 'overview_tokens',
+        'genres', 'mood', 'target_audience',
+        'era', 'decade', 'language',
+        'director', 'cast'
+    ]
+    print(f"Saving preprocessed data to: {OUTPUT_CSV}")
+    with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for rec in processed_movies:
+            row = {k: rec.get(k, "") for k in fieldnames}
+            row['title_tokens'] = ' '.join(row['title_tokens'])
+            row['overview_tokens'] = ' '.join(row['overview_tokens'])
+            row['genres'] = ';'.join(row['genres'])
+            row['mood'] = ';'.join(row['mood'])
+            row['cast'] = ';'.join(row['cast'])
+            writer.writerow(row)
+    print(f"✅ Movie preprocessing complete. Processed {len(processed_movies)} records.")
+    print(f"Outputs:\n - {OUTPUT_JSON}\n - {OUTPUT_CSV}")
+if __name__ == '__main__':
     main()