badminton001 commited on
Commit
5bf6c44
·
verified ·
1 Parent(s): 3ef5bd7

Update preprocessing/preprocess_50000_movies.py

Browse files
Files changed (1) hide show
  1. preprocessing/preprocess_50000_movies.py +167 -170
preprocessing/preprocess_50000_movies.py CHANGED
@@ -1,171 +1,168 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- preprocessing/preprocess_50000_movies.py
6
-
7
- Preprocesses annotated TMDb movie data using NLTK:
8
- - Tokenizes titles and overviews (word_tokenize)
9
- - Converts text to lowercase
10
- - Removes English stopwords
11
- - Keeps only alphabetic tokens
12
- - Normalizes various tag fields (genres, mood, target_audience, era, decade, language)
13
- - Flattens director and cast lists for CSV output
14
-
15
- Input:
16
- - data/movie/annotated/movies_annotated_50000.json
17
-
18
- Outputs:
19
- - data/movie/preprocessed/movies_preprocessed_50000.json
20
- - data/movie/preprocessed/movies_preprocessed_50000.csv
21
- """
22
-
23
- import json
24
- import csv
25
- import os
26
- import re
27
- import nltk
28
- from nltk.tokenize import word_tokenize
29
- from nltk.corpus import stopwords
30
- from pathlib import Path
31
- from typing import List, Dict, Any, Optional
32
-
33
- # --- NLTK Downloads (Uncomment if not already downloaded) ---
34
- # try:
35
- # nltk.data.find('tokenizers/punkt')
36
- # except nltk.downloader.DownloadError:
37
- # nltk.download('punkt')
38
- # try:
39
- # nltk.data.find('corpora/stopwords')
40
- # except nltk.downloader.DownloadError:
41
- # nltk.download('stopwords')
42
-
43
- # --- File Paths Configuration ---
44
- # Input annotated movies JSON file (updated for 50,000 records)
45
- INPUT_JSON = "data/movie/annotated/movies_annotated_50000.json"
46
- # Output preprocessed movies JSON file (updated for 50,000 records)
47
- OUTPUT_JSON = "data/movie/preprocessed/movies_preprocessed_50000.json"
48
- # Output preprocessed movies CSV file (for inspection, updated for 50,000 records)
49
- OUTPUT_CSV = "data/movie/preprocessed/movies_preprocessed_50000.csv"
50
-
51
- # Ensure output directory exists
52
- Path(OUTPUT_JSON).parent.mkdir(parents=True, exist_ok=True)
53
-
54
- # --- Global Stopword Set ---
55
- # Using a set for efficient lookup
56
- STOPWORDS = set(stopwords.words('english'))
57
-
58
-
59
- # Optional: Add custom stopwords relevant to movie data if needed (e.g., "film", "movie", "story")
60
- # CUSTOM_STOPWORDS = {"film", "movie", "story"}
61
- # STOPWORDS.update(CUSTOM_STOPWORDS)
62
-
63
- # --- Text Cleaning Function ---
64
- def clean_text(text: Optional[str]) -> List[str]:
65
- """
66
- Cleans and tokenizes text:
67
- - Handles None input
68
- - Converts to lowercase
69
- - Removes URLs and HTML tags
70
- - Tokenizes using NLTK's word_tokenize
71
- - Filters out non-alphabetic tokens and stopwords
72
- """
73
- text = text or ""
74
- text = text.lower()
75
- text = re.sub(r'http\S+', ' ', text)
76
- text = re.sub(r'<[^>]+>', ' ', text)
77
- text = re.sub(r'[^a-z\s]', ' ', text)
78
-
79
- tokens = word_tokenize(text)
80
- return [t for t in tokens if t.isalpha() and t not in STOPWORDS]
81
-
82
-
83
- # --- List Normalization Function ---
84
- def normalize_list_tags(tags: Optional[List[str]]) -> List[str]:
85
- """
86
- Normalizes a list of string tags:
87
- - Handles None input
88
- - Strips whitespace
89
- - Converts to lowercase
90
- - Filters out empty strings
91
- - Ensures elements are strings before processing
92
- """
93
- if tags is None:
94
- return []
95
- return [str(t).strip().lower() for t in tags if t is not None and isinstance(t, str) and str(t).strip()]
96
-
97
-
98
- # --- Main Preprocessing Function ---
99
- def main():
100
- """
101
- Main function to load annotated movie data, preprocess it,
102
- and save the results to JSON and CSV files.
103
- """
104
- if not os.path.exists(INPUT_JSON):
105
- raise FileNotFoundError(f"Cannot find input file: {INPUT_JSON}. Please ensure annotation is complete.")
106
-
107
- print(f"Loading annotated movie data from: {INPUT_JSON}")
108
- try:
109
- with open(INPUT_JSON, 'r', encoding='utf-8') as f:
110
- movies_annotated = json.load(f)
111
- except json.JSONDecodeError as e:
112
- print(f"Error decoding JSON from {INPUT_JSON}: {e}")
113
- return
114
-
115
- processed_movies = []
116
- print(f"Preprocessing {len(movies_annotated)} movie records...")
117
- for rec in movies_annotated:
118
- processed_rec = dict(rec)
119
-
120
- processed_rec['title_tokens'] = clean_text(rec.get('title'))
121
- processed_rec['overview_tokens'] = clean_text(rec.get('overview'))
122
-
123
- processed_rec['genres'] = normalize_list_tags(rec.get('genres'))
124
- processed_rec['mood'] = normalize_list_tags(rec.get('mood'))
125
- processed_rec['cast'] = normalize_list_tags(rec.get('cast'))
126
-
127
- processed_rec['target_audience'] = rec.get('target_audience', '').strip().lower().replace(' ', '_')
128
- processed_rec['era'] = rec.get('era', '').strip().lower().replace(' ', '_')
129
- processed_rec['decade'] = rec.get('decade', '').strip().lower()
130
- processed_rec['language'] = rec.get('language', '').strip().lower()
131
-
132
- director_value = rec.get('director')
133
- if director_value is not None:
134
- processed_rec['director'] = str(director_value).strip().lower()
135
- else:
136
- processed_rec['director'] = ''
137
-
138
- processed_movies.append(processed_rec)
139
-
140
- print(f"Saving preprocessed data to: {OUTPUT_JSON}")
141
- with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
142
- json.dump(processed_movies, f, ensure_ascii=False, indent=2)
143
-
144
- fieldnames = [
145
- 'tmdb_id', 'title',
146
- 'title_tokens', 'overview_tokens',
147
- 'genres', 'mood', 'target_audience',
148
- 'era', 'decade', 'language',
149
- 'director', 'cast'
150
- ]
151
-
152
- print(f"Saving preprocessed data to: {OUTPUT_CSV}")
153
- with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
154
- writer = csv.DictWriter(f, fieldnames=fieldnames)
155
- writer.writeheader()
156
- for rec in processed_movies:
157
- row = {k: rec.get(k, "") for k in fieldnames}
158
- row['title_tokens'] = ' '.join(row['title_tokens'])
159
- row['overview_tokens'] = ' '.join(row['overview_tokens'])
160
- row['genres'] = ';'.join(row['genres'])
161
- row['mood'] = ';'.join(row['mood'])
162
- row['cast'] = ';'.join(row['cast'])
163
-
164
- writer.writerow(row)
165
-
166
- print(f"✅ Movie preprocessing complete. Processed {len(processed_movies)} records.")
167
- print(f"Outputs:\n - {OUTPUT_JSON}\n - {OUTPUT_CSV}")
168
-
169
-
170
- if __name__ == '__main__':
171
  main()
 
1
+ """
2
+ preprocessing/preprocess_50000_movies.py
3
+
4
+ Preprocesses annotated TMDb movie data using NLTK:
5
+ - Tokenizes titles and overviews (word_tokenize)
6
+ - Converts text to lowercase
7
+ - Removes English stopwords
8
+ - Keeps only alphabetic tokens
9
+ - Normalizes various tag fields (genres, mood, target_audience, era, decade, language)
10
+ - Flattens director and cast lists for CSV output
11
+
12
+ Input:
13
+ - data/movie/annotated/movies_annotated_50000.json
14
+
15
+ Outputs:
16
+ - data/movie/preprocessed/movies_preprocessed_50000.json
17
+ - data/movie/preprocessed/movies_preprocessed_50000.csv
18
+ """
19
+
20
+ import json
21
+ import csv
22
+ import os
23
+ import re
24
+ import nltk
25
+ from nltk.tokenize import word_tokenize
26
+ from nltk.corpus import stopwords
27
+ from pathlib import Path
28
+ from typing import List, Dict, Any, Optional
29
+
30
+ # --- NLTK Downloads (Uncomment if not already downloaded) ---
31
+ # try:
32
+ # nltk.data.find('tokenizers/punkt')
33
+ # except nltk.downloader.DownloadError:
34
+ # nltk.download('punkt')
35
+ # try:
36
+ # nltk.data.find('corpora/stopwords')
37
+ # except nltk.downloader.DownloadError:
38
+ # nltk.download('stopwords')
39
+
40
+ # --- File Paths Configuration ---
41
+ # Input annotated movies JSON file (updated for 50,000 records)
42
+ INPUT_JSON = "data/movie/annotated/movies_annotated_50000.json"
43
+ # Output preprocessed movies JSON file (updated for 50,000 records)
44
+ OUTPUT_JSON = "data/movie/preprocessed/movies_preprocessed_50000.json"
45
+ # Output preprocessed movies CSV file (for inspection, updated for 50,000 records)
46
+ OUTPUT_CSV = "data/movie/preprocessed/movies_preprocessed_50000.csv"
47
+
48
+ # Ensure output directory exists
49
+ Path(OUTPUT_JSON).parent.mkdir(parents=True, exist_ok=True)
50
+
51
+ # --- Global Stopword Set ---
52
+ # Using a set for efficient lookup
53
+ STOPWORDS = set(stopwords.words('english'))
54
+
55
+
56
+ # Optional: Add custom stopwords relevant to movie data if needed (e.g., "film", "movie", "story")
57
+ # CUSTOM_STOPWORDS = {"film", "movie", "story"}
58
+ # STOPWORDS.update(CUSTOM_STOPWORDS)
59
+
60
+ # --- Text Cleaning Function ---
61
+ def clean_text(text: Optional[str]) -> List[str]:
62
+ """
63
+ Cleans and tokenizes text:
64
+ - Handles None input
65
+ - Converts to lowercase
66
+ - Removes URLs and HTML tags
67
+ - Tokenizes using NLTK's word_tokenize
68
+ - Filters out non-alphabetic tokens and stopwords
69
+ """
70
+ text = text or ""
71
+ text = text.lower()
72
+ text = re.sub(r'http\S+', ' ', text)
73
+ text = re.sub(r'<[^>]+>', ' ', text)
74
+ text = re.sub(r'[^a-z\s]', ' ', text)
75
+
76
+ tokens = word_tokenize(text)
77
+ return [t for t in tokens if t.isalpha() and t not in STOPWORDS]
78
+
79
+
80
+ # --- List Normalization Function ---
81
+ def normalize_list_tags(tags: Optional[List[str]]) -> List[str]:
82
+ """
83
+ Normalizes a list of string tags:
84
+ - Handles None input
85
+ - Strips whitespace
86
+ - Converts to lowercase
87
+ - Filters out empty strings
88
+ - Ensures elements are strings before processing
89
+ """
90
+ if tags is None:
91
+ return []
92
+ return [str(t).strip().lower() for t in tags if t is not None and isinstance(t, str) and str(t).strip()]
93
+
94
+
95
+ # --- Main Preprocessing Function ---
96
+ def main():
97
+ """
98
+ Main function to load annotated movie data, preprocess it,
99
+ and save the results to JSON and CSV files.
100
+ """
101
+ if not os.path.exists(INPUT_JSON):
102
+ raise FileNotFoundError(f"Cannot find input file: {INPUT_JSON}. Please ensure annotation is complete.")
103
+
104
+ print(f"Loading annotated movie data from: {INPUT_JSON}")
105
+ try:
106
+ with open(INPUT_JSON, 'r', encoding='utf-8') as f:
107
+ movies_annotated = json.load(f)
108
+ except json.JSONDecodeError as e:
109
+ print(f"Error decoding JSON from {INPUT_JSON}: {e}")
110
+ return
111
+
112
+ processed_movies = []
113
+ print(f"Preprocessing {len(movies_annotated)} movie records...")
114
+ for rec in movies_annotated:
115
+ processed_rec = dict(rec)
116
+
117
+ processed_rec['title_tokens'] = clean_text(rec.get('title'))
118
+ processed_rec['overview_tokens'] = clean_text(rec.get('overview'))
119
+
120
+ processed_rec['genres'] = normalize_list_tags(rec.get('genres'))
121
+ processed_rec['mood'] = normalize_list_tags(rec.get('mood'))
122
+ processed_rec['cast'] = normalize_list_tags(rec.get('cast'))
123
+
124
+ processed_rec['target_audience'] = rec.get('target_audience', '').strip().lower().replace(' ', '_')
125
+ processed_rec['era'] = rec.get('era', '').strip().lower().replace(' ', '_')
126
+ processed_rec['decade'] = rec.get('decade', '').strip().lower()
127
+ processed_rec['language'] = rec.get('language', '').strip().lower()
128
+
129
+ director_value = rec.get('director')
130
+ if director_value is not None:
131
+ processed_rec['director'] = str(director_value).strip().lower()
132
+ else:
133
+ processed_rec['director'] = ''
134
+
135
+ processed_movies.append(processed_rec)
136
+
137
+ print(f"Saving preprocessed data to: {OUTPUT_JSON}")
138
+ with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
139
+ json.dump(processed_movies, f, ensure_ascii=False, indent=2)
140
+
141
+ fieldnames = [
142
+ 'tmdb_id', 'title',
143
+ 'title_tokens', 'overview_tokens',
144
+ 'genres', 'mood', 'target_audience',
145
+ 'era', 'decade', 'language',
146
+ 'director', 'cast'
147
+ ]
148
+
149
+ print(f"Saving preprocessed data to: {OUTPUT_CSV}")
150
+ with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
151
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
152
+ writer.writeheader()
153
+ for rec in processed_movies:
154
+ row = {k: rec.get(k, "") for k in fieldnames}
155
+ row['title_tokens'] = ' '.join(row['title_tokens'])
156
+ row['overview_tokens'] = ' '.join(row['overview_tokens'])
157
+ row['genres'] = ';'.join(row['genres'])
158
+ row['mood'] = ';'.join(row['mood'])
159
+ row['cast'] = ';'.join(row['cast'])
160
+
161
+ writer.writerow(row)
162
+
163
+ print(f"✅ Movie preprocessing complete. Processed {len(processed_movies)} records.")
164
+ print(f"Outputs:\n - {OUTPUT_JSON}\n - {OUTPUT_CSV}")
165
+
166
+
167
+ if __name__ == '__main__':
 
 
 
168
  main()