Simon Clematide commited on
Commit
4d23e7a
·
1 Parent(s): f9c9b95

Add CLI script for processing JSON and JSONL files with text field extraction and exclusion handling

Browse files
Files changed (1) hide show
  1. sdg_predict/cli_zora2text.py +251 -0
sdg_predict/cli_zora2text.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ import os
4
+ import logging
5
+ from typing import Any
6
+ import re
7
+
8
+ # Configure logging
9
+ logging.basicConfig(
10
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
11
+ )
12
+
13
+
14
+ def process_jsonl(input_file: str, output_file: str) -> None:
15
+ """
16
+ Process a JSONL file to add a 'text' field combining 'title' and 'description'.
17
+
18
+ Args:
19
+ input_file (str): Path to the input JSONL file.
20
+ output_file (str): Path to the output JSONL file.
21
+ """
22
+ logging.info(f"Processing JSONL file: {input_file}")
23
+ with open(input_file, "r") as infile, open(output_file, "w") as outfile:
24
+ for line in infile:
25
+ entry: dict[str, Any] = json.loads(line)
26
+ title: str = entry.get("title", "")
27
+ description: str = entry.get("description", "")
28
+ if not title and not description:
29
+ logging.warning(
30
+ f"File '{input_file}' contains an entry with no title and no"
31
+ " description."
32
+ )
33
+ entry["text"] = (
34
+ f"{title}: {description}" if title and description else description
35
+ )
36
+ if entry["text"].strip(): # Only write entries with non-empty 'text'
37
+ outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
38
+ logging.info(f"Finished processing JSONL file: {input_file}")
39
+ logging.info(f"Output file saved at: {output_file}")
40
+
41
+
42
+ def load_exclusion_ids(exclusion_file: str) -> set:
43
+ """
44
+ Load exclusion IDs from a JSONL file.
45
+
46
+ Args:
47
+ exclusion_file (str): Path to the JSONL file containing exclusion IDs.
48
+
49
+ Returns:
50
+ set: A set of IDs to exclude.
51
+ """
52
+ exclusion_ids = set()
53
+ with open(exclusion_file, "r") as file:
54
+ for line in file:
55
+ obj = json.loads(line)
56
+ if "id" in obj:
57
+ exclusion_ids.add(obj["id"])
58
+ return exclusion_ids
59
+
60
+
61
+ def process_directory(
62
+ input_dir: str, output_file: str, exclusion_file: str = None
63
+ ) -> None:
64
+ """
65
+ Process all JSON files in a directory to add a 'text' field and write to a single JSONL file.
66
+
67
+ Args:
68
+ input_dir (str): Path to the input directory containing JSON files.
69
+ output_file (str): Path to the output JSONL file to save processed entries.
70
+ exclusion_file (str, optional): Path to a JSONL file containing IDs to exclude.
71
+
72
+ Note:
73
+ All processed entries from the JSON files in the directory will be combined
74
+ and written into a single JSONL file specified by `output_file`.
75
+ """
76
+ logging.info(f"Processing directory: {input_dir}")
77
+ total_converted = 0
78
+ total_excluded = 0
79
+
80
+ exclusion_ids = load_exclusion_ids(exclusion_file) if exclusion_file else set()
81
+
82
+ with open(output_file, "w") as outfile:
83
+ for filename in os.listdir(input_dir):
84
+ if filename.endswith(".json"):
85
+ input_file: str = os.path.join(input_dir, filename)
86
+ logging.info(f"Processing file: {input_file}")
87
+ with open(input_file, "r") as infile:
88
+ data = json.load(infile) # Load single JSON object
89
+ if not isinstance(data, dict):
90
+ logging.warning(
91
+ f"File '{input_file}' does not contain a valid JSON object."
92
+ " Skipping."
93
+ )
94
+ total_excluded += 1
95
+ continue
96
+
97
+ # Handle nested structure under 'ns0:dc'
98
+ dc_data = data.get("ns0:dc", {})
99
+
100
+ title = dc_data.get("dc:title", "")
101
+ description = dc_data.get("dc:description", "")
102
+ creator = ", ".join(dc_data.get("dc:creator", []))
103
+ subject = ", ".join(dc_data.get("dc:subject", []))
104
+ publisher = dc_data.get("dc:publisher", "")
105
+ date = dc_data.get("dc:date", "")
106
+
107
+ # Extract identifier and construct 'id'
108
+ identifiers = dc_data.get("dc:identifier", [])
109
+ if type(identifiers) is not list:
110
+ identifiers = [identifiers]
111
+ id_value = None
112
+ for identifier in identifiers:
113
+ if "https://www.zora.uzh.ch/id/eprint/" in identifier:
114
+ id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
115
+ break
116
+ # "https://www.zora.uzh.ch/140521"
117
+ match = re.match(r"https://www.zora.uzh.ch/(\d+).*", identifier)
118
+ if match:
119
+ id_value = match.group(1)
120
+ break
121
+
122
+ if id_value:
123
+ id_field = f"oai:www.zora.uzh.ch:{id_value}"
124
+ else:
125
+ id_field = None
126
+ logging.warning(
127
+ "No valid ID found in identifiers: %s", identifiers
128
+ )
129
+ if not id_field:
130
+ logging.warning(
131
+ f"File '{input_file}' does not contain a valid ID."
132
+ " Skipping."
133
+ )
134
+ total_excluded += 1
135
+ continue
136
+ # Check if the ID is in the exclusion list
137
+ if id_field in exclusion_ids:
138
+ logging.info(f"Excluding file with ID: {id_field}")
139
+ total_excluded += 1
140
+ continue
141
+
142
+ text = f"{title}: {description}".strip()
143
+
144
+ if text:
145
+ entry = {
146
+ "id": id_field,
147
+ "title": title,
148
+ "description": description,
149
+ "text": text,
150
+ "creator": creator,
151
+ "subject": subject,
152
+ "publisher": publisher,
153
+ "date": date,
154
+ }
155
+ outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
156
+ total_converted += 1
157
+ else:
158
+ total_excluded += 1
159
+
160
+ logging.info(f"Finished processing directory: {input_dir}")
161
+ logging.info(f"Total converted files: {total_converted}")
162
+ logging.info(f"Total excluded files: {total_excluded}")
163
+ logging.info(f"Output file saved at: {outfile.name}")
164
+
165
+
166
+ def process_single_json_file(input_file: str, output_file: str) -> None:
167
+ """
168
+ Process a single JSON file to extract relevant fields and add a 'text' field and 'id'.
169
+
170
+ Args:
171
+ input_file (str): Path to the input JSON file.
172
+ output_file (str): Path to the output JSONL file.
173
+ """
174
+ logging.info(f"Processing single JSON file: {input_file}")
175
+ with open(input_file, "r") as infile, open(output_file, "w") as outfile:
176
+ data = json.load(infile)
177
+ if not isinstance(data, dict):
178
+ raise ValueError("Expected a JSON object at the root.")
179
+
180
+ # Handle nested structure under 'ns0:dc'
181
+ dc_data = data.get("ns0:dc", {})
182
+
183
+ title = dc_data.get("dc:title", "")
184
+ description = dc_data.get("dc:description", "")
185
+ creator = ", ".join(dc_data.get("dc:creator", []))
186
+ subject = ", ".join(dc_data.get("dc:subject", []))
187
+ publisher = dc_data.get("dc:publisher", "")
188
+ date = dc_data.get("dc:date", "")
189
+
190
+ # Extract identifier and construct 'id'
191
+ identifiers = dc_data.get("dc:identifier", [])
192
+ id_value = None
193
+ for identifier in identifiers:
194
+ if "https://www.zora.uzh.ch/id/eprint/" in identifier:
195
+ id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
196
+ break
197
+
198
+ if id_value:
199
+ id_field = f"oai:www.zora.uzh.ch:{id_value}"
200
+ else:
201
+ id_field = None
202
+
203
+ text = f"{title}: {description}".strip()
204
+
205
+ if text:
206
+ entry = {
207
+ "text": text,
208
+ "id": id_field,
209
+ "title": title,
210
+ "description": description,
211
+ "creator": creator,
212
+ "subject": subject,
213
+ "publisher": publisher,
214
+ "date": date,
215
+ }
216
+ outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
217
+ logging.info(f"Finished processing single JSON file: {input_file}")
218
+
219
+
220
+ def main() -> None:
221
+ """
222
+ Main function to parse arguments and process files or directories.
223
+
224
+ If the input is a directory, all JSON files in the directory will be processed,
225
+ and their entries will be combined into a single JSONL file specified by the output path.
226
+ If the input is a single JSONL file, it will be processed and written to the output file.
227
+ """
228
+ logging.info("Starting the processing script.")
229
+ parser = argparse.ArgumentParser(
230
+ description=(
231
+ "Process JSON or JSONL files to add a 'text' field consisting of {title}:"
232
+ " {description}."
233
+ " If the result is empty, the document is not added to the output file."
234
+ )
235
+ )
236
+ parser.add_argument("input", help="Path to the input JSONL file or directory")
237
+ parser.add_argument("output", help="Path to the output JSONL file or directory")
238
+ parser.add_argument(
239
+ "--exclude", help="Path to a JSONL file containing IDs to exclude", default=None
240
+ )
241
+ args = parser.parse_args()
242
+
243
+ if os.path.isdir(args.input):
244
+ process_directory(args.input, args.output, args.exclude)
245
+ else:
246
+ process_jsonl(args.input, args.output)
247
+ logging.info("Processing script completed.")
248
+
249
+
250
+ if __name__ == "__main__":
251
+ main()