Simon Clematide
Add CLI script for processing JSON and JSONL files with text field extraction and exclusion handling
4d23e7a
| import json | |
| import argparse | |
| import os | |
| import logging | |
| from typing import Any | |
| import re | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
| ) | |
| def process_jsonl(input_file: str, output_file: str) -> None: | |
| """ | |
| Process a JSONL file to add a 'text' field combining 'title' and 'description'. | |
| Args: | |
| input_file (str): Path to the input JSONL file. | |
| output_file (str): Path to the output JSONL file. | |
| """ | |
| logging.info(f"Processing JSONL file: {input_file}") | |
| with open(input_file, "r") as infile, open(output_file, "w") as outfile: | |
| for line in infile: | |
| entry: dict[str, Any] = json.loads(line) | |
| title: str = entry.get("title", "") | |
| description: str = entry.get("description", "") | |
| if not title and not description: | |
| logging.warning( | |
| f"File '{input_file}' contains an entry with no title and no" | |
| " description." | |
| ) | |
| entry["text"] = ( | |
| f"{title}: {description}" if title and description else description | |
| ) | |
| if entry["text"].strip(): # Only write entries with non-empty 'text' | |
| outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| logging.info(f"Finished processing JSONL file: {input_file}") | |
| logging.info(f"Output file saved at: {output_file}") | |
| def load_exclusion_ids(exclusion_file: str) -> set: | |
| """ | |
| Load exclusion IDs from a JSONL file. | |
| Args: | |
| exclusion_file (str): Path to the JSONL file containing exclusion IDs. | |
| Returns: | |
| set: A set of IDs to exclude. | |
| """ | |
| exclusion_ids = set() | |
| with open(exclusion_file, "r") as file: | |
| for line in file: | |
| obj = json.loads(line) | |
| if "id" in obj: | |
| exclusion_ids.add(obj["id"]) | |
| return exclusion_ids | |
| def process_directory( | |
| input_dir: str, output_file: str, exclusion_file: str = None | |
| ) -> None: | |
| """ | |
| Process all JSON files in a directory to add a 'text' field and write to a single JSONL file. | |
| Args: | |
| input_dir (str): Path to the input directory containing JSON files. | |
| output_file (str): Path to the output JSONL file to save processed entries. | |
| exclusion_file (str, optional): Path to a JSONL file containing IDs to exclude. | |
| Note: | |
| All processed entries from the JSON files in the directory will be combined | |
| and written into a single JSONL file specified by `output_file`. | |
| """ | |
| logging.info(f"Processing directory: {input_dir}") | |
| total_converted = 0 | |
| total_excluded = 0 | |
| exclusion_ids = load_exclusion_ids(exclusion_file) if exclusion_file else set() | |
| with open(output_file, "w") as outfile: | |
| for filename in os.listdir(input_dir): | |
| if filename.endswith(".json"): | |
| input_file: str = os.path.join(input_dir, filename) | |
| logging.info(f"Processing file: {input_file}") | |
| with open(input_file, "r") as infile: | |
| data = json.load(infile) # Load single JSON object | |
| if not isinstance(data, dict): | |
| logging.warning( | |
| f"File '{input_file}' does not contain a valid JSON object." | |
| " Skipping." | |
| ) | |
| total_excluded += 1 | |
| continue | |
| # Handle nested structure under 'ns0:dc' | |
| dc_data = data.get("ns0:dc", {}) | |
| title = dc_data.get("dc:title", "") | |
| description = dc_data.get("dc:description", "") | |
| creator = ", ".join(dc_data.get("dc:creator", [])) | |
| subject = ", ".join(dc_data.get("dc:subject", [])) | |
| publisher = dc_data.get("dc:publisher", "") | |
| date = dc_data.get("dc:date", "") | |
| # Extract identifier and construct 'id' | |
| identifiers = dc_data.get("dc:identifier", []) | |
| if type(identifiers) is not list: | |
| identifiers = [identifiers] | |
| id_value = None | |
| for identifier in identifiers: | |
| if "https://www.zora.uzh.ch/id/eprint/" in identifier: | |
| id_value = identifier.split("/id/eprint/")[-1].split("/")[0] | |
| break | |
| # "https://www.zora.uzh.ch/140521" | |
| match = re.match(r"https://www.zora.uzh.ch/(\d+).*", identifier) | |
| if match: | |
| id_value = match.group(1) | |
| break | |
| if id_value: | |
| id_field = f"oai:www.zora.uzh.ch:{id_value}" | |
| else: | |
| id_field = None | |
| logging.warning( | |
| "No valid ID found in identifiers: %s", identifiers | |
| ) | |
| if not id_field: | |
| logging.warning( | |
| f"File '{input_file}' does not contain a valid ID." | |
| " Skipping." | |
| ) | |
| total_excluded += 1 | |
| continue | |
| # Check if the ID is in the exclusion list | |
| if id_field in exclusion_ids: | |
| logging.info(f"Excluding file with ID: {id_field}") | |
| total_excluded += 1 | |
| continue | |
| text = f"{title}: {description}".strip() | |
| if text: | |
| entry = { | |
| "id": id_field, | |
| "title": title, | |
| "description": description, | |
| "text": text, | |
| "creator": creator, | |
| "subject": subject, | |
| "publisher": publisher, | |
| "date": date, | |
| } | |
| outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| total_converted += 1 | |
| else: | |
| total_excluded += 1 | |
| logging.info(f"Finished processing directory: {input_dir}") | |
| logging.info(f"Total converted files: {total_converted}") | |
| logging.info(f"Total excluded files: {total_excluded}") | |
| logging.info(f"Output file saved at: {outfile.name}") | |
| def process_single_json_file(input_file: str, output_file: str) -> None: | |
| """ | |
| Process a single JSON file to extract relevant fields and add a 'text' field and 'id'. | |
| Args: | |
| input_file (str): Path to the input JSON file. | |
| output_file (str): Path to the output JSONL file. | |
| """ | |
| logging.info(f"Processing single JSON file: {input_file}") | |
| with open(input_file, "r") as infile, open(output_file, "w") as outfile: | |
| data = json.load(infile) | |
| if not isinstance(data, dict): | |
| raise ValueError("Expected a JSON object at the root.") | |
| # Handle nested structure under 'ns0:dc' | |
| dc_data = data.get("ns0:dc", {}) | |
| title = dc_data.get("dc:title", "") | |
| description = dc_data.get("dc:description", "") | |
| creator = ", ".join(dc_data.get("dc:creator", [])) | |
| subject = ", ".join(dc_data.get("dc:subject", [])) | |
| publisher = dc_data.get("dc:publisher", "") | |
| date = dc_data.get("dc:date", "") | |
| # Extract identifier and construct 'id' | |
| identifiers = dc_data.get("dc:identifier", []) | |
| id_value = None | |
| for identifier in identifiers: | |
| if "https://www.zora.uzh.ch/id/eprint/" in identifier: | |
| id_value = identifier.split("/id/eprint/")[-1].split("/")[0] | |
| break | |
| if id_value: | |
| id_field = f"oai:www.zora.uzh.ch:{id_value}" | |
| else: | |
| id_field = None | |
| text = f"{title}: {description}".strip() | |
| if text: | |
| entry = { | |
| "text": text, | |
| "id": id_field, | |
| "title": title, | |
| "description": description, | |
| "creator": creator, | |
| "subject": subject, | |
| "publisher": publisher, | |
| "date": date, | |
| } | |
| outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| logging.info(f"Finished processing single JSON file: {input_file}") | |
| def main() -> None: | |
| """ | |
| Main function to parse arguments and process files or directories. | |
| If the input is a directory, all JSON files in the directory will be processed, | |
| and their entries will be combined into a single JSONL file specified by the output path. | |
| If the input is a single JSONL file, it will be processed and written to the output file. | |
| """ | |
| logging.info("Starting the processing script.") | |
| parser = argparse.ArgumentParser( | |
| description=( | |
| "Process JSON or JSONL files to add a 'text' field consisting of {title}:" | |
| " {description}." | |
| " If the result is empty, the document is not added to the output file." | |
| ) | |
| ) | |
| parser.add_argument("input", help="Path to the input JSONL file or directory") | |
| parser.add_argument("output", help="Path to the output JSONL file or directory") | |
| parser.add_argument( | |
| "--exclude", help="Path to a JSONL file containing IDs to exclude", default=None | |
| ) | |
| args = parser.parse_args() | |
| if os.path.isdir(args.input): | |
| process_directory(args.input, args.output, args.exclude) | |
| else: | |
| process_jsonl(args.input, args.output) | |
| logging.info("Processing script completed.") | |
| if __name__ == "__main__": | |
| main() | |