import json import argparse import os import logging from typing import Any import re # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) def process_jsonl(input_file: str, output_file: str) -> None: """ Process a JSONL file to add a 'text' field combining 'title' and 'description'. Args: input_file (str): Path to the input JSONL file. output_file (str): Path to the output JSONL file. """ logging.info(f"Processing JSONL file: {input_file}") with open(input_file, "r") as infile, open(output_file, "w") as outfile: for line in infile: entry: dict[str, Any] = json.loads(line) title: str = entry.get("title", "") description: str = entry.get("description", "") if not title and not description: logging.warning( f"File '{input_file}' contains an entry with no title and no" " description." ) entry["text"] = ( f"{title}: {description}" if title and description else description ) if entry["text"].strip(): # Only write entries with non-empty 'text' outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") logging.info(f"Finished processing JSONL file: {input_file}") logging.info(f"Output file saved at: {output_file}") def load_exclusion_ids(exclusion_file: str) -> set: """ Load exclusion IDs from a JSONL file. Args: exclusion_file (str): Path to the JSONL file containing exclusion IDs. Returns: set: A set of IDs to exclude. """ exclusion_ids = set() with open(exclusion_file, "r") as file: for line in file: obj = json.loads(line) if "id" in obj: exclusion_ids.add(obj["id"]) return exclusion_ids def process_directory( input_dir: str, output_file: str, exclusion_file: str = None ) -> None: """ Process all JSON files in a directory to add a 'text' field and write to a single JSONL file. Args: input_dir (str): Path to the input directory containing JSON files. output_file (str): Path to the output JSONL file to save processed entries. exclusion_file (str, optional): Path to a JSONL file containing IDs to exclude. Note: All processed entries from the JSON files in the directory will be combined and written into a single JSONL file specified by `output_file`. """ logging.info(f"Processing directory: {input_dir}") total_converted = 0 total_excluded = 0 exclusion_ids = load_exclusion_ids(exclusion_file) if exclusion_file else set() with open(output_file, "w") as outfile: for filename in os.listdir(input_dir): if filename.endswith(".json"): input_file: str = os.path.join(input_dir, filename) logging.info(f"Processing file: {input_file}") with open(input_file, "r") as infile: data = json.load(infile) # Load single JSON object if not isinstance(data, dict): logging.warning( f"File '{input_file}' does not contain a valid JSON object." " Skipping." ) total_excluded += 1 continue # Handle nested structure under 'ns0:dc' dc_data = data.get("ns0:dc", {}) title = dc_data.get("dc:title", "") description = dc_data.get("dc:description", "") creator = ", ".join(dc_data.get("dc:creator", [])) subject = ", ".join(dc_data.get("dc:subject", [])) publisher = dc_data.get("dc:publisher", "") date = dc_data.get("dc:date", "") # Extract identifier and construct 'id' identifiers = dc_data.get("dc:identifier", []) if type(identifiers) is not list: identifiers = [identifiers] id_value = None for identifier in identifiers: if "https://www.zora.uzh.ch/id/eprint/" in identifier: id_value = identifier.split("/id/eprint/")[-1].split("/")[0] break # "https://www.zora.uzh.ch/140521" match = re.match(r"https://www.zora.uzh.ch/(\d+).*", identifier) if match: id_value = match.group(1) break if id_value: id_field = f"oai:www.zora.uzh.ch:{id_value}" else: id_field = None logging.warning( "No valid ID found in identifiers: %s", identifiers ) if not id_field: logging.warning( f"File '{input_file}' does not contain a valid ID." " Skipping." ) total_excluded += 1 continue # Check if the ID is in the exclusion list if id_field in exclusion_ids: logging.info(f"Excluding file with ID: {id_field}") total_excluded += 1 continue text = f"{title}: {description}".strip() if text: entry = { "id": id_field, "title": title, "description": description, "text": text, "creator": creator, "subject": subject, "publisher": publisher, "date": date, } outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") total_converted += 1 else: total_excluded += 1 logging.info(f"Finished processing directory: {input_dir}") logging.info(f"Total converted files: {total_converted}") logging.info(f"Total excluded files: {total_excluded}") logging.info(f"Output file saved at: {outfile.name}") def process_single_json_file(input_file: str, output_file: str) -> None: """ Process a single JSON file to extract relevant fields and add a 'text' field and 'id'. Args: input_file (str): Path to the input JSON file. output_file (str): Path to the output JSONL file. """ logging.info(f"Processing single JSON file: {input_file}") with open(input_file, "r") as infile, open(output_file, "w") as outfile: data = json.load(infile) if not isinstance(data, dict): raise ValueError("Expected a JSON object at the root.") # Handle nested structure under 'ns0:dc' dc_data = data.get("ns0:dc", {}) title = dc_data.get("dc:title", "") description = dc_data.get("dc:description", "") creator = ", ".join(dc_data.get("dc:creator", [])) subject = ", ".join(dc_data.get("dc:subject", [])) publisher = dc_data.get("dc:publisher", "") date = dc_data.get("dc:date", "") # Extract identifier and construct 'id' identifiers = dc_data.get("dc:identifier", []) id_value = None for identifier in identifiers: if "https://www.zora.uzh.ch/id/eprint/" in identifier: id_value = identifier.split("/id/eprint/")[-1].split("/")[0] break if id_value: id_field = f"oai:www.zora.uzh.ch:{id_value}" else: id_field = None text = f"{title}: {description}".strip() if text: entry = { "text": text, "id": id_field, "title": title, "description": description, "creator": creator, "subject": subject, "publisher": publisher, "date": date, } outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") logging.info(f"Finished processing single JSON file: {input_file}") def main() -> None: """ Main function to parse arguments and process files or directories. If the input is a directory, all JSON files in the directory will be processed, and their entries will be combined into a single JSONL file specified by the output path. If the input is a single JSONL file, it will be processed and written to the output file. """ logging.info("Starting the processing script.") parser = argparse.ArgumentParser( description=( "Process JSON or JSONL files to add a 'text' field consisting of {title}:" " {description}." " If the result is empty, the document is not added to the output file." ) ) parser.add_argument("input", help="Path to the input JSONL file or directory") parser.add_argument("output", help="Path to the output JSONL file or directory") parser.add_argument( "--exclude", help="Path to a JSONL file containing IDs to exclude", default=None ) args = parser.parse_args() if os.path.isdir(args.input): process_directory(args.input, args.output, args.exclude) else: process_jsonl(args.input, args.output) logging.info("Processing script completed.") if __name__ == "__main__": main()