File size: 9,868 Bytes

4d23e7a

import json
import argparse
import os
import logging
from typing import Any
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def process_jsonl(input_file: str, output_file: str) -> None:
    """
    Process a JSONL file to add a 'text' field combining 'title' and 'description'.

    Args:
        input_file (str): Path to the input JSONL file.
        output_file (str): Path to the output JSONL file.
    """
    logging.info(f"Processing JSONL file: {input_file}")
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            entry: dict[str, Any] = json.loads(line)
            title: str = entry.get("title", "")
            description: str = entry.get("description", "")
            if not title and not description:
                logging.warning(
                    f"File '{input_file}' contains an entry with no title and no"
                    " description."
                )
            entry["text"] = (
                f"{title}: {description}" if title and description else description
            )
            if entry["text"].strip():  # Only write entries with non-empty 'text'
                outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
    logging.info(f"Finished processing JSONL file: {input_file}")
    logging.info(f"Output file saved at: {output_file}")


def load_exclusion_ids(exclusion_file: str) -> set:
    """
    Load exclusion IDs from a JSONL file.

    Args:
        exclusion_file (str): Path to the JSONL file containing exclusion IDs.

    Returns:
        set: A set of IDs to exclude.
    """
    exclusion_ids = set()
    with open(exclusion_file, "r") as file:
        for line in file:
            obj = json.loads(line)
            if "id" in obj:
                exclusion_ids.add(obj["id"])
    return exclusion_ids


def process_directory(
    input_dir: str, output_file: str, exclusion_file: str = None
) -> None:
    """
    Process all JSON files in a directory to add a 'text' field and write to a single JSONL file.

    Args:
        input_dir (str): Path to the input directory containing JSON files.
        output_file (str): Path to the output JSONL file to save processed entries.
        exclusion_file (str, optional): Path to a JSONL file containing IDs to exclude.

    Note:
        All processed entries from the JSON files in the directory will be combined
        and written into a single JSONL file specified by `output_file`.
    """
    logging.info(f"Processing directory: {input_dir}")
    total_converted = 0
    total_excluded = 0

    exclusion_ids = load_exclusion_ids(exclusion_file) if exclusion_file else set()

    with open(output_file, "w") as outfile:
        for filename in os.listdir(input_dir):
            if filename.endswith(".json"):
                input_file: str = os.path.join(input_dir, filename)
                logging.info(f"Processing file: {input_file}")
                with open(input_file, "r") as infile:
                    data = json.load(infile)  # Load single JSON object
                    if not isinstance(data, dict):
                        logging.warning(
                            f"File '{input_file}' does not contain a valid JSON object."
                            " Skipping."
                        )
                        total_excluded += 1
                        continue

                    # Handle nested structure under 'ns0:dc'
                    dc_data = data.get("ns0:dc", {})

                    title = dc_data.get("dc:title", "")
                    description = dc_data.get("dc:description", "")
                    creator = ", ".join(dc_data.get("dc:creator", []))
                    subject = ", ".join(dc_data.get("dc:subject", []))
                    publisher = dc_data.get("dc:publisher", "")
                    date = dc_data.get("dc:date", "")

                    # Extract identifier and construct 'id'
                    identifiers = dc_data.get("dc:identifier", [])
                    if type(identifiers) is not list:
                        identifiers = [identifiers]
                    id_value = None
                    for identifier in identifiers:
                        if "https://www.zora.uzh.ch/id/eprint/" in identifier:
                            id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
                            break
                        # "https://www.zora.uzh.ch/140521"
                        match = re.match(r"https://www.zora.uzh.ch/(\d+).*", identifier)
                        if match:
                            id_value = match.group(1)
                            break

                    if id_value:
                        id_field = f"oai:www.zora.uzh.ch:{id_value}"
                    else:
                        id_field = None
                        logging.warning(
                            "No valid ID found in identifiers: %s", identifiers
                        )
                    if not id_field:
                        logging.warning(
                            f"File '{input_file}' does not contain a valid ID."
                            " Skipping."
                        )
                        total_excluded += 1
                        continue
                    # Check if the ID is in the exclusion list
                    if id_field in exclusion_ids:
                        logging.info(f"Excluding file with ID: {id_field}")
                        total_excluded += 1
                        continue

                    text = f"{title}: {description}".strip()

                    if text:
                        entry = {
                            "id": id_field,
                            "title": title,
                            "description": description,
                            "text": text,
                            "creator": creator,
                            "subject": subject,
                            "publisher": publisher,
                            "date": date,
                        }
                        outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
                        total_converted += 1
                    else:
                        total_excluded += 1

    logging.info(f"Finished processing directory: {input_dir}")
    logging.info(f"Total converted files: {total_converted}")
    logging.info(f"Total excluded files: {total_excluded}")
    logging.info(f"Output file saved at: {outfile.name}")


def process_single_json_file(input_file: str, output_file: str) -> None:
    """
    Process a single JSON file to extract relevant fields and add a 'text' field and 'id'.

    Args:
        input_file (str): Path to the input JSON file.
        output_file (str): Path to the output JSONL file.
    """
    logging.info(f"Processing single JSON file: {input_file}")
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        data = json.load(infile)
        if not isinstance(data, dict):
            raise ValueError("Expected a JSON object at the root.")

        # Handle nested structure under 'ns0:dc'
        dc_data = data.get("ns0:dc", {})

        title = dc_data.get("dc:title", "")
        description = dc_data.get("dc:description", "")
        creator = ", ".join(dc_data.get("dc:creator", []))
        subject = ", ".join(dc_data.get("dc:subject", []))
        publisher = dc_data.get("dc:publisher", "")
        date = dc_data.get("dc:date", "")

        # Extract identifier and construct 'id'
        identifiers = dc_data.get("dc:identifier", [])
        id_value = None
        for identifier in identifiers:
            if "https://www.zora.uzh.ch/id/eprint/" in identifier:
                id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
                break

        if id_value:
            id_field = f"oai:www.zora.uzh.ch:{id_value}"
        else:
            id_field = None

        text = f"{title}: {description}".strip()

        if text:
            entry = {
                "text": text,
                "id": id_field,
                "title": title,
                "description": description,
                "creator": creator,
                "subject": subject,
                "publisher": publisher,
                "date": date,
            }
            outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
    logging.info(f"Finished processing single JSON file: {input_file}")


def main() -> None:
    """
    Main function to parse arguments and process files or directories.

    If the input is a directory, all JSON files in the directory will be processed,
    and their entries will be combined into a single JSONL file specified by the output path.
    If the input is a single JSONL file, it will be processed and written to the output file.
    """
    logging.info("Starting the processing script.")
    parser = argparse.ArgumentParser(
        description=(
            "Process JSON or JSONL files to add a 'text' field consisting of {title}:"
            " {description}."
            " If the result is empty, the document is not added to the output file."
        )
    )
    parser.add_argument("input", help="Path to the input JSONL file or directory")
    parser.add_argument("output", help="Path to the output JSONL file or directory")
    parser.add_argument(
        "--exclude", help="Path to a JSONL file containing IDs to exclude", default=None
    )
    args = parser.parse_args()

    if os.path.isdir(args.input):
        process_directory(args.input, args.output, args.exclude)
    else:
        process_jsonl(args.input, args.output)
    logging.info("Processing script completed.")


if __name__ == "__main__":
    main()