simon-clmtd
/

sdg-scibert-zo_up

Safetensors

bert

Model card Files Files and versions

xet

Community

Simon Clematide commited on May 1, 2025

Commit

4d23e7a

1 Parent(s): f9c9b95

Add CLI script for processing JSON and JSONL files with text field extraction and exclusion handling

Browse files

Files changed (1) hide show

sdg_predict/cli_zora2text.py +251 -0

sdg_predict/cli_zora2text.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import json
+import argparse
+import os
+import logging
+from typing import Any
+import re
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+def process_jsonl(input_file: str, output_file: str) -> None:
+    """
+    Process a JSONL file to add a 'text' field combining 'title' and 'description'.
+    Args:
+        input_file (str): Path to the input JSONL file.
+        output_file (str): Path to the output JSONL file.
+    """
+    logging.info(f"Processing JSONL file: {input_file}")
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+        for line in infile:
+            entry: dict[str, Any] = json.loads(line)
+            title: str = entry.get("title", "")
+            description: str = entry.get("description", "")
+            if not title and not description:
+                logging.warning(
+                    f"File '{input_file}' contains an entry with no title and no"
+                    " description."
+                )
+            entry["text"] = (
+                f"{title}: {description}" if title and description else description
+            )
+            if entry["text"].strip():  # Only write entries with non-empty 'text'
+                outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    logging.info(f"Finished processing JSONL file: {input_file}")
+    logging.info(f"Output file saved at: {output_file}")
+def load_exclusion_ids(exclusion_file: str) -> set:
+    """
+    Load exclusion IDs from a JSONL file.
+    Args:
+        exclusion_file (str): Path to the JSONL file containing exclusion IDs.
+    Returns:
+        set: A set of IDs to exclude.
+    """
+    exclusion_ids = set()
+    with open(exclusion_file, "r") as file:
+        for line in file:
+            obj = json.loads(line)
+            if "id" in obj:
+                exclusion_ids.add(obj["id"])
+    return exclusion_ids
+def process_directory(
+    input_dir: str, output_file: str, exclusion_file: str = None
+) -> None:
+    """
+    Process all JSON files in a directory to add a 'text' field and write to a single JSONL file.
+    Args:
+        input_dir (str): Path to the input directory containing JSON files.
+        output_file (str): Path to the output JSONL file to save processed entries.
+        exclusion_file (str, optional): Path to a JSONL file containing IDs to exclude.
+    Note:
+        All processed entries from the JSON files in the directory will be combined
+        and written into a single JSONL file specified by `output_file`.
+    """
+    logging.info(f"Processing directory: {input_dir}")
+    total_converted = 0
+    total_excluded = 0
+    exclusion_ids = load_exclusion_ids(exclusion_file) if exclusion_file else set()
+    with open(output_file, "w") as outfile:
+        for filename in os.listdir(input_dir):
+            if filename.endswith(".json"):
+                input_file: str = os.path.join(input_dir, filename)
+                logging.info(f"Processing file: {input_file}")
+                with open(input_file, "r") as infile:
+                    data = json.load(infile)  # Load single JSON object
+                    if not isinstance(data, dict):
+                        logging.warning(
+                            f"File '{input_file}' does not contain a valid JSON object."
+                            " Skipping."
+                        )
+                        total_excluded += 1
+                        continue
+                    # Handle nested structure under 'ns0:dc'
+                    dc_data = data.get("ns0:dc", {})
+                    title = dc_data.get("dc:title", "")
+                    description = dc_data.get("dc:description", "")
+                    creator = ", ".join(dc_data.get("dc:creator", []))
+                    subject = ", ".join(dc_data.get("dc:subject", []))
+                    publisher = dc_data.get("dc:publisher", "")
+                    date = dc_data.get("dc:date", "")
+                    # Extract identifier and construct 'id'
+                    identifiers = dc_data.get("dc:identifier", [])
+                    if type(identifiers) is not list:
+                        identifiers = [identifiers]
+                    id_value = None
+                    for identifier in identifiers:
+                        if "https://www.zora.uzh.ch/id/eprint/" in identifier:
+                            id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
+                            break
+                        # "https://www.zora.uzh.ch/140521"
+                        match = re.match(r"https://www.zora.uzh.ch/(\d+).*", identifier)
+                        if match:
+                            id_value = match.group(1)
+                            break
+                    if id_value:
+                        id_field = f"oai:www.zora.uzh.ch:{id_value}"
+                    else:
+                        id_field = None
+                        logging.warning(
+                            "No valid ID found in identifiers: %s", identifiers
+                        )
+                    if not id_field:
+                        logging.warning(
+                            f"File '{input_file}' does not contain a valid ID."
+                            " Skipping."
+                        )
+                        total_excluded += 1
+                        continue
+                    # Check if the ID is in the exclusion list
+                    if id_field in exclusion_ids:
+                        logging.info(f"Excluding file with ID: {id_field}")
+                        total_excluded += 1
+                        continue
+                    text = f"{title}: {description}".strip()
+                    if text:
+                        entry = {
+                            "id": id_field,
+                            "title": title,
+                            "description": description,
+                            "text": text,
+                            "creator": creator,
+                            "subject": subject,
+                            "publisher": publisher,
+                            "date": date,
+                        }
+                        outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
+                        total_converted += 1
+                    else:
+                        total_excluded += 1
+    logging.info(f"Finished processing directory: {input_dir}")
+    logging.info(f"Total converted files: {total_converted}")
+    logging.info(f"Total excluded files: {total_excluded}")
+    logging.info(f"Output file saved at: {outfile.name}")
+def process_single_json_file(input_file: str, output_file: str) -> None:
+    """
+    Process a single JSON file to extract relevant fields and add a 'text' field and 'id'.
+    Args:
+        input_file (str): Path to the input JSON file.
+        output_file (str): Path to the output JSONL file.
+    """
+    logging.info(f"Processing single JSON file: {input_file}")
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+        data = json.load(infile)
+        if not isinstance(data, dict):
+            raise ValueError("Expected a JSON object at the root.")
+        # Handle nested structure under 'ns0:dc'
+        dc_data = data.get("ns0:dc", {})
+        title = dc_data.get("dc:title", "")
+        description = dc_data.get("dc:description", "")
+        creator = ", ".join(dc_data.get("dc:creator", []))
+        subject = ", ".join(dc_data.get("dc:subject", []))
+        publisher = dc_data.get("dc:publisher", "")
+        date = dc_data.get("dc:date", "")
+        # Extract identifier and construct 'id'
+        identifiers = dc_data.get("dc:identifier", [])
+        id_value = None
+        for identifier in identifiers:
+            if "https://www.zora.uzh.ch/id/eprint/" in identifier:
+                id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
+                break
+        if id_value:
+            id_field = f"oai:www.zora.uzh.ch:{id_value}"
+        else:
+            id_field = None
+        text = f"{title}: {description}".strip()
+        if text:
+            entry = {
+                "text": text,
+                "id": id_field,
+                "title": title,
+                "description": description,
+                "creator": creator,
+                "subject": subject,
+                "publisher": publisher,
+                "date": date,
+            }
+            outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    logging.info(f"Finished processing single JSON file: {input_file}")
+def main() -> None:
+    """
+    Main function to parse arguments and process files or directories.
+    If the input is a directory, all JSON files in the directory will be processed,
+    and their entries will be combined into a single JSONL file specified by the output path.
+    If the input is a single JSONL file, it will be processed and written to the output file.
+    """
+    logging.info("Starting the processing script.")
+    parser = argparse.ArgumentParser(
+        description=(
+            "Process JSON or JSONL files to add a 'text' field consisting of {title}:"
+            " {description}."
+            " If the result is empty, the document is not added to the output file."
+        )
+    )
+    parser.add_argument("input", help="Path to the input JSONL file or directory")
+    parser.add_argument("output", help="Path to the output JSONL file or directory")
+    parser.add_argument(
+        "--exclude", help="Path to a JSONL file containing IDs to exclude", default=None
+    )
+    args = parser.parse_args()
+    if os.path.isdir(args.input):
+        process_directory(args.input, args.output, args.exclude)
+    else:
+        process_jsonl(args.input, args.output)
+    logging.info("Processing script completed.")
+if __name__ == "__main__":
+    main()