sdg-scibert-zo_up / sdg_predict /cli_zora2text.py

Simon Clematide

Add CLI script for processing JSON and JSONL files with text field extraction and exclusion handling

4d23e7a 9 months ago

9.87 kB

	import json
	import argparse
	import os
	import logging
	from typing import Any
	import re

	# Configure logging
	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
	)


	def process_jsonl(input_file: str, output_file: str) -> None:
	"""
	Process a JSONL file to add a 'text' field combining 'title' and 'description'.

	Args:
	input_file (str): Path to the input JSONL file.
	output_file (str): Path to the output JSONL file.
	"""
	logging.info(f"Processing JSONL file: {input_file}")
	with open(input_file, "r") as infile, open(output_file, "w") as outfile:
	for line in infile:
	entry: dict[str, Any] = json.loads(line)
	title: str = entry.get("title", "")
	description: str = entry.get("description", "")
	if not title and not description:
	logging.warning(
	f"File '{input_file}' contains an entry with no title and no"
	" description."
	)
	entry["text"] = (
	f"{title}: {description}" if title and description else description
	)
	if entry["text"].strip(): # Only write entries with non-empty 'text'
	outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
	logging.info(f"Finished processing JSONL file: {input_file}")
	logging.info(f"Output file saved at: {output_file}")


	def load_exclusion_ids(exclusion_file: str) -> set:
	"""
	Load exclusion IDs from a JSONL file.

	Args:
	exclusion_file (str): Path to the JSONL file containing exclusion IDs.

	Returns:
	set: A set of IDs to exclude.
	"""
	exclusion_ids = set()
	with open(exclusion_file, "r") as file:
	for line in file:
	obj = json.loads(line)
	if "id" in obj:
	exclusion_ids.add(obj["id"])
	return exclusion_ids


	def process_directory(
	input_dir: str, output_file: str, exclusion_file: str = None
	) -> None:
	"""
	Process all JSON files in a directory to add a 'text' field and write to a single JSONL file.

	Args:
	input_dir (str): Path to the input directory containing JSON files.
	output_file (str): Path to the output JSONL file to save processed entries.
	exclusion_file (str, optional): Path to a JSONL file containing IDs to exclude.

	Note:
	All processed entries from the JSON files in the directory will be combined
	and written into a single JSONL file specified by `output_file`.
	"""
	logging.info(f"Processing directory: {input_dir}")
	total_converted = 0
	total_excluded = 0

	exclusion_ids = load_exclusion_ids(exclusion_file) if exclusion_file else set()

	with open(output_file, "w") as outfile:
	for filename in os.listdir(input_dir):
	if filename.endswith(".json"):
	input_file: str = os.path.join(input_dir, filename)
	logging.info(f"Processing file: {input_file}")
	with open(input_file, "r") as infile:
	data = json.load(infile) # Load single JSON object
	if not isinstance(data, dict):
	logging.warning(
	f"File '{input_file}' does not contain a valid JSON object."
	" Skipping."
	)
	total_excluded += 1
	continue

	# Handle nested structure under 'ns0:dc'
	dc_data = data.get("ns0:dc", {})

	title = dc_data.get("dc:title", "")
	description = dc_data.get("dc:description", "")
	creator = ", ".join(dc_data.get("dc:creator", []))
	subject = ", ".join(dc_data.get("dc:subject", []))
	publisher = dc_data.get("dc:publisher", "")
	date = dc_data.get("dc:date", "")

	# Extract identifier and construct 'id'
	identifiers = dc_data.get("dc:identifier", [])
	if type(identifiers) is not list:
	identifiers = [identifiers]
	id_value = None
	for identifier in identifiers:
	if "https://www.zora.uzh.ch/id/eprint/" in identifier:
	id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
	break
	# "https://www.zora.uzh.ch/140521"
	match = re.match(r"https://www.zora.uzh.ch/(\d+).*", identifier)
	if match:
	id_value = match.group(1)
	break

	if id_value:
	id_field = f"oai:www.zora.uzh.ch:{id_value}"
	else:
	id_field = None
	logging.warning(
	"No valid ID found in identifiers: %s", identifiers
	)
	if not id_field:
	logging.warning(
	f"File '{input_file}' does not contain a valid ID."
	" Skipping."
	)
	total_excluded += 1
	continue
	# Check if the ID is in the exclusion list
	if id_field in exclusion_ids:
	logging.info(f"Excluding file with ID: {id_field}")
	total_excluded += 1
	continue

	text = f"{title}: {description}".strip()

	if text:
	entry = {
	"id": id_field,
	"title": title,
	"description": description,
	"text": text,
	"creator": creator,
	"subject": subject,
	"publisher": publisher,
	"date": date,
	}
	outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
	total_converted += 1
	else:
	total_excluded += 1

	logging.info(f"Finished processing directory: {input_dir}")
	logging.info(f"Total converted files: {total_converted}")
	logging.info(f"Total excluded files: {total_excluded}")
	logging.info(f"Output file saved at: {outfile.name}")


	def process_single_json_file(input_file: str, output_file: str) -> None:
	"""
	Process a single JSON file to extract relevant fields and add a 'text' field and 'id'.

	Args:
	input_file (str): Path to the input JSON file.
	output_file (str): Path to the output JSONL file.
	"""
	logging.info(f"Processing single JSON file: {input_file}")
	with open(input_file, "r") as infile, open(output_file, "w") as outfile:
	data = json.load(infile)
	if not isinstance(data, dict):
	raise ValueError("Expected a JSON object at the root.")

	# Handle nested structure under 'ns0:dc'
	dc_data = data.get("ns0:dc", {})

	title = dc_data.get("dc:title", "")
	description = dc_data.get("dc:description", "")
	creator = ", ".join(dc_data.get("dc:creator", []))
	subject = ", ".join(dc_data.get("dc:subject", []))
	publisher = dc_data.get("dc:publisher", "")
	date = dc_data.get("dc:date", "")

	# Extract identifier and construct 'id'
	identifiers = dc_data.get("dc:identifier", [])
	id_value = None
	for identifier in identifiers:
	if "https://www.zora.uzh.ch/id/eprint/" in identifier:
	id_value = identifier.split("/id/eprint/")[-1].split("/")[0]
	break

	if id_value:
	id_field = f"oai:www.zora.uzh.ch:{id_value}"
	else:
	id_field = None

	text = f"{title}: {description}".strip()

	if text:
	entry = {
	"text": text,
	"id": id_field,
	"title": title,
	"description": description,
	"creator": creator,
	"subject": subject,
	"publisher": publisher,
	"date": date,
	}
	outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
	logging.info(f"Finished processing single JSON file: {input_file}")


	def main() -> None:
	"""
	Main function to parse arguments and process files or directories.

	If the input is a directory, all JSON files in the directory will be processed,
	and their entries will be combined into a single JSONL file specified by the output path.
	If the input is a single JSONL file, it will be processed and written to the output file.
	"""
	logging.info("Starting the processing script.")
	parser = argparse.ArgumentParser(
	description=(
	"Process JSON or JSONL files to add a 'text' field consisting of {title}:"
	" {description}."
	" If the result is empty, the document is not added to the output file."
	)
	)
	parser.add_argument("input", help="Path to the input JSONL file or directory")
	parser.add_argument("output", help="Path to the output JSONL file or directory")
	parser.add_argument(
	"--exclude", help="Path to a JSONL file containing IDs to exclude", default=None
	)
	args = parser.parse_args()

	if os.path.isdir(args.input):
	process_directory(args.input, args.output, args.exclude)
	else:
	process_jsonl(args.input, args.output)
	logging.info("Processing script completed.")


	if __name__ == "__main__":
	main()