Spaces:

AUXteam
/

tiny_factory

Paused

tiny_factory / extraction /artifact_exporter.py

root

Import from HF Space harvesthealth/tiny_factory

6a42990 3 months ago

7.58 kB

	import os
	import json
	import pandas as pd
	import pypandoc
	import markdown
	from typing import Union, List

	from tinytroupe.extraction import logger
	from tinytroupe.utils import JsonSerializableRegistry

	import tinytroupe.utils as utils

	class ArtifactExporter(JsonSerializableRegistry):
	"""
	An artifact exporter is responsible for exporting artifacts from TinyTroupe elements, for example
	in order to create synthetic data files from simulations.
	"""

	def __init__(self, base_output_folder:str) -> None:
	self.base_output_folder = base_output_folder

	def export(self, artifact_name:str, artifact_data:Union[dict, str], content_type:str, content_format:str=None, target_format:str="txt", verbose:bool=False):
	"""
	Exports the specified artifact data to a file.

	Args:
	artifact_name (str): The name of the artifact.
	artifact_data (Union[dict, str]): The data to export. If a dict is given, it will be saved as JSON.
	If a string is given, it will be saved as is.
	content_type (str): The type of the content within the artifact.
	content_format (str, optional): The format of the content within the artifact (e.g., md, csv, etc). Defaults to None.
	target_format (str): The format to export the artifact to (e.g., json, txt, docx, etc).
	verbose (bool, optional): Whether to print debug messages. Defaults to False.
	"""

	# dedent inputs, just in case
	if isinstance(artifact_data, str):
	artifact_data = utils.dedent(artifact_data)
	elif isinstance(artifact_data, dict):
	artifact_data['content'] = utils.dedent(artifact_data['content'])
	else:
	raise ValueError("The artifact data must be either a string or a dictionary.")

	# clean the artifact name of invalid characters
	invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '\|', '\n', '\t', '\r', ';']
	for char in invalid_chars:
	# check if the character is in the artifact name
	if char in artifact_name:
	# replace the character with an underscore
	artifact_name = artifact_name.replace(char, "-")
	logger.warning(f"Replaced invalid character {char} with hyphen in artifact name '{artifact_name}'.")

	artifact_file_path = self._compose_filepath(artifact_data, artifact_name, content_type, target_format, verbose)


	if target_format == "json":
	self._export_as_json(artifact_file_path, artifact_data, content_type, verbose)
	elif target_format == "txt" or target_format == "text" or target_format == "md" or target_format == "markdown":
	self._export_as_txt(artifact_file_path, artifact_data, content_type, verbose)
	elif target_format == "docx":
	self._export_as_docx(artifact_file_path, artifact_data, content_format, verbose)
	else:
	raise ValueError(f"Unsupported target format: {target_format}.")


	def _export_as_txt(self, artifact_file_path:str, artifact_data:Union[dict, str], content_type:str, verbose:bool=False):
	"""
	Exports the specified artifact data to a text file.
	"""

	with open(artifact_file_path, 'w', encoding="utf-8", errors="replace") as f:
	if isinstance(artifact_data, dict):
	content = artifact_data['content']
	else:
	content = artifact_data

	f.write(content)

	def _export_as_json(self, artifact_file_path:str, artifact_data:Union[dict, str], content_type:str, verbose:bool=False):
	"""
	Exports the specified artifact data to a JSON file.
	"""

	with open(artifact_file_path, 'w', encoding="utf-8", errors="replace") as f:
	if isinstance(artifact_data, dict):
	json.dump(artifact_data, f, indent=4)
	else:
	raise ValueError("The artifact data must be a dictionary to export to JSON.")

	def _export_as_docx(self, artifact_file_path:str, artifact_data:Union[dict, str], content_original_format:str, verbose:bool=False):
	"""
	Exports the specified artifact data to a DOCX file.
	"""

	# original format must be 'text' or 'markdown'
	if content_original_format not in ['text', 'txt', 'markdown', 'md']:
	raise ValueError(f"The original format cannot be {content_original_format} to export to DOCX.")
	else:
	# normalize content value
	content_original_format = 'markdown' if content_original_format == 'md' else content_original_format

	# first, get the content to export. If `artifact_date` is a dict, the contant should be under the key `content`.
	# If it is a string, the content is the string itself.
	# using pypandoc
	if isinstance(artifact_data, dict):
	content = artifact_data['content']
	else:
	content = artifact_data

	# first, convert to HTML. This is necessary because pypandoc does not support a GOOD direct conversion from markdown to DOCX.
	html_content = markdown.markdown(content)

	## write this intermediary HTML to file
	#html_file_path = artifact_file_path.replace(".docx", ".html")
	#with open(html_file_path, 'w', encoding="utf-8", errors="replace") as f:
	# f.write(html_content)

	# then, convert to DOCX
	pypandoc.convert_text(html_content, 'docx', format='html', outputfile=artifact_file_path)

	###########################################################
	# IO
	###########################################################

	def _compose_filepath(self, artifact_data:Union[dict, str], artifact_name:str, content_type:str, target_format:str=None, verbose:bool=False):
	"""
	Composes the file path for the artifact to export.

	Args:
	artifact_data (Union[dict, str]): The data to export.
	artifact_name (str): The name of the artifact.
	content_type (str): The type of the content within the artifact.
	content_format (str, optional): The format of the content within the artifact (e.g., md, csv, etc). Defaults to None.
	verbose (bool, optional): Whether to print debug messages. Defaults to False.
	"""

	# Extension definition:
	#
	# - If the content format is specified, we use it as the part of the extension.
	# - If artificat_data is a dict, we add .json to the extension. Note that if content format was specified, we'd get <content_format>.json.
	# - If artifact_data is a string and no content format is specified, we add .txt to the extension.
	extension = None
	if target_format is not None:
	extension = f"{target_format}"
	elif isinstance(artifact_data, str) and target_format is None:
	extension = "txt"

	# content type definition
	if content_type is None:
	subfolder = ""
	else:
	subfolder = content_type

	# save to the specified file name or path, considering the base output folder.
	artifact_file_path = os.path.join(self.base_output_folder, subfolder, f"{artifact_name}.{extension}")

	# create intermediate directories if necessary
	os.makedirs(os.path.dirname(artifact_file_path), exist_ok=True)

	return artifact_file_path