import zipfile import gradio as gr import os import json import pandas as pd from pathlib import Path from tempfile import TemporaryDirectory import uuid import threading import time pd.options.mode.chained_assignment = None OUTPUT_DIR = "./alignments" os.makedirs(OUTPUT_DIR, exist_ok=True) def delete_output_dir(delay=120): """Function to delete OUTPUT_DIR after a delay.""" time.sleep(delay) # Give time for Gradio to complete the download for file in os.listdir(OUTPUT_DIR): os.remove(os.path.join(OUTPUT_DIR, file)) os.rmdir(OUTPUT_DIR) def split_comma_if_needed(value): if isinstance(value, str) and ', ' in value: updated_value = [item.strip() for item in value.split(', ')] else: updated_value = value return updated_value def convert_alignment(files): zip_filename = f"alignments_{uuid.uuid4()}.zip" zip_path = os.path.join(OUTPUT_DIR, zip_filename) with TemporaryDirectory() as tmpdir: json_file_paths = [] for file in files: # read the header header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*') header = header.to_json(orient="records", lines=True) header = json.loads(header) # read the body body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*') #json_data = df.to_json(orient="records") # Convert to JSON source_data = body[[col for col in body.columns if 'source' in col]] source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)] source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1) source_transformed = source_data.map(split_comma_if_needed) source_result = {} source_result['sentences'] = source_transformed.to_dict(orient='records') source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4) source_json_file_name = header['source_text_id'] + '.json' source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name) with open(source_json_file_path, "w") as json_file: json_file.write(source_json_data) target_data = body[[col for col in body.columns if 'target' in col]] target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)] target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1) target_transformed = target_data.map(split_comma_if_needed) target_result = {} target_result['sentences'] = target_transformed.to_dict(orient='records') target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4) target_json_file_name = header['target_text_id'] + '.json' target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name) with open(target_json_file_path, "w") as json_file: json_file.write(target_json_data) json_file_paths.append(source_json_file_path) json_file_paths.append(target_json_file_path) # Create ZIP file with zipfile.ZipFile(zip_path, "w") as zipf: for json_file in json_file_paths: zipf.write(json_file, arcname=os.path.basename(json_file)) for json_file in json_file_paths: os.remove(json_file) # Schedule directory cleanup after a delay threading.Thread(target=delete_output_dir, args=(60,)).start() # Adjust delay as needed return zip_path demo = gr.Interface( fn=convert_alignment, inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]), outputs="file", fill_width = True, theme="Nymbo/Nymbo_Theme", title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform", description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window." ) if __name__ == "__main__": demo.launch()