Spaces:
Sleeping
Sleeping
| import zipfile | |
| import gradio as gr | |
| import os | |
| import json | |
| import pandas as pd | |
| from pathlib import Path | |
| from tempfile import TemporaryDirectory | |
| import uuid | |
| import threading | |
| import time | |
| pd.options.mode.chained_assignment = None | |
| OUTPUT_DIR = "./alignments" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| def delete_output_dir(delay=120): | |
| """Function to delete OUTPUT_DIR after a delay.""" | |
| time.sleep(delay) # Give time for Gradio to complete the download | |
| for file in os.listdir(OUTPUT_DIR): | |
| os.remove(os.path.join(OUTPUT_DIR, file)) | |
| os.rmdir(OUTPUT_DIR) | |
| def split_comma_if_needed(value): | |
| if isinstance(value, str) and ', ' in value: | |
| updated_value = [item.strip() for item in value.split(', ')] | |
| else: | |
| updated_value = value | |
| return updated_value | |
| def convert_alignment(files): | |
| zip_filename = f"alignments_{uuid.uuid4()}.zip" | |
| zip_path = os.path.join(OUTPUT_DIR, zip_filename) | |
| with TemporaryDirectory() as tmpdir: | |
| json_file_paths = [] | |
| for file in files: | |
| # read the header | |
| header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*') | |
| header = header.to_json(orient="records", lines=True) | |
| header = json.loads(header) | |
| # read the body | |
| body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*') | |
| #json_data = df.to_json(orient="records") # Convert to JSON | |
| source_data = body[[col for col in body.columns if 'source' in col]] | |
| source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)] | |
| source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1) | |
| source_transformed = source_data.map(split_comma_if_needed) | |
| source_result = {} | |
| source_result['sentences'] = source_transformed.to_dict(orient='records') | |
| source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4) | |
| source_json_file_name = header['source_text_id'] + '.json' | |
| source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name) | |
| with open(source_json_file_path, "w") as json_file: | |
| json_file.write(source_json_data) | |
| target_data = body[[col for col in body.columns if 'target' in col]] | |
| target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)] | |
| target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1) | |
| target_transformed = target_data.map(split_comma_if_needed) | |
| target_result = {} | |
| target_result['sentences'] = target_transformed.to_dict(orient='records') | |
| target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4) | |
| target_json_file_name = header['target_text_id'] + '.json' | |
| target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name) | |
| with open(target_json_file_path, "w") as json_file: | |
| json_file.write(target_json_data) | |
| json_file_paths.append(source_json_file_path) | |
| json_file_paths.append(target_json_file_path) | |
| # Create ZIP file | |
| with zipfile.ZipFile(zip_path, "w") as zipf: | |
| for json_file in json_file_paths: | |
| zipf.write(json_file, arcname=os.path.basename(json_file)) | |
| for json_file in json_file_paths: | |
| os.remove(json_file) | |
| # Schedule directory cleanup after a delay | |
| threading.Thread(target=delete_output_dir, args=(60,)).start() # Adjust delay as needed | |
| return zip_path | |
| demo = gr.Interface( | |
| fn=convert_alignment, | |
| inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]), | |
| outputs="file", | |
| fill_width = True, | |
| theme="Nymbo/Nymbo_Theme", | |
| title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform", | |
| description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |