Spaces:
Sleeping
Sleeping
File size: 4,284 Bytes
cb420eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import zipfile
import gradio as gr
import os
import json
import pandas as pd
from pathlib import Path
from tempfile import TemporaryDirectory
import uuid
import threading
import time
pd.options.mode.chained_assignment = None
OUTPUT_DIR = "./alignments"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def delete_output_dir(delay=120):
"""Function to delete OUTPUT_DIR after a delay."""
time.sleep(delay) # Give time for Gradio to complete the download
for file in os.listdir(OUTPUT_DIR):
os.remove(os.path.join(OUTPUT_DIR, file))
os.rmdir(OUTPUT_DIR)
def split_comma_if_needed(value):
if isinstance(value, str) and ', ' in value:
updated_value = [item.strip() for item in value.split(', ')]
else:
updated_value = value
return updated_value
def convert_alignment(files):
zip_filename = f"alignments_{uuid.uuid4()}.zip"
zip_path = os.path.join(OUTPUT_DIR, zip_filename)
with TemporaryDirectory() as tmpdir:
json_file_paths = []
for file in files:
# read the header
header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*')
header = header.to_json(orient="records", lines=True)
header = json.loads(header)
# read the body
body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*')
#json_data = df.to_json(orient="records") # Convert to JSON
source_data = body[[col for col in body.columns if 'source' in col]]
source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)]
source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1)
source_transformed = source_data.map(split_comma_if_needed)
source_result = {}
source_result['sentences'] = source_transformed.to_dict(orient='records')
source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4)
source_json_file_name = header['source_text_id'] + '.json'
source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name)
with open(source_json_file_path, "w") as json_file:
json_file.write(source_json_data)
target_data = body[[col for col in body.columns if 'target' in col]]
target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)]
target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1)
target_transformed = target_data.map(split_comma_if_needed)
target_result = {}
target_result['sentences'] = target_transformed.to_dict(orient='records')
target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4)
target_json_file_name = header['target_text_id'] + '.json'
target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name)
with open(target_json_file_path, "w") as json_file:
json_file.write(target_json_data)
json_file_paths.append(source_json_file_path)
json_file_paths.append(target_json_file_path)
# Create ZIP file
with zipfile.ZipFile(zip_path, "w") as zipf:
for json_file in json_file_paths:
zipf.write(json_file, arcname=os.path.basename(json_file))
for json_file in json_file_paths:
os.remove(json_file)
# Schedule directory cleanup after a delay
threading.Thread(target=delete_output_dir, args=(60,)).start() # Adjust delay as needed
return zip_path
demo = gr.Interface(
fn=convert_alignment,
inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]),
outputs="file",
fill_width = True,
theme="Nymbo/Nymbo_Theme",
title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform",
description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window."
)
if __name__ == "__main__":
demo.launch() |