nannanliu's picture
Create app.py
cb420eb verified
import zipfile
import gradio as gr
import os
import json
import pandas as pd
from pathlib import Path
from tempfile import TemporaryDirectory
import uuid
import threading
import time
pd.options.mode.chained_assignment = None
OUTPUT_DIR = "./alignments"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def delete_output_dir(delay=120):
"""Function to delete OUTPUT_DIR after a delay."""
time.sleep(delay) # Give time for Gradio to complete the download
for file in os.listdir(OUTPUT_DIR):
os.remove(os.path.join(OUTPUT_DIR, file))
os.rmdir(OUTPUT_DIR)
def split_comma_if_needed(value):
if isinstance(value, str) and ', ' in value:
updated_value = [item.strip() for item in value.split(', ')]
else:
updated_value = value
return updated_value
def convert_alignment(files):
zip_filename = f"alignments_{uuid.uuid4()}.zip"
zip_path = os.path.join(OUTPUT_DIR, zip_filename)
with TemporaryDirectory() as tmpdir:
json_file_paths = []
for file in files:
# read the header
header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*')
header = header.to_json(orient="records", lines=True)
header = json.loads(header)
# read the body
body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*')
#json_data = df.to_json(orient="records") # Convert to JSON
source_data = body[[col for col in body.columns if 'source' in col]]
source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)]
source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1)
source_transformed = source_data.map(split_comma_if_needed)
source_result = {}
source_result['sentences'] = source_transformed.to_dict(orient='records')
source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4)
source_json_file_name = header['source_text_id'] + '.json'
source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name)
with open(source_json_file_path, "w") as json_file:
json_file.write(source_json_data)
target_data = body[[col for col in body.columns if 'target' in col]]
target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)]
target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1)
target_transformed = target_data.map(split_comma_if_needed)
target_result = {}
target_result['sentences'] = target_transformed.to_dict(orient='records')
target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4)
target_json_file_name = header['target_text_id'] + '.json'
target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name)
with open(target_json_file_path, "w") as json_file:
json_file.write(target_json_data)
json_file_paths.append(source_json_file_path)
json_file_paths.append(target_json_file_path)
# Create ZIP file
with zipfile.ZipFile(zip_path, "w") as zipf:
for json_file in json_file_paths:
zipf.write(json_file, arcname=os.path.basename(json_file))
for json_file in json_file_paths:
os.remove(json_file)
# Schedule directory cleanup after a delay
threading.Thread(target=delete_output_dir, args=(60,)).start() # Adjust delay as needed
return zip_path
demo = gr.Interface(
fn=convert_alignment,
inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]),
outputs="file",
fill_width = True,
theme="Nymbo/Nymbo_Theme",
title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform",
description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window."
)
if __name__ == "__main__":
demo.launch()