Spaces:

nannanliu
/

UNIC_alignment_conversion

Sleeping

File size: 4,284 Bytes

cb420eb

import zipfile
import gradio as gr
import os
import json
import pandas as pd
from pathlib import Path
from tempfile import TemporaryDirectory
import uuid
import threading
import time
pd.options.mode.chained_assignment = None

OUTPUT_DIR = "./alignments"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def delete_output_dir(delay=120):
    """Function to delete OUTPUT_DIR after a delay."""
    time.sleep(delay)  # Give time for Gradio to complete the download
    for file in os.listdir(OUTPUT_DIR):
        os.remove(os.path.join(OUTPUT_DIR, file))
    os.rmdir(OUTPUT_DIR)

def split_comma_if_needed(value):
        if isinstance(value, str) and ', ' in value:
            updated_value = [item.strip() for item in value.split(', ')]
        else: 
            updated_value = value
        return updated_value

def convert_alignment(files):

    zip_filename = f"alignments_{uuid.uuid4()}.zip"
    zip_path = os.path.join(OUTPUT_DIR, zip_filename)
            
    with TemporaryDirectory() as tmpdir:
        json_file_paths = []
        
        for file in files:
            # read the header 
            header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*')
            header = header.to_json(orient="records", lines=True)
            header = json.loads(header)

            # read the body 
            body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*')
            
            #json_data = df.to_json(orient="records")  # Convert to JSON
            source_data = body[[col for col in body.columns if 'source' in col]]
            source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)]
            source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1)
            source_transformed = source_data.map(split_comma_if_needed)
            source_result = {}
            source_result['sentences'] = source_transformed.to_dict(orient='records')
            source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4)
            source_json_file_name = header['source_text_id'] + '.json'
            source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name)
            with open(source_json_file_path, "w") as json_file:
                json_file.write(source_json_data)
                    
            target_data = body[[col for col in body.columns if 'target' in col]]
            target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)]
            target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1)
            target_transformed = target_data.map(split_comma_if_needed)
            target_result = {}
            target_result['sentences'] = target_transformed.to_dict(orient='records')
            target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4)
            target_json_file_name = header['target_text_id'] + '.json'
            target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name)
            with open(target_json_file_path, "w") as json_file:
                json_file.write(target_json_data)

            json_file_paths.append(source_json_file_path)
            json_file_paths.append(target_json_file_path)
        
        # Create ZIP file
        with zipfile.ZipFile(zip_path, "w") as zipf:
            for json_file in json_file_paths:
                zipf.write(json_file, arcname=os.path.basename(json_file))

        for json_file in json_file_paths:
            os.remove(json_file)

        # Schedule directory cleanup after a delay
        threading.Thread(target=delete_output_dir, args=(60,)).start()  # Adjust delay as needed

        return zip_path

demo = gr.Interface(
    fn=convert_alignment,
    inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]),
    outputs="file",
    fill_width = True, 
    theme="Nymbo/Nymbo_Theme", 
    title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform", 
    description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window."
)

if __name__ == "__main__":
    demo.launch()