File size: 4,284 Bytes
cb420eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import zipfile
import gradio as gr
import os
import json
import pandas as pd
from pathlib import Path
from tempfile import TemporaryDirectory
import uuid
import threading
import time
pd.options.mode.chained_assignment = None

OUTPUT_DIR = "./alignments"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def delete_output_dir(delay=120):
    """Function to delete OUTPUT_DIR after a delay."""
    time.sleep(delay)  # Give time for Gradio to complete the download
    for file in os.listdir(OUTPUT_DIR):
        os.remove(os.path.join(OUTPUT_DIR, file))
    os.rmdir(OUTPUT_DIR)

def split_comma_if_needed(value):
        if isinstance(value, str) and ', ' in value:
            updated_value = [item.strip() for item in value.split(', ')]
        else: 
            updated_value = value
        return updated_value

def convert_alignment(files):

    zip_filename = f"alignments_{uuid.uuid4()}.zip"
    zip_path = os.path.join(OUTPUT_DIR, zip_filename)
            
    with TemporaryDirectory() as tmpdir:
        json_file_paths = []
        
        for file in files:
            # read the header 
            header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*')
            header = header.to_json(orient="records", lines=True)
            header = json.loads(header)

            # read the body 
            body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*')
            
            #json_data = df.to_json(orient="records")  # Convert to JSON
            source_data = body[[col for col in body.columns if 'source' in col]]
            source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)]
            source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1)
            source_transformed = source_data.map(split_comma_if_needed)
            source_result = {}
            source_result['sentences'] = source_transformed.to_dict(orient='records')
            source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4)
            source_json_file_name = header['source_text_id'] + '.json'
            source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name)
            with open(source_json_file_path, "w") as json_file:
                json_file.write(source_json_data)
                    
            target_data = body[[col for col in body.columns if 'target' in col]]
            target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)]
            target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1)
            target_transformed = target_data.map(split_comma_if_needed)
            target_result = {}
            target_result['sentences'] = target_transformed.to_dict(orient='records')
            target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4)
            target_json_file_name = header['target_text_id'] + '.json'
            target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name)
            with open(target_json_file_path, "w") as json_file:
                json_file.write(target_json_data)

            json_file_paths.append(source_json_file_path)
            json_file_paths.append(target_json_file_path)
        
        # Create ZIP file
        with zipfile.ZipFile(zip_path, "w") as zipf:
            for json_file in json_file_paths:
                zipf.write(json_file, arcname=os.path.basename(json_file))

        for json_file in json_file_paths:
            os.remove(json_file)

        # Schedule directory cleanup after a delay
        threading.Thread(target=delete_output_dir, args=(60,)).start()  # Adjust delay as needed

        return zip_path

demo = gr.Interface(
    fn=convert_alignment,
    inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]),
    outputs="file",
    fill_width = True, 
    theme="Nymbo/Nymbo_Theme", 
    title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform", 
    description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window."
)

if __name__ == "__main__":
    demo.launch()