nannanliu commited on
Commit
cb420eb
·
verified ·
1 Parent(s): a430a96

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+ import gradio as gr
3
+ import os
4
+ import json
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ from tempfile import TemporaryDirectory
8
+ import uuid
9
+ import threading
10
+ import time
11
+ pd.options.mode.chained_assignment = None
12
+
13
+ OUTPUT_DIR = "./alignments"
14
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
15
+
16
+ def delete_output_dir(delay=120):
17
+ """Function to delete OUTPUT_DIR after a delay."""
18
+ time.sleep(delay) # Give time for Gradio to complete the download
19
+ for file in os.listdir(OUTPUT_DIR):
20
+ os.remove(os.path.join(OUTPUT_DIR, file))
21
+ os.rmdir(OUTPUT_DIR)
22
+
23
+ def split_comma_if_needed(value):
24
+ if isinstance(value, str) and ', ' in value:
25
+ updated_value = [item.strip() for item in value.split(', ')]
26
+ else:
27
+ updated_value = value
28
+ return updated_value
29
+
30
+ def convert_alignment(files):
31
+
32
+ zip_filename = f"alignments_{uuid.uuid4()}.zip"
33
+ zip_path = os.path.join(OUTPUT_DIR, zip_filename)
34
+
35
+ with TemporaryDirectory() as tmpdir:
36
+ json_file_paths = []
37
+
38
+ for file in files:
39
+ # read the header
40
+ header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*')
41
+ header = header.to_json(orient="records", lines=True)
42
+ header = json.loads(header)
43
+
44
+ # read the body
45
+ body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*')
46
+
47
+ #json_data = df.to_json(orient="records") # Convert to JSON
48
+ source_data = body[[col for col in body.columns if 'source' in col]]
49
+ source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)]
50
+ source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1)
51
+ source_transformed = source_data.map(split_comma_if_needed)
52
+ source_result = {}
53
+ source_result['sentences'] = source_transformed.to_dict(orient='records')
54
+ source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4)
55
+ source_json_file_name = header['source_text_id'] + '.json'
56
+ source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name)
57
+ with open(source_json_file_path, "w") as json_file:
58
+ json_file.write(source_json_data)
59
+
60
+ target_data = body[[col for col in body.columns if 'target' in col]]
61
+ target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)]
62
+ target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1)
63
+ target_transformed = target_data.map(split_comma_if_needed)
64
+ target_result = {}
65
+ target_result['sentences'] = target_transformed.to_dict(orient='records')
66
+ target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4)
67
+ target_json_file_name = header['target_text_id'] + '.json'
68
+ target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name)
69
+ with open(target_json_file_path, "w") as json_file:
70
+ json_file.write(target_json_data)
71
+
72
+ json_file_paths.append(source_json_file_path)
73
+ json_file_paths.append(target_json_file_path)
74
+
75
+ # Create ZIP file
76
+ with zipfile.ZipFile(zip_path, "w") as zipf:
77
+ for json_file in json_file_paths:
78
+ zipf.write(json_file, arcname=os.path.basename(json_file))
79
+
80
+ for json_file in json_file_paths:
81
+ os.remove(json_file)
82
+
83
+ # Schedule directory cleanup after a delay
84
+ threading.Thread(target=delete_output_dir, args=(60,)).start() # Adjust delay as needed
85
+
86
+ return zip_path
87
+
88
+ demo = gr.Interface(
89
+ fn=convert_alignment,
90
+ inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]),
91
+ outputs="file",
92
+ fill_width = True,
93
+ theme="Nymbo/Nymbo_Theme",
94
+ title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform",
95
+ description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window."
96
+ )
97
+
98
+ if __name__ == "__main__":
99
+ demo.launch()