Spaces:

nannanliu
/

UNIC_alignment_conversion

Sleeping

App Files Files Community

nannanliu commited on Nov 7, 2024

Commit

cb420eb

verified ·

1 Parent(s): a430a96

Create app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import zipfile
+import gradio as gr
+import os
+import json
+import pandas as pd
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import uuid
+import threading
+import time
+pd.options.mode.chained_assignment = None
+OUTPUT_DIR = "./alignments"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def delete_output_dir(delay=120):
+    """Function to delete OUTPUT_DIR after a delay."""
+    time.sleep(delay)  # Give time for Gradio to complete the download
+    for file in os.listdir(OUTPUT_DIR):
+        os.remove(os.path.join(OUTPUT_DIR, file))
+    os.rmdir(OUTPUT_DIR)
+def split_comma_if_needed(value):
+        if isinstance(value, str) and ', ' in value:
+            updated_value = [item.strip() for item in value.split(', ')]
+        else:
+            updated_value = value
+        return updated_value
+def convert_alignment(files):
+    zip_filename = f"alignments_{uuid.uuid4()}.zip"
+    zip_path = os.path.join(OUTPUT_DIR, zip_filename)
+    with TemporaryDirectory() as tmpdir:
+        json_file_paths = []
+        for file in files:
+            # read the header
+            header = pd.read_excel(file, sheet_name=0, nrows=3, comment = '*')
+            header = header.to_json(orient="records", lines=True)
+            header = json.loads(header)
+            # read the body
+            body = pd.read_excel(file, sheet_name=0, skiprows = 8, comment = '*')
+            #json_data = df.to_json(orient="records")  # Convert to JSON
+            source_data = body[[col for col in body.columns if 'source' in col]]
+            source_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(source_data.index)]
+            source_data = source_data.rename({'source_text': 'full_text', 'source_start': 'start', 'source_duration': 'duration'}, axis = 1)
+            source_transformed = source_data.map(split_comma_if_needed)
+            source_result = {}
+            source_result['sentences'] = source_transformed.to_dict(orient='records')
+            source_json_data = json.dumps(source_result, ensure_ascii=False, indent=4)
+            source_json_file_name = header['source_text_id'] + '.json'
+            source_json_file_path = os.path.join(OUTPUT_DIR, source_json_file_name)
+            with open(source_json_file_path, "w") as json_file:
+                json_file.write(source_json_data)
+            target_data = body[[col for col in body.columns if 'target' in col]]
+            target_data.loc[:, 'id'] = ["{0:03}".format(index+1) for index in list(target_data.index)]
+            target_data = target_data.rename({'target_text': 'full_text', 'target_start': 'start', 'target_duration': 'duration'}, axis = 1)
+            target_transformed = target_data.map(split_comma_if_needed)
+            target_result = {}
+            target_result['sentences'] = target_transformed.to_dict(orient='records')
+            target_json_data = json.dumps(target_result, ensure_ascii=False, indent=4)
+            target_json_file_name = header['target_text_id'] + '.json'
+            target_json_file_path = os.path.join(OUTPUT_DIR, target_json_file_name)
+            with open(target_json_file_path, "w") as json_file:
+                json_file.write(target_json_data)
+            json_file_paths.append(source_json_file_path)
+            json_file_paths.append(target_json_file_path)
+        # Create ZIP file
+        with zipfile.ZipFile(zip_path, "w") as zipf:
+            for json_file in json_file_paths:
+                zipf.write(json_file, arcname=os.path.basename(json_file))
+        for json_file in json_file_paths:
+            os.remove(json_file)
+        # Schedule directory cleanup after a delay
+        threading.Thread(target=delete_output_dir, args=(60,)).start()  # Adjust delay as needed
+        return zip_path
+demo = gr.Interface(
+    fn=convert_alignment,
+    inputs=gr.Files(file_count="multiple", file_types=[".xlsx"]),
+    outputs="file",
+    fill_width = True,
+    theme="Nymbo/Nymbo_Theme",
+    title="Convert your alignment spreadsheets in .xlsx to a zip of JSON files for the UNIC platform",
+    description="Upload a list of uncompressed spreadsheets, and download by clicking on the down arrow in the output window."
+)
+if __name__ == "__main__":
+    demo.launch()