Spaces:

zyshan-ds
/

CSV2Praat_Auto_Tool

Sleeping

App Files Files Community

Added validation of data format

by ylmmhf - opened Apr 6, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+34

-0

Files changed (1) hide show

app.py +34 -0

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import zipfile
 from pathlib import Path
 import gradio as gr
 import io
 # ==== Core Functions ====
 def create_interval_data_dict(xmin, xmax, sentence):
@@ -37,7 +39,35 @@ def write_textgrid_file(intervals, output_file_path, total_xmax):
             f.write(f'            xmin = {intervals[-1]["xmax"]}\n')
             f.write(f'            xmax = {intervals[-1]["xmax"]}\n')
             f.write(f'            text = ""\n')
 # ==== Gradio Interface Function ====
 def csv_to_textgrid(file):
     try:
@@ -77,6 +107,9 @@ def csv_to_textgrid(file):
             with open(csv_path, 'r', encoding='utf-8') as csvfile:
                 reader = csv.reader(csvfile)
                 header = next(reader)  # Skip header
                 print(f"Header: {header}")
                 iu_xmin = 0
@@ -87,6 +120,7 @@ def csv_to_textgrid(file):
                 current_file_processed = False
                 for row in reader:
                     if len(row) < 6:
                         print(f"Skipping invalid row: {row}")
                         continue

 from pathlib import Path
 import gradio as gr
 import io
+import re
 # ==== Core Functions ====
 def create_interval_data_dict(xmin, xmax, sentence):
             f.write(f'            xmin = {intervals[-1]["xmax"]}\n')
             f.write(f'            xmax = {intervals[-1]["xmax"]}\n')
             f.write(f'            text = ""\n')
+def validate_csv_format(header):
+    expected_headers = ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
+    return header == expected_headers
+def validate_row(row):
+    if len(row) < 6:
+        return False, "Row does not have enough columns."
+    try:
+        # Validate data types
+        filename = row[1].strip()
+        xmin = float(row[2])
+        xmax = float(row[3])
+        text = row[4].strip()
+        is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
+        # Check time consistency
+        if xmin >= xmax:
+            return False, "xmin must be less than xmax."
+        # Check text content
+        if not re.match("^[a-zA-Z0-9 ,.!?]*$", text):  # Allow letters, numbers, spaces, and some punctuation
+            return False, "Text contains invalid characters."
+        return True, ""
+    except ValueError as e:
+        return False, f"Value error: {e}"
 # ==== Gradio Interface Function ====
 def csv_to_textgrid(file):
     try:
             with open(csv_path, 'r', encoding='utf-8') as csvfile:
                 reader = csv.reader(csvfile)
                 header = next(reader)  # Skip header
+                if not validate_csv_format(header):
+                    return None, "Invalid CSV format. Expected headers: , file_name, xmin, xmax, text, is_unit_start_pred"
                 print(f"Header: {header}")
                 iu_xmin = 0
                 current_file_processed = False
                 for row in reader:
+                    valid, message = validate_row(row)
                     if len(row) < 6:
                         print(f"Skipping invalid row: {row}")
                         continue