Added validation of data format

#1
by ylmmhf - opened
Files changed (1) hide show
  1. app.py +34 -0
app.py CHANGED
@@ -5,6 +5,8 @@ import zipfile
5
  from pathlib import Path
6
  import gradio as gr
7
  import io
 
 
8
 
9
  # ==== Core Functions ====
10
  def create_interval_data_dict(xmin, xmax, sentence):
@@ -37,7 +39,35 @@ def write_textgrid_file(intervals, output_file_path, total_xmax):
37
  f.write(f' xmin = {intervals[-1]["xmax"]}\n')
38
  f.write(f' xmax = {intervals[-1]["xmax"]}\n')
39
  f.write(f' text = ""\n')
 
 
 
 
 
 
 
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # ==== Gradio Interface Function ====
42
  def csv_to_textgrid(file):
43
  try:
@@ -77,6 +107,9 @@ def csv_to_textgrid(file):
77
  with open(csv_path, 'r', encoding='utf-8') as csvfile:
78
  reader = csv.reader(csvfile)
79
  header = next(reader) # Skip header
 
 
 
80
  print(f"Header: {header}")
81
 
82
  iu_xmin = 0
@@ -87,6 +120,7 @@ def csv_to_textgrid(file):
87
  current_file_processed = False
88
 
89
  for row in reader:
 
90
  if len(row) < 6:
91
  print(f"Skipping invalid row: {row}")
92
  continue
 
5
  from pathlib import Path
6
  import gradio as gr
7
  import io
8
+ import re
9
+
10
 
11
  # ==== Core Functions ====
12
  def create_interval_data_dict(xmin, xmax, sentence):
 
39
  f.write(f' xmin = {intervals[-1]["xmax"]}\n')
40
  f.write(f' xmax = {intervals[-1]["xmax"]}\n')
41
  f.write(f' text = ""\n')
42
+
43
+ def validate_csv_format(header):
44
+ expected_headers = ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
45
+ return header == expected_headers
46
+
47
+ def validate_row(row):
48
+ if len(row) < 6:
49
+ return False, "Row does not have enough columns."
50
 
51
+ try:
52
+ # Validate data types
53
+ filename = row[1].strip()
54
+ xmin = float(row[2])
55
+ xmax = float(row[3])
56
+ text = row[4].strip()
57
+ is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
58
+
59
+ # Check time consistency
60
+ if xmin >= xmax:
61
+ return False, "xmin must be less than xmax."
62
+
63
+ # Check text content
64
+ if not re.match("^[a-zA-Z0-9 ,.!?]*$", text): # Allow letters, numbers, spaces, and some punctuation
65
+ return False, "Text contains invalid characters."
66
+
67
+ return True, ""
68
+ except ValueError as e:
69
+ return False, f"Value error: {e}"
70
+
71
  # ==== Gradio Interface Function ====
72
  def csv_to_textgrid(file):
73
  try:
 
107
  with open(csv_path, 'r', encoding='utf-8') as csvfile:
108
  reader = csv.reader(csvfile)
109
  header = next(reader) # Skip header
110
+ if not validate_csv_format(header):
111
+ return None, "Invalid CSV format. Expected headers: , file_name, xmin, xmax, text, is_unit_start_pred"
112
+
113
  print(f"Header: {header}")
114
 
115
  iu_xmin = 0
 
120
  current_file_processed = False
121
 
122
  for row in reader:
123
+ valid, message = validate_row(row)
124
  if len(row) < 6:
125
  print(f"Skipping invalid row: {row}")
126
  continue