re-debug last interval issue

#6
by ylmmhf - opened
Files changed (1) hide show
  1. app.py +79 -105
app.py CHANGED
@@ -7,6 +7,7 @@ import gradio as gr
7
  import io
8
  import re
9
 
 
10
  # ==== Core Functions ====
11
  def create_interval_data_dict(xmin, xmax, sentence):
12
  return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
@@ -32,90 +33,76 @@ def write_textgrid_file(intervals, output_file_path, total_xmax, tier_name):
32
  f.write(f' xmin = {interval["xmin"]}\n')
33
  f.write(f' xmax = {interval["xmax"]}\n')
34
  f.write(f' text = "{interval["text"]}"\n')
35
-
36
-
37
  def validate_csv_format(header):
38
- valid_headers = [
39
- ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred'],
40
- ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
41
- ]
42
- return header in valid_headers
43
 
44
- def validate_row(row, header):
45
- if len(row) < 5:
46
  return False, "Row does not have enough columns."
47
 
48
- if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
49
- try:
50
- filename = row[1].strip()
51
- xmin = float(row[2])
52
- xmax = float(row[3])
53
- text = row[4].strip()
54
- is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
55
-
56
- if xmin >= xmax:
57
- return False, "xmin must be less than xmax."
58
-
59
- if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
60
- return False, "Text contains invalid characters."
61
-
62
- return True, ""
63
- except ValueError:
64
- return False, "Data format error (possibly number conversion failed)."
65
-
66
- elif header == ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
67
- try:
68
- filename = row[0].strip()
69
- xmin = float(row[1])
70
- xmax = float(row[2])
71
- text = row[3].strip()
72
- is_unit_start_pred = row[4].strip().lower() in ["true", "false"]
73
-
74
- if xmin >= xmax:
75
- return False, "xmin must be less than xmax."
76
-
77
- if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
78
- return False, "Text contains invalid characters."
79
-
80
- return True, ""
81
- except ValueError:
82
- return False, "Data format error (possibly number conversion failed)."
83
-
84
- return False, "Invalid header format."
85
-
86
  # ==== Gradio Interface Function ====
87
- def csv_to_textgrid(file, tier_name=""):
88
  try:
 
89
  temp_dir = tempfile.mkdtemp()
90
  csv_path = os.path.join(temp_dir, "input.csv")
91
 
92
- if isinstance(file, str):
93
- with open(file, 'r', encoding='utf-8') as f:
94
- file_content = f.read()
95
- else:
96
- try:
97
- if hasattr(file, 'read'):
98
- file_content = file.read().decode('utf-8', errors='replace')
99
- else:
 
 
 
 
 
100
  file_content = str(file)
101
- except Exception as e:
102
- print(f"Error reading file: {e}")
103
- return None, f"Error reading file: {e}"
 
 
 
104
 
105
- with open(csv_path, 'w', encoding='utf-8') as f:
106
- f.write(file_content)
107
  print(f"CSV file written to {csv_path}")
108
 
109
  output_directory = os.path.join(temp_dir, "textgrids")
110
  os.makedirs(output_directory, exist_ok=True)
111
 
 
112
  processed_files = []
113
  try:
114
  with open(csv_path, 'r', encoding='utf-8') as csvfile:
115
  reader = csv.reader(csvfile)
116
- header = next(reader)
117
  if not validate_csv_format(header):
118
- return None, "Invalid CSV format. Expected headers: file_name, xmin, xmax, text, is_unit_start_pred"
119
 
120
  print(f"Header: {header}")
121
 
@@ -127,38 +114,25 @@ def csv_to_textgrid(file, tier_name=""):
127
  current_file_processed = False
128
 
129
  for row in reader:
130
- valid, message = validate_row(row, header)
131
- if not valid:
132
- print(f"Skipping invalid row: {row} - {message}")
133
  continue
134
-
135
  try:
136
- if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
137
- filename_idx = 1
138
- xmin_idx = 2
139
- xmax_idx = 3
140
- text_idx = 4
141
- is_unit_start_idx = 5
142
- else:
143
- filename_idx = 0
144
- xmin_idx = 1
145
- xmax_idx = 2
146
- text_idx = 3
147
- is_unit_start_idx = 4
148
-
149
- filename = row[filename_idx].strip() if len(row) > filename_idx and row[filename_idx].strip() else None
150
  if not filename:
151
  print(f"Skipping row with no filename: {row}")
152
  continue
153
-
154
- xmin = float(row[xmin_idx]) if row[xmin_idx].strip() else 0
155
- xmax = float(row[xmax_idx]) if row[xmax_idx].strip() else 0
156
- text = row[text_idx].strip() if len(row) > text_idx else ""
157
- is_unit_start_pred_str = row[is_unit_start_idx].strip().lower() if len(row) > is_unit_start_idx else "false"
158
  is_unit_start_pred = is_unit_start_pred_str == "true"
159
-
160
  print(f"Processing: {filename}, {xmin}, {xmax}, {text}, {is_unit_start_pred}")
161
-
162
  except (ValueError, IndexError) as e:
163
  print(f"Error processing row {row}: {e}")
164
  continue
@@ -224,6 +198,7 @@ def csv_to_textgrid(file, tier_name=""):
224
  print(f"Error processing CSV: {e}")
225
  return None, f"Error processing CSV: {e}"
226
 
 
227
  if processed_files:
228
  zip_path = os.path.join(temp_dir, "textgrids.zip")
229
  with zipfile.ZipFile(zip_path, 'w') as zipf:
@@ -243,30 +218,29 @@ def csv_to_textgrid(file, tier_name=""):
243
  return None, f"Error: {str(e)}"
244
 
245
  # ==== Gradio Interface Setup ====
246
- csv_format_instruction = """
247
- **Expected CSV format:**\n
248
  The first row is the header. Each subsequent row should contain:\n
249
- With index: `, file_name, xmin, xmax, text, is_unit_start_pred`\n
250
- Without index: `file_name, xmin, xmax, text, is_unit_start_pred`\n\n
251
- - `file_name`: Identifier for the audio file (used to group intervals).\n
252
- - `xmin`: Start time of the segment (in seconds).\n
253
- - `xmax`: End time of the segment (in seconds).\n
254
- - `text`: The actual spoken word or phrase (supports multiple languages).\n
255
- - `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).\n
256
- **Please enter your preferred tier name in the space below.**\n
257
- Example (with index, works the same without index):\n
258
- | | file_name | xmin | xmax | text | is_unit_start_pred |
259
- |-|-----------|--------|--------|-------|---------------------|
260
- |0| example1 | 20.42 | 20.74 | Hello | TRUE |
261
- |1| example1 | 20.74 | 20.81 | World | TRUE |
262
- |2| example1 | 20.81 | 20.92 | ! | FALSE |
263
  """
264
 
265
  iface = gr.Interface(
266
  fn=csv_to_textgrid,
267
  inputs=[
268
  gr.File(label="πŸ“ Upload CSV File", file_types=[".csv"]),
269
- gr.Textbox(label="πŸ“ Enter Tier Name", placeholder="Enter the tier name")
270
  ],
271
  outputs=[
272
  gr.File(label="πŸ“¦ Download TextGrid ZIP"),
 
7
  import io
8
  import re
9
 
10
+
11
  # ==== Core Functions ====
12
  def create_interval_data_dict(xmin, xmax, sentence):
13
  return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
 
33
  f.write(f' xmin = {interval["xmin"]}\n')
34
  f.write(f' xmax = {interval["xmax"]}\n')
35
  f.write(f' text = "{interval["text"]}"\n')
36
+
 
37
  def validate_csv_format(header):
38
+ expected_headers = ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
39
+ return header == expected_headers
 
 
 
40
 
41
+ def validate_row(row):
42
+ if len(row) < 6:
43
  return False, "Row does not have enough columns."
44
 
45
+ try:
46
+ # Validate data types
47
+ filename = row[1].strip()
48
+ xmin = float(row[2])
49
+ xmax = float(row[3])
50
+ text = row[4].strip()
51
+ is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
52
+
53
+ # Check time consistency
54
+ if xmin >= xmax:
55
+ return False, "xmin must be less than xmax."
56
+
57
+ # Check text content
58
+ if not re.match("^[a-zA-Z0-9 ,.!?]*$", text): # Allow letters, numbers, spaces, and some punctuation
59
+ return False, "Text contains invalid characters."
60
+
61
+ return True, ""
62
+ except ValueError as e:
63
+ return False, f"Value error: {e}"
64
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  # ==== Gradio Interface Function ====
66
+ def csv_to_textgrid(file, tier_name="generated_tier"):
67
  try:
68
+ # Create temporary directory
69
  temp_dir = tempfile.mkdtemp()
70
  csv_path = os.path.join(temp_dir, "input.csv")
71
 
72
+ # Handle different file object types
73
+ if hasattr(file, 'name'):
74
+ if isinstance(file, str):
75
+ with open(file, 'r') as f:
76
+ file_content = f.read()
77
+ else:
78
+ try:
79
+ if hasattr(file, 'read'):
80
+ file_content = file.read()
81
+ else:
82
+ file_content = file.decode('utf-8') if isinstance(file, bytes) else str(file)
83
+ except Exception as e:
84
+ print(f"Error reading file: {e}")
85
  file_content = str(file)
86
+
87
+ with open(csv_path, 'w', encoding='utf-8') as f:
88
+ f.write(file_content)
89
+ else:
90
+ with open(csv_path, 'w', encoding='utf-8') as f:
91
+ f.write(str(file))
92
 
 
 
93
  print(f"CSV file written to {csv_path}")
94
 
95
  output_directory = os.path.join(temp_dir, "textgrids")
96
  os.makedirs(output_directory, exist_ok=True)
97
 
98
+ # Process the CSV file
99
  processed_files = []
100
  try:
101
  with open(csv_path, 'r', encoding='utf-8') as csvfile:
102
  reader = csv.reader(csvfile)
103
+ header = next(reader) # Skip header
104
  if not validate_csv_format(header):
105
+ return None, "Invalid CSV format. Expected headers: , file_name, xmin, xmax, text, is_unit_start_pred"
106
 
107
  print(f"Header: {header}")
108
 
 
114
  current_file_processed = False
115
 
116
  for row in reader:
117
+ valid, message = validate_row(row)
118
+ if len(row) < 6:
119
+ print(f"Skipping invalid row: {row}")
120
  continue
121
+
122
  try:
123
+ filename = row[1].strip() if len(row) > 1 and row[1].strip() else None
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  if not filename:
125
  print(f"Skipping row with no filename: {row}")
126
  continue
127
+
128
+ xmin = float(row[2]) if row[2].strip() else 0
129
+ xmax = float(row[3]) if row[3].strip() else 0
130
+ text = row[4].strip() if len(row) > 4 else ""
131
+ is_unit_start_pred_str = row[5].strip().lower() if len(row) > 5 else "false"
132
  is_unit_start_pred = is_unit_start_pred_str == "true"
133
+
134
  print(f"Processing: {filename}, {xmin}, {xmax}, {text}, {is_unit_start_pred}")
135
+
136
  except (ValueError, IndexError) as e:
137
  print(f"Error processing row {row}: {e}")
138
  continue
 
198
  print(f"Error processing CSV: {e}")
199
  return None, f"Error processing CSV: {e}"
200
 
201
+ # Create zip file
202
  if processed_files:
203
  zip_path = os.path.join(temp_dir, "textgrids.zip")
204
  with zipfile.ZipFile(zip_path, 'w') as zipf:
 
218
  return None, f"Error: {str(e)}"
219
 
220
  # ==== Gradio Interface Setup ====
221
+ csv_format_instruction = """**Expected CSV format:**
 
222
  The first row is the header. Each subsequent row should contain:\n
223
+ `file_name, xmin, xmax, text, is_unit_start_pred`
224
+ Each row represents a word or segment in an audio file.
225
+ - `file_name`: Identifier for the audio file (used to group intervals).
226
+ - `xmin`: Start time of the segment (in seconds).
227
+ - `xmax`: End time of the segment (in seconds).
228
+ - `text`: The actual spoken word or phrase.
229
+ - `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).
230
+ **Please enter the tier name according to your preference or as deemed appropriate for the data.**
231
+ Example:\n
232
+ | file_name | xmin | xmax | text | is_unit_start_pred |
233
+ |-----------|--------|--------|-------|---------------------|
234
+ | example1 | 20.42 | 20.74 | mhmm | TRUE |
235
+ | example1 | 20.74 | 20.81 | hello | TRUE |
236
+ | example1 | 20.81 | 20.92 | world | FALSE |
237
  """
238
 
239
  iface = gr.Interface(
240
  fn=csv_to_textgrid,
241
  inputs=[
242
  gr.File(label="πŸ“ Upload CSV File", file_types=[".csv"]),
243
+ gr.Textbox(label="πŸ“ Enter Tier Name", placeholder="Enter the name of the tier") # New input for tier name
244
  ],
245
  outputs=[
246
  gr.File(label="πŸ“¦ Download TextGrid ZIP"),