zyshan-ds commited on
Commit
1306c6f
Β·
verified Β·
1 Parent(s): f68127a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -51
app.py CHANGED
@@ -12,7 +12,7 @@ def create_interval_data_dict(xmin, xmax, sentence):
12
  return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
13
 
14
  def write_textgrid_file(intervals, output_file_path, total_xmax, tier_name):
15
- with open(output_file_path, 'w') as f:
16
  f.write('File type = "ooTextFile"\n')
17
  f.write('Object class = "TextGrid"\n\n')
18
  f.write('xmin = 0\n')
@@ -46,65 +46,61 @@ def validate_csv_format(header):
46
  ]
47
  return header in valid_headers
48
 
49
-
50
  def validate_row(row, header):
51
  if len(row) < 5:
52
  return False, "Row does not have enough columns."
53
 
54
  if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
55
- # Validate data types
56
- filename = row[1].strip()
57
- xmin = float(row[2])
58
- xmax = float(row[3])
59
- text = row[4].strip()
60
- is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
61
 
62
- # Check time consistency
63
- if xmin >= xmax:
64
- return False, "xmin must be less than xmax."
65
 
66
- # Check text content
67
- if not re.match("^[a-zA-Z0-9 ,.!?]*$", text): # Allow letters, numbers, spaces, and some punctuation
68
- return False, "Text contains invalid characters."
69
 
70
- return True, ""
 
 
71
 
72
  elif header == ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
73
- # Validate data types
74
- filename = row[0].strip()
75
- xmin = float(row[1])
76
- xmax = float(row[2])
77
- text = row[3].strip()
78
- is_unit_start_pred = row[4].strip().lower() in ["true", "false"]
79
 
80
- # Check time consistency
81
- if xmin >= xmax:
82
- return False, "xmin must be less than xmax."
83
 
84
- # Check text content
85
- if not re.match("^[a-zA-Z0-9 ,.!?]*$", text): # Allow letters, numbers, spaces, and some punctuation
86
- return False, "Text contains invalid characters."
87
 
88
- return True, ""
 
 
89
 
90
- else:
91
- return False, "Invalid header format"
92
 
93
  # ==== Gradio Interface Function ====
94
  def csv_to_textgrid(file, tier_name=""):
95
  try:
96
- # Create temporary directory
97
  temp_dir = tempfile.mkdtemp()
98
  csv_path = os.path.join(temp_dir, "input.csv")
99
 
100
- # Handle different file object types
101
  if isinstance(file, str):
102
- with open(file, 'r') as f:
103
  file_content = f.read()
104
  else:
105
  try:
106
  if hasattr(file, 'read'):
107
- file_content = file.read().decode('utf-8') # Decode if bytes
108
  else:
109
  file_content = str(file)
110
  except Exception as e:
@@ -118,12 +114,11 @@ def csv_to_textgrid(file, tier_name=""):
118
  output_directory = os.path.join(temp_dir, "textgrids")
119
  os.makedirs(output_directory, exist_ok=True)
120
 
121
- # Process the CSV file
122
  processed_files = []
123
  try:
124
  with open(csv_path, 'r', encoding='utf-8') as csvfile:
125
  reader = csv.reader(csvfile)
126
- header = next(reader) # Skip header
127
  if not validate_csv_format(header):
128
  return None, "Invalid CSV format. Expected headers: file_name, xmin, xmax, text, is_unit_start_pred"
129
 
@@ -143,14 +138,13 @@ def csv_to_textgrid(file, tier_name=""):
143
  continue
144
 
145
  try:
146
- # Determine index positions based on header format
147
  if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
148
  filename_idx = 1
149
  xmin_idx = 2
150
  xmax_idx = 3
151
  text_idx = 4
152
  is_unit_start_idx = 5
153
- else: # ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
154
  filename_idx = 0
155
  xmin_idx = 1
156
  xmax_idx = 2
@@ -174,7 +168,6 @@ def csv_to_textgrid(file, tier_name=""):
174
  print(f"Error processing row {row}: {e}")
175
  continue
176
 
177
- # Handle file transition
178
  if prev_filename is not None and prev_filename != filename:
179
  if words:
180
  intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
@@ -206,7 +199,6 @@ def csv_to_textgrid(file, tier_name=""):
206
  words.append(text)
207
  iu_xmax = xmax
208
 
209
- # Process the last file
210
  if not current_file_processed and prev_filename:
211
  if words:
212
  intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
@@ -221,7 +213,6 @@ def csv_to_textgrid(file, tier_name=""):
221
  print(f"Error processing CSV: {e}")
222
  return None, f"Error processing CSV: {e}"
223
 
224
- # Create zip file
225
  if processed_files:
226
  zip_path = os.path.join(temp_dir, "textgrids.zip")
227
  with zipfile.ZipFile(zip_path, 'w') as zipf:
@@ -246,25 +237,25 @@ csv_format_instruction = """
246
  The first row is the header. Each subsequent row should contain:\n
247
  With index: `, file_name, xmin, xmax, text, is_unit_start_pred`\n
248
  Without index: `file_name, xmin, xmax, text, is_unit_start_pred`\n\n
249
- - `file_name`: Identifier for the audio file (used to group intervals).
250
- - `xmin`: Start time of the segment (in seconds).
251
- - `xmax`: End time of the segment (in seconds).
252
- - `text`: The actual spoken word or phrase.
253
  - `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).\n
254
  **Please enter your preferred tier name in the space below.**\n
255
- Example (with index, works the same without indexοΌ‰:\n
256
  | | file_name | xmin | xmax | text | is_unit_start_pred |
257
  |-|-----------|--------|--------|-------|---------------------|
258
- |0| example1 | 20.42 | 20.74 | mhmm | TRUE |
259
- |1| example1 | 20.74 | 20.81 | hello | TRUE |
260
- |2| example1 | 20.81 | 20.92 | world | FALSE |
261
  """
262
 
263
  iface = gr.Interface(
264
  fn=csv_to_textgrid,
265
  inputs=[
266
  gr.File(label="πŸ“ Upload CSV File", file_types=[".csv"]),
267
- gr.Textbox(label="πŸ“ Enter Tier Name", placeholder="Enter the name of the tier") # New input for tier name
268
  ],
269
  outputs=[
270
  gr.File(label="πŸ“¦ Download TextGrid ZIP"),
@@ -275,4 +266,4 @@ iface = gr.Interface(
275
  )
276
 
277
  if __name__ == "__main__":
278
- iface.launch()
 
12
  return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
13
 
14
  def write_textgrid_file(intervals, output_file_path, total_xmax, tier_name):
15
+ with open(output_file_path, 'w', encoding='utf-8') as f:
16
  f.write('File type = "ooTextFile"\n')
17
  f.write('Object class = "TextGrid"\n\n')
18
  f.write('xmin = 0\n')
 
46
  ]
47
  return header in valid_headers
48
 
 
49
  def validate_row(row, header):
50
  if len(row) < 5:
51
  return False, "Row does not have enough columns."
52
 
53
  if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
54
+ try:
55
+ filename = row[1].strip()
56
+ xmin = float(row[2])
57
+ xmax = float(row[3])
58
+ text = row[4].strip()
59
+ is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
60
 
61
+ if xmin >= xmax:
62
+ return False, "xmin must be less than xmax."
 
63
 
64
+ if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
65
+ return False, "Text contains invalid characters."
 
66
 
67
+ return True, ""
68
+ except ValueError:
69
+ return False, "Data format error (possibly number conversion failed)."
70
 
71
  elif header == ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
72
+ try:
73
+ filename = row[0].strip()
74
+ xmin = float(row[1])
75
+ xmax = float(row[2])
76
+ text = row[3].strip()
77
+ is_unit_start_pred = row[4].strip().lower() in ["true", "false"]
78
 
79
+ if xmin >= xmax:
80
+ return False, "xmin must be less than xmax."
 
81
 
82
+ if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
83
+ return False, "Text contains invalid characters."
 
84
 
85
+ return True, ""
86
+ except ValueError:
87
+ return False, "Data format error (possibly number conversion failed)."
88
 
89
+ return False, "Invalid header format."
 
90
 
91
  # ==== Gradio Interface Function ====
92
  def csv_to_textgrid(file, tier_name=""):
93
  try:
 
94
  temp_dir = tempfile.mkdtemp()
95
  csv_path = os.path.join(temp_dir, "input.csv")
96
 
 
97
  if isinstance(file, str):
98
+ with open(file, 'r', encoding='utf-8') as f:
99
  file_content = f.read()
100
  else:
101
  try:
102
  if hasattr(file, 'read'):
103
+ file_content = file.read().decode('utf-8', errors='replace')
104
  else:
105
  file_content = str(file)
106
  except Exception as e:
 
114
  output_directory = os.path.join(temp_dir, "textgrids")
115
  os.makedirs(output_directory, exist_ok=True)
116
 
 
117
  processed_files = []
118
  try:
119
  with open(csv_path, 'r', encoding='utf-8') as csvfile:
120
  reader = csv.reader(csvfile)
121
+ header = next(reader)
122
  if not validate_csv_format(header):
123
  return None, "Invalid CSV format. Expected headers: file_name, xmin, xmax, text, is_unit_start_pred"
124
 
 
138
  continue
139
 
140
  try:
 
141
  if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
142
  filename_idx = 1
143
  xmin_idx = 2
144
  xmax_idx = 3
145
  text_idx = 4
146
  is_unit_start_idx = 5
147
+ else:
148
  filename_idx = 0
149
  xmin_idx = 1
150
  xmax_idx = 2
 
168
  print(f"Error processing row {row}: {e}")
169
  continue
170
 
 
171
  if prev_filename is not None and prev_filename != filename:
172
  if words:
173
  intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
 
199
  words.append(text)
200
  iu_xmax = xmax
201
 
 
202
  if not current_file_processed and prev_filename:
203
  if words:
204
  intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
 
213
  print(f"Error processing CSV: {e}")
214
  return None, f"Error processing CSV: {e}"
215
 
 
216
  if processed_files:
217
  zip_path = os.path.join(temp_dir, "textgrids.zip")
218
  with zipfile.ZipFile(zip_path, 'w') as zipf:
 
237
  The first row is the header. Each subsequent row should contain:\n
238
  With index: `, file_name, xmin, xmax, text, is_unit_start_pred`\n
239
  Without index: `file_name, xmin, xmax, text, is_unit_start_pred`\n\n
240
+ - `file_name`: Identifier for the audio file (used to group intervals).\n
241
+ - `xmin`: Start time of the segment (in seconds).\n
242
+ - `xmax`: End time of the segment (in seconds).\n
243
+ - `text`: The actual spoken word or phrase (supports multiple languages).\n
244
  - `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).\n
245
  **Please enter your preferred tier name in the space below.**\n
246
+ Example (with index, works the same without index):\n
247
  | | file_name | xmin | xmax | text | is_unit_start_pred |
248
  |-|-----------|--------|--------|-------|---------------------|
249
+ |0| example1 | 20.42 | 20.74 | Hello | TRUE |
250
+ |1| example1 | 20.74 | 20.81 | World | TRUE |
251
+ |2| example1 | 20.81 | 20.92 | ! | FALSE |
252
  """
253
 
254
  iface = gr.Interface(
255
  fn=csv_to_textgrid,
256
  inputs=[
257
  gr.File(label="πŸ“ Upload CSV File", file_types=[".csv"]),
258
+ gr.Textbox(label="πŸ“ Enter Tier Name", placeholder="Enter the tier name")
259
  ],
260
  outputs=[
261
  gr.File(label="πŸ“¦ Download TextGrid ZIP"),
 
266
  )
267
 
268
  if __name__ == "__main__":
269
+ iface.launch()