Spaces:

zyshan-ds
/

CSV2Praat_Auto_Tool

Sleeping

App Files Files Community

zyshan-ds commited on Apr 8, 2025

Commit

1306c6f

verified ·

1 Parent(s): f68127a

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -51

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ def create_interval_data_dict(xmin, xmax, sentence):
     return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
 def write_textgrid_file(intervals, output_file_path, total_xmax, tier_name):
-    with open(output_file_path, 'w') as f:
         f.write('File type = "ooTextFile"\n')
         f.write('Object class = "TextGrid"\n\n')
         f.write('xmin = 0\n')
@@ -46,65 +46,61 @@ def validate_csv_format(header):
     ]
     return header in valid_headers
 def validate_row(row, header):
     if len(row) < 5:
         return False, "Row does not have enough columns."
     if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
-        # Validate data types
-        filename = row[1].strip()
-        xmin = float(row[2])
-        xmax = float(row[3])
-        text = row[4].strip()
-        is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
-        # Check time consistency
-        if xmin >= xmax:
-            return False, "xmin must be less than xmax."
-        # Check text content
-        if not re.match("^[a-zA-Z0-9 ,.!?]*$", text):  # Allow letters, numbers, spaces, and some punctuation
-            return False, "Text contains invalid characters."
-        return True, ""
     elif header == ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
-        # Validate data types
-        filename = row[0].strip()
-        xmin = float(row[1])
-        xmax = float(row[2])
-        text = row[3].strip()
-        is_unit_start_pred = row[4].strip().lower() in ["true", "false"]
-        # Check time consistency
-        if xmin >= xmax:
-            return False, "xmin must be less than xmax."
-        # Check text content
-        if not re.match("^[a-zA-Z0-9 ,.!?]*$", text):  # Allow letters, numbers, spaces, and some punctuation
-            return False, "Text contains invalid characters."
-        return True, ""
-    else:
-        return False, "Invalid header format"
 # ==== Gradio Interface Function ====
 def csv_to_textgrid(file, tier_name=""):
     try:
-        # Create temporary directory
         temp_dir = tempfile.mkdtemp()
         csv_path = os.path.join(temp_dir, "input.csv")
-        # Handle different file object types
         if isinstance(file, str):
-            with open(file, 'r') as f:
                 file_content = f.read()
         else:
             try:
                 if hasattr(file, 'read'):
-                    file_content = file.read().decode('utf-8')  # Decode if bytes
                 else:
                     file_content = str(file)
             except Exception as e:
@@ -118,12 +114,11 @@ def csv_to_textgrid(file, tier_name=""):
         output_directory = os.path.join(temp_dir, "textgrids")
         os.makedirs(output_directory, exist_ok=True)
-        # Process the CSV file
         processed_files = []
         try:
             with open(csv_path, 'r', encoding='utf-8') as csvfile:
                 reader = csv.reader(csvfile)
-                header = next(reader)  # Skip header
                 if not validate_csv_format(header):
                     return None, "Invalid CSV format. Expected headers: file_name, xmin, xmax, text, is_unit_start_pred"
@@ -143,14 +138,13 @@ def csv_to_textgrid(file, tier_name=""):
                         continue
                     try:
-                        # Determine index positions based on header format
                         if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
                             filename_idx = 1
                             xmin_idx = 2
                             xmax_idx = 3
                             text_idx = 4
                             is_unit_start_idx = 5
-                        else:  # ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
                             filename_idx = 0
                             xmin_idx = 1
                             xmax_idx = 2
@@ -174,7 +168,6 @@ def csv_to_textgrid(file, tier_name=""):
                         print(f"Error processing row {row}: {e}")
                         continue
-                    # Handle file transition
                     if prev_filename is not None and prev_filename != filename:
                         if words:
                             intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
@@ -206,7 +199,6 @@ def csv_to_textgrid(file, tier_name=""):
                     words.append(text)
                     iu_xmax = xmax
-                # Process the last file
                 if not current_file_processed and prev_filename:
                     if words:
                         intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
@@ -221,7 +213,6 @@ def csv_to_textgrid(file, tier_name=""):
             print(f"Error processing CSV: {e}")
             return None, f"Error processing CSV: {e}"
-        # Create zip file
         if processed_files:
             zip_path = os.path.join(temp_dir, "textgrids.zip")
             with zipfile.ZipFile(zip_path, 'w') as zipf:
@@ -246,25 +237,25 @@ csv_format_instruction = """
 The first row is the header. Each subsequent row should contain:\n
 With index: `, file_name, xmin, xmax, text, is_unit_start_pred`\n
 Without index: `file_name, xmin, xmax, text, is_unit_start_pred`\n\n
-- `file_name`: Identifier for the audio file (used to group intervals).
-- `xmin`: Start time of the segment (in seconds).
-- `xmax`: End time of the segment (in seconds).
-- `text`: The actual spoken word or phrase.
 - `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).\n
 **Please enter your preferred tier name in the space below.**\n
-Example （with index, works the same without index）:\n
 | | file_name | xmin   | xmax   | text  | is_unit_start_pred |
 |-|-----------|--------|--------|-------|---------------------|
-|0| example1  | 20.42  | 20.74  | mhmm  | TRUE                |
-|1| example1  | 20.74  | 20.81  | hello | TRUE                |
-|2| example1  | 20.81  | 20.92  | world | FALSE               |
 """
 iface = gr.Interface(
     fn=csv_to_textgrid,
     inputs=[
         gr.File(label="📁 Upload CSV File", file_types=[".csv"]),
-        gr.Textbox(label="📝 Enter Tier Name", placeholder="Enter the name of the tier")  # New input for tier name
     ],
     outputs=[
         gr.File(label="📦 Download TextGrid ZIP"),
@@ -275,4 +266,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch()

     return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
 def write_textgrid_file(intervals, output_file_path, total_xmax, tier_name):
+    with open(output_file_path, 'w', encoding='utf-8') as f:
         f.write('File type = "ooTextFile"\n')
         f.write('Object class = "TextGrid"\n\n')
         f.write('xmin = 0\n')
     ]
     return header in valid_headers
 def validate_row(row, header):
     if len(row) < 5:
         return False, "Row does not have enough columns."
     if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
+        try:
+            filename = row[1].strip()
+            xmin = float(row[2])
+            xmax = float(row[3])
+            text = row[4].strip()
+            is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
+            if xmin >= xmax:
+                return False, "xmin must be less than xmax."
+            if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
+                return False, "Text contains invalid characters."
+            return True, ""
+        except ValueError:
+            return False, "Data format error (possibly number conversion failed)."
     elif header == ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
+        try:
+            filename = row[0].strip()
+            xmin = float(row[1])
+            xmax = float(row[2])
+            text = row[3].strip()
+            is_unit_start_pred = row[4].strip().lower() in ["true", "false"]
+            if xmin >= xmax:
+                return False, "xmin must be less than xmax."
+            if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
+                return False, "Text contains invalid characters."
+            return True, ""
+        except ValueError:
+            return False, "Data format error (possibly number conversion failed)."
+    return False, "Invalid header format."
 # ==== Gradio Interface Function ====
 def csv_to_textgrid(file, tier_name=""):
     try:
         temp_dir = tempfile.mkdtemp()
         csv_path = os.path.join(temp_dir, "input.csv")
         if isinstance(file, str):
+            with open(file, 'r', encoding='utf-8') as f:
                 file_content = f.read()
         else:
             try:
                 if hasattr(file, 'read'):
+                    file_content = file.read().decode('utf-8', errors='replace')
                 else:
                     file_content = str(file)
             except Exception as e:
         output_directory = os.path.join(temp_dir, "textgrids")
         os.makedirs(output_directory, exist_ok=True)
         processed_files = []
         try:
             with open(csv_path, 'r', encoding='utf-8') as csvfile:
                 reader = csv.reader(csvfile)
+                header = next(reader)
                 if not validate_csv_format(header):
                     return None, "Invalid CSV format. Expected headers: file_name, xmin, xmax, text, is_unit_start_pred"
                         continue
                     try:
                         if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
                             filename_idx = 1
                             xmin_idx = 2
                             xmax_idx = 3
                             text_idx = 4
                             is_unit_start_idx = 5
+                        else:
                             filename_idx = 0
                             xmin_idx = 1
                             xmax_idx = 2
                         print(f"Error processing row {row}: {e}")
                         continue
                     if prev_filename is not None and prev_filename != filename:
                         if words:
                             intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
                     words.append(text)
                     iu_xmax = xmax
                 if not current_file_processed and prev_filename:
                     if words:
                         intervals.append(create_interval_data_dict(iu_xmin, iu_xmax, ' '.join(words)))
             print(f"Error processing CSV: {e}")
             return None, f"Error processing CSV: {e}"
         if processed_files:
             zip_path = os.path.join(temp_dir, "textgrids.zip")
             with zipfile.ZipFile(zip_path, 'w') as zipf:
 The first row is the header. Each subsequent row should contain:\n
 With index: `, file_name, xmin, xmax, text, is_unit_start_pred`\n
 Without index: `file_name, xmin, xmax, text, is_unit_start_pred`\n\n
+- `file_name`: Identifier for the audio file (used to group intervals).\n
+- `xmin`: Start time of the segment (in seconds).\n
+- `xmax`: End time of the segment (in seconds).\n
+- `text`: The actual spoken word or phrase (supports multiple languages).\n
 - `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).\n
 **Please enter your preferred tier name in the space below.**\n
+Example (with index, works the same without index):\n
 | | file_name | xmin   | xmax   | text  | is_unit_start_pred |
 |-|-----------|--------|--------|-------|---------------------|
+|0| example1  | 20.42  | 20.74  | Hello | TRUE                |
+|1| example1  | 20.74  | 20.81  | World | TRUE                |
+|2| example1  | 20.81  | 20.92  | !     | FALSE               |
 """
 iface = gr.Interface(
     fn=csv_to_textgrid,
     inputs=[
         gr.File(label="📁 Upload CSV File", file_types=[".csv"]),
+        gr.Textbox(label="📝 Enter Tier Name", placeholder="Enter the tier name")
     ],
     outputs=[
         gr.File(label="📦 Download TextGrid ZIP"),
 )
 if __name__ == "__main__":
+    iface.launch()