Spaces:

zyshan-ds
/

CSV2Praat_Auto_Tool

Sleeping

App Files Files Community

re-debug last interval issue

by ylmmhf - opened Apr 8, 2025

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+79

-105

Files changed (1) hide show

app.py +79 -105

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gradio as gr
 import io
 import re
 # ==== Core Functions ====
 def create_interval_data_dict(xmin, xmax, sentence):
     return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
@@ -32,90 +33,76 @@ def write_textgrid_file(intervals, output_file_path, total_xmax, tier_name):
             f.write(f'            xmin = {interval["xmin"]}\n')
             f.write(f'            xmax = {interval["xmax"]}\n')
             f.write(f'            text = "{interval["text"]}"\n')
 def validate_csv_format(header):
-    valid_headers = [
-        ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred'],
-        ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
-    ]
-    return header in valid_headers
-def validate_row(row, header):
-    if len(row) < 5:
         return False, "Row does not have enough columns."
-    if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
-        try:
-            filename = row[1].strip()
-            xmin = float(row[2])
-            xmax = float(row[3])
-            text = row[4].strip()
-            is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
-            if xmin >= xmax:
-                return False, "xmin must be less than xmax."
-            if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
-                return False, "Text contains invalid characters."
-            return True, ""
-        except ValueError:
-            return False, "Data format error (possibly number conversion failed)."
-    elif header == ['file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
-        try:
-            filename = row[0].strip()
-            xmin = float(row[1])
-            xmax = float(row[2])
-            text = row[3].strip()
-            is_unit_start_pred = row[4].strip().lower() in ["true", "false"]
-            if xmin >= xmax:
-                return False, "xmin must be less than xmax."
-            if not re.match(r'^[\p{L}\p{N}\p{P}\p{Zs}]*$', text, re.UNICODE):
-                return False, "Text contains invalid characters."
-            return True, ""
-        except ValueError:
-            return False, "Data format error (possibly number conversion failed)."
-    return False, "Invalid header format."
 # ==== Gradio Interface Function ====
-def csv_to_textgrid(file, tier_name=""):
     try:
         temp_dir = tempfile.mkdtemp()
         csv_path = os.path.join(temp_dir, "input.csv")
-        if isinstance(file, str):
-            with open(file, 'r', encoding='utf-8') as f:
-                file_content = f.read()
-        else:
-            try:
-                if hasattr(file, 'read'):
-                    file_content = file.read().decode('utf-8', errors='replace')
-                else:
                     file_content = str(file)
-            except Exception as e:
-                print(f"Error reading file: {e}")
-                return None, f"Error reading file: {e}"
-        with open(csv_path, 'w', encoding='utf-8') as f:
-            f.write(file_content)
         print(f"CSV file written to {csv_path}")
         output_directory = os.path.join(temp_dir, "textgrids")
         os.makedirs(output_directory, exist_ok=True)
         processed_files = []
         try:
             with open(csv_path, 'r', encoding='utf-8') as csvfile:
                 reader = csv.reader(csvfile)
-                header = next(reader)
                 if not validate_csv_format(header):
-                    return None, "Invalid CSV format. Expected headers: file_name, xmin, xmax, text, is_unit_start_pred"
                 print(f"Header: {header}")
@@ -127,38 +114,25 @@ def csv_to_textgrid(file, tier_name=""):
                 current_file_processed = False
                 for row in reader:
-                    valid, message = validate_row(row, header)
-                    if not valid:
-                        print(f"Skipping invalid row: {row} - {message}")
                         continue
                     try:
-                        if header == ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']:
-                            filename_idx = 1
-                            xmin_idx = 2
-                            xmax_idx = 3
-                            text_idx = 4
-                            is_unit_start_idx = 5
-                        else:
-                            filename_idx = 0
-                            xmin_idx = 1
-                            xmax_idx = 2
-                            text_idx = 3
-                            is_unit_start_idx = 4
-                        filename = row[filename_idx].strip() if len(row) > filename_idx and row[filename_idx].strip() else None
                         if not filename:
                             print(f"Skipping row with no filename: {row}")
                             continue
-                        xmin = float(row[xmin_idx]) if row[xmin_idx].strip() else 0
-                        xmax = float(row[xmax_idx]) if row[xmax_idx].strip() else 0
-                        text = row[text_idx].strip() if len(row) > text_idx else ""
-                        is_unit_start_pred_str = row[is_unit_start_idx].strip().lower() if len(row) > is_unit_start_idx else "false"
                         is_unit_start_pred = is_unit_start_pred_str == "true"
                         print(f"Processing: {filename}, {xmin}, {xmax}, {text}, {is_unit_start_pred}")
                     except (ValueError, IndexError) as e:
                         print(f"Error processing row {row}: {e}")
                         continue
@@ -224,6 +198,7 @@ def csv_to_textgrid(file, tier_name=""):
             print(f"Error processing CSV: {e}")
             return None, f"Error processing CSV: {e}"
         if processed_files:
             zip_path = os.path.join(temp_dir, "textgrids.zip")
             with zipfile.ZipFile(zip_path, 'w') as zipf:
@@ -243,30 +218,29 @@ def csv_to_textgrid(file, tier_name=""):
         return None, f"Error: {str(e)}"
 # ==== Gradio Interface Setup ====
-csv_format_instruction = """
-**Expected CSV format:**\n
 The first row is the header. Each subsequent row should contain:\n
-With index: `, file_name, xmin, xmax, text, is_unit_start_pred`\n
-Without index: `file_name, xmin, xmax, text, is_unit_start_pred`\n\n
-- `file_name`: Identifier for the audio file (used to group intervals).\n
-- `xmin`: Start time of the segment (in seconds).\n
-- `xmax`: End time of the segment (in seconds).\n
-- `text`: The actual spoken word or phrase (supports multiple languages).\n
-- `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).\n
-**Please enter your preferred tier name in the space below.**\n
-Example (with index, works the same without index):\n
-| | file_name | xmin   | xmax   | text  | is_unit_start_pred |
-|-|-----------|--------|--------|-------|---------------------|
-|0| example1  | 20.42  | 20.74  | Hello | TRUE                |
-|1| example1  | 20.74  | 20.81  | World | TRUE                |
-|2| example1  | 20.81  | 20.92  | !     | FALSE               |
 """
 iface = gr.Interface(
     fn=csv_to_textgrid,
     inputs=[
         gr.File(label="📁 Upload CSV File", file_types=[".csv"]),
-        gr.Textbox(label="📝 Enter Tier Name", placeholder="Enter the tier name")
     ],
     outputs=[
         gr.File(label="📦 Download TextGrid ZIP"),

 import io
 import re
 # ==== Core Functions ====
 def create_interval_data_dict(xmin, xmax, sentence):
     return {'xmin': float(xmin), 'xmax': float(xmax), 'text': sentence}
             f.write(f'            xmin = {interval["xmin"]}\n')
             f.write(f'            xmax = {interval["xmax"]}\n')
             f.write(f'            text = "{interval["text"]}"\n')
 def validate_csv_format(header):
+    expected_headers = ['', 'file_name', 'xmin', 'xmax', 'text', 'is_unit_start_pred']
+    return header == expected_headers
+def validate_row(row):
+    if len(row) < 6:
         return False, "Row does not have enough columns."
+    try:
+        # Validate data types
+        filename = row[1].strip()
+        xmin = float(row[2])
+        xmax = float(row[3])
+        text = row[4].strip()
+        is_unit_start_pred = row[5].strip().lower() in ["true", "false"]
+        # Check time consistency
+        if xmin >= xmax:
+            return False, "xmin must be less than xmax."
+        # Check text content
+        if not re.match("^[a-zA-Z0-9 ,.!?]*$", text):  # Allow letters, numbers, spaces, and some punctuation
+            return False, "Text contains invalid characters."
+        return True, ""
+    except ValueError as e:
+        return False, f"Value error: {e}"
 # ==== Gradio Interface Function ====
+def csv_to_textgrid(file, tier_name="generated_tier"):
     try:
+        # Create temporary directory
         temp_dir = tempfile.mkdtemp()
         csv_path = os.path.join(temp_dir, "input.csv")
+        # Handle different file object types
+        if hasattr(file, 'name'):
+            if isinstance(file, str):
+                with open(file, 'r') as f:
+                    file_content = f.read()
+            else:
+                try:
+                    if hasattr(file, 'read'):
+                        file_content = file.read()
+                    else:
+                        file_content = file.decode('utf-8') if isinstance(file, bytes) else str(file)
+                except Exception as e:
+                    print(f"Error reading file: {e}")
                     file_content = str(file)
+            with open(csv_path, 'w', encoding='utf-8') as f:
+                f.write(file_content)
+        else:
+            with open(csv_path, 'w', encoding='utf-8') as f:
+                f.write(str(file))
         print(f"CSV file written to {csv_path}")
         output_directory = os.path.join(temp_dir, "textgrids")
         os.makedirs(output_directory, exist_ok=True)
+        # Process the CSV file
         processed_files = []
         try:
             with open(csv_path, 'r', encoding='utf-8') as csvfile:
                 reader = csv.reader(csvfile)
+                header = next(reader)  # Skip header
                 if not validate_csv_format(header):
+                    return None, "Invalid CSV format. Expected headers: , file_name, xmin, xmax, text, is_unit_start_pred"
                 print(f"Header: {header}")
                 current_file_processed = False
                 for row in reader:
+                    valid, message = validate_row(row)
+                    if len(row) < 6:
+                        print(f"Skipping invalid row: {row}")
                         continue
                     try:
+                        filename = row[1].strip() if len(row) > 1 and row[1].strip() else None
                         if not filename:
                             print(f"Skipping row with no filename: {row}")
                             continue
+                        xmin = float(row[2]) if row[2].strip() else 0
+                        xmax = float(row[3]) if row[3].strip() else 0
+                        text = row[4].strip() if len(row) > 4 else ""
+                        is_unit_start_pred_str = row[5].strip().lower() if len(row) > 5 else "false"
                         is_unit_start_pred = is_unit_start_pred_str == "true"
                         print(f"Processing: {filename}, {xmin}, {xmax}, {text}, {is_unit_start_pred}")
                     except (ValueError, IndexError) as e:
                         print(f"Error processing row {row}: {e}")
                         continue
             print(f"Error processing CSV: {e}")
             return None, f"Error processing CSV: {e}"
+        # Create zip file
         if processed_files:
             zip_path = os.path.join(temp_dir, "textgrids.zip")
             with zipfile.ZipFile(zip_path, 'w') as zipf:
         return None, f"Error: {str(e)}"
 # ==== Gradio Interface Setup ====
+csv_format_instruction = """**Expected CSV format:**
 The first row is the header. Each subsequent row should contain:\n
+`file_name, xmin, xmax, text, is_unit_start_pred`
+Each row represents a word or segment in an audio file.
+- `file_name`: Identifier for the audio file (used to group intervals).
+- `xmin`: Start time of the segment (in seconds).
+- `xmax`: End time of the segment (in seconds).
+- `text`: The actual spoken word or phrase.
+- `is_unit_start_pred`: Marks the beginning of a new unit (TRUE/FALSE).
+**Please enter the tier name according to your preference or as deemed appropriate for the data.**
+Example:\n
+| file_name | xmin   | xmax   | text  | is_unit_start_pred |
+|-----------|--------|--------|-------|---------------------|
+| example1  | 20.42  | 20.74  | mhmm  | TRUE                |
+| example1  | 20.74  | 20.81  | hello | TRUE                |
+| example1  | 20.81  | 20.92  | world | FALSE               |
 """
 iface = gr.Interface(
     fn=csv_to_textgrid,
     inputs=[
         gr.File(label="📁 Upload CSV File", file_types=[".csv"]),
+        gr.Textbox(label="📝 Enter Tier Name", placeholder="Enter the name of the tier")  # New input for tier name
     ],
     outputs=[
         gr.File(label="📦 Download TextGrid ZIP"),