Spaces:

bigbossmonster
/

testing

Sleeping

App Files Files Community

bigbossmonster commited on Jan 21

Commit

5c201dc

verified ·

1 Parent(s): e74e277

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -14

app.py CHANGED Viewed

@@ -56,21 +56,23 @@ def parse_filename_to_ms(filename):
 def parse_srt(content: str):
     """
-    Robust Regex Parser for SRT.
-    Handles blank subtitles and inconsistent newlines by searching for patterns
-    rather than splitting by newlines.
     """
-    # Normalize line endings
-    content = content.replace('\r\n', '\n').replace('\r', '\n')
-    # Pattern explanation:
-    # (\d+)                 -> Group 1: ID
     # \n                    -> Newline
-    # (\d{2}:\d{2}:.*)      -> Group 2: Timestamp line
     # \n                    -> Newline
-    # (.*?)                 -> Group 3: Subtitle text (non-greedy)
-    # (?=\n\d+\n|\Z)        -> Lookahead: Stop when we see the next ID or end of file
-    pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\Z)', re.DOTALL)
     matches = pattern.findall(content)
@@ -78,10 +80,10 @@ def parse_srt(content: str):
     for m in matches:
         srt_id = m[0].strip()
         time_range = m[1].strip()
-        text = m[2].strip() # This will correctly be "" if the line is empty
         try:
-            # Extract start time for sorting
             start_time_str = time_range.split('-->')[0].strip()
             start_ms = parse_srt_time_to_ms(start_time_str)
@@ -89,7 +91,7 @@ def parse_srt(content: str):
                 "id": srt_id,
                 "time": time_range,
                 "startTimeMs": start_ms,
-                "text": text
             })
         except Exception as e:
             logger.warning(f"Error parsing block {srt_id}: {e}")

 def parse_srt(content: str):
     """
+    Robust Regex Parser.
+    Matches ID, Timestamp, and Text even if Text is empty.
     """
+    # 1. Normalize line endings and remove byte order marks if present
+    content = content.replace('\r\n', '\n').replace('\r', '\n').strip()
+    # 2. Regex Breakdown:
+    # (\d+)                 -> Group 1: The ID
     # \n                    -> Newline
+    # (\d{2}:\d{2}:.*)      -> Group 2: The Timestamp line
     # \n                    -> Newline
+    # (.*?)                 -> Group 3: Subtitle text (Non-greedy)
+    # (?=\n\n\d+\n|\n\d+\n|$) -> Lookahead: Stop at double newline ID, single newline ID, or end of string
+    pattern = re.compile(
+        r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\n\n\d+\n|$)',
+        re.DOTALL
+    )
     matches = pattern.findall(content)
     for m in matches:
         srt_id = m[0].strip()
         time_range = m[1].strip()
+        # strip() here handles cases with just spaces/newlines in the text area
+        text = m[2].strip()
         try:
             start_time_str = time_range.split('-->')[0].strip()
             start_ms = parse_srt_time_to_ms(start_time_str)
                 "id": srt_id,
                 "time": time_range,
                 "startTimeMs": start_ms,
+                "text": text # Will be "" if empty, keeping pairing in sync
             })
         except Exception as e:
             logger.warning(f"Error parsing block {srt_id}: {e}")