Spaces:

bigbossmonster
/

testing

Sleeping

App Files Files Community

bigbossmonster commited on Jan 21

Commit

9c5c9a9

verified ·

1 Parent(s): 5c201dc

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -32

app.py CHANGED Viewed

@@ -54,48 +54,53 @@ def parse_filename_to_ms(filename):
     h, m, s, ms = map(int, match.groups())
     return (h * 3600000) + (m * 60000) + (s * 1000) + ms
 def parse_srt(content: str):
     """
-    Robust Regex Parser.
-    Matches ID, Timestamp, and Text even if Text is empty.
     """
-    # 1. Normalize line endings and remove byte order marks if present
-    content = content.replace('\r\n', '\n').replace('\r', '\n').strip()
-    # 2. Regex Breakdown:
-    # (\d+)                 -> Group 1: The ID
-    # \n                    -> Newline
-    # (\d{2}:\d{2}:.*)      -> Group 2: The Timestamp line
-    # \n                    -> Newline
-    # (.*?)                 -> Group 3: Subtitle text (Non-greedy)
-    # (?=\n\n\d+\n|\n\d+\n|$) -> Lookahead: Stop at double newline ID, single newline ID, or end of string
-    pattern = re.compile(
-        r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\n\n\d+\n|$)',
-        re.DOTALL
-    )
-    matches = pattern.findall(content)
     parsed = []
-    for m in matches:
-        srt_id = m[0].strip()
-        time_range = m[1].strip()
-        # strip() here handles cases with just spaces/newlines in the text area
-        text = m[2].strip()
         try:
             start_time_str = time_range.split('-->')[0].strip()
             start_ms = parse_srt_time_to_ms(start_time_str)
-            parsed.append({
-                "id": srt_id,
-                "time": time_range,
-                "startTimeMs": start_ms,
-                "text": text # Will be "" if empty, keeping pairing in sync
-            })
-        except Exception as e:
-            logger.warning(f"Error parsing block {srt_id}: {e}")
     return parsed
 def compress_image(image_bytes, max_width=800, quality=80):

     h, m, s, ms = map(int, match.groups())
     return (h * 3600000) + (m * 60000) + (s * 1000) + ms
+# In app.py
 def parse_srt(content: str):
     """
+    Robust Parser: Finds headers first, then slices content between them.
+    Guarantees that 20 IDs = 20 Items, even if text is empty.
     """
+    # 1. Normalize line endings
+    content = content.replace('\r\n', '\n').replace('\r', '\n')
+    # 2. Find all headers (ID + Time)
+    # We do NOT try to match text here. We only look for the anchors.
+    header_pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})', re.MULTILINE)
+    matches = list(header_pattern.finditer(content))
     parsed = []
+    for i, match in enumerate(matches):
+        srt_id = match.group(1)
+        time_range = match.group(2)
+        # Start matching text immediately after this header
+        start_index = match.end()
+        # Stop matching text at the start of the NEXT header (or EOF)
+        if i + 1 < len(matches):
+            end_index = matches[i+1].start()
+        else:
+            end_index = len(content)
+        # Extract and clean the text
+        raw_text = content[start_index:end_index]
+        text = raw_text.strip()
         try:
             start_time_str = time_range.split('-->')[0].strip()
             start_ms = parse_srt_time_to_ms(start_time_str)
+        except:
+            start_ms = 0
+        parsed.append({
+            "id": srt_id,
+            "time": time_range,
+            "startTimeMs": start_ms,
+            "text": text  # This will be "" (empty string) if no text exists, but the item remains!
+        })
     return parsed
 def compress_image(image_bytes, max_width=800, quality=80):