Spaces:

bigbossmonster
/

testing

Sleeping

App Files Files Community

bigbossmonster commited on 23 days ago

Commit

8b676a5

verified ·

1 Parent(s): 38bd308

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -12

app.py CHANGED Viewed

@@ -49,21 +49,106 @@ def parse_filename_to_ms(filename):
     return (h * 3600000) + (m * 60000) + (s * 1000) + ms
 def parse_srt(content: str):
-    blocks = content.replace('\r\n', '\n').replace('\r', '\n').strip().split('\n\n')
     parsed = []
-    for block in blocks:
-        lines = block.split('\n')
-        if len(lines) >= 2:
-            time_line = lines[1]
-            if '-->' in time_line:
-                start_str = time_line.split('-->')[0].strip()
-                text = " ".join(lines[2:]) if len(lines) > 2 else "[BLANK]"
                 parsed.append({
-                    "id": lines[0],
-                    "time": time_line,
-                    "startTimeMs": parse_srt_time_to_ms(start_str),
                     "text": text
                 })
     return parsed
 def compress_image(image_bytes, quality=70, max_width=800):
@@ -114,7 +199,27 @@ def process_batch_gemini(api_key, items, model_name):
         )
         text = response.text.replace("```json", "").replace("```", "").strip()
-        return json.loads(text)
     except Exception as e:
         logger.error(f"Gemini API Error with key ...{api_key[-4:]}: {e}")
         return None

     return (h * 3600000) + (m * 60000) + (s * 1000) + ms
 def parse_srt(content: str):
+    """
+    Robust State Machine Parser for SRT (Backend Version).
+    Matches Frontend logic to handle blank subtitles correctly.
+    """
+    content = content.replace('\r\n', '\n').replace('\r', '\n')
+    lines = content.split('\n')
     parsed = []
+    current_item = {"id": None, "time": None, "text_lines": []}
+    state = 'SEARCH_ID' # States: SEARCH_ID, SEARCH_TIME, READ_TEXT
+    for i, line in enumerate(lines):
+        line = line.strip()
+        next_line = lines[i+1].strip() if i + 1 < len(lines) else None
+        if state == 'SEARCH_ID':
+            if line.isdigit():
+                current_item["id"] = line
+                state = 'SEARCH_TIME'
+        elif state == 'SEARCH_TIME':
+            if '-->' in line:
+                current_item["time"] = line
+                state = 'READ_TEXT'
+                # EDGE CASE: Immediate Blank Subtitle
+                # If next line is a number (start of new ID), current block is blank
+                if next_line and next_line.isdigit():
+                    # Check 2 lines ahead to confirm it's really an ID (followed by timestamp)
+                    line_after_next = lines[i+2].strip() if i + 2 < len(lines) else None
+                    if line_after_next and '-->' in line_after_next:
+                        # Close current blank block
+                        start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
+                        parsed.append({
+                            "id": current_item["id"],
+                            "time": current_item["time"],
+                            "startTimeMs": start_ms,
+                            "text": "[BLANK SUBTITLE]"
+                        })
+                        current_item = {"id": None, "time": None, "text_lines": []}
+                        state = 'SEARCH_ID'
+            elif line.isdigit():
+                # Recover from missing timestamp
+                current_item["id"] = line
+        elif state == 'READ_TEXT':
+            # Check for start of new block (ID line followed by Time line)
+            is_new_block_start = (
+                line.isdigit() and
+                next_line and '-->' in next_line
+            )
+            # Check for standard blank line separator
+            is_blank_separator = (
+                line == '' and
+                next_line and next_line.isdigit() and
+                (i + 2 < len(lines) and '-->' in lines[i+2])
+            )
+            if is_new_block_start:
+                # Missing blank separator, force close
+                text = " ".join(current_item["text_lines"]).strip() or "[BLANK SUBTITLE]"
+                start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
                 parsed.append({
+                    "id": current_item["id"],
+                    "time": current_item["time"],
+                    "startTimeMs": start_ms,
                     "text": text
                 })
+                current_item = {"id": line, "time": None, "text_lines": []}
+                state = 'SEARCH_TIME'
+            elif is_blank_separator:
+                # Standard closure
+                text = " ".join(current_item["text_lines"]).strip() or "[BLANK SUBTITLE]"
+                start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
+                parsed.append({
+                    "id": current_item["id"],
+                    "time": current_item["time"],
+                    "startTimeMs": start_ms,
+                    "text": text
+                })
+                current_item = {"id": None, "time": None, "text_lines": []}
+                state = 'SEARCH_ID'
+            else:
+                if line:
+                    current_item["text_lines"].append(line)
+    # Push last item
+    if current_item["id"] and current_item["time"]:
+        text = " ".join(current_item["text_lines"]).strip() or "[BLANK SUBTITLE]"
+        start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
+        parsed.append({
+            "id": current_item["id"],
+            "time": current_item["time"],
+            "startTimeMs": start_ms,
+            "text": text
+        })
     return parsed
 def compress_image(image_bytes, quality=70, max_width=800):
         )
         text = response.text.replace("```json", "").replace("```", "").strip()
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError as e:
+            # Handle Truncated JSON (Output Token Limit Exceeded)
+            # This happens if the batch size is too large for the model's output window
+            logger.warning(f"JSON Parse Error (likely truncated response): {e}. Attempting repair...")
+            # Repair Strategy: Find the last closing brace '}', discard everything after, and close the array ']'
+            last_object_idx = text.rfind("}")
+            if last_object_idx != -1:
+                repaired_text = text[:last_object_idx+1] + "]"
+                try:
+                    repaired_data = json.loads(repaired_text)
+                    logger.info(f"Successfully repaired JSON. Recovered {len(repaired_data)}/{len(items)} items.")
+                    return repaired_data
+                except json.JSONDecodeError:
+                    logger.error("JSON repair failed.")
+            return None # Fail gracefully if repair is impossible
     except Exception as e:
         logger.error(f"Gemini API Error with key ...{api_key[-4:]}: {e}")
         return None