bigbossmonster commited on
Commit
9c5c9a9
·
verified ·
1 Parent(s): 5c201dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -32
app.py CHANGED
@@ -54,48 +54,53 @@ def parse_filename_to_ms(filename):
54
  h, m, s, ms = map(int, match.groups())
55
  return (h * 3600000) + (m * 60000) + (s * 1000) + ms
56
 
 
 
57
  def parse_srt(content: str):
58
  """
59
- Robust Regex Parser.
60
- Matches ID, Timestamp, and Text even if Text is empty.
61
  """
62
- # 1. Normalize line endings and remove byte order marks if present
63
- content = content.replace('\r\n', '\n').replace('\r', '\n').strip()
64
-
65
- # 2. Regex Breakdown:
66
- # (\d+) -> Group 1: The ID
67
- # \n -> Newline
68
- # (\d{2}:\d{2}:.*) -> Group 2: The Timestamp line
69
- # \n -> Newline
70
- # (.*?) -> Group 3: Subtitle text (Non-greedy)
71
- # (?=\n\n\d+\n|\n\d+\n|$) -> Lookahead: Stop at double newline ID, single newline ID, or end of string
72
- pattern = re.compile(
73
- r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\n\n\d+\n|$)',
74
- re.DOTALL
75
- )
76
 
77
- matches = pattern.findall(content)
 
 
78
 
 
79
  parsed = []
80
- for m in matches:
81
- srt_id = m[0].strip()
82
- time_range = m[1].strip()
83
- # strip() here handles cases with just spaces/newlines in the text area
84
- text = m[2].strip()
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  try:
87
  start_time_str = time_range.split('-->')[0].strip()
88
  start_ms = parse_srt_time_to_ms(start_time_str)
89
-
90
- parsed.append({
91
- "id": srt_id,
92
- "time": time_range,
93
- "startTimeMs": start_ms,
94
- "text": text # Will be "" if empty, keeping pairing in sync
95
- })
96
- except Exception as e:
97
- logger.warning(f"Error parsing block {srt_id}: {e}")
98
-
99
  return parsed
100
 
101
  def compress_image(image_bytes, max_width=800, quality=80):
 
54
  h, m, s, ms = map(int, match.groups())
55
  return (h * 3600000) + (m * 60000) + (s * 1000) + ms
56
 
57
+ # In app.py
58
+
59
  def parse_srt(content: str):
60
  """
61
+ Robust Parser: Finds headers first, then slices content between them.
62
+ Guarantees that 20 IDs = 20 Items, even if text is empty.
63
  """
64
+ # 1. Normalize line endings
65
+ content = content.replace('\r\n', '\n').replace('\r', '\n')
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ # 2. Find all headers (ID + Time)
68
+ # We do NOT try to match text here. We only look for the anchors.
69
+ header_pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})', re.MULTILINE)
70
 
71
+ matches = list(header_pattern.finditer(content))
72
  parsed = []
73
+
74
+ for i, match in enumerate(matches):
75
+ srt_id = match.group(1)
76
+ time_range = match.group(2)
77
+
78
+ # Start matching text immediately after this header
79
+ start_index = match.end()
80
+
81
+ # Stop matching text at the start of the NEXT header (or EOF)
82
+ if i + 1 < len(matches):
83
+ end_index = matches[i+1].start()
84
+ else:
85
+ end_index = len(content)
86
+
87
+ # Extract and clean the text
88
+ raw_text = content[start_index:end_index]
89
+ text = raw_text.strip()
90
 
91
  try:
92
  start_time_str = time_range.split('-->')[0].strip()
93
  start_ms = parse_srt_time_to_ms(start_time_str)
94
+ except:
95
+ start_ms = 0
96
+
97
+ parsed.append({
98
+ "id": srt_id,
99
+ "time": time_range,
100
+ "startTimeMs": start_ms,
101
+ "text": text # This will be "" (empty string) if no text exists, but the item remains!
102
+ })
103
+
104
  return parsed
105
 
106
  def compress_image(image_bytes, max_width=800, quality=80):