bigbossmonster commited on
Commit
5c201dc
·
verified ·
1 Parent(s): e74e277

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -14
app.py CHANGED
@@ -56,21 +56,23 @@ def parse_filename_to_ms(filename):
56
 
57
  def parse_srt(content: str):
58
  """
59
- Robust Regex Parser for SRT.
60
- Handles blank subtitles and inconsistent newlines by searching for patterns
61
- rather than splitting by newlines.
62
  """
63
- # Normalize line endings
64
- content = content.replace('\r\n', '\n').replace('\r', '\n')
65
 
66
- # Pattern explanation:
67
- # (\d+) -> Group 1: ID
68
  # \n -> Newline
69
- # (\d{2}:\d{2}:.*) -> Group 2: Timestamp line
70
  # \n -> Newline
71
- # (.*?) -> Group 3: Subtitle text (non-greedy)
72
- # (?=\n\d+\n|\Z) -> Lookahead: Stop when we see the next ID or end of file
73
- pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\Z)', re.DOTALL)
 
 
 
74
 
75
  matches = pattern.findall(content)
76
 
@@ -78,10 +80,10 @@ def parse_srt(content: str):
78
  for m in matches:
79
  srt_id = m[0].strip()
80
  time_range = m[1].strip()
81
- text = m[2].strip() # This will correctly be "" if the line is empty
 
82
 
83
  try:
84
- # Extract start time for sorting
85
  start_time_str = time_range.split('-->')[0].strip()
86
  start_ms = parse_srt_time_to_ms(start_time_str)
87
 
@@ -89,7 +91,7 @@ def parse_srt(content: str):
89
  "id": srt_id,
90
  "time": time_range,
91
  "startTimeMs": start_ms,
92
- "text": text
93
  })
94
  except Exception as e:
95
  logger.warning(f"Error parsing block {srt_id}: {e}")
 
56
 
57
  def parse_srt(content: str):
58
  """
59
+ Robust Regex Parser.
60
+ Matches ID, Timestamp, and Text even if Text is empty.
 
61
  """
62
+ # 1. Normalize line endings and remove byte order marks if present
63
+ content = content.replace('\r\n', '\n').replace('\r', '\n').strip()
64
 
65
+ # 2. Regex Breakdown:
66
+ # (\d+) -> Group 1: The ID
67
  # \n -> Newline
68
+ # (\d{2}:\d{2}:.*) -> Group 2: The Timestamp line
69
  # \n -> Newline
70
+ # (.*?) -> Group 3: Subtitle text (Non-greedy)
71
+ # (?=\n\n\d+\n|\n\d+\n|$) -> Lookahead: Stop at double newline ID, single newline ID, or end of string
72
+ pattern = re.compile(
73
+ r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\n\n\d+\n|$)',
74
+ re.DOTALL
75
+ )
76
 
77
  matches = pattern.findall(content)
78
 
 
80
  for m in matches:
81
  srt_id = m[0].strip()
82
  time_range = m[1].strip()
83
+ # strip() here handles cases with just spaces/newlines in the text area
84
+ text = m[2].strip()
85
 
86
  try:
 
87
  start_time_str = time_range.split('-->')[0].strip()
88
  start_ms = parse_srt_time_to_ms(start_time_str)
89
 
 
91
  "id": srt_id,
92
  "time": time_range,
93
  "startTimeMs": start_ms,
94
+ "text": text # Will be "" if empty, keeping pairing in sync
95
  })
96
  except Exception as e:
97
  logger.warning(f"Error parsing block {srt_id}: {e}")