Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -56,21 +56,23 @@ def parse_filename_to_ms(filename):
|
|
| 56 |
|
| 57 |
def parse_srt(content: str):
|
| 58 |
"""
|
| 59 |
-
Robust Regex Parser
|
| 60 |
-
|
| 61 |
-
rather than splitting by newlines.
|
| 62 |
"""
|
| 63 |
-
# Normalize line endings
|
| 64 |
-
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
# (\d+) -> Group 1: ID
|
| 68 |
# \n -> Newline
|
| 69 |
-
# (\d{2}:\d{2}:.*) -> Group 2: Timestamp line
|
| 70 |
# \n -> Newline
|
| 71 |
-
# (.*?) -> Group 3: Subtitle text (
|
| 72 |
-
# (?=\n\d+\n|\
|
| 73 |
-
pattern = re.compile(
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
matches = pattern.findall(content)
|
| 76 |
|
|
@@ -78,10 +80,10 @@ def parse_srt(content: str):
|
|
| 78 |
for m in matches:
|
| 79 |
srt_id = m[0].strip()
|
| 80 |
time_range = m[1].strip()
|
| 81 |
-
|
|
|
|
| 82 |
|
| 83 |
try:
|
| 84 |
-
# Extract start time for sorting
|
| 85 |
start_time_str = time_range.split('-->')[0].strip()
|
| 86 |
start_ms = parse_srt_time_to_ms(start_time_str)
|
| 87 |
|
|
@@ -89,7 +91,7 @@ def parse_srt(content: str):
|
|
| 89 |
"id": srt_id,
|
| 90 |
"time": time_range,
|
| 91 |
"startTimeMs": start_ms,
|
| 92 |
-
"text": text
|
| 93 |
})
|
| 94 |
except Exception as e:
|
| 95 |
logger.warning(f"Error parsing block {srt_id}: {e}")
|
|
|
|
| 56 |
|
| 57 |
def parse_srt(content: str):
|
| 58 |
"""
|
| 59 |
+
Robust Regex Parser.
|
| 60 |
+
Matches ID, Timestamp, and Text even if Text is empty.
|
|
|
|
| 61 |
"""
|
| 62 |
+
# 1. Normalize line endings and remove byte order marks if present
|
| 63 |
+
content = content.replace('\r\n', '\n').replace('\r', '\n').strip()
|
| 64 |
|
| 65 |
+
# 2. Regex Breakdown:
|
| 66 |
+
# (\d+) -> Group 1: The ID
|
| 67 |
# \n -> Newline
|
| 68 |
+
# (\d{2}:\d{2}:.*) -> Group 2: The Timestamp line
|
| 69 |
# \n -> Newline
|
| 70 |
+
# (.*?) -> Group 3: Subtitle text (Non-greedy)
|
| 71 |
+
# (?=\n\n\d+\n|\n\d+\n|$) -> Lookahead: Stop at double newline ID, single newline ID, or end of string
|
| 72 |
+
pattern = re.compile(
|
| 73 |
+
r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\n\n\d+\n|$)',
|
| 74 |
+
re.DOTALL
|
| 75 |
+
)
|
| 76 |
|
| 77 |
matches = pattern.findall(content)
|
| 78 |
|
|
|
|
| 80 |
for m in matches:
|
| 81 |
srt_id = m[0].strip()
|
| 82 |
time_range = m[1].strip()
|
| 83 |
+
# strip() here handles cases with just spaces/newlines in the text area
|
| 84 |
+
text = m[2].strip()
|
| 85 |
|
| 86 |
try:
|
|
|
|
| 87 |
start_time_str = time_range.split('-->')[0].strip()
|
| 88 |
start_ms = parse_srt_time_to_ms(start_time_str)
|
| 89 |
|
|
|
|
| 91 |
"id": srt_id,
|
| 92 |
"time": time_range,
|
| 93 |
"startTimeMs": start_ms,
|
| 94 |
+
"text": text # Will be "" if empty, keeping pairing in sync
|
| 95 |
})
|
| 96 |
except Exception as e:
|
| 97 |
logger.warning(f"Error parsing block {srt_id}: {e}")
|