Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -54,48 +54,53 @@ def parse_filename_to_ms(filename):
|
|
| 54 |
h, m, s, ms = map(int, match.groups())
|
| 55 |
return (h * 3600000) + (m * 60000) + (s * 1000) + ms
|
| 56 |
|
|
|
|
|
|
|
| 57 |
def parse_srt(content: str):
|
| 58 |
"""
|
| 59 |
-
Robust
|
| 60 |
-
|
| 61 |
"""
|
| 62 |
-
# 1. Normalize line endings
|
| 63 |
-
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
| 64 |
-
|
| 65 |
-
# 2. Regex Breakdown:
|
| 66 |
-
# (\d+) -> Group 1: The ID
|
| 67 |
-
# \n -> Newline
|
| 68 |
-
# (\d{2}:\d{2}:.*) -> Group 2: The Timestamp line
|
| 69 |
-
# \n -> Newline
|
| 70 |
-
# (.*?) -> Group 3: Subtitle text (Non-greedy)
|
| 71 |
-
# (?=\n\n\d+\n|\n\d+\n|$) -> Lookahead: Stop at double newline ID, single newline ID, or end of string
|
| 72 |
-
pattern = re.compile(
|
| 73 |
-
r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})\n(.*?)(?=\n\d+\n|\n\n\d+\n|$)',
|
| 74 |
-
re.DOTALL
|
| 75 |
-
)
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
|
|
|
|
| 79 |
parsed = []
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
try:
|
| 87 |
start_time_str = time_range.split('-->')[0].strip()
|
| 88 |
start_ms = parse_srt_time_to_ms(start_time_str)
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
return parsed
|
| 100 |
|
| 101 |
def compress_image(image_bytes, max_width=800, quality=80):
|
|
|
|
| 54 |
h, m, s, ms = map(int, match.groups())
|
| 55 |
return (h * 3600000) + (m * 60000) + (s * 1000) + ms
|
| 56 |
|
| 57 |
+
# In app.py
|
| 58 |
+
|
| 59 |
def parse_srt(content: str):
|
| 60 |
"""
|
| 61 |
+
Robust Parser: Finds headers first, then slices content between them.
|
| 62 |
+
Guarantees that 20 IDs = 20 Items, even if text is empty.
|
| 63 |
"""
|
| 64 |
+
# 1. Normalize line endings
|
| 65 |
+
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
# 2. Find all headers (ID + Time)
|
| 68 |
+
# We do NOT try to match text here. We only look for the anchors.
|
| 69 |
+
header_pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3})', re.MULTILINE)
|
| 70 |
|
| 71 |
+
matches = list(header_pattern.finditer(content))
|
| 72 |
parsed = []
|
| 73 |
+
|
| 74 |
+
for i, match in enumerate(matches):
|
| 75 |
+
srt_id = match.group(1)
|
| 76 |
+
time_range = match.group(2)
|
| 77 |
+
|
| 78 |
+
# Start matching text immediately after this header
|
| 79 |
+
start_index = match.end()
|
| 80 |
+
|
| 81 |
+
# Stop matching text at the start of the NEXT header (or EOF)
|
| 82 |
+
if i + 1 < len(matches):
|
| 83 |
+
end_index = matches[i+1].start()
|
| 84 |
+
else:
|
| 85 |
+
end_index = len(content)
|
| 86 |
+
|
| 87 |
+
# Extract and clean the text
|
| 88 |
+
raw_text = content[start_index:end_index]
|
| 89 |
+
text = raw_text.strip()
|
| 90 |
|
| 91 |
try:
|
| 92 |
start_time_str = time_range.split('-->')[0].strip()
|
| 93 |
start_ms = parse_srt_time_to_ms(start_time_str)
|
| 94 |
+
except:
|
| 95 |
+
start_ms = 0
|
| 96 |
+
|
| 97 |
+
parsed.append({
|
| 98 |
+
"id": srt_id,
|
| 99 |
+
"time": time_range,
|
| 100 |
+
"startTimeMs": start_ms,
|
| 101 |
+
"text": text # This will be "" (empty string) if no text exists, but the item remains!
|
| 102 |
+
})
|
| 103 |
+
|
| 104 |
return parsed
|
| 105 |
|
| 106 |
def compress_image(image_bytes, max_width=800, quality=80):
|