bigbossmonster commited on
Commit
8b676a5
·
verified ·
1 Parent(s): 38bd308

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -12
app.py CHANGED
@@ -49,21 +49,106 @@ def parse_filename_to_ms(filename):
49
  return (h * 3600000) + (m * 60000) + (s * 1000) + ms
50
 
51
  def parse_srt(content: str):
52
- blocks = content.replace('\r\n', '\n').replace('\r', '\n').strip().split('\n\n')
 
 
 
 
 
 
53
  parsed = []
54
- for block in blocks:
55
- lines = block.split('\n')
56
- if len(lines) >= 2:
57
- time_line = lines[1]
58
- if '-->' in time_line:
59
- start_str = time_line.split('-->')[0].strip()
60
- text = " ".join(lines[2:]) if len(lines) > 2 else "[BLANK]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  parsed.append({
62
- "id": lines[0],
63
- "time": time_line,
64
- "startTimeMs": parse_srt_time_to_ms(start_str),
65
  "text": text
66
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  return parsed
68
 
69
  def compress_image(image_bytes, quality=70, max_width=800):
@@ -114,7 +199,27 @@ def process_batch_gemini(api_key, items, model_name):
114
  )
115
 
116
  text = response.text.replace("```json", "").replace("```", "").strip()
117
- return json.loads(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
  logger.error(f"Gemini API Error with key ...{api_key[-4:]}: {e}")
120
  return None
 
49
  return (h * 3600000) + (m * 60000) + (s * 1000) + ms
50
 
51
  def parse_srt(content: str):
52
+ """
53
+ Robust State Machine Parser for SRT (Backend Version).
54
+ Matches Frontend logic to handle blank subtitles correctly.
55
+ """
56
+ content = content.replace('\r\n', '\n').replace('\r', '\n')
57
+ lines = content.split('\n')
58
+
59
  parsed = []
60
+ current_item = {"id": None, "time": None, "text_lines": []}
61
+ state = 'SEARCH_ID' # States: SEARCH_ID, SEARCH_TIME, READ_TEXT
62
+
63
+ for i, line in enumerate(lines):
64
+ line = line.strip()
65
+ next_line = lines[i+1].strip() if i + 1 < len(lines) else None
66
+
67
+ if state == 'SEARCH_ID':
68
+ if line.isdigit():
69
+ current_item["id"] = line
70
+ state = 'SEARCH_TIME'
71
+
72
+ elif state == 'SEARCH_TIME':
73
+ if '-->' in line:
74
+ current_item["time"] = line
75
+ state = 'READ_TEXT'
76
+
77
+ # EDGE CASE: Immediate Blank Subtitle
78
+ # If next line is a number (start of new ID), current block is blank
79
+ if next_line and next_line.isdigit():
80
+ # Check 2 lines ahead to confirm it's really an ID (followed by timestamp)
81
+ line_after_next = lines[i+2].strip() if i + 2 < len(lines) else None
82
+ if line_after_next and '-->' in line_after_next:
83
+ # Close current blank block
84
+ start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
85
+ parsed.append({
86
+ "id": current_item["id"],
87
+ "time": current_item["time"],
88
+ "startTimeMs": start_ms,
89
+ "text": "[BLANK SUBTITLE]"
90
+ })
91
+ current_item = {"id": None, "time": None, "text_lines": []}
92
+ state = 'SEARCH_ID'
93
+ elif line.isdigit():
94
+ # Recover from missing timestamp
95
+ current_item["id"] = line
96
+
97
+ elif state == 'READ_TEXT':
98
+ # Check for start of new block (ID line followed by Time line)
99
+ is_new_block_start = (
100
+ line.isdigit() and
101
+ next_line and '-->' in next_line
102
+ )
103
+
104
+ # Check for standard blank line separator
105
+ is_blank_separator = (
106
+ line == '' and
107
+ next_line and next_line.isdigit() and
108
+ (i + 2 < len(lines) and '-->' in lines[i+2])
109
+ )
110
+
111
+ if is_new_block_start:
112
+ # Missing blank separator, force close
113
+ text = " ".join(current_item["text_lines"]).strip() or "[BLANK SUBTITLE]"
114
+ start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
115
  parsed.append({
116
+ "id": current_item["id"],
117
+ "time": current_item["time"],
118
+ "startTimeMs": start_ms,
119
  "text": text
120
  })
121
+ current_item = {"id": line, "time": None, "text_lines": []}
122
+ state = 'SEARCH_TIME'
123
+
124
+ elif is_blank_separator:
125
+ # Standard closure
126
+ text = " ".join(current_item["text_lines"]).strip() or "[BLANK SUBTITLE]"
127
+ start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
128
+ parsed.append({
129
+ "id": current_item["id"],
130
+ "time": current_item["time"],
131
+ "startTimeMs": start_ms,
132
+ "text": text
133
+ })
134
+ current_item = {"id": None, "time": None, "text_lines": []}
135
+ state = 'SEARCH_ID'
136
+
137
+ else:
138
+ if line:
139
+ current_item["text_lines"].append(line)
140
+
141
+ # Push last item
142
+ if current_item["id"] and current_item["time"]:
143
+ text = " ".join(current_item["text_lines"]).strip() or "[BLANK SUBTITLE]"
144
+ start_ms = parse_srt_time_to_ms(current_item["time"].split('-->')[0].strip())
145
+ parsed.append({
146
+ "id": current_item["id"],
147
+ "time": current_item["time"],
148
+ "startTimeMs": start_ms,
149
+ "text": text
150
+ })
151
+
152
  return parsed
153
 
154
  def compress_image(image_bytes, quality=70, max_width=800):
 
199
  )
200
 
201
  text = response.text.replace("```json", "").replace("```", "").strip()
202
+
203
+ try:
204
+ return json.loads(text)
205
+ except json.JSONDecodeError as e:
206
+ # Handle Truncated JSON (Output Token Limit Exceeded)
207
+ # This happens if the batch size is too large for the model's output window
208
+ logger.warning(f"JSON Parse Error (likely truncated response): {e}. Attempting repair...")
209
+
210
+ # Repair Strategy: Find the last closing brace '}', discard everything after, and close the array ']'
211
+ last_object_idx = text.rfind("}")
212
+ if last_object_idx != -1:
213
+ repaired_text = text[:last_object_idx+1] + "]"
214
+ try:
215
+ repaired_data = json.loads(repaired_text)
216
+ logger.info(f"Successfully repaired JSON. Recovered {len(repaired_data)}/{len(items)} items.")
217
+ return repaired_data
218
+ except json.JSONDecodeError:
219
+ logger.error("JSON repair failed.")
220
+
221
+ return None # Fail gracefully if repair is impossible
222
+
223
  except Exception as e:
224
  logger.error(f"Gemini API Error with key ...{api_key[-4:]}: {e}")
225
  return None