lulavc commited on
Commit
60db046
·
verified ·
1 Parent(s): 0f5a8ce

Improve OCR prompt + add debug logging

Browse files
Files changed (1) hide show
  1. app.py +27 -17
app.py CHANGED
@@ -274,25 +274,27 @@ def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str,
274
  ratio = 2048 / max(original_size)
275
  processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
276
 
277
- prompt = f"""Analyze this manga/comic page. For each speech bubble or text region:
278
- 1. Detect the bounding box coordinates [x1, y1, x2, y2] (pixel coordinates)
279
- 2. Extract the original {source_lang} text
280
- 3. Translate to {target_lang}
281
 
282
- Return ONLY a valid JSON array with this exact format:
 
 
 
 
 
 
 
 
 
 
 
 
283
  [
284
- {{"bbox": [x1, y1, x2, y2], "original": "original text", "translated": "translated text"}},
285
- ...
286
  ]
287
 
288
- Important:
289
- - bbox coordinates should be integers representing pixel positions
290
- - x1,y1 = top-left corner, x2,y2 = bottom-right corner
291
- - Include ALL text regions (speech bubbles, sound effects, narration boxes)
292
- - Keep translations natural and contextually appropriate for manga
293
- - Preserve the meaning and emotion of the original text
294
- - If no text is found, return an empty array: []
295
- """
296
 
297
  try:
298
  response = client.chat.completions.create(
@@ -316,14 +318,22 @@ Important:
316
 
317
  result_text = ""
318
  msg = response.choices[0].message
 
 
319
  if hasattr(msg, 'content') and msg.content:
320
  result_text = msg.content
321
- elif hasattr(msg, 'reasoning_content') and msg.reasoning_content:
322
- result_text = msg.reasoning_content
 
 
 
 
323
 
324
  # Parse JSON from response with robust error handling
325
  detections = safe_parse_json(result_text)
326
 
 
 
327
  if detections:
328
  # Scale bboxes back to original size if needed
329
  if original_size != processed_size:
 
274
  ratio = 2048 / max(original_size)
275
  processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
276
 
277
+ prompt = f"""You are a manga OCR and translation assistant.
 
 
 
278
 
279
+ Look at this manga image carefully. Find ALL {source_lang} text including:
280
+ - Character names
281
+ - Speech bubbles
282
+ - Narration boxes
283
+ - Sound effects
284
+ - Labels and titles
285
+
286
+ For each text you find, provide:
287
+ 1. The bounding box [x1, y1, x2, y2] in pixels (top-left to bottom-right)
288
+ 2. The original {source_lang} text
289
+ 3. The {target_lang} translation
290
+
291
+ Output ONLY a JSON array like this:
292
  [
293
+ {{"bbox": [100, 50, 200, 80], "original": "日本語テキスト", "translated": "English text"}},
294
+ {{"bbox": [300, 100, 400, 130], "original": "もっとテキスト", "translated": "More text"}}
295
  ]
296
 
297
+ Find as many text regions as possible. Do not skip any visible text."""
 
 
 
 
 
 
 
298
 
299
  try:
300
  response = client.chat.completions.create(
 
318
 
319
  result_text = ""
320
  msg = response.choices[0].message
321
+
322
+ # Try multiple response fields
323
  if hasattr(msg, 'content') and msg.content:
324
  result_text = msg.content
325
+ if hasattr(msg, 'reasoning_content') and msg.reasoning_content:
326
+ # Append reasoning content if present
327
+ result_text = result_text + "\n" + msg.reasoning_content if result_text else msg.reasoning_content
328
+
329
+ print(f"📝 GLM Response length: {len(result_text)} chars")
330
+ print(f"📝 GLM Response preview: {result_text[:500] if result_text else 'EMPTY'}...")
331
 
332
  # Parse JSON from response with robust error handling
333
  detections = safe_parse_json(result_text)
334
 
335
+ print(f"📝 Parsed detections: {len(detections)} items")
336
+
337
  if detections:
338
  # Scale bboxes back to original size if needed
339
  if original_size != processed_size: