Spaces:

lulavc
/

BubbleScribe

Runtime error

App Files Files Community

lulavc commited on Dec 10, 2025

Commit

60db046

verified ·

1 Parent(s): 0f5a8ce

Improve OCR prompt + add debug logging

Browse files

Files changed (1) hide show

app.py +27 -17

app.py CHANGED Viewed

@@ -274,25 +274,27 @@ def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str,
         ratio = 2048 / max(original_size)
         processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
-    prompt = f"""Analyze this manga/comic page. For each speech bubble or text region:
-1. Detect the bounding box coordinates [x1, y1, x2, y2] (pixel coordinates)
-2. Extract the original {source_lang} text
-3. Translate to {target_lang}
-Return ONLY a valid JSON array with this exact format:
 [
-  {{"bbox": [x1, y1, x2, y2], "original": "original text", "translated": "translated text"}},
-  ...
 ]
-Important:
-- bbox coordinates should be integers representing pixel positions
-- x1,y1 = top-left corner, x2,y2 = bottom-right corner
-- Include ALL text regions (speech bubbles, sound effects, narration boxes)
-- Keep translations natural and contextually appropriate for manga
-- Preserve the meaning and emotion of the original text
-- If no text is found, return an empty array: []
-"""
     try:
         response = client.chat.completions.create(
@@ -316,14 +318,22 @@ Important:
         result_text = ""
         msg = response.choices[0].message
         if hasattr(msg, 'content') and msg.content:
             result_text = msg.content
-        elif hasattr(msg, 'reasoning_content') and msg.reasoning_content:
-            result_text = msg.reasoning_content
         # Parse JSON from response with robust error handling
         detections = safe_parse_json(result_text)
         if detections:
             # Scale bboxes back to original size if needed
             if original_size != processed_size:

         ratio = 2048 / max(original_size)
         processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
+    prompt = f"""You are a manga OCR and translation assistant.
+Look at this manga image carefully. Find ALL {source_lang} text including:
+- Character names
+- Speech bubbles
+- Narration boxes
+- Sound effects
+- Labels and titles
+For each text you find, provide:
+1. The bounding box [x1, y1, x2, y2] in pixels (top-left to bottom-right)
+2. The original {source_lang} text
+3. The {target_lang} translation
+Output ONLY a JSON array like this:
 [
+  {{"bbox": [100, 50, 200, 80], "original": "日本語テキスト", "translated": "English text"}},
+  {{"bbox": [300, 100, 400, 130], "original": "もっとテキスト", "translated": "More text"}}
 ]
+Find as many text regions as possible. Do not skip any visible text."""
     try:
         response = client.chat.completions.create(
         result_text = ""
         msg = response.choices[0].message
+        # Try multiple response fields
         if hasattr(msg, 'content') and msg.content:
             result_text = msg.content
+        if hasattr(msg, 'reasoning_content') and msg.reasoning_content:
+            # Append reasoning content if present
+            result_text = result_text + "\n" + msg.reasoning_content if result_text else msg.reasoning_content
+        print(f"📝 GLM Response length: {len(result_text)} chars")
+        print(f"📝 GLM Response preview: {result_text[:500] if result_text else 'EMPTY'}...")
         # Parse JSON from response with robust error handling
         detections = safe_parse_json(result_text)
+        print(f"📝 Parsed detections: {len(detections)} items")
         if detections:
             # Scale bboxes back to original size if needed
             if original_size != processed_size: