Spaces:

lulavc
/

BubbleScribe

Runtime error

lulavc commited on Dec 10, 2025

Commit

13efc9b

verified ·

1 Parent(s): 9e8e6a4

Improve detection: more aggressive prompt, scan entire image, find 20-50 regions

Files changed (1) hide show

app.py CHANGED Viewed

@@ -238,27 +238,33 @@ def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str,
         ratio = 2048 / max(original_size)
         processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
-    prompt = f"""You are a manga OCR and translation assistant.
-Look at this manga image carefully. Find ALL {source_lang} text including:
-- Character names
-- Speech bubbles
-- Narration boxes
-- Sound effects
-- Labels and titles
-For each text you find, provide:
-1. The bounding box [x1, y1, x2, y2] in pixels (top-left to bottom-right)
-2. The original {source_lang} text
-3. The {target_lang} translation
-Output ONLY a JSON array like this:
 [
-  {{"bbox": [100, 50, 200, 80], "original": "日本語テキスト", "translated": "English text"}},
-  {{"bbox": [300, 100, 400, 130], "original": "もっとテキスト", "translated": "More text"}}
 ]
-Find as many text regions as possible. Do not skip any visible text."""
     try:
         response = client.chat.completions.create(
@@ -275,7 +281,7 @@ Find as many text regions as possible. Do not skip any visible text."""
                     ]
                 }
             ],
-            max_tokens=4096
         )
         progress(0.4, desc="Processing response...")

         ratio = 2048 / max(original_size)
         processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
+    prompt = f"""You are a professional manga translator. Your task is to find and translate EVERY piece of {source_lang} text in this image.
+IMPORTANT: Scan the ENTIRE image from top to bottom, left to right. Do NOT miss any text!
+Find ALL of these text types:
+- Main titles and headers
+- Character names (above/below portraits)
+- Speech bubbles and dialogue
+- Narration boxes
+- Sound effects (onomatopoeia)
+- Labels, captions, descriptions
+- Small text and annotations
+- Relationship indicators (arrows, connections)
+- ANY other visible {source_lang} text
+For EACH text region found:
+1. bbox: [x1, y1, x2, y2] pixel coordinates
+2. original: the exact {source_lang} text
+3. translated: natural {target_lang} translation
+Return a JSON array. Example:
 [
+  {{"bbox": [100, 50, 200, 80], "original": "キャラクター名", "translated": "Character Name"}},
+  {{"bbox": [300, 100, 400, 130], "original": "説明文", "translated": "Description"}}
 ]
+CRITICAL: Find at least 20-50 text regions. This image has many text elements. Scan every corner carefully. Include ALL small labels and character descriptions."""
     try:
         response = client.chat.completions.create(
                     ]
                 }
             ],
+            max_tokens=8192
         )
         progress(0.4, desc="Processing response...")