lulavc commited on
Commit
13efc9b
·
verified ·
1 Parent(s): 9e8e6a4

Improve detection: more aggressive prompt, scan entire image, find 20-50 regions

Browse files
Files changed (1) hide show
  1. app.py +23 -17
app.py CHANGED
@@ -238,27 +238,33 @@ def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str,
238
  ratio = 2048 / max(original_size)
239
  processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
240
 
241
- prompt = f"""You are a manga OCR and translation assistant.
242
 
243
- Look at this manga image carefully. Find ALL {source_lang} text including:
244
- - Character names
245
- - Speech bubbles
246
- - Narration boxes
247
- - Sound effects
248
- - Labels and titles
249
-
250
- For each text you find, provide:
251
- 1. The bounding box [x1, y1, x2, y2] in pixels (top-left to bottom-right)
252
- 2. The original {source_lang} text
253
- 3. The {target_lang} translation
254
 
255
- Output ONLY a JSON array like this:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  [
257
- {{"bbox": [100, 50, 200, 80], "original": "日本語テスト", "translated": "English text"}},
258
- {{"bbox": [300, 100, 400, 130], "original": "もっとテキスト", "translated": "More text"}}
259
  ]
260
 
261
- Find as many text regions as possible. Do not skip any visible text."""
262
 
263
  try:
264
  response = client.chat.completions.create(
@@ -275,7 +281,7 @@ Find as many text regions as possible. Do not skip any visible text."""
275
  ]
276
  }
277
  ],
278
- max_tokens=4096
279
  )
280
 
281
  progress(0.4, desc="Processing response...")
 
238
  ratio = 2048 / max(original_size)
239
  processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
240
 
241
+ prompt = f"""You are a professional manga translator. Your task is to find and translate EVERY piece of {source_lang} text in this image.
242
 
243
+ IMPORTANT: Scan the ENTIRE image from top to bottom, left to right. Do NOT miss any text!
 
 
 
 
 
 
 
 
 
 
244
 
245
+ Find ALL of these text types:
246
+ - Main titles and headers
247
+ - Character names (above/below portraits)
248
+ - Speech bubbles and dialogue
249
+ - Narration boxes
250
+ - Sound effects (onomatopoeia)
251
+ - Labels, captions, descriptions
252
+ - Small text and annotations
253
+ - Relationship indicators (arrows, connections)
254
+ - ANY other visible {source_lang} text
255
+
256
+ For EACH text region found:
257
+ 1. bbox: [x1, y1, x2, y2] pixel coordinates
258
+ 2. original: the exact {source_lang} text
259
+ 3. translated: natural {target_lang} translation
260
+
261
+ Return a JSON array. Example:
262
  [
263
+ {{"bbox": [100, 50, 200, 80], "original": "キャラクター名", "translated": "Character Name"}},
264
+ {{"bbox": [300, 100, 400, 130], "original": "説明文", "translated": "Description"}}
265
  ]
266
 
267
+ CRITICAL: Find at least 20-50 text regions. This image has many text elements. Scan every corner carefully. Include ALL small labels and character descriptions."""
268
 
269
  try:
270
  response = client.chat.completions.create(
 
281
  ]
282
  }
283
  ],
284
+ max_tokens=8192
285
  )
286
 
287
  progress(0.4, desc="Processing response...")