Spaces:
Runtime error
Runtime error
Improve OCR prompt + add debug logging
Browse files
app.py
CHANGED
|
@@ -274,25 +274,27 @@ def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str,
|
|
| 274 |
ratio = 2048 / max(original_size)
|
| 275 |
processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
|
| 276 |
|
| 277 |
-
prompt = f"""
|
| 278 |
-
1. Detect the bounding box coordinates [x1, y1, x2, y2] (pixel coordinates)
|
| 279 |
-
2. Extract the original {source_lang} text
|
| 280 |
-
3. Translate to {target_lang}
|
| 281 |
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
[
|
| 284 |
-
{{"bbox": [
|
| 285 |
-
|
| 286 |
]
|
| 287 |
|
| 288 |
-
|
| 289 |
-
- bbox coordinates should be integers representing pixel positions
|
| 290 |
-
- x1,y1 = top-left corner, x2,y2 = bottom-right corner
|
| 291 |
-
- Include ALL text regions (speech bubbles, sound effects, narration boxes)
|
| 292 |
-
- Keep translations natural and contextually appropriate for manga
|
| 293 |
-
- Preserve the meaning and emotion of the original text
|
| 294 |
-
- If no text is found, return an empty array: []
|
| 295 |
-
"""
|
| 296 |
|
| 297 |
try:
|
| 298 |
response = client.chat.completions.create(
|
|
@@ -316,14 +318,22 @@ Important:
|
|
| 316 |
|
| 317 |
result_text = ""
|
| 318 |
msg = response.choices[0].message
|
|
|
|
|
|
|
| 319 |
if hasattr(msg, 'content') and msg.content:
|
| 320 |
result_text = msg.content
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
# Parse JSON from response with robust error handling
|
| 325 |
detections = safe_parse_json(result_text)
|
| 326 |
|
|
|
|
|
|
|
| 327 |
if detections:
|
| 328 |
# Scale bboxes back to original size if needed
|
| 329 |
if original_size != processed_size:
|
|
|
|
| 274 |
ratio = 2048 / max(original_size)
|
| 275 |
processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
|
| 276 |
|
| 277 |
+
prompt = f"""You are a manga OCR and translation assistant.
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
+
Look at this manga image carefully. Find ALL {source_lang} text including:
|
| 280 |
+
- Character names
|
| 281 |
+
- Speech bubbles
|
| 282 |
+
- Narration boxes
|
| 283 |
+
- Sound effects
|
| 284 |
+
- Labels and titles
|
| 285 |
+
|
| 286 |
+
For each text you find, provide:
|
| 287 |
+
1. The bounding box [x1, y1, x2, y2] in pixels (top-left to bottom-right)
|
| 288 |
+
2. The original {source_lang} text
|
| 289 |
+
3. The {target_lang} translation
|
| 290 |
+
|
| 291 |
+
Output ONLY a JSON array like this:
|
| 292 |
[
|
| 293 |
+
{{"bbox": [100, 50, 200, 80], "original": "日本語テキスト", "translated": "English text"}},
|
| 294 |
+
{{"bbox": [300, 100, 400, 130], "original": "もっとテキスト", "translated": "More text"}}
|
| 295 |
]
|
| 296 |
|
| 297 |
+
Find as many text regions as possible. Do not skip any visible text."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
try:
|
| 300 |
response = client.chat.completions.create(
|
|
|
|
| 318 |
|
| 319 |
result_text = ""
|
| 320 |
msg = response.choices[0].message
|
| 321 |
+
|
| 322 |
+
# Try multiple response fields
|
| 323 |
if hasattr(msg, 'content') and msg.content:
|
| 324 |
result_text = msg.content
|
| 325 |
+
if hasattr(msg, 'reasoning_content') and msg.reasoning_content:
|
| 326 |
+
# Append reasoning content if present
|
| 327 |
+
result_text = result_text + "\n" + msg.reasoning_content if result_text else msg.reasoning_content
|
| 328 |
+
|
| 329 |
+
print(f"📝 GLM Response length: {len(result_text)} chars")
|
| 330 |
+
print(f"📝 GLM Response preview: {result_text[:500] if result_text else 'EMPTY'}...")
|
| 331 |
|
| 332 |
# Parse JSON from response with robust error handling
|
| 333 |
detections = safe_parse_json(result_text)
|
| 334 |
|
| 335 |
+
print(f"📝 Parsed detections: {len(detections)} items")
|
| 336 |
+
|
| 337 |
if detections:
|
| 338 |
# Scale bboxes back to original size if needed
|
| 339 |
if original_size != processed_size:
|