Spaces:
Running
Running
Merge branch 'main' of https://github.com/TonyLiu2004/Multimodal-Manga-Translator
Browse files- __pycache__/helpers.cpython-310.pyc +0 -0
- main.py +90 -25
- services/bubble_detector_kitsumed_service.py +1 -2
__pycache__/helpers.cpython-310.pyc
DELETED
|
Binary file (1.46 kB)
|
|
|
main.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from services.OCR_glm_service import OCR_Glm_Service
|
| 2 |
from services.translate_tencentHY_service import Translate_Tencent_Service
|
| 3 |
from services.bubble_detector_kitsumed_service import Bubble_Detector_Kitsumed_Service
|
|
|
|
| 4 |
from services.bubble_detector_kiuyha_service import Bubble_Detector_Kiuyha_Service
|
| 5 |
from services.OCR_japanese_service import OCR_Japanese_Service
|
| 6 |
from services.translate_qwen_service import Translate_Qwen_Service
|
|
@@ -12,6 +13,7 @@ import torch
|
|
| 12 |
from pathlib import Path
|
| 13 |
from helpers import get_project_root, setup_fonts
|
| 14 |
from fastapi import FastAPI
|
|
|
|
| 15 |
|
| 16 |
###
|
| 17 |
###
|
|
@@ -84,18 +86,18 @@ def show_boxes(image_path):
|
|
| 84 |
# Get coordinates as a list of floats
|
| 85 |
coords = box.xyxy[0].tolist() # [x1, y1, x2, y2]
|
| 86 |
draw.rectangle(coords, outline="red", width=1)
|
| 87 |
-
|
| 88 |
# label
|
| 89 |
conf = box.conf[0].item()
|
| 90 |
box_cropped = img.crop(coords)
|
| 91 |
# box_cropped = upscale_for_ocr(box_cropped, scale=3)
|
| 92 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
|
| 93 |
-
box_cropped.save(f.name)
|
| 94 |
temp_path = f.name
|
| 95 |
draw.text(
|
| 96 |
-
(coords[0], coords[1] - 10),
|
| 97 |
-
"b",
|
| 98 |
-
fill="red",
|
| 99 |
font=font
|
| 100 |
)
|
| 101 |
img.show()
|
|
@@ -114,34 +116,34 @@ def get_wrapped_text(text, font, max_width):
|
|
| 114 |
else:
|
| 115 |
lines.append(' '.join(current_line))
|
| 116 |
current_line = [word]
|
| 117 |
-
|
| 118 |
lines.append(' '.join(current_line))
|
| 119 |
return lines
|
| 120 |
|
| 121 |
def fit_text_to_box(draw, text, box_coords, font_path, padding=5, initial_size=40):
|
| 122 |
x1, y1, x2, y2 = box_coords
|
| 123 |
-
|
| 124 |
padding = padding
|
| 125 |
target_width = (x2 - x1) - (padding * 2)
|
| 126 |
target_height = (y2 - y1) - (padding * 2)
|
| 127 |
-
|
| 128 |
current_size = initial_size
|
| 129 |
lines = []
|
| 130 |
-
|
| 131 |
while current_size > 8:
|
| 132 |
# index=0 for Japanese, 1 for Korean in NotoSansCJK
|
| 133 |
font = ImageFont.truetype(font_path, size=current_size)
|
| 134 |
lines = get_wrapped_text(text, font, target_width)
|
| 135 |
-
|
| 136 |
# Use a more reliable line height measurement
|
| 137 |
# getbbox can be inconsistent; use font.size * constant for better leading
|
| 138 |
-
line_height = int(current_size * 1.2)
|
| 139 |
total_height = line_height * len(lines)
|
| 140 |
-
|
| 141 |
if total_height <= target_height:
|
| 142 |
break
|
| 143 |
current_size -= 2 # Step down by 2 for speed
|
| 144 |
-
|
| 145 |
return lines, font, current_size, line_height
|
| 146 |
|
| 147 |
def upscale_for_ocr(img, scale=2):
|
|
@@ -152,7 +154,7 @@ def process_image(image_path, language):
|
|
| 152 |
bubble_results = bubble_detector_model.predict(image_path)
|
| 153 |
img = Image.open(image_path)
|
| 154 |
draw = ImageDraw.Draw(img)
|
| 155 |
-
|
| 156 |
texts = []
|
| 157 |
coordinates={}
|
| 158 |
i=0
|
|
@@ -164,7 +166,7 @@ def process_image(image_path, language):
|
|
| 164 |
# box_cropped.show()
|
| 165 |
|
| 166 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
|
| 167 |
-
box_cropped.save(f.name)
|
| 168 |
temp_path = f.name
|
| 169 |
|
| 170 |
text = ""
|
|
@@ -184,13 +186,26 @@ def process_image(image_path, language):
|
|
| 184 |
print("translating...")
|
| 185 |
translated = translate_model.translate(texts)
|
| 186 |
print(translated)
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
print(translated_text)
|
| 192 |
print("==================================")
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
#wipe the space
|
| 195 |
draw.rectangle(coords, fill="white", outline="white")
|
| 196 |
|
|
@@ -207,23 +222,23 @@ def process_image(image_path, language):
|
|
| 207 |
for line in lines:
|
| 208 |
line = line.strip()
|
| 209 |
if not line: continue
|
| 210 |
-
|
| 211 |
# Horizontal Centering
|
| 212 |
line_w = draw.textlength(line, font=best_font)
|
| 213 |
start_x = coords[0] + ((coords[2] - coords[0]) - line_w) / 2
|
| 214 |
-
|
| 215 |
draw.text((start_x, start_y), line, font=best_font, fill="black")
|
| 216 |
start_y += line_h
|
| 217 |
|
| 218 |
-
return img
|
| 219 |
|
| 220 |
def translate_text(text, language):
|
| 221 |
# translated_text = ""
|
| 222 |
# if language == "japanese":
|
| 223 |
-
# translated_text =
|
| 224 |
|
| 225 |
translated_text = translate_model.translate(text)
|
| 226 |
-
|
| 227 |
return translated_text
|
| 228 |
|
| 229 |
def runOCRTests():
|
|
@@ -238,10 +253,60 @@ def runOCRTests():
|
|
| 238 |
print(f"failed on {i}")
|
| 239 |
break
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
def main():
|
| 242 |
img_path = ROOT / "test_images" / "test_2.png"
|
| 243 |
-
img = process_image(img_path, "japanese")
|
| 244 |
img.show()
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
@app.get("/")
|
| 247 |
def home():
|
|
|
|
| 1 |
from services.OCR_glm_service import OCR_Glm_Service
|
| 2 |
from services.translate_tencentHY_service import Translate_Tencent_Service
|
| 3 |
from services.bubble_detector_kitsumed_service import Bubble_Detector_Kitsumed_Service
|
| 4 |
+
|
| 5 |
from services.bubble_detector_kiuyha_service import Bubble_Detector_Kiuyha_Service
|
| 6 |
from services.OCR_japanese_service import OCR_Japanese_Service
|
| 7 |
from services.translate_qwen_service import Translate_Qwen_Service
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
from helpers import get_project_root, setup_fonts
|
| 15 |
from fastapi import FastAPI
|
| 16 |
+
import db as manga_db
|
| 17 |
|
| 18 |
###
|
| 19 |
###
|
|
|
|
| 86 |
# Get coordinates as a list of floats
|
| 87 |
coords = box.xyxy[0].tolist() # [x1, y1, x2, y2]
|
| 88 |
draw.rectangle(coords, outline="red", width=1)
|
| 89 |
+
|
| 90 |
# label
|
| 91 |
conf = box.conf[0].item()
|
| 92 |
box_cropped = img.crop(coords)
|
| 93 |
# box_cropped = upscale_for_ocr(box_cropped, scale=3)
|
| 94 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
|
| 95 |
+
box_cropped.save(f.name)
|
| 96 |
temp_path = f.name
|
| 97 |
draw.text(
|
| 98 |
+
(coords[0], coords[1] - 10),
|
| 99 |
+
"b",
|
| 100 |
+
fill="red",
|
| 101 |
font=font
|
| 102 |
)
|
| 103 |
img.show()
|
|
|
|
| 116 |
else:
|
| 117 |
lines.append(' '.join(current_line))
|
| 118 |
current_line = [word]
|
| 119 |
+
|
| 120 |
lines.append(' '.join(current_line))
|
| 121 |
return lines
|
| 122 |
|
| 123 |
def fit_text_to_box(draw, text, box_coords, font_path, padding=5, initial_size=40):
|
| 124 |
x1, y1, x2, y2 = box_coords
|
| 125 |
+
|
| 126 |
padding = padding
|
| 127 |
target_width = (x2 - x1) - (padding * 2)
|
| 128 |
target_height = (y2 - y1) - (padding * 2)
|
| 129 |
+
|
| 130 |
current_size = initial_size
|
| 131 |
lines = []
|
| 132 |
+
|
| 133 |
while current_size > 8:
|
| 134 |
# index=0 for Japanese, 1 for Korean in NotoSansCJK
|
| 135 |
font = ImageFont.truetype(font_path, size=current_size)
|
| 136 |
lines = get_wrapped_text(text, font, target_width)
|
| 137 |
+
|
| 138 |
# Use a more reliable line height measurement
|
| 139 |
# getbbox can be inconsistent; use font.size * constant for better leading
|
| 140 |
+
line_height = int(current_size * 1.2)
|
| 141 |
total_height = line_height * len(lines)
|
| 142 |
+
|
| 143 |
if total_height <= target_height:
|
| 144 |
break
|
| 145 |
current_size -= 2 # Step down by 2 for speed
|
| 146 |
+
|
| 147 |
return lines, font, current_size, line_height
|
| 148 |
|
| 149 |
def upscale_for_ocr(img, scale=2):
|
|
|
|
| 154 |
bubble_results = bubble_detector_model.predict(image_path)
|
| 155 |
img = Image.open(image_path)
|
| 156 |
draw = ImageDraw.Draw(img)
|
| 157 |
+
|
| 158 |
texts = []
|
| 159 |
coordinates={}
|
| 160 |
i=0
|
|
|
|
| 166 |
# box_cropped.show()
|
| 167 |
|
| 168 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
|
| 169 |
+
box_cropped.save(f.name)
|
| 170 |
temp_path = f.name
|
| 171 |
|
| 172 |
text = ""
|
|
|
|
| 186 |
print("translating...")
|
| 187 |
translated = translate_model.translate(texts)
|
| 188 |
print(translated)
|
| 189 |
+
|
| 190 |
+
bubble_data = []
|
| 191 |
+
for i in range(len(texts)):
|
| 192 |
+
coords = coordinates[i]
|
| 193 |
+
x1, y1, x2, y2 = coords
|
| 194 |
+
original_text = texts[i]["text"]
|
| 195 |
+
translated_text = translated.get(str(i), translated.get(i, ""))
|
| 196 |
+
if not isinstance(translated_text, str):
|
| 197 |
+
translated_text = str(translated_text)
|
| 198 |
+
print(f"{i}: {original_text}")
|
| 199 |
print(translated_text)
|
| 200 |
print("==================================")
|
| 201 |
|
| 202 |
+
bubble_data.append({
|
| 203 |
+
"bubble_index": i,
|
| 204 |
+
"x1": float(x1), "y1": float(y1), "x2": float(x2), "y2": float(y2),
|
| 205 |
+
"original_text": original_text,
|
| 206 |
+
"translated_text": translated_text,
|
| 207 |
+
})
|
| 208 |
+
|
| 209 |
#wipe the space
|
| 210 |
draw.rectangle(coords, fill="white", outline="white")
|
| 211 |
|
|
|
|
| 222 |
for line in lines:
|
| 223 |
line = line.strip()
|
| 224 |
if not line: continue
|
| 225 |
+
|
| 226 |
# Horizontal Centering
|
| 227 |
line_w = draw.textlength(line, font=best_font)
|
| 228 |
start_x = coords[0] + ((coords[2] - coords[0]) - line_w) / 2
|
| 229 |
+
|
| 230 |
draw.text((start_x, start_y), line, font=best_font, fill="black")
|
| 231 |
start_y += line_h
|
| 232 |
|
| 233 |
+
return img, bubble_data
|
| 234 |
|
| 235 |
def translate_text(text, language):
|
| 236 |
# translated_text = ""
|
| 237 |
# if language == "japanese":
|
| 238 |
+
# translated_text =
|
| 239 |
|
| 240 |
translated_text = translate_model.translate(text)
|
| 241 |
+
|
| 242 |
return translated_text
|
| 243 |
|
| 244 |
def runOCRTests():
|
|
|
|
| 253 |
print(f"failed on {i}")
|
| 254 |
break
|
| 255 |
|
| 256 |
+
def _language_to_code(language: str) -> str:
|
| 257 |
+
"""Map language name to ISO 639-1 style code for DB."""
|
| 258 |
+
m = {"japanese": "ja", "english": "en", "korean": "ko", "chinese": "zh"}
|
| 259 |
+
return m.get(language.lower(), language[:2] if len(language) >= 2 else "ja")
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def process_chapter(
|
| 263 |
+
manga_title: str,
|
| 264 |
+
chapter_number: float,
|
| 265 |
+
page_paths: list,
|
| 266 |
+
language: str = "japanese",
|
| 267 |
+
provider_id: str = "local",
|
| 268 |
+
db_url: str = None,
|
| 269 |
+
):
|
| 270 |
+
"""
|
| 271 |
+
Process each page of a chapter, draw translated text on images, and save
|
| 272 |
+
to the PostgreSQL text repository (provider_id, manga_title, chapter/page,
|
| 273 |
+
segment coordinates, original/translated text, language code). No images stored.
|
| 274 |
+
page_paths: list of paths to page images in order.
|
| 275 |
+
provider_id: source/provider identifier (e.g. 'mangadex', 'local').
|
| 276 |
+
db_url: PostgreSQL URL or set DATABASE_URL.
|
| 277 |
+
Returns (list of (img, bubble_data) per page).
|
| 278 |
+
"""
|
| 279 |
+
manga_db.init_db(db_url)
|
| 280 |
+
language_code = _language_to_code(language)
|
| 281 |
+
results = []
|
| 282 |
+
for page_number, image_path in enumerate(page_paths, start=1):
|
| 283 |
+
path = Path(image_path)
|
| 284 |
+
if not path.exists():
|
| 285 |
+
print(f"Skip missing page {page_number}: {path}")
|
| 286 |
+
continue
|
| 287 |
+
print(f"Processing chapter {chapter_number} page {page_number}/{len(page_paths)}: {path.name}")
|
| 288 |
+
img, bubble_data = process_image(str(path), language)
|
| 289 |
+
manga_db.save_page_translation(
|
| 290 |
+
provider_id=provider_id,
|
| 291 |
+
manga_title=manga_title,
|
| 292 |
+
chapter_number=chapter_number,
|
| 293 |
+
page_number=page_number,
|
| 294 |
+
bubbles=bubble_data,
|
| 295 |
+
language_code=language_code,
|
| 296 |
+
db_url=db_url,
|
| 297 |
+
)
|
| 298 |
+
results.append((img, bubble_data))
|
| 299 |
+
print(f"Chapter '{manga_title}' ch.{chapter_number} saved to DB ({len(results)} pages).")
|
| 300 |
+
return results
|
| 301 |
+
|
| 302 |
+
|
| 303 |
def main():
|
| 304 |
img_path = ROOT / "test_images" / "test_2.png"
|
| 305 |
+
img, bubble_data = process_image(img_path, "japanese")
|
| 306 |
img.show()
|
| 307 |
+
# manga_db.save_page_translation(provider_id="local", manga_title="Test", chapter_number=0,
|
| 308 |
+
# page_number=1, bubbles=bubble_data, language_code="ja")
|
| 309 |
+
|
| 310 |
|
| 311 |
@app.get("/")
|
| 312 |
def home():
|
services/bubble_detector_kitsumed_service.py
CHANGED
|
@@ -18,5 +18,4 @@ class Bubble_Detector_Kitsumed_Service:
|
|
| 18 |
show_conf=show_conf,
|
| 19 |
imgsz=imgsz,
|
| 20 |
)
|
| 21 |
-
return results[0]
|
| 22 |
-
|
|
|
|
| 18 |
show_conf=show_conf,
|
| 19 |
imgsz=imgsz,
|
| 20 |
)
|
| 21 |
+
return results[0]
|
|
|