File size: 11,356 Bytes
47dc635
 
 
 
 
 
 
 
 
 
fc4abe8
47dc635
 
 
 
 
 
 
fc4abe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47dc635
 
 
 
fc4abe8
 
47dc635
 
 
 
 
 
fc4abe8
47dc635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc4abe8
 
47dc635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
from services.bubble_detector_kiuyha_service import Bubble_Detector_Kiuyha_Service
from services.translate_qwen_service import Translate_Qwen_Service
from PIL import Image, ImageDraw, ImageFont
import tempfile
import os
import re
import torch
from pathlib import Path
from helpers import get_project_root, setup_fonts
from manga_ocr import MangaOcr
import httpx

class ImageProcessor:
    def __init__(self, bubble_detector, ocr_model, translate_model):
        self.bubble_detector_model = bubble_detector
        self.ocr_model = ocr_model
        self.translate_model = translate_model

    async def download_and_process(self, image_url: str, language: str):
        # Create a temporary file that stays on disk until we close it
        # 'delete=False' is important because some ML models need the file to stay closed/flushed before they can read it.
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
            temp_path = tmp.name
            
            # Download
            async with httpx.AsyncClient() as client:
                response = await client.get(image_url)
                response.raise_for_status()
                tmp.write(response.content)
                tmp.flush()

        try:
            results = self.process_image(temp_path, language)
            return results

        finally:
            if os.path.exists(temp_path):
                os.remove(temp_path)
                print(f"Cleaned up temp file: {temp_path}")

    def process_image(self, image_path, language):
        bubble_results = self.bubble_detector_model.predict(image_path)
        print(f"bubble results: {bubble_results}")
        img = Image.open(image_path)
        width, height = img.size
        # draw = ImageDraw.Draw(img)

        texts = []
        coordinates={}
        i=0
        for box_data in bubble_results:
            coords = box_data['coords']
            # draw.rectangle(coords, outline="red", width=1)
            box_cropped = img.crop(coords)
            # box_cropped = upscale_for_ocr(box_cropped, scale=3)
            # box_cropped.show()

            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
                box_cropped.save(f.name)
                temp_path = f.name

            text = ""
            try:
                text = self.ocr_model(box_cropped) 
            except Exception as e:
                print(f"text OCR failed for {i}")

            text = re.sub(r'[\n\r\u2028\u2029]+', ' ', text) #remove new lines
            texts.append({"id": i, "text": text})
            coordinates[i] = coords
            i+=1
        print(f'OCR Complete, total {len(texts)} bubbles.')

        #add translated text to manga image
        try:
            print("Translating with cloud Qwen model...")
            translated = self.translate_model.translate_cloud(texts)
        except Exception as e:
            print("API translation failed with Qwen, falling back to local model...")
            translated = self.translate_model.translate(texts)

        print(translated)

        bubble_data = []
        for i in range(len(texts)):
            coords = coordinates[i]
            x1, y1, x2, y2 = coords
            original_text = texts[i]["text"]
            translated_text = translated.get(str(i), translated.get(i, ""))
            if not isinstance(translated_text, str):
                translated_text = str(translated_text)
            print(f"{i}: {original_text}")
            print(translated_text)
            print("==================================")

            bubble_data.append({
                "bubble_index": i,
                "width": width,
                "height": height,
                "x1": float(x1), "y1": float(y1), "x2": float(x2), "y2": float(y2),
                "original_text": original_text,
                "translated_text": translated_text,
            })

            ######### Code for drawing translated text onto manga panel directly) ###########

            # #wipe the space
            # draw.rectangle(coords, fill="white", outline="white")

            # # 1. Calculate the best fit
            # lines, best_font, final_size, line_h = fit_text_to_box(draw, translated_text, coords, FONT_PATH)

            # # Calculate total height of the block
            # total_h = line_h * len(lines)

            # # Start_y adjusted for the block height relative to the box center
            # start_y = coords[1] + ((coords[3] - coords[1]) - total_h) / 2

            # # 3. Draw each line centered horizontally
            # for line in lines:
            #     line = line.strip()
            #     if not line: continue

            #     # Horizontal Centering
            #     line_w = draw.textlength(line, font=best_font)
            #     start_x = coords[0] + ((coords[2] - coords[0]) - line_w) / 2

            #     draw.text((start_x, start_y), line, font=best_font, fill="black")
            #     start_y += line_h

        return bubble_data #img, bubble_data
    

########Test code, keeping it here as reference. Remove later################
# def show_boxes(image_path):
#     result = bubble_detector_model.predict(image_path)
#     img = Image.open(image_path).convert("RGB")
#     draw = ImageDraw.Draw(img)
#     for box in result.boxes:
#         # Get coordinates as a list of floats
#         coords = box.xyxy[0].tolist() # [x1, y1, x2, y2]
#         draw.rectangle(coords, outline="red", width=1)

#         # label
#         conf = box.conf[0].item()
#         box_cropped = img.crop(coords)
#         # box_cropped = upscale_for_ocr(box_cropped, scale=3)
#         with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
#             box_cropped.save(f.name)
#             temp_path = f.name
#         draw.text(
#             (coords[0], coords[1] - 10),
#             "b",
#             fill="red",
#             font=font
#         )
#     img.show()

# def get_wrapped_text(text, font, max_width):
#     lines = []
#     words = text.split(' ') # Split by words for English
#     current_line = []

#     for word in words:
#         # Check if adding the next word exceeds the width
#         test_line = ' '.join(current_line + [word])
#         # getlength() is more accurate than getbbox for text width
#         if font.getlength(test_line) <= max_width:
#             current_line.append(word)
#         else:
#             lines.append(' '.join(current_line))
#             current_line = [word]

#     lines.append(' '.join(current_line))
#     return lines

# def fit_text_to_box(draw, text, box_coords, font_path, padding=5, initial_size=40):
#     x1, y1, x2, y2 = box_coords

#     padding = padding
#     target_width = (x2 - x1) - (padding * 2)
#     target_height = (y2 - y1) - (padding * 2)

#     current_size = initial_size
#     lines = []

#     while current_size > 8:
#         # index=0 for Japanese, 1 for Korean in NotoSansCJK
#         font = ImageFont.truetype(font_path, size=current_size)
#         lines = get_wrapped_text(text, font, target_width)

#         # Use a more reliable line height measurement
#         # getbbox can be inconsistent; use font.size * constant for better leading
#         line_height = int(current_size * 1.2)
#         total_height = line_height * len(lines)

#         if total_height <= target_height:
#             break
#         current_size -= 2 # Step down by 2 for speed

#     return lines, font, current_size, line_height

# def upscale_for_ocr(img, scale=2):
#     w, h = img.size
#     return img.resize((w*scale, h*scale), Image.BICUBIC)

# def process_image(image_path, language):
#     bubble_results = bubble_detector_model.predict(image_path)
#     print(f"bubble results: {bubble_results}")
#     img = Image.open(image_path)
#     draw = ImageDraw.Draw(img)

#     texts = []
#     coordinates={}
#     i=0
#     for box_data in bubble_results:
#         coords = box_data['coords']
#         draw.rectangle(coords, outline="red", width=1)
#         box_cropped = img.crop(coords)
#         # box_cropped = upscale_for_ocr(box_cropped, scale=3)
#         # box_cropped.show()

#         with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
#             box_cropped.save(f.name)
#             temp_path = f.name

#         text = ""
#         # if language == "japanese":
#         #     # text = ocr_japanese_model.runOCR(temp_path)
#         #     text = ocr_model(temp_path)
#         # else:
#         #     text = ocr_model.runOCR(temp_path)

#         text = ocr_model(box_cropped) 

#         text = re.sub(r'[\n\r\u2028\u2029]+', ' ', text) #remove new lines
#         texts.append({"id": i, "text": text})
#         coordinates[i] = coords
#         i+=1
#     print(f'OCR Complete, total {len(texts)} bubbles.')

#     #add translated text to manga image
#     try:
#         print("Translating with cloud Qwen model...")
#         translated = translate_model.translate_cloud(texts)
#     except Exception as e:
#         print("API translation failed with Qwen, falling back to local model...")
#         translated = translate_model.translate(texts)

#     print(translated)

#     bubble_data = []
#     for i in range(len(texts)):
#         coords = coordinates[i]
#         x1, y1, x2, y2 = coords
#         original_text = texts[i]["text"]
#         translated_text = translated.get(str(i), translated.get(i, ""))
#         if not isinstance(translated_text, str):
#             translated_text = str(translated_text)
#         print(f"{i}: {original_text}")
#         print(translated_text)
#         print("==================================")

#         bubble_data.append({
#             "bubble_index": i,
#             "x1": float(x1), "y1": float(y1), "x2": float(x2), "y2": float(y2),
#             "original_text": original_text,
#             "translated_text": translated_text,
#         })

#         #wipe the space
#         draw.rectangle(coords, fill="white", outline="white")

#         # 1. Calculate the best fit
#         lines, best_font, final_size, line_h = fit_text_to_box(draw, translated_text, coords, FONT_PATH)

#         # Calculate total height of the block
#         total_h = line_h * len(lines)

#         # Start_y adjusted for the block height relative to the box center
#         start_y = coords[1] + ((coords[3] - coords[1]) - total_h) / 2

#         # 3. Draw each line centered horizontally
#         for line in lines:
#             line = line.strip()
#             if not line: continue

#             # Horizontal Centering
#             line_w = draw.textlength(line, font=best_font)
#             start_x = coords[0] + ((coords[2] - coords[0]) - line_w) / 2

#             draw.text((start_x, start_y), line, font=best_font, fill="black")
#             start_y += line_h

#     return img, bubble_data

# def translate_text(text, language):
#     # translated_text = ""
#     # if language == "japanese":
#     #     translated_text =

#     translated_text = translate_model.translate(text)

#     return translated_text

# def _language_to_code(language: str) -> str:
#     """Map language name to ISO 639-1 style code for DB."""
#     m = {"japanese": "ja", "english": "en", "korean": "ko", "chinese": "zh"}
#     return m.get(language.lower(), language[:2] if len(language) >= 2 else "ja")