lulavc commited on
Commit
37ca53e
·
verified ·
1 Parent(s): c194a7f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +391 -0
app.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BubbleScribe - AI Manga & Comic Translator
3
+ Translate manga/comics using GLM-4.6V-Flash for OCR + Translation and LaMa for inpainting.
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ import json
9
+ import base64
10
+ import re
11
+ import numpy as np
12
+ from PIL import Image, ImageDraw, ImageFont
13
+ from io import BytesIO
14
+ from openai import OpenAI
15
+
16
+ # Initialize GLM client
17
+ def get_glm_client():
18
+ api_key = os.environ.get("GLM_API_KEY")
19
+ if not api_key:
20
+ return None
21
+ return OpenAI(api_key=api_key, base_url="https://api.z.ai/api/paas/v4")
22
+
23
+ def encode_image_base64(image: Image.Image) -> str:
24
+ """Convert PIL Image to base64 string."""
25
+ buffered = BytesIO()
26
+ image.save(buffered, format="PNG")
27
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
28
+
29
+ def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str, progress=gr.Progress()):
30
+ """Use GLM-4.6V to detect text regions and translate."""
31
+ client = get_glm_client()
32
+ if not client:
33
+ return None, "Error: GLM_API_KEY not set in Space secrets"
34
+
35
+ progress(0.1, desc="Analyzing image with GLM-4.6V...")
36
+
37
+ # Convert image to base64
38
+ img_base64 = encode_image_base64(image)
39
+
40
+ # Prompt for detection and translation
41
+ prompt = f"""Analyze this manga/comic page. For each speech bubble or text region:
42
+ 1. Detect the bounding box coordinates [x1, y1, x2, y2] (pixel coordinates)
43
+ 2. Extract the original {source_lang} text
44
+ 3. Translate to {target_lang}
45
+
46
+ Return ONLY a valid JSON array with this exact format:
47
+ [
48
+ {{"bbox": [x1, y1, x2, y2], "original": "original text", "translated": "translated text"}},
49
+ ...
50
+ ]
51
+
52
+ Important:
53
+ - bbox coordinates should be integers representing pixel positions
54
+ - x1,y1 = top-left corner, x2,y2 = bottom-right corner
55
+ - Include ALL text regions (speech bubbles, sound effects, narration boxes)
56
+ - Keep translations natural and contextually appropriate for manga
57
+ - If no text is found, return an empty array: []
58
+ """
59
+
60
+ try:
61
+ response = client.chat.completions.create(
62
+ model="glm-4.6v-flash",
63
+ messages=[
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {
68
+ "type": "image_url",
69
+ "image_url": {"url": f"data:image/png;base64,{img_base64}"}
70
+ },
71
+ {"type": "text", "text": prompt}
72
+ ]
73
+ }
74
+ ],
75
+ max_tokens=4096
76
+ )
77
+
78
+ progress(0.4, desc="Processing response...")
79
+
80
+ # Extract response
81
+ result_text = ""
82
+ msg = response.choices[0].message
83
+ if hasattr(msg, 'content') and msg.content:
84
+ result_text = msg.content
85
+ elif hasattr(msg, 'reasoning_content') and msg.reasoning_content:
86
+ result_text = msg.reasoning_content
87
+
88
+ # Parse JSON from response
89
+ json_match = re.search(r'\[[\s\S]*\]', result_text)
90
+ if json_match:
91
+ detections = json.loads(json_match.group())
92
+ return detections, f"Found {len(detections)} text regions"
93
+ else:
94
+ return [], "No text regions detected"
95
+
96
+ except Exception as e:
97
+ return None, f"Error: {str(e)}"
98
+
99
+ def create_text_mask(image: Image.Image, detections: list) -> Image.Image:
100
+ """Create a mask for inpainting based on detected text regions."""
101
+ mask = Image.new('L', image.size, 0)
102
+ draw = ImageDraw.Draw(mask)
103
+
104
+ padding = 5 # Add padding around text regions
105
+ for det in detections:
106
+ bbox = det.get('bbox', [])
107
+ if len(bbox) == 4:
108
+ x1, y1, x2, y2 = bbox
109
+ # Add padding and ensure bounds
110
+ x1 = max(0, x1 - padding)
111
+ y1 = max(0, y1 - padding)
112
+ x2 = min(image.width, x2 + padding)
113
+ y2 = min(image.height, y2 + padding)
114
+ draw.rectangle([x1, y1, x2, y2], fill=255)
115
+
116
+ return mask
117
+
118
+ def simple_inpaint(image: Image.Image, mask: Image.Image) -> Image.Image:
119
+ """Simple inpainting using OpenCV (fallback if LaMa not available)."""
120
+ try:
121
+ import cv2
122
+ img_array = np.array(image.convert('RGB'))
123
+ mask_array = np.array(mask)
124
+
125
+ # OpenCV inpainting
126
+ result = cv2.inpaint(img_array, mask_array, inpaintRadius=7, flags=cv2.INPAINT_TELEA)
127
+ return Image.fromarray(result)
128
+ except Exception as e:
129
+ print(f"OpenCV inpaint failed: {e}")
130
+ return image
131
+
132
+ def lama_inpaint(image: Image.Image, mask: Image.Image) -> Image.Image:
133
+ """Inpaint using LaMa model."""
134
+ try:
135
+ from simple_lama_inpainting import SimpleLama
136
+ simple_lama = SimpleLama()
137
+ result = simple_lama(image, mask)
138
+ return result
139
+ except Exception as e:
140
+ print(f"LaMa inpaint failed: {e}, falling back to OpenCV")
141
+ return simple_inpaint(image, mask)
142
+
143
+ def get_font(size: int):
144
+ """Get a font for text overlay."""
145
+ font_paths = [
146
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
147
+ "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
148
+ "C:/Windows/Fonts/arial.ttf",
149
+ "C:/Windows/Fonts/arialbd.ttf",
150
+ ]
151
+
152
+ for path in font_paths:
153
+ if os.path.exists(path):
154
+ try:
155
+ return ImageFont.truetype(path, size)
156
+ except:
157
+ continue
158
+
159
+ return ImageFont.load_default()
160
+
161
+ def add_translated_text(image: Image.Image, detections: list, font_size: int = 14) -> Image.Image:
162
+ """Add translated text to the inpainted image."""
163
+ result = image.copy()
164
+ draw = ImageDraw.Draw(result)
165
+
166
+ for det in detections:
167
+ bbox = det.get('bbox', [])
168
+ translated = det.get('translated', '')
169
+
170
+ if len(bbox) == 4 and translated:
171
+ x1, y1, x2, y2 = bbox
172
+ box_width = x2 - x1
173
+ box_height = y2 - y1
174
+
175
+ # Calculate font size based on box size
176
+ estimated_size = min(box_height // 2, box_width // max(len(translated), 1) * 2)
177
+ estimated_size = max(10, min(estimated_size, 32))
178
+ font = get_font(estimated_size)
179
+
180
+ # Get text size
181
+ text_bbox = draw.textbbox((0, 0), translated, font=font)
182
+ text_width = text_bbox[2] - text_bbox[0]
183
+ text_height = text_bbox[3] - text_bbox[1]
184
+
185
+ # Center text in box
186
+ text_x = x1 + (box_width - text_width) // 2
187
+ text_y = y1 + (box_height - text_height) // 2
188
+
189
+ # Draw text with outline for readability
190
+ outline_color = "black"
191
+ text_color = "white"
192
+
193
+ # Draw outline
194
+ for dx in [-1, 0, 1]:
195
+ for dy in [-1, 0, 1]:
196
+ if dx != 0 or dy != 0:
197
+ draw.text((text_x + dx, text_y + dy), translated, font=font, fill=outline_color)
198
+
199
+ # Draw main text
200
+ draw.text((text_x, text_y), translated, font=font, fill=text_color)
201
+
202
+ return result
203
+
204
+ def draw_detections(image: Image.Image, detections: list) -> Image.Image:
205
+ """Draw bounding boxes and labels on image for visualization."""
206
+ result = image.copy()
207
+ draw = ImageDraw.Draw(result)
208
+ font = get_font(12)
209
+
210
+ colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7", "#DDA0DD", "#98D8C8"]
211
+
212
+ for i, det in enumerate(detections):
213
+ bbox = det.get('bbox', [])
214
+ original = det.get('original', '')[:20]
215
+ translated = det.get('translated', '')[:20]
216
+
217
+ if len(bbox) == 4:
218
+ x1, y1, x2, y2 = bbox
219
+ color = colors[i % len(colors)]
220
+
221
+ # Draw rectangle
222
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
223
+
224
+ # Draw label
225
+ label = f"{i+1}: {original} → {translated}"
226
+ draw.text((x1, y1 - 15), label, font=font, fill=color)
227
+
228
+ return result
229
+
230
+ def translate_manga(image, source_lang, target_lang, show_boxes, apply_inpaint, progress=gr.Progress()):
231
+ """Main translation pipeline."""
232
+ if image is None:
233
+ return None, None, "Please upload an image"
234
+
235
+ # Convert to PIL if needed
236
+ if not isinstance(image, Image.Image):
237
+ image = Image.fromarray(image)
238
+
239
+ # Step 1: Detect and translate
240
+ progress(0.1, desc="Detecting text with GLM-4.6V...")
241
+ detections, status = detect_and_translate(image, source_lang, target_lang, progress)
242
+
243
+ if detections is None:
244
+ return None, None, status
245
+
246
+ if len(detections) == 0:
247
+ return image, image, "No text detected in the image"
248
+
249
+ # Step 2: Create visualization with boxes
250
+ progress(0.5, desc="Creating visualization...")
251
+ viz_image = draw_detections(image, detections)
252
+
253
+ # Step 3: Inpaint and add translated text
254
+ if apply_inpaint:
255
+ progress(0.6, desc="Creating mask...")
256
+ mask = create_text_mask(image, detections)
257
+
258
+ progress(0.7, desc="Inpainting (removing original text)...")
259
+ inpainted = lama_inpaint(image, mask)
260
+
261
+ progress(0.9, desc="Adding translated text...")
262
+ result = add_translated_text(inpainted, detections)
263
+ else:
264
+ result = add_translated_text(image, detections)
265
+
266
+ # Format detections for display
267
+ det_text = json.dumps(detections, indent=2, ensure_ascii=False)
268
+
269
+ progress(1.0, desc="Done!")
270
+
271
+ if show_boxes:
272
+ return viz_image, result, det_text
273
+ else:
274
+ return image, result, det_text
275
+
276
+ # Language options
277
+ LANGUAGES = [
278
+ "Japanese",
279
+ "Korean",
280
+ "Chinese (Simplified)",
281
+ "Chinese (Traditional)",
282
+ "English",
283
+ "Spanish",
284
+ "Portuguese",
285
+ "French",
286
+ "German",
287
+ "Italian",
288
+ "Russian",
289
+ "Thai",
290
+ "Vietnamese",
291
+ "Indonesian",
292
+ "Arabic"
293
+ ]
294
+
295
+ # CSS
296
+ css = """
297
+ .gradio-container {
298
+ max-width: 1200px !important;
299
+ }
300
+ .header {
301
+ text-align: center;
302
+ padding: 20px;
303
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
304
+ border-radius: 10px;
305
+ margin-bottom: 20px;
306
+ }
307
+ .header h1 {
308
+ color: white;
309
+ margin: 0;
310
+ }
311
+ .header p {
312
+ color: rgba(255,255,255,0.9);
313
+ margin: 5px 0 0 0;
314
+ }
315
+ """
316
+
317
+ # Build UI
318
+ with gr.Blocks(title="BubbleScribe", css=css) as demo:
319
+ gr.HTML("""
320
+ <div class="header">
321
+ <h1>✍️ BubbleScribe</h1>
322
+ <p>AI-powered manga & comic translator using GLM-4.6V</p>
323
+ </div>
324
+ """)
325
+
326
+ with gr.Row():
327
+ with gr.Column(scale=1):
328
+ input_image = gr.Image(label="📤 Upload Manga Page", type="pil")
329
+
330
+ with gr.Row():
331
+ source_lang = gr.Dropdown(
332
+ choices=LANGUAGES,
333
+ value="Japanese",
334
+ label="Source Language"
335
+ )
336
+ target_lang = gr.Dropdown(
337
+ choices=LANGUAGES,
338
+ value="English",
339
+ label="Target Language"
340
+ )
341
+
342
+ with gr.Row():
343
+ show_boxes = gr.Checkbox(label="Show detection boxes", value=True)
344
+ apply_inpaint = gr.Checkbox(label="Apply inpainting", value=True)
345
+
346
+ translate_btn = gr.Button("🔄 Translate", variant="primary", size="lg")
347
+
348
+ with gr.Column(scale=2):
349
+ with gr.Row():
350
+ detection_output = gr.Image(label="🔍 Detected Text Regions")
351
+ result_output = gr.Image(label="✨ Translated Result")
352
+
353
+ detections_json = gr.Textbox(
354
+ label="📋 Detected Text (JSON)",
355
+ lines=10,
356
+ max_lines=20
357
+ )
358
+
359
+ gr.Markdown("""
360
+ ### 💡 How to Use
361
+ 1. Upload a manga or comic page
362
+ 2. Select source and target languages
363
+ 3. Click "Translate" to process
364
+ 4. View detected regions and translated result
365
+
366
+ ### ⚠️ Notes
367
+ - Works best with clear, high-contrast text
368
+ - Speech bubbles are detected more reliably than sound effects
369
+ - First run may take longer (model loading)
370
+
371
+ ### 🔧 Powered By
372
+ - **GLM-4.6V-Flash** - Text detection & translation (Z.ai API)
373
+ - **LaMa** - Text removal inpainting
374
+ """)
375
+
376
+ gr.HTML("""
377
+ <div style="text-align: center; margin-top: 20px; padding: 10px; background: rgba(0,0,0,0.05); border-radius: 8px;">
378
+ <strong>Model:</strong> <a href="https://huggingface.co/zai-org/GLM-4.6V" target="_blank">zai-org/GLM-4.6V</a> •
379
+ <strong>Created by:</strong> <a href="https://huggingface.co/lulavc" target="_blank">@lulavc</a>
380
+ </div>
381
+ """)
382
+
383
+ # Event handler
384
+ translate_btn.click(
385
+ fn=translate_manga,
386
+ inputs=[input_image, source_lang, target_lang, show_boxes, apply_inpaint],
387
+ outputs=[detection_output, result_output, detections_json]
388
+ )
389
+
390
+ if __name__ == "__main__":
391
+ demo.launch()