Spaces:

lulavc
/

BubbleScribe

Runtime error

App Files Files Community

lulavc commited on Dec 10, 2025

Commit

37ca53e

verified ·

1 Parent(s): c194a7f

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +391 -0

app.py ADDED Viewed

	@@ -0,0 +1,391 @@

+"""
+BubbleScribe - AI Manga & Comic Translator
+Translate manga/comics using GLM-4.6V-Flash for OCR + Translation and LaMa for inpainting.
+"""
+import gradio as gr
+import os
+import json
+import base64
+import re
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from io import BytesIO
+from openai import OpenAI
+# Initialize GLM client
+def get_glm_client():
+    api_key = os.environ.get("GLM_API_KEY")
+    if not api_key:
+        return None
+    return OpenAI(api_key=api_key, base_url="https://api.z.ai/api/paas/v4")
+def encode_image_base64(image: Image.Image) -> str:
+    """Convert PIL Image to base64 string."""
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str, progress=gr.Progress()):
+    """Use GLM-4.6V to detect text regions and translate."""
+    client = get_glm_client()
+    if not client:
+        return None, "Error: GLM_API_KEY not set in Space secrets"
+    progress(0.1, desc="Analyzing image with GLM-4.6V...")
+    # Convert image to base64
+    img_base64 = encode_image_base64(image)
+    # Prompt for detection and translation
+    prompt = f"""Analyze this manga/comic page. For each speech bubble or text region:
+1. Detect the bounding box coordinates [x1, y1, x2, y2] (pixel coordinates)
+2. Extract the original {source_lang} text
+3. Translate to {target_lang}
+Return ONLY a valid JSON array with this exact format:
+[
+  {{"bbox": [x1, y1, x2, y2], "original": "original text", "translated": "translated text"}},
+  ...
+]
+Important:
+- bbox coordinates should be integers representing pixel positions
+- x1,y1 = top-left corner, x2,y2 = bottom-right corner
+- Include ALL text regions (speech bubbles, sound effects, narration boxes)
+- Keep translations natural and contextually appropriate for manga
+- If no text is found, return an empty array: []
+"""
+    try:
+        response = client.chat.completions.create(
+            model="glm-4.6v-flash",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{img_base64}"}
+                        },
+                        {"type": "text", "text": prompt}
+                    ]
+                }
+            ],
+            max_tokens=4096
+        )
+        progress(0.4, desc="Processing response...")
+        # Extract response
+        result_text = ""
+        msg = response.choices[0].message
+        if hasattr(msg, 'content') and msg.content:
+            result_text = msg.content
+        elif hasattr(msg, 'reasoning_content') and msg.reasoning_content:
+            result_text = msg.reasoning_content
+        # Parse JSON from response
+        json_match = re.search(r'\[[\s\S]*\]', result_text)
+        if json_match:
+            detections = json.loads(json_match.group())
+            return detections, f"Found {len(detections)} text regions"
+        else:
+            return [], "No text regions detected"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+def create_text_mask(image: Image.Image, detections: list) -> Image.Image:
+    """Create a mask for inpainting based on detected text regions."""
+    mask = Image.new('L', image.size, 0)
+    draw = ImageDraw.Draw(mask)
+    padding = 5  # Add padding around text regions
+    for det in detections:
+        bbox = det.get('bbox', [])
+        if len(bbox) == 4:
+            x1, y1, x2, y2 = bbox
+            # Add padding and ensure bounds
+            x1 = max(0, x1 - padding)
+            y1 = max(0, y1 - padding)
+            x2 = min(image.width, x2 + padding)
+            y2 = min(image.height, y2 + padding)
+            draw.rectangle([x1, y1, x2, y2], fill=255)
+    return mask
+def simple_inpaint(image: Image.Image, mask: Image.Image) -> Image.Image:
+    """Simple inpainting using OpenCV (fallback if LaMa not available)."""
+    try:
+        import cv2
+        img_array = np.array(image.convert('RGB'))
+        mask_array = np.array(mask)
+        # OpenCV inpainting
+        result = cv2.inpaint(img_array, mask_array, inpaintRadius=7, flags=cv2.INPAINT_TELEA)
+        return Image.fromarray(result)
+    except Exception as e:
+        print(f"OpenCV inpaint failed: {e}")
+        return image
+def lama_inpaint(image: Image.Image, mask: Image.Image) -> Image.Image:
+    """Inpaint using LaMa model."""
+    try:
+        from simple_lama_inpainting import SimpleLama
+        simple_lama = SimpleLama()
+        result = simple_lama(image, mask)
+        return result
+    except Exception as e:
+        print(f"LaMa inpaint failed: {e}, falling back to OpenCV")
+        return simple_inpaint(image, mask)
+def get_font(size: int):
+    """Get a font for text overlay."""
+    font_paths = [
+        "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+        "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
+        "C:/Windows/Fonts/arial.ttf",
+        "C:/Windows/Fonts/arialbd.ttf",
+    ]
+    for path in font_paths:
+        if os.path.exists(path):
+            try:
+                return ImageFont.truetype(path, size)
+            except:
+                continue
+    return ImageFont.load_default()
+def add_translated_text(image: Image.Image, detections: list, font_size: int = 14) -> Image.Image:
+    """Add translated text to the inpainted image."""
+    result = image.copy()
+    draw = ImageDraw.Draw(result)
+    for det in detections:
+        bbox = det.get('bbox', [])
+        translated = det.get('translated', '')
+        if len(bbox) == 4 and translated:
+            x1, y1, x2, y2 = bbox
+            box_width = x2 - x1
+            box_height = y2 - y1
+            # Calculate font size based on box size
+            estimated_size = min(box_height // 2, box_width // max(len(translated), 1) * 2)
+            estimated_size = max(10, min(estimated_size, 32))
+            font = get_font(estimated_size)
+            # Get text size
+            text_bbox = draw.textbbox((0, 0), translated, font=font)
+            text_width = text_bbox[2] - text_bbox[0]
+            text_height = text_bbox[3] - text_bbox[1]
+            # Center text in box
+            text_x = x1 + (box_width - text_width) // 2
+            text_y = y1 + (box_height - text_height) // 2
+            # Draw text with outline for readability
+            outline_color = "black"
+            text_color = "white"
+            # Draw outline
+            for dx in [-1, 0, 1]:
+                for dy in [-1, 0, 1]:
+                    if dx != 0 or dy != 0:
+                        draw.text((text_x + dx, text_y + dy), translated, font=font, fill=outline_color)
+            # Draw main text
+            draw.text((text_x, text_y), translated, font=font, fill=text_color)
+    return result
+def draw_detections(image: Image.Image, detections: list) -> Image.Image:
+    """Draw bounding boxes and labels on image for visualization."""
+    result = image.copy()
+    draw = ImageDraw.Draw(result)
+    font = get_font(12)
+    colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7", "#DDA0DD", "#98D8C8"]
+    for i, det in enumerate(detections):
+        bbox = det.get('bbox', [])
+        original = det.get('original', '')[:20]
+        translated = det.get('translated', '')[:20]
+        if len(bbox) == 4:
+            x1, y1, x2, y2 = bbox
+            color = colors[i % len(colors)]
+            # Draw rectangle
+            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+            # Draw label
+            label = f"{i+1}: {original} → {translated}"
+            draw.text((x1, y1 - 15), label, font=font, fill=color)
+    return result
+def translate_manga(image, source_lang, target_lang, show_boxes, apply_inpaint, progress=gr.Progress()):
+    """Main translation pipeline."""
+    if image is None:
+        return None, None, "Please upload an image"
+    # Convert to PIL if needed
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    # Step 1: Detect and translate
+    progress(0.1, desc="Detecting text with GLM-4.6V...")
+    detections, status = detect_and_translate(image, source_lang, target_lang, progress)
+    if detections is None:
+        return None, None, status
+    if len(detections) == 0:
+        return image, image, "No text detected in the image"
+    # Step 2: Create visualization with boxes
+    progress(0.5, desc="Creating visualization...")
+    viz_image = draw_detections(image, detections)
+    # Step 3: Inpaint and add translated text
+    if apply_inpaint:
+        progress(0.6, desc="Creating mask...")
+        mask = create_text_mask(image, detections)
+        progress(0.7, desc="Inpainting (removing original text)...")
+        inpainted = lama_inpaint(image, mask)
+        progress(0.9, desc="Adding translated text...")
+        result = add_translated_text(inpainted, detections)
+    else:
+        result = add_translated_text(image, detections)
+    # Format detections for display
+    det_text = json.dumps(detections, indent=2, ensure_ascii=False)
+    progress(1.0, desc="Done!")
+    if show_boxes:
+        return viz_image, result, det_text
+    else:
+        return image, result, det_text
+# Language options
+LANGUAGES = [
+    "Japanese",
+    "Korean",
+    "Chinese (Simplified)",
+    "Chinese (Traditional)",
+    "English",
+    "Spanish",
+    "Portuguese",
+    "French",
+    "German",
+    "Italian",
+    "Russian",
+    "Thai",
+    "Vietnamese",
+    "Indonesian",
+    "Arabic"
+]
+# CSS
+css = """
+.gradio-container {
+    max-width: 1200px !important;
+}
+.header {
+    text-align: center;
+    padding: 20px;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border-radius: 10px;
+    margin-bottom: 20px;
+}
+.header h1 {
+    color: white;
+    margin: 0;
+}
+.header p {
+    color: rgba(255,255,255,0.9);
+    margin: 5px 0 0 0;
+}
+"""
+# Build UI
+with gr.Blocks(title="BubbleScribe", css=css) as demo:
+    gr.HTML("""
+    <div class="header">
+        <h1>✍️ BubbleScribe</h1>
+        <p>AI-powered manga & comic translator using GLM-4.6V</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(label="📤 Upload Manga Page", type="pil")
+            with gr.Row():
+                source_lang = gr.Dropdown(
+                    choices=LANGUAGES,
+                    value="Japanese",
+                    label="Source Language"
+                )
+                target_lang = gr.Dropdown(
+                    choices=LANGUAGES,
+                    value="English",
+                    label="Target Language"
+                )
+            with gr.Row():
+                show_boxes = gr.Checkbox(label="Show detection boxes", value=True)
+                apply_inpaint = gr.Checkbox(label="Apply inpainting", value=True)
+            translate_btn = gr.Button("🔄 Translate", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            with gr.Row():
+                detection_output = gr.Image(label="🔍 Detected Text Regions")
+                result_output = gr.Image(label="✨ Translated Result")
+            detections_json = gr.Textbox(
+                label="📋 Detected Text (JSON)",
+                lines=10,
+                max_lines=20
+            )
+    gr.Markdown("""
+    ### 💡 How to Use
+    1. Upload a manga or comic page
+    2. Select source and target languages
+    3. Click "Translate" to process
+    4. View detected regions and translated result
+    ### ⚠️ Notes
+    - Works best with clear, high-contrast text
+    - Speech bubbles are detected more reliably than sound effects
+    - First run may take longer (model loading)
+    ### 🔧 Powered By
+    - **GLM-4.6V-Flash** - Text detection & translation (Z.ai API)
+    - **LaMa** - Text removal inpainting
+    """)
+    gr.HTML("""
+    <div style="text-align: center; margin-top: 20px; padding: 10px; background: rgba(0,0,0,0.05); border-radius: 8px;">
+        <strong>Model:</strong> <a href="https://huggingface.co/zai-org/GLM-4.6V" target="_blank">zai-org/GLM-4.6V</a> •
+        <strong>Created by:</strong> <a href="https://huggingface.co/lulavc" target="_blank">@lulavc</a>
+    </div>
+    """)
+    # Event handler
+    translate_btn.click(
+        fn=translate_manga,
+        inputs=[input_image, source_lang, target_lang, show_boxes, apply_inpaint],
+        outputs=[detection_output, result_output, detections_json]
+    )
+if __name__ == "__main__":
+    demo.launch()