Spaces:

iammraat
/

deepseek

Sleeping

App Files Files Community

iammraat commited on Jan 29

Commit

ac5cce0

verified ·

1 Parent(s): 8d71bda

Update app.py

Browse files

Files changed (1) hide show

app.py +290 -274

app.py CHANGED Viewed

@@ -1,317 +1,333 @@
 import gradio as gr
-from transformers import AutoModel, AutoTokenizer
 import torch
-import spaces
-import os
-import sys
-import tempfile
-import shutil
-from PIL import Image, ImageDraw, ImageFont, ImageOps
-import fitz
-import re
 import numpy as np
-import base64
-from io import StringIO, BytesIO
-MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
-model = model.eval().cuda()
-BASE_SIZE = 1024
-IMAGE_SIZE = 768
-CROP_MODE = True
-TASK_PROMPTS = {
-    "📋 Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
-    "📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
-    "📍 Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
-    "🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
-    "✏️ Custom": {"prompt": "", "has_grounding": False}
-}
-def extract_grounding_references(text):
-    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
-    return re.findall(pattern, text, re.DOTALL)
-def draw_bounding_boxes(image, refs, extract_images=False):
-    img_w, img_h = image.size
-    img_draw = image.copy()
-    draw = ImageDraw.Draw(img_draw)
-    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
-    draw2 = ImageDraw.Draw(overlay)
-    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 15)
-    crops = []
-    color_map = {}
-    np.random.seed(42)
-    for ref in refs:
-        label = ref[1]
-        if label not in color_map:
-            color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
-        color = color_map[label]
-        coords = eval(ref[2])
-        color_a = color + (60,)
-        for box in coords:
-            x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
-            if extract_images and label == 'image':
-                crops.append(image.crop((x1, y1, x2, y2)))
-            width = 5 if label == 'title' else 3
-            draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
-            draw2.rectangle([x1, y1, x2, y2], fill=color_a)
-            text_bbox = draw.textbbox((0, 0), label, font=font)
-            tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
-            ty = max(0, y1 - 20)
-            draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
-            draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
-    img_draw.paste(overlay, (0, 0), overlay)
-    return img_draw, crops
-def clean_output(text, include_images=False):
-    if not text:
-        return ""
-    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
-    matches = re.findall(pattern, text, re.DOTALL)
-    img_num = 0
-    for match in matches:
-        if '<|ref|>image<|/ref|>' in match[0]:
-            if include_images:
-                text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
-                img_num += 1
-            else:
-                text = text.replace(match[0], '', 1)
-        else:
-            text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
-    text = text.replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
-    return text.strip()
-def embed_images(markdown, crops):
-    if not crops:
-        return markdown
-    for i, img in enumerate(crops):
-        buf = BytesIO()
-        img.save(buf, format="PNG")
-        b64 = base64.b64encode(buf.getvalue()).decode()
-        markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
-    return markdown
-@spaces.GPU(duration=90)
-def process_image(image, task, custom_prompt):
-    if image is None:
-        return "Error: Upload an image", "", "", None, []
-    if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
-        return "Please enter a prompt", "", "", None, []
-    if image.mode in ('RGBA', 'LA', 'P'):
-        image = image.convert('RGB')
-    image = ImageOps.exif_transpose(image)
-    if task == "✏️ Custom":
-        prompt = f"<image>\n{custom_prompt.strip()}"
-        has_grounding = '<|grounding|>' in custom_prompt
-    elif task == "📍 Locate":
-        prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
-        has_grounding = True
-    else:
-        prompt = TASK_PROMPTS[task]["prompt"]
-        has_grounding = TASK_PROMPTS[task]["has_grounding"]
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
-    image.save(tmp.name, 'JPEG', quality=95)
-    tmp.close()
-    out_dir = tempfile.mkdtemp()
-    stdout = sys.stdout
-    sys.stdout = StringIO()
-    model.infer(
-        tokenizer=tokenizer,
-        prompt=prompt,
-        image_file=tmp.name,
-        output_path=out_dir,
-        base_size=BASE_SIZE,
-        image_size=IMAGE_SIZE,
-        crop_mode=CROP_MODE,
-        save_results=False
-    )
-    debug_filters = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
-    result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
-                        if l.strip() and not any(s in l for s in debug_filters)]).strip()
-    sys.stdout = stdout
-    os.unlink(tmp.name)
-    shutil.rmtree(out_dir, ignore_errors=True)
-    if not result:
-        return "No text detected", "", "", None, []
-    cleaned = clean_output(result, False)
-    markdown = clean_output(result, True)
-    img_out = None
-    crops = []
-    if has_grounding and '<|ref|>' in result:
-        refs = extract_grounding_references(result)
-        if refs:
-            img_out, crops = draw_bounding_boxes(image, refs, True)
-    markdown = embed_images(markdown, crops)
-    return cleaned, markdown, result, img_out, crops
-@spaces.GPU(duration=90)
-def process_pdf(path, task, custom_prompt, page_num):
-    doc = fitz.open(path)
-    total_pages = len(doc)
-    if page_num < 1 or page_num > total_pages:
-        doc.close()
-        return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
-    page = doc.load_page(page_num - 1)
-    pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
-    img = Image.open(BytesIO(pix.tobytes("png")))
-    doc.close()
-    return process_image(img, task, custom_prompt)
-def process_file(path, task, custom_prompt, page_num):
-    if not path:
-        return "Error: Upload a file", "", "", None, []
-    if path.lower().endswith('.pdf'):
-        return process_pdf(path, task, custom_prompt, page_num)
-    else:
-        return process_image(Image.open(path), task, custom_prompt)
-def toggle_prompt(task):
-    if task == "✏️ Custom":
-        return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
-    elif task == "📍 Locate":
-        return gr.update(visible=True, label="Text to Locate", placeholder="Enter text to locate")
-    return gr.update(visible=False)
-def select_boxes(task):
-    if task == "📍 Locate":
-        return gr.update(selected="tab_boxes")
-    return gr.update()
-def get_pdf_page_count(file_path):
-    if not file_path or not file_path.lower().endswith('.pdf'):
-        return 1
-    doc = fitz.open(file_path)
-    count = len(doc)
-    doc.close()
-    return count
-def load_image(file_path, page_num=1):
-    if not file_path:
-        return None
-    if file_path.lower().endswith('.pdf'):
-        doc = fitz.open(file_path)
-        page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
-        page = doc.load_page(page_idx)
-        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
-        img = Image.open(BytesIO(pix.tobytes("png")))
-        doc.close()
-        return img
-    else:
-        return Image.open(file_path)
-def update_page_selector(file_path):
-    if not file_path:
-        return gr.update(visible=False)
-    if file_path.lower().endswith('.pdf'):
-        page_count = get_pdf_page_count(file_path)
-        return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
-                        label=f"Select Page (1-{page_count})")
-    return gr.update(visible=False)
-with gr.Blocks(title="DeepSeek-OCR-2") as demo:
-    gr.Markdown("""
-    # 🚀 DeepSeek-OCR-2 Demo
-    **Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
-    **It's powered by DeepEncoder v2. It achieves 91.09% on OmniDocBench (+3.73% over v1).**
-    **Hope this tool was helpful! If so, a quick like ❤️ would mean a lot :)**
-    """)
     with gr.Row():
         with gr.Column(scale=1):
-            file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
-            input_img = gr.Image(label="Input Image", type="pil", height=300)
-            page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
-            task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
-            prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
-            btn = gr.Button("Extract", variant="primary", size="lg")
-        with gr.Column(scale=2):
-            with gr.Tabs() as tabs:
-                with gr.Tab("Text", id="tab_text"):
-                    text_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
-                with gr.Tab("Markdown Preview", id="tab_markdown"):
-                    md_out = gr.Markdown("")
-                with gr.Tab("Boxes", id="tab_boxes"):
-                    img_out = gr.Image(type="pil", height=500, show_label=False)
-                with gr.Tab("Cropped Images", id="tab_crops"):
-                    gallery = gr.Gallery(show_label=False, columns=3, height=400)
-                with gr.Tab("Raw Text", id="tab_raw"):
-                    raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
-    gr.Examples(
-        examples=[
-            ["examples/ocr.jpg", "📋 Markdown", ""],
-            ["examples/reachy-mini.jpg", "📍 Locate", "Robot"]
-        ],
-        inputs=[input_img, task, prompt],
-        cache_examples=False
-    )
-    with gr.Accordion("ℹ️ Info", open=False):
-        gr.Markdown("""
-        ### Configuration
-        1024 base + 768 patches with dynamic cropping (2-6 patches). 144 tokens per patch + 256 base tokens.
-        ### Tasks
-        - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
-        - **Free OCR**: Simple text extraction without layout
-        - **Locate**: Find and highlight specific text/elements in image (grounding ✅)
-        - **Describe**: General image description
-        - **Custom**: Your own prompt
-        ### Special Tokens
-        - `<image>` - Placeholder where visual tokens (256-1120 size) are inserted
-        - `<|grounding|>` - Enables layout detection with bounding boxes
-        - `<|ref|>text<|/ref|>` - Reference text to locate in the image
-        """)
-    file_in.change(load_image, [file_in, page_selector], [input_img])
-    file_in.change(update_page_selector, [file_in], [page_selector])
-    page_selector.change(load_image, [file_in, page_selector], [input_img])
-    task.change(toggle_prompt, [task], [prompt])
-    task.change(select_boxes, [task], [tabs])
-    def run(image, file_path, task, custom_prompt, page_num):
-        if file_path:
-            return process_file(file_path, task, custom_prompt, int(page_num))
-        if image is not None:
-            return process_image(image, task, custom_prompt)
-        return "Error: Upload a file or image", "", "", None, []
-    submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
-                             [text_out, md_out, raw_out, img_out, gallery])
-    submit_event.then(select_boxes, [task], [tabs])
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(theme=gr.themes.Soft())

 import gradio as gr
 import torch
 import numpy as np
+import cv2
+from PIL import Image, ImageOps
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from paddleocr import PaddleOCR
+from scipy.signal import find_peaks
+# ==========================================
+# ⚙️ CONFIGURATION & MODEL LOADING
+# ==========================================
+print("--- SYSTEM STARTUP ---")
+# Force CPU to avoid CUDA overhead on CPU-only Spaces
+DEVICE = "cpu"
+print(f"-> Hardware Device: {DEVICE}")
+# 1. LOAD TR-OCR (Recognition)
+# We use the 'stage1' model which is often more robust for general handwriting
+print("-> Loading TrOCR Model...")
+processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
+model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(DEVICE).eval()
+# 2. LOAD PADDLEOCR (Detection)
+# 'structure_version' and generic settings tuned for recall (catch everything, filter later)
+print("-> Loading PaddleOCR Detector...")
+detector = PaddleOCR(
+    use_angle_cls=True,
+    lang='en',
+    show_log=False,
+    use_gpu=False,
+    det_limit_side_len=2500,  # High res for small text
+    det_db_thresh=0.1,        # Low threshold to catch faint ink
+    det_db_box_thresh=0.3,
+    det_db_unclip_ratio=1.6
+)
+print("--- SYSTEMS READY ---")
+# ==========================================
+# 🧠 CORE LOGIC: GEOMETRY UTILS
+# ==========================================
+def calculate_iou_containment(box1, box2):
+    """
+    Calculates how much of box1 is inside box2.
+    """
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    if x2 < x1 or y2 < y1: return 0.0
+    intersection = (x2 - x1) * (y2 - y1)
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) + 1e-6
+    return intersection / area1
+def get_vertical_overlap_ratio(box1, box2):
+    """
+    Calculates vertical overlap between two boxes.
+    Used to determine if words are on the same line.
+    """
+    # y1, y2 are top, bottom
+    y1_a, y2_a = box1[1], box1[3]
+    y1_b, y2_b = box2[1], box2[3]
+    intersection_start = max(y1_a, y1_b)
+    intersection_end = min(y2_a, y2_b)
+    if intersection_end < intersection_start: return 0.0
+    overlap_height = intersection_end - intersection_start
+    min_height = min(y2_a - y1_a, y2_b - y1_b) + 1e-6
+    return overlap_height / min_height
+def filter_nested_boxes(boxes, containment_thresh=0.9):
+    """
+    Removes small noise boxes inside larger real boxes.
+    """
+    if not boxes: return []
+    # Add area to list: [x1, y1, x2, y2, area]
+    active = []
+    for b in boxes:
+        area = (b[2] - b[0]) * (b[3] - b[1])
+        active.append(list(b) + [area])
+    # Sort largest first
+    active.sort(key=lambda x: x[4], reverse=True)
+    final_boxes = []
+    for current in active:
+        is_nested = False
+        curr_box = current[:4]
+        for kept in final_boxes:
+            if calculate_iou_containment(curr_box, kept) > containment_thresh:
+                is_nested = True
+                break
+        if not is_nested:
+            final_boxes.append(curr_box)
+    return final_boxes
+# ==========================================
+# 🔬 SCIENTIFIC LOGIC: PROJECTION PROFILES
+# ==========================================
+def split_double_lines(crop_img, logs):
+    """
+    Analyzes a crop to see if it accidentally contains TWO lines of text.
+    Uses Horizontal Projection Profile.
+    Returns: List of crops (either [original] or [top_half, bottom_half])
+    """
+    # 1. Binarize
+    gray = cv2.cvtColor(crop_img, cv2.COLOR_RGB2GRAY)
+    # Otsu's thresholding for dynamic contrast
+    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    # 2. Horizontal Projection (Sum of white pixels per row)
+    h_proj = np.sum(thresh, axis=1)
+    # 3. Normalize projection
+    max_val = np.max(h_proj)
+    if max_val == 0: return [crop_img] # Empty image
+    h_proj = h_proj / max_val
+    # 4. Find Peaks (Lines of text) and Valleys (Space between lines)
+    # We look for peaks with a certain prominence
+    peaks, _ = find_peaks(h_proj, height=0.2, distance=15)
+    if len(peaks) < 2:
+        return [crop_img] # Likely just one line
+    # If we have 2+ clear peaks, we check the "valley" between them
+    # Simple logic: Find the deepest point between the two major peaks
+    if len(peaks) >= 2:
+        # Get the first two major peaks
+        p1, p2 = peaks[0], peaks[1]
+        # Look at the region between peaks
+        valley_region = h_proj[p1:p2]
+        if len(valley_region) == 0: return [crop_img]
+        min_val = np.min(valley_region)
+        min_idx = np.argmin(valley_region) + p1
+        # STRICT CHECK: Only split if the valley is truly empty (or noise)
+        # If the valley still has > 30% ink density of the peak, it might just be a messy 'y' or 'g'
+        if min_val < 0.3:
+            logs.append(f"   -> ✂️ Refinement: Split double line at Y={min_idx}")
+            top_crop = crop_img[0:min_idx, :]
+            bot_crop = crop_img[min_idx:, :]
+            return [top_crop, bot_crop]
+    return [crop_img]
+# ==========================================
+# ⛓️ PIPELINE STEP: MERGING & ORDERING
+# ==========================================
+def smart_line_merger(raw_boxes, logs):
+    """
+    Groups words into lines using Centroid Clustering & Vertical Overlap.
+    """
+    if not raw_boxes: return []
+    # 1. Clean & Format
+    rects = []
+    for box in raw_boxes:
+        box = np.array(box).astype(np.float32)
+        x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
+        x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
+        rects.append([x1, y1, x2, y2])
+    rects = filter_nested_boxes(rects)
+    logs.append(f"Valid Word Boxes: {len(rects)}")
+    # 2. Sort by Y-Center (approximate top-down)
+    rects.sort(key=lambda r: (r[1] + r[3]) / 2)
+    lines = []
+    while rects:
+        # Start new line with the highest remaining box
+        curr_line = [rects.pop(0)]
+        # Find all other boxes that belong to this line
+        # We use strict Vertical Overlap Ratio instead of arbitrary pixel distance
+        remaining = []
+        for r in rects:
+            # Check overlap against the *average* vertical span of the current line
+            # For simplicity, we check against the first word (the seed)
+            overlap = get_vertical_overlap_ratio(curr_line[0], r)
+            # 0.4 means they share 40% of their vertical height
+            if overlap > 0.4:
+                curr_line.append(r)
+            else:
+                remaining.append(r)
+        rects = remaining
+        # Sort the collected line horizontally (Left to Right)
+        curr_line.sort(key=lambda r: r[0])
+        # Merge coordinates
+        lx1 = min(r[0] for r in curr_line)
+        ly1 = min(r[1] for r in curr_line)
+        lx2 = max(r[2] for r in curr_line)
+        ly2 = max(r[3] for r in curr_line)
+        lines.append([lx1, ly1, lx2, ly2])
+    # Final Sort of Lines (Top to Bottom)
+    lines.sort(key=lambda r: r[1])
+    return lines
+# ==========================================
+# 🚀 MAIN EXECUTION
+# ==========================================
+def process_handwriting(image):
+    logs = ["--- STARTING PIPELINE ---"]
+    if image is None: return None, [], "Please upload an image.", "Error"
+    # 1. PRE-PROCESS
+    # Convert to RGB array
+    orig_np = np.array(image.convert("RGB"))
+    # 2. DETECT (PaddleOCR)
+    try:
+        dt_boxes, _ = detector.text_detector(orig_np)
+        if dt_boxes is None: dt_boxes = []
+    except Exception as e:
+        return image, [], f"Detector Failed: {e}", "\n".join(logs)
+    if len(dt_boxes) == 0:
+        return image, [], "No text detected.", "Logs end."
+    # 3. MERGE WORDS -> LINES
+    line_boxes = smart_line_merger(dt_boxes, logs)
+    logs.append(f"Merged into {len(line_boxes)} lines.")
+    # 4. RECOGNITION + REFINEMENT LOOP
+    annotated_img = orig_np.copy()
+    final_text_lines = []
+    gallery_crops = []
+    # Padding for crops (gives TrOCR context)
+    PAD = 8
+    h_img, w_img, _ = orig_np.shape
+    for i, box in enumerate(line_boxes):
+        x1, y1, x2, y2 = map(int, box)
+        # Add padding safely
+        x1 = max(0, x1 - PAD); y1 = max(0, y1 - PAD)
+        x2 = min(w_img, x2 + PAD); y2 = min(h_img, y2 + PAD)
+        # Crop
+        line_crop = orig_np[y1:y2, x1:x2]
+        # --- REFINEMENT LOOP ---
+        # Check if we accidentally merged two lines
+        sub_crops = split_double_lines(line_crop, logs)
+        for sub_crop in sub_crops:
+            if sub_crop.shape[0] < 10 or sub_crop.shape[1] < 10: continue
+            # Convert for TrOCR
+            pil_crop = Image.fromarray(sub_crop)
+            gallery_crops.append(pil_crop)
+            # Inference
+            with torch.no_grad():
+                pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(DEVICE)
+                generated_ids = model.generate(pixel_values)
+                text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            if text.strip():
+                final_text_lines.append(text)
+        # Visualization (Draw the *original* merged box in Green)
+        cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 200, 0), 2)
+        cv2.putText(annotated_img, str(i+1), (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,200,0), 1)
+    full_text = "\n".join(final_text_lines)
+    logs.append("--- PROCESSING COMPLETE ---")
+    return Image.fromarray(annotated_img), gallery_crops, full_text, "\n".join(logs)
+# ==========================================
+# 🖥️ GRADIO INTERFACE
+# ==========================================
+css = """
+#gallery { height: 300px; overflow-y: scroll; }
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    gr.Markdown("## 📝 Scientific Handwriting OCR (Line-Level Refinement)")
+    gr.Markdown("Uses PaddleOCR for detection, Geometry for merging, Projection Profiles for refinement, and TrOCR for reading.")
     with gr.Row():
         with gr.Column(scale=1):
+            input_img = gr.Image(type="pil", label="Input Document")
+            run_btn = gr.Button("Analyze & Transcribe", variant="primary")
+        with gr.Column(scale=1):
+            with gr.Tabs():
+                with gr.Tab("Transcribed Text"):
+                    output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
+                with gr.Tab("Segmentation Map"):
+                    output_img = gr.Image(label="Line Detection Map")
+                with gr.Tab("System Logs"):
+                    log_output = gr.Textbox(label="Process Logs", lines=15)
+    gr.Markdown("### Line Segments (Input for TrOCR)")
+    gallery = gr.Gallery(label="Refined Crops", columns=4, elem_id="gallery")
+    run_btn.click(
+        process_handwriting,
+        input_img,
+        [output_img, gallery, output_txt, log_output]
+    )
 if __name__ == "__main__":
+    demo.launch()