iammraat commited on
Commit
ac5cce0
Β·
verified Β·
1 Parent(s): 8d71bda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +290 -274
app.py CHANGED
@@ -1,317 +1,333 @@
1
  import gradio as gr
2
- from transformers import AutoModel, AutoTokenizer
3
  import torch
4
- import spaces
5
- import os
6
- import sys
7
- import tempfile
8
- import shutil
9
- from PIL import Image, ImageDraw, ImageFont, ImageOps
10
- import fitz
11
- import re
12
  import numpy as np
13
- import base64
14
- from io import StringIO, BytesIO
 
 
 
15
 
16
- MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
 
 
 
17
 
18
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
19
- model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
20
- model = model.eval().cuda()
21
 
22
- BASE_SIZE = 1024
23
- IMAGE_SIZE = 768
24
- CROP_MODE = True
 
 
25
 
26
- TASK_PROMPTS = {
27
- "πŸ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
28
- "πŸ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
29
- "πŸ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
30
- "πŸ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
31
- "✏️ Custom": {"prompt": "", "has_grounding": False}
32
- }
 
 
 
 
 
 
 
33
 
34
- def extract_grounding_references(text):
35
- pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
36
- return re.findall(pattern, text, re.DOTALL)
37
 
38
- def draw_bounding_boxes(image, refs, extract_images=False):
39
- img_w, img_h = image.size
40
- img_draw = image.copy()
41
- draw = ImageDraw.Draw(img_draw)
42
- overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
43
- draw2 = ImageDraw.Draw(overlay)
44
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 15)
45
- crops = []
46
 
47
- color_map = {}
48
- np.random.seed(42)
49
-
50
- for ref in refs:
51
- label = ref[1]
52
- if label not in color_map:
53
- color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
54
-
55
- color = color_map[label]
56
- coords = eval(ref[2])
57
- color_a = color + (60,)
58
-
59
- for box in coords:
60
- x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
61
-
62
- if extract_images and label == 'image':
63
- crops.append(image.crop((x1, y1, x2, y2)))
64
-
65
- width = 5 if label == 'title' else 3
66
- draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
67
- draw2.rectangle([x1, y1, x2, y2], fill=color_a)
68
-
69
- text_bbox = draw.textbbox((0, 0), label, font=font)
70
- tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
71
- ty = max(0, y1 - 20)
72
- draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
73
- draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
74
 
75
- img_draw.paste(overlay, (0, 0), overlay)
76
- return img_draw, crops
 
77
 
78
- def clean_output(text, include_images=False):
79
- if not text:
80
- return ""
81
- pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
82
- matches = re.findall(pattern, text, re.DOTALL)
83
- img_num = 0
 
 
84
 
85
- for match in matches:
86
- if '<|ref|>image<|/ref|>' in match[0]:
87
- if include_images:
88
- text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
89
- img_num += 1
90
- else:
91
- text = text.replace(match[0], '', 1)
92
- else:
93
- text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
94
 
95
- text = text.replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
96
 
97
- return text.strip()
98
-
99
- def embed_images(markdown, crops):
100
- if not crops:
101
- return markdown
102
- for i, img in enumerate(crops):
103
- buf = BytesIO()
104
- img.save(buf, format="PNG")
105
- b64 = base64.b64encode(buf.getvalue()).decode()
106
- markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
107
- return markdown
108
-
109
- @spaces.GPU(duration=90)
110
- def process_image(image, task, custom_prompt):
111
- if image is None:
112
- return "Error: Upload an image", "", "", None, []
113
- if task in ["✏️ Custom", "πŸ“ Locate"] and not custom_prompt.strip():
114
- return "Please enter a prompt", "", "", None, []
115
 
116
- if image.mode in ('RGBA', 'LA', 'P'):
117
- image = image.convert('RGB')
118
- image = ImageOps.exif_transpose(image)
 
 
 
 
119
 
120
- if task == "✏️ Custom":
121
- prompt = f"<image>\n{custom_prompt.strip()}"
122
- has_grounding = '<|grounding|>' in custom_prompt
123
- elif task == "πŸ“ Locate":
124
- prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
125
- has_grounding = True
126
- else:
127
- prompt = TASK_PROMPTS[task]["prompt"]
128
- has_grounding = TASK_PROMPTS[task]["has_grounding"]
129
 
130
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
131
- image.save(tmp.name, 'JPEG', quality=95)
132
- tmp.close()
133
- out_dir = tempfile.mkdtemp()
134
 
135
- stdout = sys.stdout
136
- sys.stdout = StringIO()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- model.infer(
139
- tokenizer=tokenizer,
140
- prompt=prompt,
141
- image_file=tmp.name,
142
- output_path=out_dir,
143
- base_size=BASE_SIZE,
144
- image_size=IMAGE_SIZE,
145
- crop_mode=CROP_MODE,
146
- save_results=False
147
- )
148
 
149
- debug_filters = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
150
- result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
151
- if l.strip() and not any(s in l for s in debug_filters)]).strip()
152
- sys.stdout = stdout
153
 
154
- os.unlink(tmp.name)
155
- shutil.rmtree(out_dir, ignore_errors=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- if not result:
158
- return "No text detected", "", "", None, []
 
 
 
159
 
160
- cleaned = clean_output(result, False)
161
- markdown = clean_output(result, True)
162
 
163
- img_out = None
164
- crops = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- if has_grounding and '<|ref|>' in result:
167
- refs = extract_grounding_references(result)
168
- if refs:
169
- img_out, crops = draw_bounding_boxes(image, refs, True)
170
 
171
- markdown = embed_images(markdown, crops)
 
 
172
 
173
- return cleaned, markdown, result, img_out, crops
 
 
 
 
 
174
 
175
- @spaces.GPU(duration=90)
176
- def process_pdf(path, task, custom_prompt, page_num):
177
- doc = fitz.open(path)
178
- total_pages = len(doc)
179
- if page_num < 1 or page_num > total_pages:
180
- doc.close()
181
- return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
182
- page = doc.load_page(page_num - 1)
183
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
184
- img = Image.open(BytesIO(pix.tobytes("png")))
185
- doc.close()
186
-
187
- return process_image(img, task, custom_prompt)
188
 
189
- def process_file(path, task, custom_prompt, page_num):
190
- if not path:
191
- return "Error: Upload a file", "", "", None, []
192
- if path.lower().endswith('.pdf'):
193
- return process_pdf(path, task, custom_prompt, page_num)
194
- else:
195
- return process_image(Image.open(path), task, custom_prompt)
196
 
197
- def toggle_prompt(task):
198
- if task == "✏️ Custom":
199
- return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
200
- elif task == "πŸ“ Locate":
201
- return gr.update(visible=True, label="Text to Locate", placeholder="Enter text to locate")
202
- return gr.update(visible=False)
 
 
203
 
204
- def select_boxes(task):
205
- if task == "πŸ“ Locate":
206
- return gr.update(selected="tab_boxes")
207
- return gr.update()
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
- def get_pdf_page_count(file_path):
210
- if not file_path or not file_path.lower().endswith('.pdf'):
211
- return 1
212
- doc = fitz.open(file_path)
213
- count = len(doc)
214
- doc.close()
215
- return count
 
 
 
 
 
 
 
 
 
216
 
217
- def load_image(file_path, page_num=1):
218
- if not file_path:
219
- return None
220
- if file_path.lower().endswith('.pdf'):
221
- doc = fitz.open(file_path)
222
- page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
223
- page = doc.load_page(page_idx)
224
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
225
- img = Image.open(BytesIO(pix.tobytes("png")))
226
- doc.close()
227
- return img
228
- else:
229
- return Image.open(file_path)
230
 
231
- def update_page_selector(file_path):
232
- if not file_path:
233
- return gr.update(visible=False)
234
- if file_path.lower().endswith('.pdf'):
235
- page_count = get_pdf_page_count(file_path)
236
- return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
237
- label=f"Select Page (1-{page_count})")
238
- return gr.update(visible=False)
239
 
240
- with gr.Blocks(title="DeepSeek-OCR-2") as demo:
241
- gr.Markdown("""
242
- # πŸš€ DeepSeek-OCR-2 Demo
243
- **Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
244
- **It's powered by DeepEncoder v2. It achieves 91.09% on OmniDocBench (+3.73% over v1).**
245
-
246
- **Hope this tool was helpful! If so, a quick like ❀️ would mean a lot :)**
247
- """)
 
 
248
 
249
  with gr.Row():
250
  with gr.Column(scale=1):
251
- file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
252
- input_img = gr.Image(label="Input Image", type="pil", height=300)
253
- page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
254
- task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
255
- prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
256
- btn = gr.Button("Extract", variant="primary", size="lg")
257
-
258
- with gr.Column(scale=2):
259
- with gr.Tabs() as tabs:
260
- with gr.Tab("Text", id="tab_text"):
261
- text_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
262
- with gr.Tab("Markdown Preview", id="tab_markdown"):
263
- md_out = gr.Markdown("")
264
- with gr.Tab("Boxes", id="tab_boxes"):
265
- img_out = gr.Image(type="pil", height=500, show_label=False)
266
- with gr.Tab("Cropped Images", id="tab_crops"):
267
- gallery = gr.Gallery(show_label=False, columns=3, height=400)
268
- with gr.Tab("Raw Text", id="tab_raw"):
269
- raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
270
-
271
- gr.Examples(
272
- examples=[
273
- ["examples/ocr.jpg", "πŸ“‹ Markdown", ""],
274
- ["examples/reachy-mini.jpg", "πŸ“ Locate", "Robot"]
275
- ],
276
- inputs=[input_img, task, prompt],
277
- cache_examples=False
278
- )
279
-
280
- with gr.Accordion("ℹ️ Info", open=False):
281
- gr.Markdown("""
282
- ### Configuration
283
- 1024 base + 768 patches with dynamic cropping (2-6 patches). 144 tokens per patch + 256 base tokens.
284
 
285
- ### Tasks
286
- - **Markdown**: Convert document to structured markdown with layout detection (grounding βœ…)
287
- - **Free OCR**: Simple text extraction without layout
288
- - **Locate**: Find and highlight specific text/elements in image (grounding βœ…)
289
- - **Describe**: General image description
290
- - **Custom**: Your own prompt
291
-
292
- ### Special Tokens
293
- - `<image>` - Placeholder where visual tokens (256-1120 size) are inserted
294
- - `<|grounding|>` - Enables layout detection with bounding boxes
295
- - `<|ref|>text<|/ref|>` - Reference text to locate in the image
296
-
297
- """)
298
-
299
- file_in.change(load_image, [file_in, page_selector], [input_img])
300
- file_in.change(update_page_selector, [file_in], [page_selector])
301
- page_selector.change(load_image, [file_in, page_selector], [input_img])
302
- task.change(toggle_prompt, [task], [prompt])
303
- task.change(select_boxes, [task], [tabs])
304
-
305
- def run(image, file_path, task, custom_prompt, page_num):
306
- if file_path:
307
- return process_file(file_path, task, custom_prompt, int(page_num))
308
- if image is not None:
309
- return process_image(image, task, custom_prompt)
310
- return "Error: Upload a file or image", "", "", None, []
311
 
312
- submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
313
- [text_out, md_out, raw_out, img_out, gallery])
314
- submit_event.then(select_boxes, [task], [tabs])
 
 
315
 
316
  if __name__ == "__main__":
317
- demo.queue(max_size=20).launch(theme=gr.themes.Soft())
 
1
  import gradio as gr
 
2
  import torch
 
 
 
 
 
 
 
 
3
  import numpy as np
4
+ import cv2
5
+ from PIL import Image, ImageOps
6
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
7
+ from paddleocr import PaddleOCR
8
+ from scipy.signal import find_peaks
9
 
10
+ # ==========================================
11
+ # βš™οΈ CONFIGURATION & MODEL LOADING
12
+ # ==========================================
13
+ print("--- SYSTEM STARTUP ---")
14
 
15
+ # Force CPU to avoid CUDA overhead on CPU-only Spaces
16
+ DEVICE = "cpu"
17
+ print(f"-> Hardware Device: {DEVICE}")
18
 
19
+ # 1. LOAD TR-OCR (Recognition)
20
+ # We use the 'stage1' model which is often more robust for general handwriting
21
+ print("-> Loading TrOCR Model...")
22
+ processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
23
+ model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(DEVICE).eval()
24
 
25
+ # 2. LOAD PADDLEOCR (Detection)
26
+ # 'structure_version' and generic settings tuned for recall (catch everything, filter later)
27
+ print("-> Loading PaddleOCR Detector...")
28
+ detector = PaddleOCR(
29
+ use_angle_cls=True,
30
+ lang='en',
31
+ show_log=False,
32
+ use_gpu=False,
33
+ det_limit_side_len=2500, # High res for small text
34
+ det_db_thresh=0.1, # Low threshold to catch faint ink
35
+ det_db_box_thresh=0.3,
36
+ det_db_unclip_ratio=1.6
37
+ )
38
+ print("--- SYSTEMS READY ---")
39
 
40
+ # ==========================================
41
+ # 🧠 CORE LOGIC: GEOMETRY UTILS
42
+ # ==========================================
43
 
44
+ def calculate_iou_containment(box1, box2):
45
+ """
46
+ Calculates how much of box1 is inside box2.
47
+ """
48
+ x1 = max(box1[0], box2[0])
49
+ y1 = max(box1[1], box2[1])
50
+ x2 = min(box1[2], box2[2])
51
+ y2 = min(box1[3], box2[3])
52
 
53
+ if x2 < x1 or y2 < y1: return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ intersection = (x2 - x1) * (y2 - y1)
56
+ area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) + 1e-6
57
+ return intersection / area1
58
 
59
+ def get_vertical_overlap_ratio(box1, box2):
60
+ """
61
+ Calculates vertical overlap between two boxes.
62
+ Used to determine if words are on the same line.
63
+ """
64
+ # y1, y2 are top, bottom
65
+ y1_a, y2_a = box1[1], box1[3]
66
+ y1_b, y2_b = box2[1], box2[3]
67
 
68
+ intersection_start = max(y1_a, y1_b)
69
+ intersection_end = min(y2_a, y2_b)
 
 
 
 
 
 
 
70
 
71
+ if intersection_end < intersection_start: return 0.0
72
 
73
+ overlap_height = intersection_end - intersection_start
74
+ min_height = min(y2_a - y1_a, y2_b - y1_b) + 1e-6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ return overlap_height / min_height
77
+
78
+ def filter_nested_boxes(boxes, containment_thresh=0.9):
79
+ """
80
+ Removes small noise boxes inside larger real boxes.
81
+ """
82
+ if not boxes: return []
83
 
84
+ # Add area to list: [x1, y1, x2, y2, area]
85
+ active = []
86
+ for b in boxes:
87
+ area = (b[2] - b[0]) * (b[3] - b[1])
88
+ active.append(list(b) + [area])
 
 
 
 
89
 
90
+ # Sort largest first
91
+ active.sort(key=lambda x: x[4], reverse=True)
 
 
92
 
93
+ final_boxes = []
94
+ for current in active:
95
+ is_nested = False
96
+ curr_box = current[:4]
97
+
98
+ for kept in final_boxes:
99
+ if calculate_iou_containment(curr_box, kept) > containment_thresh:
100
+ is_nested = True
101
+ break
102
+
103
+ if not is_nested:
104
+ final_boxes.append(curr_box)
105
+
106
+ return final_boxes
107
+
108
+ # ==========================================
109
+ # πŸ”¬ SCIENTIFIC LOGIC: PROJECTION PROFILES
110
+ # ==========================================
111
+
112
+ def split_double_lines(crop_img, logs):
113
+ """
114
+ Analyzes a crop to see if it accidentally contains TWO lines of text.
115
+ Uses Horizontal Projection Profile.
116
+ Returns: List of crops (either [original] or [top_half, bottom_half])
117
+ """
118
+ # 1. Binarize
119
+ gray = cv2.cvtColor(crop_img, cv2.COLOR_RGB2GRAY)
120
+ # Otsu's thresholding for dynamic contrast
121
+ _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
122
+
123
+ # 2. Horizontal Projection (Sum of white pixels per row)
124
+ h_proj = np.sum(thresh, axis=1)
125
 
126
+ # 3. Normalize projection
127
+ max_val = np.max(h_proj)
128
+ if max_val == 0: return [crop_img] # Empty image
129
+ h_proj = h_proj / max_val
 
 
 
 
 
 
130
 
131
+ # 4. Find Peaks (Lines of text) and Valleys (Space between lines)
132
+ # We look for peaks with a certain prominence
133
+ peaks, _ = find_peaks(h_proj, height=0.2, distance=15)
 
134
 
135
+ if len(peaks) < 2:
136
+ return [crop_img] # Likely just one line
137
+
138
+ # If we have 2+ clear peaks, we check the "valley" between them
139
+ # Simple logic: Find the deepest point between the two major peaks
140
+ if len(peaks) >= 2:
141
+ # Get the first two major peaks
142
+ p1, p2 = peaks[0], peaks[1]
143
+
144
+ # Look at the region between peaks
145
+ valley_region = h_proj[p1:p2]
146
+ if len(valley_region) == 0: return [crop_img]
147
+
148
+ min_val = np.min(valley_region)
149
+ min_idx = np.argmin(valley_region) + p1
150
+
151
+ # STRICT CHECK: Only split if the valley is truly empty (or noise)
152
+ # If the valley still has > 30% ink density of the peak, it might just be a messy 'y' or 'g'
153
+ if min_val < 0.3:
154
+ logs.append(f" -> βœ‚οΈ Refinement: Split double line at Y={min_idx}")
155
+ top_crop = crop_img[0:min_idx, :]
156
+ bot_crop = crop_img[min_idx:, :]
157
+ return [top_crop, bot_crop]
158
+
159
+ return [crop_img]
160
+
161
+ # ==========================================
162
+ # ⛓️ PIPELINE STEP: MERGING & ORDERING
163
+ # ==========================================
164
+
165
+ def smart_line_merger(raw_boxes, logs):
166
+ """
167
+ Groups words into lines using Centroid Clustering & Vertical Overlap.
168
+ """
169
+ if not raw_boxes: return []
170
+
171
+ # 1. Clean & Format
172
+ rects = []
173
+ for box in raw_boxes:
174
+ box = np.array(box).astype(np.float32)
175
+ x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
176
+ x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
177
+ rects.append([x1, y1, x2, y2])
178
 
179
+ rects = filter_nested_boxes(rects)
180
+ logs.append(f"Valid Word Boxes: {len(rects)}")
181
+
182
+ # 2. Sort by Y-Center (approximate top-down)
183
+ rects.sort(key=lambda r: (r[1] + r[3]) / 2)
184
 
185
+ lines = []
 
186
 
187
+ while rects:
188
+ # Start new line with the highest remaining box
189
+ curr_line = [rects.pop(0)]
190
+
191
+ # Find all other boxes that belong to this line
192
+ # We use strict Vertical Overlap Ratio instead of arbitrary pixel distance
193
+ remaining = []
194
+ for r in rects:
195
+ # Check overlap against the *average* vertical span of the current line
196
+ # For simplicity, we check against the first word (the seed)
197
+ overlap = get_vertical_overlap_ratio(curr_line[0], r)
198
+
199
+ # 0.4 means they share 40% of their vertical height
200
+ if overlap > 0.4:
201
+ curr_line.append(r)
202
+ else:
203
+ remaining.append(r)
204
+
205
+ rects = remaining
206
+
207
+ # Sort the collected line horizontally (Left to Right)
208
+ curr_line.sort(key=lambda r: r[0])
209
+
210
+ # Merge coordinates
211
+ lx1 = min(r[0] for r in curr_line)
212
+ ly1 = min(r[1] for r in curr_line)
213
+ lx2 = max(r[2] for r in curr_line)
214
+ ly2 = max(r[3] for r in curr_line)
215
+
216
+ lines.append([lx1, ly1, lx2, ly2])
217
+
218
+ # Final Sort of Lines (Top to Bottom)
219
+ lines.sort(key=lambda r: r[1])
220
+ return lines
221
+
222
+ # ==========================================
223
+ # πŸš€ MAIN EXECUTION
224
+ # ==========================================
225
+
226
+ def process_handwriting(image):
227
+ logs = ["--- STARTING PIPELINE ---"]
228
 
229
+ if image is None: return None, [], "Please upload an image.", "Error"
 
 
 
230
 
231
+ # 1. PRE-PROCESS
232
+ # Convert to RGB array
233
+ orig_np = np.array(image.convert("RGB"))
234
 
235
+ # 2. DETECT (PaddleOCR)
236
+ try:
237
+ dt_boxes, _ = detector.text_detector(orig_np)
238
+ if dt_boxes is None: dt_boxes = []
239
+ except Exception as e:
240
+ return image, [], f"Detector Failed: {e}", "\n".join(logs)
241
 
242
+ if len(dt_boxes) == 0:
243
+ return image, [], "No text detected.", "Logs end."
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ # 3. MERGE WORDS -> LINES
246
+ line_boxes = smart_line_merger(dt_boxes, logs)
247
+ logs.append(f"Merged into {len(line_boxes)} lines.")
 
 
 
 
248
 
249
+ # 4. RECOGNITION + REFINEMENT LOOP
250
+ annotated_img = orig_np.copy()
251
+ final_text_lines = []
252
+ gallery_crops = []
253
+
254
+ # Padding for crops (gives TrOCR context)
255
+ PAD = 8
256
+ h_img, w_img, _ = orig_np.shape
257
 
258
+ for i, box in enumerate(line_boxes):
259
+ x1, y1, x2, y2 = map(int, box)
260
+
261
+ # Add padding safely
262
+ x1 = max(0, x1 - PAD); y1 = max(0, y1 - PAD)
263
+ x2 = min(w_img, x2 + PAD); y2 = min(h_img, y2 + PAD)
264
+
265
+ # Crop
266
+ line_crop = orig_np[y1:y2, x1:x2]
267
+
268
+ # --- REFINEMENT LOOP ---
269
+ # Check if we accidentally merged two lines
270
+ sub_crops = split_double_lines(line_crop, logs)
271
+
272
+ for sub_crop in sub_crops:
273
+ if sub_crop.shape[0] < 10 or sub_crop.shape[1] < 10: continue
274
 
275
+ # Convert for TrOCR
276
+ pil_crop = Image.fromarray(sub_crop)
277
+ gallery_crops.append(pil_crop)
278
+
279
+ # Inference
280
+ with torch.no_grad():
281
+ pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(DEVICE)
282
+ generated_ids = model.generate(pixel_values)
283
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
284
+
285
+ if text.strip():
286
+ final_text_lines.append(text)
287
+
288
+ # Visualization (Draw the *original* merged box in Green)
289
+ cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 200, 0), 2)
290
+ cv2.putText(annotated_img, str(i+1), (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,200,0), 1)
291
 
292
+ full_text = "\n".join(final_text_lines)
293
+ logs.append("--- PROCESSING COMPLETE ---")
294
+
295
+ return Image.fromarray(annotated_img), gallery_crops, full_text, "\n".join(logs)
 
 
 
 
 
 
 
 
 
296
 
 
 
 
 
 
 
 
 
297
 
298
+ # ==========================================
299
+ # πŸ–₯️ GRADIO INTERFACE
300
+ # ==========================================
301
+ css = """
302
+ #gallery { height: 300px; overflow-y: scroll; }
303
+ """
304
+
305
+ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
306
+ gr.Markdown("## πŸ“ Scientific Handwriting OCR (Line-Level Refinement)")
307
+ gr.Markdown("Uses PaddleOCR for detection, Geometry for merging, Projection Profiles for refinement, and TrOCR for reading.")
308
 
309
  with gr.Row():
310
  with gr.Column(scale=1):
311
+ input_img = gr.Image(type="pil", label="Input Document")
312
+ run_btn = gr.Button("Analyze & Transcribe", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
+ with gr.Column(scale=1):
315
+ with gr.Tabs():
316
+ with gr.Tab("Transcribed Text"):
317
+ output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
318
+ with gr.Tab("Segmentation Map"):
319
+ output_img = gr.Image(label="Line Detection Map")
320
+ with gr.Tab("System Logs"):
321
+ log_output = gr.Textbox(label="Process Logs", lines=15)
322
+
323
+ gr.Markdown("### Line Segments (Input for TrOCR)")
324
+ gallery = gr.Gallery(label="Refined Crops", columns=4, elem_id="gallery")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
+ run_btn.click(
327
+ process_handwriting,
328
+ input_img,
329
+ [output_img, gallery, output_txt, log_output]
330
+ )
331
 
332
  if __name__ == "__main__":
333
+ demo.launch()