Luis J Camargo commited on
Commit
58fd993
·
1 Parent(s): dcf1d67

first commit

Browse files
Files changed (2) hide show
  1. app.py +365 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import base64
5
+ import re
6
+ import logging
7
+ import sys
8
+ import yaml
9
+ from typing import Dict, List, Tuple, Any, Optional
10
+ import time
11
+
12
+ import gradio as gr
13
+ from PIL import Image
14
+ import requests
15
+ from urllib.parse import urlparse
16
+
17
+ # Paddle imports
18
+ try:
19
+ from paddleocr import PaddleOCRVL
20
+ import paddlex
21
+ PADDLE_AVAILABLE = True
22
+ except ImportError:
23
+ PADDLE_AVAILABLE = False
24
+ print("Warning: paddleocr or paddlex not found. Inference will be disabled.")
25
+
26
+ # --- Configuration ---
27
+ LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
28
+ logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stderr)])
29
+ logger = logging.getLogger("TachiwinDocOCR")
30
+
31
+ CUSTOM_MODEL_PATH = "tachiwin/Tachiwin-OCR-1.5"
32
+ CONFIG_FILE = "custom_pipeline_config.yaml"
33
+ OUTPUT_DIR = "output"
34
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
35
+
36
+ # LATEX Configuration for Gradio
37
+ LATEX_DELIMS = [
38
+ {"left": "$$", "right": "$$", "display": True},
39
+ {"left": "$", "right": "$", "display": False},
40
+ {"left": "\\(", "right": "\\)", "display": False},
41
+ {"left": "\\[", "right": "\\]", "display": True},
42
+ ]
43
+
44
+ # --- Model Initialization ---
45
+ pipeline = None
46
+
47
+ def setup_pipeline():
48
+ global pipeline
49
+ if not PADDLE_AVAILABLE:
50
+ return
51
+
52
+ try:
53
+ # 1. Generate default config if it doesn't exist
54
+ if not os.path.exists(CONFIG_FILE):
55
+ logger.info(f"Generating default configuration file: {CONFIG_FILE}")
56
+ # Note: Using the internal paddlex API to get the config
57
+ # Equivalent to: paddlex --get_pipeline_config PaddleOCR-VL
58
+ from paddlex import create_pipeline
59
+ temp_pipeline = create_pipeline("PaddleOCR-VL")
60
+ temp_pipeline.export_pipeline_config(save_path=CONFIG_FILE)
61
+ logger.info("Default configuration exported.")
62
+
63
+ # 2. Modify config to point to custom model
64
+ logger.info(f"Modifying configuration to use custom model: {CUSTOM_MODEL_PATH}")
65
+ with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
66
+ config_data = yaml.safe_load(f)
67
+
68
+ # Update the model_dir for VLRecognition
69
+ # Heuristic: Find and update VLRecognition model_dir
70
+ if 'SubModules' in config_data and 'VLRecognition' in config_data['SubModules']:
71
+ config_data['SubModules']['VLRecognition']['model_dir'] = CUSTOM_MODEL_PATH
72
+ logger.info(f"Updated VLRecognition model_dir to {CUSTOM_MODEL_PATH}")
73
+ else:
74
+ logger.warning("Could not find VLRecognition in config_data['SubModules']. Attempting fallback.")
75
+ # Fallback searching through the dict if structure is different
76
+ def update_model_dir(d):
77
+ for k, v in d.items():
78
+ if k == 'VLRecognition' and isinstance(v, dict):
79
+ v['model_dir'] = CUSTOM_MODEL_PATH
80
+ return True
81
+ if isinstance(v, dict):
82
+ if update_model_dir(v): return True
83
+ return False
84
+ update_model_dir(config_data)
85
+
86
+ with open(CONFIG_FILE, 'w', encoding='utf-8') as f:
87
+ yaml.dump(config_data, f)
88
+
89
+ # 3. Initialize pipeline with modified config
90
+ logger.info(f"Initializing PaddleOCRVL with config: {CONFIG_FILE}")
91
+ pipeline = PaddleOCRVL(pipeline_config=CONFIG_FILE)
92
+ logger.info("PaddleOCRVL initialized successfully.")
93
+
94
+ except Exception as e:
95
+ logger.error(f"Failed to initialize PaddleOCRVL: {e}")
96
+
97
+ if PADDLE_AVAILABLE:
98
+ setup_pipeline()
99
+
100
+ # --- Helper Functions ---
101
+
102
+ def image_to_base64_data_url(filepath: str) -> str:
103
+ try:
104
+ ext = os.path.splitext(filepath)[1].lower()
105
+ mime_types = {
106
+ ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png",
107
+ ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp"
108
+ }
109
+ mime_type = mime_types.get(ext, "image/jpeg")
110
+ with open(filepath, "rb") as image_file:
111
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
112
+ return f"data:{mime_type};base64,{encoded_string}"
113
+ except Exception as e:
114
+ logger.error(f"Error encoding image to Base64: {e}")
115
+ return ""
116
+
117
+ def _escape_inequalities_in_math(md: str) -> str:
118
+ _MATH_PATTERNS = [
119
+ re.compile(r"\$\$([\s\S]+?)\$\$"),
120
+ re.compile(r"\$([^\$]+?)\$"),
121
+ re.compile(r"\\\[([\s\S]+?)\\\]"),
122
+ re.compile(r"\\\(([\s\S]+?)\\\)"),
123
+ ]
124
+ def fix(s: str) -> str:
125
+ s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
126
+ s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
127
+ s = s.replace("<", r" \lt ").replace(">", r" \gt ")
128
+ return s
129
+ for pat in _MATH_PATTERNS:
130
+ md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
131
+ return md
132
+
133
+ def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
134
+ if not path_or_url:
135
+ return gr.update(value="", visible=False)
136
+
137
+ is_url = isinstance(path_or_url, str) and path_or_url.startswith(("http://", "https://"))
138
+ if is_url:
139
+ src = path_or_url
140
+ else:
141
+ src = image_to_base64_data_url(path_or_url)
142
+
143
+ html_content = f"""
144
+ <div class="uploaded-image">
145
+ <img src="{src}" alt="Preview image" style="width:100%;height:100%;object-fit:contain;" loading="lazy"/>
146
+ </div>
147
+ """
148
+ return gr.update(value=html_content, visible=True)
149
+
150
+ # --- Inference Logic ---
151
+
152
+ def run_inference(img_path, task_type="ocr"):
153
+ if not PADDLE_AVAILABLE or pipeline is None:
154
+ return "PaddleOCRVL is not available or failed to load. Ensure paddlex and paddleocr are installed.", "", "", ""
155
+
156
+ if not img_path:
157
+ return "Please upload an image.", "", "", ""
158
+
159
+ try:
160
+ logger.info(f"Running inference for {img_path} with task {task_type}")
161
+
162
+ # Adjust pipeline parameters based on task_type if needed
163
+ # PaddleOCRVL predict as per documentation
164
+ output = pipeline.predict(img_path)
165
+
166
+ md_content = ""
167
+ json_content = ""
168
+ vis_html = ""
169
+
170
+ run_id = str(int(time.time()))
171
+ run_output_dir = os.path.join(OUTPUT_DIR, run_id)
172
+ os.makedirs(run_output_dir, exist_ok=True)
173
+
174
+ for i, res in enumerate(output):
175
+ # Save outputs
176
+ res.save_to_json(save_path=run_output_dir)
177
+ res.save_to_markdown(save_path=run_output_dir)
178
+
179
+ # Print for logs
180
+ res.print()
181
+
182
+ # Extract content from generated files
183
+ for root, dirs, files in os.walk(run_output_dir):
184
+ for file in files:
185
+ file_full_path = os.path.join(root, file)
186
+ if file.endswith(".md"):
187
+ with open(file_full_path, 'r', encoding='utf-8') as f:
188
+ md_content += f.read() + "\n\n"
189
+ elif file.endswith(".json"):
190
+ with open(file_full_path, 'r', encoding='utf-8') as f:
191
+ json_content += f.read() + "\n\n"
192
+ elif file.endswith((".png", ".jpg", ".jpeg")) and "res" in file:
193
+ # Found a visualization image
194
+ vis_src = image_to_base64_data_url(file_full_path)
195
+ vis_html += f'<div style="margin-bottom:20px;">'
196
+ vis_html += f'<p style="font-weight:bold;">Visualization {i+1}:</p>'
197
+ vis_html += f'<img src="{vis_src}" alt="Visualization {i+1}" style="width:100%; border-radius: 8px; border: 1px solid #ddd;">'
198
+ vis_html += f'</div>'
199
+
200
+ if not md_content:
201
+ md_content = "No text recognized."
202
+
203
+ md_preview = _escape_inequalities_in_math(md_content)
204
+
205
+ return md_preview, md_content, vis_html, json_content
206
+
207
+ except Exception as e:
208
+ logger.error(f"Inference failed: {e}")
209
+ return f"Error: {str(e)}", "", "", ""
210
+
211
+ # --- UI Components ---
212
+
213
+ css = """
214
+ body, .gradio-container { font-family: 'Inter', -apple-system, system-ui, sans-serif; }
215
+ .app-header {
216
+ text-align: center;
217
+ padding: 30px;
218
+ background: linear-gradient(120deg, rgb(2, 132, 199) 0%, rgb(16, 185, 129) 60%, rgb(5, 150, 105) 100%);
219
+ color: white;
220
+ border-radius: 15px;
221
+ margin-bottom: 25px;
222
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
223
+ }
224
+ .app-header h1 { color: white !important; margin: 0; font-size: 2.5em; }
225
+ .app-header p { font-size: 1.2em; opacity: 0.9; margin-top: 10px; }
226
+ .notice { margin: 8px auto 0; max-width: 900px; padding: 10px 12px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f8fafc; font-size: 14px; line-height: 1.6; }
227
+ .quick-links { text-align: center; padding: 8px 0; border: 1px solid #e5e7eb; border-radius: 8px; margin: 8px auto; max-width: 900px; }
228
+ .quick-links a { margin: 0 12px; font-size: 14px; font-weight: 600; color: #3b82f6; text-decoration: none; }
229
+ .quick-links a:hover { text-decoration: underline; }
230
+ #image_preview_doc, #image_preview_vl, #image_preview_spot { height: 400px !important; overflow: auto; border: 1px solid #ddd; border-radius: 8px; background: #eee; }
231
+ #image_preview_doc img, #image_preview_vl img, #image_preview_spot img { width: 100% !important; height: auto !important; object-fit: contain !important; display: block; }
232
+ .output_markdown { min-height: 30rem !important; font-size: 1.1rem !important; line-height: 1.6 !important; }
233
+ .prose pre { background: #f1f5f9 !important; border-radius: 8px !important; padding: 10px !important; }
234
+ """
235
+
236
+ with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
237
+ # Header branding
238
+ gr.HTML(
239
+ """
240
+ <div class="app-header">
241
+ <h1>🌎 Tachiwin Document Parsing OCR 🦡</h1>
242
+ <p>Advancing Linguistic Rights for the 68 Indigenous Languages of Mexico</p>
243
+ </div>
244
+ """
245
+ )
246
+
247
+ gr.HTML(f"""
248
+ <div class="notice">
249
+ <strong>Powered by PaddleOCRVL 1.5:</strong> Optimized for in-the-wild document parsing and fine-tuned for indigenous languages.
250
+ Initializing with custom weights: <code>{CUSTOM_MODEL_PATH}</code>
251
+ </div>
252
+ """)
253
+
254
+ gr.HTML("""<div class="quick-links"><a href="https://github.com/ljcamargo/tachiwin_paddleocrvl_finetuning" target="_blank">GitHub</a> | <a href="https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5" target="_blank">Base Model</a> | <a href="https://www.paddleocr.com" target="_blank">Documentation</a></div>""")
255
+
256
+ with gr.Tabs():
257
+ # --- Tab 1: Document Parsing ---
258
+ with gr.Tab("📄 Document Parsing"):
259
+ with gr.Row():
260
+ with gr.Column(scale=5):
261
+ file_doc = gr.File(label="Upload Document Image", file_count="single", type="filepath", file_types=["image"])
262
+ preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
263
+ with gr.Row(variant="panel"):
264
+ with gr.Column(scale=2):
265
+ btn_parse = gr.Button("🚀 Parse Document", variant="primary")
266
+ with gr.Column(scale=3):
267
+ with gr.Row():
268
+ chart_switch = gr.Checkbox(label="Chart parsing", value=True)
269
+ unwarp_switch = gr.Checkbox(label="Doc unwarping", value=False)
270
+
271
+ with gr.Column(scale=7):
272
+ with gr.Tabs():
273
+ with gr.Tab("📝 Markdown Preview"):
274
+ md_preview_doc = gr.Markdown(latex_delimiters=LATEX_DELIMS, elem_classes="output_markdown")
275
+ with gr.Tab("🖼️ Visualization"):
276
+ vis_image_doc = gr.HTML("<p style='text-align:center; color:#888; padding: 20px;'>Parsing results will be visualized here.</p>")
277
+ with gr.Tab("📜 Markdown Source"):
278
+ md_raw_doc = gr.Code(language="markdown")
279
+
280
+ file_doc.change(update_preview_visibility, file_doc, preview_doc_html)
281
+
282
+ def parse_doc_wrapper(fp, ch, uw):
283
+ if not fp: return "Please upload an image.", "", "", ""
284
+ res_preview, res_raw, res_vis, res_json = run_inference(fp, task_type="document")
285
+ return res_preview, res_vis, res_raw
286
+
287
+ btn_parse.click(parse_doc_wrapper, [file_doc, chart_switch, unwarp_switch], [md_preview_doc, vis_image_doc, md_raw_doc])
288
+
289
+ # --- Tab 2: Element Recognition ---
290
+ with gr.Tab("🧩 Element Recognition"):
291
+ with gr.Row():
292
+ with gr.Column(scale=5):
293
+ file_vl = gr.File(label="Upload Element Image", file_count="single", type="filepath", file_types=["image"])
294
+ preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
295
+ with gr.Row():
296
+ btn_ocr = gr.Button("Text Recognition", variant="secondary")
297
+ btn_formula = gr.Button("Formula Recognition", variant="secondary")
298
+ with gr.Row():
299
+ btn_table = gr.Button("Table Recognition", variant="secondary")
300
+ btn_chart = gr.Button("Chart Recognition", variant="secondary")
301
+
302
+ with gr.Column(scale=7):
303
+ with gr.Tabs():
304
+ with gr.Tab("📊 Result"):
305
+ md_preview_vl = gr.Markdown(latex_delimiters=LATEX_DELIMS, elem_classes="output_markdown")
306
+ with gr.Tab("📜 Raw Output"):
307
+ md_raw_vl = gr.Code(language="markdown")
308
+
309
+ file_vl.change(update_preview_visibility, file_vl, preview_vl_html)
310
+
311
+ def run_vl_wrapper(fp, prompt):
312
+ if not fp: return "Please upload an image.", "", ""
313
+ res_preview, res_raw, res_vis, res_json = run_inference(fp, task_type=prompt)
314
+ return res_preview, res_raw
315
+
316
+ for btn, prompt in [(btn_ocr, "Text Recognition"), (btn_formula, "Formula Recognition"), (btn_table, "Table Recognition"), (btn_chart, "Chart Recognition")]:
317
+ btn.click(run_vl_wrapper, [file_vl, gr.State(prompt)], [md_preview_vl, md_raw_vl])
318
+
319
+ # --- Tab 3: Spotting ---
320
+ with gr.Tab("📍 Spotting"):
321
+ with gr.Row():
322
+ with gr.Column(scale=5):
323
+ file_spot = gr.File(label="Upload Image for Detection", file_count="single", type="filepath", file_types=["image"])
324
+ preview_spot_html = gr.HTML(value="", elem_id="image_preview_spot", visible=False)
325
+ btn_run_spot = gr.Button("Run Spotting", variant="primary")
326
+
327
+ with gr.Column(scale=7):
328
+ with gr.Tabs():
329
+ with gr.Tab("🖼️ Visualization"):
330
+ vis_image_spot = gr.HTML("<p style='text-align:center; color:#888; padding: 20px;'>Detection visualization.</p>")
331
+ with gr.Tab("💾 JSON Result"):
332
+ json_spot = gr.Code(label="Detection Results", language="json")
333
+
334
+ file_spot.change(update_preview_visibility, file_spot, preview_spot_html)
335
+
336
+ def run_spotting_wrapper(fp):
337
+ if not fp: return "", ""
338
+ res_preview, res_raw, res_vis, res_json = run_inference(fp, task_type="spotting")
339
+ return res_vis, res_json
340
+
341
+ btn_run_spot.click(run_spotting_wrapper, file_spot, [vis_image_spot, json_spot])
342
+
343
+ # Footer Information
344
+ gr.Markdown(
345
+ """
346
+ ---
347
+ ### ℹ️ About Tachiwin 🦡
348
+
349
+ **Tachiwin** (from Totonac - "Language") is dedicated to bridging the digital divide for indigenous languages of Mexico through AI technology. This model represents a **world first in tech access and linguistic rights**, specifically trained to recognize the 68 indigenous languages of Mexico.
350
+
351
+ ### Supported Language Families
352
+
353
+ **Uto-Aztecan:** Náhuatl, Yaqui, Mayo, Huichol, Tepehuán, Tarahumara
354
+ **Mayan:** Maya, Tzeltal, Tzotzil, Chol, Tojolabal, Q'anjob'al, Mam
355
+ **Oto-Manguean:** Zapoteco, Mixteco, Otomí, Mazateco, Chinanteco, Triqui
356
+ **Totonac-Tepehua:** Totonaco, Tepehua
357
+ **Mixe-Zoque:** Mixe, Zoque, Popoluca
358
+ **Other:** Purépecha, Huave, Seri, Kickapoo, Kiliwa
359
+
360
+ Made with ❤️ for linguistic diversity and indigenous rights 🦡
361
+ """
362
+ )
363
+
364
+ if __name__ == "__main__":
365
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
2
+ paddlepaddle==3.3.0
3
+ paddlex
4
+ paddleocr[doc-parser]
5
+ gradio
6
+ pillow
7
+ requests
8
+ numpy
9
+ psutil
10
+ librosa
11
+ pandas
12
+ torch
13
+ transformers