Spaces:

tachiwin
/

document-ocr

Running

App Files Files Community

Luis J Camargo commited on 1 day ago

Commit

58fd993

1 Parent(s): dcf1d67

first commit

Browse files

Files changed (2) hide show

app.py +365 -0
requirements.txt +13 -0

app.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import os
+import io
+import json
+import base64
+import re
+import logging
+import sys
+import yaml
+from typing import Dict, List, Tuple, Any, Optional
+import time
+import gradio as gr
+from PIL import Image
+import requests
+from urllib.parse import urlparse
+# Paddle imports
+try:
+    from paddleocr import PaddleOCRVL
+    import paddlex
+    PADDLE_AVAILABLE = True
+except ImportError:
+    PADDLE_AVAILABLE = False
+    print("Warning: paddleocr or paddlex not found. Inference will be disabled.")
+# --- Configuration ---
+LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stderr)])
+logger = logging.getLogger("TachiwinDocOCR")
+CUSTOM_MODEL_PATH = "tachiwin/Tachiwin-OCR-1.5"
+CONFIG_FILE = "custom_pipeline_config.yaml"
+OUTPUT_DIR = "output"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# LATEX Configuration for Gradio
+LATEX_DELIMS = [
+    {"left": "$$", "right": "$$", "display": True},
+    {"left": "$",  "right": "$",  "display": False},
+    {"left": "\\(", "right": "\\)", "display": False},
+    {"left": "\\[", "right": "\\]", "display": True},
+]
+# --- Model Initialization ---
+pipeline = None
+def setup_pipeline():
+    global pipeline
+    if not PADDLE_AVAILABLE:
+        return
+    try:
+        # 1. Generate default config if it doesn't exist
+        if not os.path.exists(CONFIG_FILE):
+            logger.info(f"Generating default configuration file: {CONFIG_FILE}")
+            # Note: Using the internal paddlex API to get the config
+            # Equivalent to: paddlex --get_pipeline_config PaddleOCR-VL
+            from paddlex import create_pipeline
+            temp_pipeline = create_pipeline("PaddleOCR-VL")
+            temp_pipeline.export_pipeline_config(save_path=CONFIG_FILE)
+            logger.info("Default configuration exported.")
+        # 2. Modify config to point to custom model
+        logger.info(f"Modifying configuration to use custom model: {CUSTOM_MODEL_PATH}")
+        with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
+            config_data = yaml.safe_load(f)
+        # Update the model_dir for VLRecognition
+        # Heuristic: Find and update VLRecognition model_dir
+        if 'SubModules' in config_data and 'VLRecognition' in config_data['SubModules']:
+            config_data['SubModules']['VLRecognition']['model_dir'] = CUSTOM_MODEL_PATH
+            logger.info(f"Updated VLRecognition model_dir to {CUSTOM_MODEL_PATH}")
+        else:
+            logger.warning("Could not find VLRecognition in config_data['SubModules']. Attempting fallback.")
+            # Fallback searching through the dict if structure is different
+            def update_model_dir(d):
+                for k, v in d.items():
+                    if k == 'VLRecognition' and isinstance(v, dict):
+                        v['model_dir'] = CUSTOM_MODEL_PATH
+                        return True
+                    if isinstance(v, dict):
+                        if update_model_dir(v): return True
+                return False
+            update_model_dir(config_data)
+        with open(CONFIG_FILE, 'w', encoding='utf-8') as f:
+            yaml.dump(config_data, f)
+        # 3. Initialize pipeline with modified config
+        logger.info(f"Initializing PaddleOCRVL with config: {CONFIG_FILE}")
+        pipeline = PaddleOCRVL(pipeline_config=CONFIG_FILE)
+        logger.info("PaddleOCRVL initialized successfully.")
+    except Exception as e:
+        logger.error(f"Failed to initialize PaddleOCRVL: {e}")
+if PADDLE_AVAILABLE:
+    setup_pipeline()
+# --- Helper Functions ---
+def image_to_base64_data_url(filepath: str) -> str:
+    try:
+        ext = os.path.splitext(filepath)[1].lower()
+        mime_types = {
+            ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png",
+            ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp"
+        }
+        mime_type = mime_types.get(ext, "image/jpeg")
+        with open(filepath, "rb") as image_file:
+            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+        return f"data:{mime_type};base64,{encoded_string}"
+    except Exception as e:
+        logger.error(f"Error encoding image to Base64: {e}")
+        return ""
+def _escape_inequalities_in_math(md: str) -> str:
+    _MATH_PATTERNS = [
+        re.compile(r"\$\$([\s\S]+?)\$\$"),
+        re.compile(r"\$([^\$]+?)\$"),
+        re.compile(r"\\\[([\s\S]+?)\\\]"),
+        re.compile(r"\\\(([\s\S]+?)\\\)"),
+    ]
+    def fix(s: str) -> str:
+        s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
+        s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
+        s = s.replace("<", r" \lt ").replace(">", r" \gt ")
+        return s
+    for pat in _MATH_PATTERNS:
+        md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
+    return md
+def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
+    if not path_or_url:
+        return gr.update(value="", visible=False)
+    is_url = isinstance(path_or_url, str) and path_or_url.startswith(("http://", "https://"))
+    if is_url:
+        src = path_or_url
+    else:
+        src = image_to_base64_data_url(path_or_url)
+    html_content = f"""
+    <div class="uploaded-image">
+        <img src="{src}" alt="Preview image" style="width:100%;height:100%;object-fit:contain;" loading="lazy"/>
+    </div>
+    """
+    return gr.update(value=html_content, visible=True)
+# --- Inference Logic ---
+def run_inference(img_path, task_type="ocr"):
+    if not PADDLE_AVAILABLE or pipeline is None:
+        return "PaddleOCRVL is not available or failed to load. Ensure paddlex and paddleocr are installed.", "", "", ""
+    if not img_path:
+        return "Please upload an image.", "", "", ""
+    try:
+        logger.info(f"Running inference for {img_path} with task {task_type}")
+        # Adjust pipeline parameters based on task_type if needed
+        # PaddleOCRVL predict as per documentation
+        output = pipeline.predict(img_path)
+        md_content = ""
+        json_content = ""
+        vis_html = ""
+        run_id = str(int(time.time()))
+        run_output_dir = os.path.join(OUTPUT_DIR, run_id)
+        os.makedirs(run_output_dir, exist_ok=True)
+        for i, res in enumerate(output):
+            # Save outputs
+            res.save_to_json(save_path=run_output_dir)
+            res.save_to_markdown(save_path=run_output_dir)
+            # Print for logs
+            res.print()
+            # Extract content from generated files
+            for root, dirs, files in os.walk(run_output_dir):
+                for file in files:
+                    file_full_path = os.path.join(root, file)
+                    if file.endswith(".md"):
+                        with open(file_full_path, 'r', encoding='utf-8') as f:
+                            md_content += f.read() + "\n\n"
+                    elif file.endswith(".json"):
+                        with open(file_full_path, 'r', encoding='utf-8') as f:
+                            json_content += f.read() + "\n\n"
+                    elif file.endswith((".png", ".jpg", ".jpeg")) and "res" in file:
+                        # Found a visualization image
+                        vis_src = image_to_base64_data_url(file_full_path)
+                        vis_html += f'<div style="margin-bottom:20px;">'
+                        vis_html += f'<p style="font-weight:bold;">Visualization {i+1}:</p>'
+                        vis_html += f'<img src="{vis_src}" alt="Visualization {i+1}" style="width:100%; border-radius: 8px; border: 1px solid #ddd;">'
+                        vis_html += f'</div>'
+        if not md_content:
+            md_content = "No text recognized."
+        md_preview = _escape_inequalities_in_math(md_content)
+        return md_preview, md_content, vis_html, json_content
+    except Exception as e:
+        logger.error(f"Inference failed: {e}")
+        return f"Error: {str(e)}", "", "", ""
+# --- UI Components ---
+css = """
+body, .gradio-container { font-family: 'Inter', -apple-system, system-ui, sans-serif; }
+.app-header {
+    text-align: center;
+    padding: 30px;
+    background: linear-gradient(120deg, rgb(2, 132, 199) 0%, rgb(16, 185, 129) 60%, rgb(5, 150, 105) 100%);
+    color: white;
+    border-radius: 15px;
+    margin-bottom: 25px;
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+}
+.app-header h1 { color: white !important; margin: 0; font-size: 2.5em; }
+.app-header p { font-size: 1.2em; opacity: 0.9; margin-top: 10px; }
+.notice { margin: 8px auto 0; max-width: 900px; padding: 10px 12px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f8fafc; font-size: 14px; line-height: 1.6; }
+.quick-links { text-align: center; padding: 8px 0; border: 1px solid #e5e7eb; border-radius: 8px; margin: 8px auto; max-width: 900px; }
+.quick-links a { margin: 0 12px; font-size: 14px; font-weight: 600; color: #3b82f6; text-decoration: none; }
+.quick-links a:hover { text-decoration: underline; }
+#image_preview_doc, #image_preview_vl, #image_preview_spot { height: 400px !important; overflow: auto; border: 1px solid #ddd; border-radius: 8px; background: #eee; }
+#image_preview_doc img, #image_preview_vl img, #image_preview_spot img { width: 100% !important; height: auto !important; object-fit: contain !important; display: block; }
+.output_markdown { min-height: 30rem !important; font-size: 1.1rem !important; line-height: 1.6 !important; }
+.prose pre { background: #f1f5f9 !important; border-radius: 8px !important; padding: 10px !important; }
+"""
+with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
+    # Header branding
+    gr.HTML(
+        """
+        <div class="app-header">
+            <h1>🌎 Tachiwin Document Parsing OCR 🦡</h1>
+            <p>Advancing Linguistic Rights for the 68 Indigenous Languages of Mexico</p>
+        </div>
+        """
+    )
+    gr.HTML(f"""
+    <div class="notice">
+    <strong>Powered by PaddleOCRVL 1.5:</strong> Optimized for in-the-wild document parsing and fine-tuned for indigenous languages.
+    Initializing with custom weights: <code>{CUSTOM_MODEL_PATH}</code>
+    </div>
+    """)
+    gr.HTML("""<div class="quick-links"><a href="https://github.com/ljcamargo/tachiwin_paddleocrvl_finetuning" target="_blank">GitHub</a> | <a href="https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5" target="_blank">Base Model</a> | <a href="https://www.paddleocr.com" target="_blank">Documentation</a></div>""")
+    with gr.Tabs():
+        # --- Tab 1: Document Parsing ---
+        with gr.Tab("📄 Document Parsing"):
+            with gr.Row():
+                with gr.Column(scale=5):
+                    file_doc = gr.File(label="Upload Document Image", file_count="single", type="filepath", file_types=["image"])
+                    preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
+                    with gr.Row(variant="panel"):
+                        with gr.Column(scale=2):
+                            btn_parse = gr.Button("🚀 Parse Document", variant="primary")
+                        with gr.Column(scale=3):
+                            with gr.Row():
+                                chart_switch = gr.Checkbox(label="Chart parsing", value=True)
+                                unwarp_switch = gr.Checkbox(label="Doc unwarping", value=False)
+                with gr.Column(scale=7):
+                    with gr.Tabs():
+                        with gr.Tab("📝 Markdown Preview"):
+                            md_preview_doc = gr.Markdown(latex_delimiters=LATEX_DELIMS, elem_classes="output_markdown")
+                        with gr.Tab("🖼️ Visualization"):
+                            vis_image_doc = gr.HTML("<p style='text-align:center; color:#888; padding: 20px;'>Parsing results will be visualized here.</p>")
+                        with gr.Tab("📜 Markdown Source"):
+                            md_raw_doc = gr.Code(language="markdown")
+            file_doc.change(update_preview_visibility, file_doc, preview_doc_html)
+            def parse_doc_wrapper(fp, ch, uw):
+                if not fp: return "Please upload an image.", "", "", ""
+                res_preview, res_raw, res_vis, res_json = run_inference(fp, task_type="document")
+                return res_preview, res_vis, res_raw
+            btn_parse.click(parse_doc_wrapper, [file_doc, chart_switch, unwarp_switch], [md_preview_doc, vis_image_doc, md_raw_doc])
+        # --- Tab 2: Element Recognition ---
+        with gr.Tab("🧩 Element Recognition"):
+            with gr.Row():
+                with gr.Column(scale=5):
+                    file_vl = gr.File(label="Upload Element Image", file_count="single", type="filepath", file_types=["image"])
+                    preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
+                    with gr.Row():
+                        btn_ocr = gr.Button("Text Recognition", variant="secondary")
+                        btn_formula = gr.Button("Formula Recognition", variant="secondary")
+                    with gr.Row():
+                        btn_table = gr.Button("Table Recognition", variant="secondary")
+                        btn_chart = gr.Button("Chart Recognition", variant="secondary")
+                with gr.Column(scale=7):
+                    with gr.Tabs():
+                        with gr.Tab("📊 Result"):
+                            md_preview_vl = gr.Markdown(latex_delimiters=LATEX_DELIMS, elem_classes="output_markdown")
+                        with gr.Tab("📜 Raw Output"):
+                            md_raw_vl = gr.Code(language="markdown")
+            file_vl.change(update_preview_visibility, file_vl, preview_vl_html)
+            def run_vl_wrapper(fp, prompt):
+                if not fp: return "Please upload an image.", "", ""
+                res_preview, res_raw, res_vis, res_json = run_inference(fp, task_type=prompt)
+                return res_preview, res_raw
+            for btn, prompt in [(btn_ocr, "Text Recognition"), (btn_formula, "Formula Recognition"), (btn_table, "Table Recognition"), (btn_chart, "Chart Recognition")]:
+                btn.click(run_vl_wrapper, [file_vl, gr.State(prompt)], [md_preview_vl, md_raw_vl])
+        # --- Tab 3: Spotting ---
+        with gr.Tab("📍 Spotting"):
+            with gr.Row():
+                with gr.Column(scale=5):
+                    file_spot = gr.File(label="Upload Image for Detection", file_count="single", type="filepath", file_types=["image"])
+                    preview_spot_html = gr.HTML(value="", elem_id="image_preview_spot", visible=False)
+                    btn_run_spot = gr.Button("Run Spotting", variant="primary")
+                with gr.Column(scale=7):
+                    with gr.Tabs():
+                        with gr.Tab("🖼️ Visualization"):
+                            vis_image_spot = gr.HTML("<p style='text-align:center; color:#888; padding: 20px;'>Detection visualization.</p>")
+                        with gr.Tab("💾 JSON Result"):
+                            json_spot = gr.Code(label="Detection Results", language="json")
+            file_spot.change(update_preview_visibility, file_spot, preview_spot_html)
+            def run_spotting_wrapper(fp):
+                if not fp: return "", ""
+                res_preview, res_raw, res_vis, res_json = run_inference(fp, task_type="spotting")
+                return res_vis, res_json
+            btn_run_spot.click(run_spotting_wrapper, file_spot, [vis_image_spot, json_spot])
+    # Footer Information
+    gr.Markdown(
+        """
+        ---
+        ### ℹ️ About Tachiwin 🦡
+        **Tachiwin** (from Totonac - "Language") is dedicated to bridging the digital divide for indigenous languages of Mexico through AI technology. This model represents a **world first in tech access and linguistic rights**, specifically trained to recognize the 68 indigenous languages of Mexico.
+        ### Supported Language Families
+        **Uto-Aztecan:** Náhuatl, Yaqui, Mayo, Huichol, Tepehuán, Tarahumara
+        **Mayan:** Maya, Tzeltal, Tzotzil, Chol, Tojolabal, Q'anjob'al, Mam
+        **Oto-Manguean:** Zapoteco, Mixteco, Otomí, Mazateco, Chinanteco, Triqui
+        **Totonac-Tepehua:** Totonaco, Tepehua
+        **Mixe-Zoque:** Mixe, Zoque, Popoluca
+        **Other:** Purépecha, Huave, Seri, Kickapoo, Kiliwa
+        Made with ❤️ for linguistic diversity and indigenous rights 🦡
+        """
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
+paddlepaddle==3.3.0
+paddlex
+paddleocr[doc-parser]
+gradio
+pillow
+requests
+numpy
+psutil
+librosa
+pandas
+torch
+transformers