Spaces:

tachiwin
/

document-ocr

Sleeping

App Files Files Community

Luis J Camargo commited on Mar 1

Commit

0297456

1 Parent(s): acf8835

feat: Download OCR model locally from Hugging Face, update pipeline configuration to use the local path, and add `archive` to `.gitignore`.

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +14 -9
requirements.txt +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ archive

app.py CHANGED Viewed

@@ -15,13 +15,14 @@ import gradio as gr
 from PIL import Image
 import requests
 from urllib.parse import urlparse
 # --- Configuration ---
 LOGGING_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
 logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stdout)])
 logger = logging.getLogger("TachiwinDocOCR")
-CUSTOM_MODEL_PATH = "tachiwin/Tachiwin-OCR-1.5"
 # The YAML file provided by the user or generated
 CONFIG_FILE = "default.yaml"
 # Fallback generated if default.yaml doesn't exist
@@ -62,6 +63,11 @@ def setup_pipeline():
     try:
         logger.info("🚀 Starting Tachiwin Doc OCR Pipeline Setup...")
         target_config = None
         # Use existing default.yaml if present
         if os.path.exists(CONFIG_FILE):
@@ -90,15 +96,15 @@ def setup_pipeline():
         with open(target_config, 'r', encoding='utf-8') as f:
             config_data = yaml.safe_load(f)
-        # Update model_dir if it's not set correctly
         updated = False
         def update_config(d):
             nonlocal updated
             for k, v in d.items():
                 if k == 'VLRecognition' and isinstance(v, dict):
-                    if v.get('model_dir') != CUSTOM_MODEL_PATH:
-                        logger.info(f"🔧 Updating VLRecognition model_dir: {v.get('model_dir')} -> {CUSTOM_MODEL_PATH}")
-                        v['model_dir'] = CUSTOM_MODEL_PATH
                         updated = True
                 elif isinstance(v, dict):
                     update_config(v)
@@ -117,7 +123,6 @@ def setup_pipeline():
         # Initialize pipeline using the recommended PaddleX way
         logger.info(f"⚙️ Initializing pipeline with create_pipeline(pipeline={target_config})")
-        # According to help: create_pipeline can take a path to yaml
         pipeline = create_pipeline(pipeline=target_config)
         logger.info("✨ Pipeline initialized successfully!")
@@ -253,7 +258,7 @@ body, .gradio-container { font-family: 'Inter', system-ui, sans-serif; }
 .output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
 """
-with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
     gr.HTML(
         """
         <div class="app-header">
@@ -265,7 +270,7 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
     with gr.Row(elem_classes=["notice"]):
         status_text = "Initialized" if pipeline else "Initializing/Failed"
-        gr.Markdown(f"**⚡ Status:** {status_text} | **Model:** `{CUSTOM_MODEL_PATH}` | **Hardware:** CPU")
     with gr.Tabs():
         with gr.Tab("📄 Full Document Parsing"):
@@ -346,4 +351,4 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
     gr.Markdown("--- \n *Tachiwin Project: Indigenous Languages of Mexico.*")
 if __name__ == "__main__":
-    demo.queue().launch()

 from PIL import Image
 import requests
 from urllib.parse import urlparse
+from huggingface_hub import snapshot_download
 # --- Configuration ---
 LOGGING_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
 logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stdout)])
 logger = logging.getLogger("TachiwinDocOCR")
+REPO_ID = "tachiwin/Tachiwin-OCR-1.5"
 # The YAML file provided by the user or generated
 CONFIG_FILE = "default.yaml"
 # Fallback generated if default.yaml doesn't exist
     try:
         logger.info("🚀 Starting Tachiwin Doc OCR Pipeline Setup...")
+        # 1. Download Model from Hugging Face Hub
+        logger.info(f"📦 Downloading custom model from HF: {REPO_ID}...")
+        local_model_path = snapshot_download(repo_id=REPO_ID)
+        logger.info(f"✅ Model downloaded to: {local_model_path}")
         target_config = None
         # Use existing default.yaml if present
         if os.path.exists(CONFIG_FILE):
         with open(target_config, 'r', encoding='utf-8') as f:
             config_data = yaml.safe_load(f)
+        # Update model_dir to the LOCAL path
         updated = False
         def update_config(d):
             nonlocal updated
             for k, v in d.items():
                 if k == 'VLRecognition' and isinstance(v, dict):
+                    if v.get('model_dir') != local_model_path:
+                        logger.info(f"🔧 Updating VLRecognition model_dir to local path: {local_model_path}")
+                        v['model_dir'] = local_model_path
                         updated = True
                 elif isinstance(v, dict):
                     update_config(v)
         # Initialize pipeline using the recommended PaddleX way
         logger.info(f"⚙️ Initializing pipeline with create_pipeline(pipeline={target_config})")
         pipeline = create_pipeline(pipeline=target_config)
         logger.info("✨ Pipeline initialized successfully!")
 .output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
 """
+with gr.Blocks() as demo:
     gr.HTML(
         """
         <div class="app-header">
     with gr.Row(elem_classes=["notice"]):
         status_text = "Initialized" if pipeline else "Initializing/Failed"
+        gr.Markdown(f"**⚡ Status:** {status_text} | **Model:** `{REPO_ID}` | **Hardware:** CPU")
     with gr.Tabs():
         with gr.Tab("📄 Full Document Parsing"):
     gr.Markdown("--- \n *Tachiwin Project: Indigenous Languages of Mexico.*")
 if __name__ == "__main__":
+    demo.queue().launch(theme=gr.themes.Ocean(), css=custom_css)

requirements.txt CHANGED Viewed

@@ -11,3 +11,4 @@ librosa
 pandas
 torch
 transformers

 pandas
 torch
 transformers
+huggingface_hub