Luis J Camargo commited on
Commit
0297456
Β·
1 Parent(s): acf8835

feat: Download OCR model locally from Hugging Face, update pipeline configuration to use the local path, and add `archive` to `.gitignore`.

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +14 -9
  3. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ archive
app.py CHANGED
@@ -15,13 +15,14 @@ import gradio as gr
15
  from PIL import Image
16
  import requests
17
  from urllib.parse import urlparse
 
18
 
19
  # --- Configuration ---
20
  LOGGING_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
21
  logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stdout)])
22
  logger = logging.getLogger("TachiwinDocOCR")
23
 
24
- CUSTOM_MODEL_PATH = "tachiwin/Tachiwin-OCR-1.5"
25
  # The YAML file provided by the user or generated
26
  CONFIG_FILE = "default.yaml"
27
  # Fallback generated if default.yaml doesn't exist
@@ -62,6 +63,11 @@ def setup_pipeline():
62
  try:
63
  logger.info("πŸš€ Starting Tachiwin Doc OCR Pipeline Setup...")
64
 
 
 
 
 
 
65
  target_config = None
66
  # Use existing default.yaml if present
67
  if os.path.exists(CONFIG_FILE):
@@ -90,15 +96,15 @@ def setup_pipeline():
90
  with open(target_config, 'r', encoding='utf-8') as f:
91
  config_data = yaml.safe_load(f)
92
 
93
- # Update model_dir if it's not set correctly
94
  updated = False
95
  def update_config(d):
96
  nonlocal updated
97
  for k, v in d.items():
98
  if k == 'VLRecognition' and isinstance(v, dict):
99
- if v.get('model_dir') != CUSTOM_MODEL_PATH:
100
- logger.info(f"πŸ”§ Updating VLRecognition model_dir: {v.get('model_dir')} -> {CUSTOM_MODEL_PATH}")
101
- v['model_dir'] = CUSTOM_MODEL_PATH
102
  updated = True
103
  elif isinstance(v, dict):
104
  update_config(v)
@@ -117,7 +123,6 @@ def setup_pipeline():
117
 
118
  # Initialize pipeline using the recommended PaddleX way
119
  logger.info(f"βš™οΈ Initializing pipeline with create_pipeline(pipeline={target_config})")
120
- # According to help: create_pipeline can take a path to yaml
121
  pipeline = create_pipeline(pipeline=target_config)
122
  logger.info("✨ Pipeline initialized successfully!")
123
 
@@ -253,7 +258,7 @@ body, .gradio-container { font-family: 'Inter', system-ui, sans-serif; }
253
  .output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
254
  """
255
 
256
- with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
257
  gr.HTML(
258
  """
259
  <div class="app-header">
@@ -265,7 +270,7 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
265
 
266
  with gr.Row(elem_classes=["notice"]):
267
  status_text = "Initialized" if pipeline else "Initializing/Failed"
268
- gr.Markdown(f"**⚑ Status:** {status_text} | **Model:** `{CUSTOM_MODEL_PATH}` | **Hardware:** CPU")
269
 
270
  with gr.Tabs():
271
  with gr.Tab("πŸ“„ Full Document Parsing"):
@@ -346,4 +351,4 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
346
  gr.Markdown("--- \n *Tachiwin Project: Indigenous Languages of Mexico.*")
347
 
348
  if __name__ == "__main__":
349
- demo.queue().launch()
 
15
  from PIL import Image
16
  import requests
17
  from urllib.parse import urlparse
18
+ from huggingface_hub import snapshot_download
19
 
20
  # --- Configuration ---
21
  LOGGING_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
22
  logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stdout)])
23
  logger = logging.getLogger("TachiwinDocOCR")
24
 
25
+ REPO_ID = "tachiwin/Tachiwin-OCR-1.5"
26
  # The YAML file provided by the user or generated
27
  CONFIG_FILE = "default.yaml"
28
  # Fallback generated if default.yaml doesn't exist
 
63
  try:
64
  logger.info("πŸš€ Starting Tachiwin Doc OCR Pipeline Setup...")
65
 
66
+ # 1. Download Model from Hugging Face Hub
67
+ logger.info(f"πŸ“¦ Downloading custom model from HF: {REPO_ID}...")
68
+ local_model_path = snapshot_download(repo_id=REPO_ID)
69
+ logger.info(f"βœ… Model downloaded to: {local_model_path}")
70
+
71
  target_config = None
72
  # Use existing default.yaml if present
73
  if os.path.exists(CONFIG_FILE):
 
96
  with open(target_config, 'r', encoding='utf-8') as f:
97
  config_data = yaml.safe_load(f)
98
 
99
+ # Update model_dir to the LOCAL path
100
  updated = False
101
  def update_config(d):
102
  nonlocal updated
103
  for k, v in d.items():
104
  if k == 'VLRecognition' and isinstance(v, dict):
105
+ if v.get('model_dir') != local_model_path:
106
+ logger.info(f"πŸ”§ Updating VLRecognition model_dir to local path: {local_model_path}")
107
+ v['model_dir'] = local_model_path
108
  updated = True
109
  elif isinstance(v, dict):
110
  update_config(v)
 
123
 
124
  # Initialize pipeline using the recommended PaddleX way
125
  logger.info(f"βš™οΈ Initializing pipeline with create_pipeline(pipeline={target_config})")
 
126
  pipeline = create_pipeline(pipeline=target_config)
127
  logger.info("✨ Pipeline initialized successfully!")
128
 
 
258
  .output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
259
  """
260
 
261
+ with gr.Blocks() as demo:
262
  gr.HTML(
263
  """
264
  <div class="app-header">
 
270
 
271
  with gr.Row(elem_classes=["notice"]):
272
  status_text = "Initialized" if pipeline else "Initializing/Failed"
273
+ gr.Markdown(f"**⚑ Status:** {status_text} | **Model:** `{REPO_ID}` | **Hardware:** CPU")
274
 
275
  with gr.Tabs():
276
  with gr.Tab("πŸ“„ Full Document Parsing"):
 
351
  gr.Markdown("--- \n *Tachiwin Project: Indigenous Languages of Mexico.*")
352
 
353
  if __name__ == "__main__":
354
+ demo.queue().launch(theme=gr.themes.Ocean(), css=custom_css)
requirements.txt CHANGED
@@ -11,3 +11,4 @@ librosa
11
  pandas
12
  torch
13
  transformers
 
 
11
  pandas
12
  torch
13
  transformers
14
+ huggingface_hub