Spaces:
Running
Running
Luis J Camargo commited on
Commit Β·
0297456
1
Parent(s): acf8835
feat: Download OCR model locally from Hugging Face, update pipeline configuration to use the local path, and add `archive` to `.gitignore`.
Browse files- .gitignore +1 -0
- app.py +14 -9
- requirements.txt +1 -0
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
archive
|
app.py
CHANGED
|
@@ -15,13 +15,14 @@ import gradio as gr
|
|
| 15 |
from PIL import Image
|
| 16 |
import requests
|
| 17 |
from urllib.parse import urlparse
|
|
|
|
| 18 |
|
| 19 |
# --- Configuration ---
|
| 20 |
LOGGING_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
|
| 21 |
logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stdout)])
|
| 22 |
logger = logging.getLogger("TachiwinDocOCR")
|
| 23 |
|
| 24 |
-
|
| 25 |
# The YAML file provided by the user or generated
|
| 26 |
CONFIG_FILE = "default.yaml"
|
| 27 |
# Fallback generated if default.yaml doesn't exist
|
|
@@ -62,6 +63,11 @@ def setup_pipeline():
|
|
| 62 |
try:
|
| 63 |
logger.info("π Starting Tachiwin Doc OCR Pipeline Setup...")
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
target_config = None
|
| 66 |
# Use existing default.yaml if present
|
| 67 |
if os.path.exists(CONFIG_FILE):
|
|
@@ -90,15 +96,15 @@ def setup_pipeline():
|
|
| 90 |
with open(target_config, 'r', encoding='utf-8') as f:
|
| 91 |
config_data = yaml.safe_load(f)
|
| 92 |
|
| 93 |
-
# Update model_dir
|
| 94 |
updated = False
|
| 95 |
def update_config(d):
|
| 96 |
nonlocal updated
|
| 97 |
for k, v in d.items():
|
| 98 |
if k == 'VLRecognition' and isinstance(v, dict):
|
| 99 |
-
if v.get('model_dir') !=
|
| 100 |
-
logger.info(f"π§ Updating VLRecognition model_dir
|
| 101 |
-
v['model_dir'] =
|
| 102 |
updated = True
|
| 103 |
elif isinstance(v, dict):
|
| 104 |
update_config(v)
|
|
@@ -117,7 +123,6 @@ def setup_pipeline():
|
|
| 117 |
|
| 118 |
# Initialize pipeline using the recommended PaddleX way
|
| 119 |
logger.info(f"βοΈ Initializing pipeline with create_pipeline(pipeline={target_config})")
|
| 120 |
-
# According to help: create_pipeline can take a path to yaml
|
| 121 |
pipeline = create_pipeline(pipeline=target_config)
|
| 122 |
logger.info("β¨ Pipeline initialized successfully!")
|
| 123 |
|
|
@@ -253,7 +258,7 @@ body, .gradio-container { font-family: 'Inter', system-ui, sans-serif; }
|
|
| 253 |
.output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
|
| 254 |
"""
|
| 255 |
|
| 256 |
-
with gr.Blocks(
|
| 257 |
gr.HTML(
|
| 258 |
"""
|
| 259 |
<div class="app-header">
|
|
@@ -265,7 +270,7 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
|
|
| 265 |
|
| 266 |
with gr.Row(elem_classes=["notice"]):
|
| 267 |
status_text = "Initialized" if pipeline else "Initializing/Failed"
|
| 268 |
-
gr.Markdown(f"**β‘ Status:** {status_text} | **Model:** `{
|
| 269 |
|
| 270 |
with gr.Tabs():
|
| 271 |
with gr.Tab("π Full Document Parsing"):
|
|
@@ -346,4 +351,4 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
|
|
| 346 |
gr.Markdown("--- \n *Tachiwin Project: Indigenous Languages of Mexico.*")
|
| 347 |
|
| 348 |
if __name__ == "__main__":
|
| 349 |
-
demo.queue().launch()
|
|
|
|
| 15 |
from PIL import Image
|
| 16 |
import requests
|
| 17 |
from urllib.parse import urlparse
|
| 18 |
+
from huggingface_hub import snapshot_download
|
| 19 |
|
| 20 |
# --- Configuration ---
|
| 21 |
LOGGING_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
|
| 22 |
logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging.StreamHandler(sys.stdout)])
|
| 23 |
logger = logging.getLogger("TachiwinDocOCR")
|
| 24 |
|
| 25 |
+
REPO_ID = "tachiwin/Tachiwin-OCR-1.5"
|
| 26 |
# The YAML file provided by the user or generated
|
| 27 |
CONFIG_FILE = "default.yaml"
|
| 28 |
# Fallback generated if default.yaml doesn't exist
|
|
|
|
| 63 |
try:
|
| 64 |
logger.info("π Starting Tachiwin Doc OCR Pipeline Setup...")
|
| 65 |
|
| 66 |
+
# 1. Download Model from Hugging Face Hub
|
| 67 |
+
logger.info(f"π¦ Downloading custom model from HF: {REPO_ID}...")
|
| 68 |
+
local_model_path = snapshot_download(repo_id=REPO_ID)
|
| 69 |
+
logger.info(f"β
Model downloaded to: {local_model_path}")
|
| 70 |
+
|
| 71 |
target_config = None
|
| 72 |
# Use existing default.yaml if present
|
| 73 |
if os.path.exists(CONFIG_FILE):
|
|
|
|
| 96 |
with open(target_config, 'r', encoding='utf-8') as f:
|
| 97 |
config_data = yaml.safe_load(f)
|
| 98 |
|
| 99 |
+
# Update model_dir to the LOCAL path
|
| 100 |
updated = False
|
| 101 |
def update_config(d):
|
| 102 |
nonlocal updated
|
| 103 |
for k, v in d.items():
|
| 104 |
if k == 'VLRecognition' and isinstance(v, dict):
|
| 105 |
+
if v.get('model_dir') != local_model_path:
|
| 106 |
+
logger.info(f"π§ Updating VLRecognition model_dir to local path: {local_model_path}")
|
| 107 |
+
v['model_dir'] = local_model_path
|
| 108 |
updated = True
|
| 109 |
elif isinstance(v, dict):
|
| 110 |
update_config(v)
|
|
|
|
| 123 |
|
| 124 |
# Initialize pipeline using the recommended PaddleX way
|
| 125 |
logger.info(f"βοΈ Initializing pipeline with create_pipeline(pipeline={target_config})")
|
|
|
|
| 126 |
pipeline = create_pipeline(pipeline=target_config)
|
| 127 |
logger.info("β¨ Pipeline initialized successfully!")
|
| 128 |
|
|
|
|
| 258 |
.output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
|
| 259 |
"""
|
| 260 |
|
| 261 |
+
with gr.Blocks() as demo:
|
| 262 |
gr.HTML(
|
| 263 |
"""
|
| 264 |
<div class="app-header">
|
|
|
|
| 270 |
|
| 271 |
with gr.Row(elem_classes=["notice"]):
|
| 272 |
status_text = "Initialized" if pipeline else "Initializing/Failed"
|
| 273 |
+
gr.Markdown(f"**β‘ Status:** {status_text} | **Model:** `{REPO_ID}` | **Hardware:** CPU")
|
| 274 |
|
| 275 |
with gr.Tabs():
|
| 276 |
with gr.Tab("π Full Document Parsing"):
|
|
|
|
| 351 |
gr.Markdown("--- \n *Tachiwin Project: Indigenous Languages of Mexico.*")
|
| 352 |
|
| 353 |
if __name__ == "__main__":
|
| 354 |
+
demo.queue().launch(theme=gr.themes.Ocean(), css=custom_css)
|
requirements.txt
CHANGED
|
@@ -11,3 +11,4 @@ librosa
|
|
| 11 |
pandas
|
| 12 |
torch
|
| 13 |
transformers
|
|
|
|
|
|
| 11 |
pandas
|
| 12 |
torch
|
| 13 |
transformers
|
| 14 |
+
huggingface_hub
|