Luis J Camargo commited on
Commit
3d8f756
·
1 Parent(s): 7a79e26

first commit

Browse files
Files changed (2) hide show
  1. app.py +197 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import atexit
2
+ import functools
3
+ from queue import Queue
4
+ from threading import Event, Thread
5
+ from huggingface_hub import snapshot_download
6
+ import os
7
+
8
+ from paddleocr import PaddleOCR
9
+ import gradio as gr
10
+
11
+
12
+ CONCURRENCY_LIMIT = 4
13
+
14
+
15
+ class PaddleOCRModelManager(object):
16
+ def __init__(self, num_workers, model_factory):
17
+ super().__init__()
18
+ self._model_factory = model_factory
19
+ self._queue = Queue()
20
+ self._workers = []
21
+ self._model_initialized_event = Event()
22
+ for _ in range(num_workers):
23
+ worker = Thread(target=self._worker, daemon=False)
24
+ worker.start()
25
+ self._model_initialized_event.wait()
26
+ self._model_initialized_event.clear()
27
+ self._workers.append(worker)
28
+
29
+ def infer(self, *args, **kwargs):
30
+ result_queue = Queue(maxsize=1)
31
+ self._queue.put((args, kwargs, result_queue))
32
+ success, payload = result_queue.get()
33
+ if success:
34
+ return payload
35
+ else:
36
+ raise payload
37
+
38
+ def close(self):
39
+ for _ in self._workers:
40
+ self._queue.put(None)
41
+ for worker in self._workers:
42
+ worker.join()
43
+
44
+ def _worker(self):
45
+ model = self._model_factory()
46
+ self._model_initialized_event.set()
47
+ while True:
48
+ item = self._queue.get()
49
+ if item is None:
50
+ break
51
+ args, kwargs, result_queue = item
52
+ try:
53
+ result = model.predict(*args, **kwargs)
54
+ result_queue.put((True, result))
55
+ except Exception as e:
56
+ result_queue.put((False, e))
57
+ finally:
58
+ self._queue.task_done()
59
+
60
+
61
+ def download_model():
62
+ """Download the fine-tuned Tachiwin model from Hugging Face"""
63
+ model_repo = "PaddlePaddle/PaddleOCR-VL" # Update this!
64
+ model_dir = "./tachiwin_model"
65
+
66
+ print(f"Downloading Tachiwin model from {model_repo}...")
67
+
68
+ snapshot_download(
69
+ repo_id=model_repo,
70
+ local_dir=model_dir,
71
+ local_dir_use_symlinks=False
72
+ )
73
+
74
+ print(f"Model downloaded successfully to {model_dir}")
75
+ return model_dir
76
+
77
+
78
+ def create_model():
79
+ """Initialize PaddleOCR-VL with the fine-tuned Tachiwin model"""
80
+ model_dir = download_model()
81
+
82
+ # Using PaddleOCR in doc_parser mode for VL model
83
+ return PaddleOCR(
84
+ vl_rec_model_name="PaddleOCR-VL-0.9B",
85
+ vl_rec_model_dir=model_dir,
86
+ use_gpu=False,
87
+ show_log=False
88
+ )
89
+
90
+
91
+ # Initialize model manager with 2 workers
92
+ print("Initializing Tachiwin Indigenous Languages OCR...")
93
+ model_manager = PaddleOCRModelManager(2, create_model)
94
+ print("Model ready!")
95
+
96
+
97
+ def close_model_manager():
98
+ model_manager.close()
99
+
100
+
101
+ atexit.register(close_model_manager)
102
+
103
+
104
+ def inference(img):
105
+ """Process image with OCR and return extracted text in markdown format"""
106
+ if img is None:
107
+ return "Please upload an image."
108
+
109
+ try:
110
+ result = model_manager.infer(img)[0]
111
+
112
+ if not result:
113
+ return "No text detected in the image."
114
+
115
+ # Extract text and format as markdown table
116
+ output_lines = ["# Extracted Text\n"]
117
+ output_lines.append("| Text | Confidence |")
118
+ output_lines.append("|------|-----------|")
119
+
120
+ for line in result:
121
+ text = line[1][0]
122
+ confidence = f"{line[1][1]:.2%}"
123
+ output_lines.append(f"| {text} | {confidence} |")
124
+
125
+ return "\n".join(output_lines)
126
+
127
+ except Exception as e:
128
+ return f"Error during OCR processing: {str(e)}"
129
+
130
+
131
+ title = '🌎 Tachiwin Indigenous Languages OCR'
132
+
133
+ description = '''
134
+ ### PaddleOCR-VL Fine-tuned for the 68 Indigenous Languages of Mexico
135
+
136
+ This model represents a **world first in tech access and linguistic rights**, specifically trained to recognize
137
+ the diverse character and glyph repertoire of Mexico's 68 indigenous languages.
138
+
139
+ **How to use:** Simply upload an image containing text in any Mexican indigenous language, and the model will
140
+ detect and recognize the text.
141
+
142
+ 🔗 [PaddleOCR Documentation](https://github.com/PaddlePaddle/PaddleOCR)
143
+ '''
144
+
145
+ examples = [
146
+ ['example_nahuatl.jpg'],
147
+ ['example_maya.jpg'],
148
+ ['example_zapoteco.jpg'],
149
+ ]
150
+
151
+ example_labels = """
152
+ ### Example Images:
153
+ | Image | Language | Description |
154
+ |-------|----------|-------------|
155
+ | example_nahuatl.jpg | Náhuatl | Classical Nahuatl text with traditional glyphs |
156
+ | example_maya.jpg | Maya (Yucatec) | Contemporary Maya writing with diacritics |
157
+ | example_zapoteco.jpg | Zapoteco (Istmo) | Zapotec text from Oaxaca region |
158
+ """
159
+
160
+ css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
161
+
162
+ gr.Interface(
163
+ inference,
164
+ [
165
+ gr.Image(type='filepath', label='Input'),
166
+ ],
167
+ gr.Markdown(label='Output'),
168
+ title=title,
169
+ description=description,
170
+ examples=examples,
171
+ cache_examples=False,
172
+ css=css,
173
+ concurrency_limit=CONCURRENCY_LIMIT,
174
+ article=f"""
175
+ {example_labels}
176
+
177
+ ### About Tachiwin
178
+
179
+ **Tachiwin** (from Totonac - "Language") is dedicated to bridging
180
+ the digital divide for indigenous languages of Mexico through AI technology.
181
+
182
+ ### Supported Language Families
183
+
184
+ **Uto-Aztecan:** Náhuatl, Yaqui, Mayo, Huichol, Tepehuán, Tarahumara
185
+ **Mayan:** Maya, Tzeltal, Tzotzil, Chol, Tojolabal, Q'anjob'al, Mam
186
+ **Oto-Manguean:** Zapoteco, Mixteco, Otomí, Mazateco, Chinanteco, Triqui
187
+ **Totonac-Tepehua:** Totonaco, Tepehua
188
+ **Mixe-Zoque:** Mixe, Zoque, Popoluca
189
+ **Other:** Purépecha, Huave, Seri, Kickapoo, Kiliwa
190
+
191
+ ...covering all 68 officially recognized indigenous languages of Mexico.
192
+
193
+ ---
194
+
195
+ Made with ❤️ for linguistic diversity and indigenous rights
196
+ """
197
+ ).launch(debug=False)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ paddlepaddle==3.0.0
2
+ paddleocr[doc-parser]>=3.0.0
3
+ gradio>=4.0.0
4
+ huggingface-hub>=0.19.0
5
+ Pillow>=10.0.0
6
+ opencv-python-headless>=4.8.0
7
+ numpy>=1.23.0
8
+ safetensors>=0.4.0
9
+ transformers>=4.30.0