Luis J Camargo commited on
Commit
8d56937
·
1 Parent(s): aa9595c

attempt for text

Browse files
Files changed (2) hide show
  1. app.py +22 -4
  2. lac.jpg +0 -0
app.py CHANGED
@@ -57,6 +57,22 @@ class PaddleOCRModelManager(object):
57
  finally:
58
  self._queue.task_done()
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def create_model():
62
  """Initialize PaddleOCR-VL with the fine-tuned Tachiwin model"""
@@ -97,10 +113,10 @@ def inference(img):
97
  extracted_texts = []
98
 
99
  for page in result:
100
- if isinstance(page, dict) and 'parsing_res_list' in page:
101
- for block in page['parsing_res_list']:
102
- if 'content' in block and block['content']:
103
- extracted_texts.append(block['content'])
104
 
105
  if not extracted_texts:
106
  return "No text could be extracted from the image."
@@ -136,6 +152,7 @@ examples = [
136
  ['mir.jpg'],
137
  ['ote.jpg'],
138
  ['otm.jpg'],
 
139
  ]
140
 
141
  example_labels = """
@@ -147,6 +164,7 @@ example_labels = """
147
  | maj.jpg | Mazatec, Jalapa de Díaz | Kui xi já maña̱ xi ngakjá ku̱a̱kúya ni xi ts'e̱ Nti̱a̱ná. Kj'a̱í ni xi ku̱a̱kúyanu̱u, kui xi ts'i̱ínkatsúnnu̱u. Najmi ts'i̱ínkie yjoho̱ nga Nda̱ Nti̱a̱ná xi ts'asjejihi̱n. B'a̱ ts'ín ki̱tsa̱ ts'i̱ín nibánehe̱ ra̱ yjoho̱ nga n'e̱kje. Nkjin xi i̱ncha ts'i̱ín ni xi i̱ncha ts'ín jóo̱, ni xi tu̱ subahá maná. |
148
  | mir.jpg | Isthmus Mixe | Cab jaduhṉ yhahixøꞌøy coo jaꞌa naam̱dägøꞌøbä tiúnät wiindsǿṉ maa jaꞌa Diostøjcän, coo jaduhṉ ñäꞌä niguiumayǿøjät. |
149
  | otm.jpg | Eastern Highland Otomi | ma'ueque ma mbʉihʉ. Nɛ gätho gahʉ dyʉ mbäją gahʉ bi 'dac ma ts |
 
150
  """
151
 
152
  css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;} .output_markdown {min-height: 30rem !important;}"
 
57
  finally:
58
  self._queue.task_done()
59
 
60
+ def download_model():
61
+ """Download the fine-tuned Tachiwin model from Hugging Face"""
62
+ model_repo = "tachiwin/PaddleOCR-VL-Tachiwin" # Update this!
63
+ model_dir = "./tachiwin_model"
64
+
65
+ print(f"Downloading Tachiwin model from {model_repo}...")
66
+
67
+ snapshot_download(
68
+ repo_id=model_repo,
69
+ local_dir=model_dir,
70
+ local_dir_use_symlinks=False
71
+ )
72
+
73
+ print(f"Model downloaded successfully to {model_dir}")
74
+ return model_dir
75
+
76
 
77
  def create_model():
78
  """Initialize PaddleOCR-VL with the fine-tuned Tachiwin model"""
 
113
  extracted_texts = []
114
 
115
  for page in result:
116
+ if hasattr(page, 'parsing_res_list'):
117
+ for block in page.parsing_res_list:
118
+ if hasattr(block, 'content') and block.content:
119
+ extracted_texts.append(block.content)
120
 
121
  if not extracted_texts:
122
  return "No text could be extracted from the image."
 
152
  ['mir.jpg'],
153
  ['ote.jpg'],
154
  ['otm.jpg'],
155
+ ['lac.jpg'],
156
  ]
157
 
158
  example_labels = """
 
164
  | maj.jpg | Mazatec, Jalapa de Díaz | Kui xi já maña̱ xi ngakjá ku̱a̱kúya ni xi ts'e̱ Nti̱a̱ná. Kj'a̱í ni xi ku̱a̱kúyanu̱u, kui xi ts'i̱ínkatsúnnu̱u. Najmi ts'i̱ínkie yjoho̱ nga Nda̱ Nti̱a̱ná xi ts'asjejihi̱n. B'a̱ ts'ín ki̱tsa̱ ts'i̱ín nibánehe̱ ra̱ yjoho̱ nga n'e̱kje. Nkjin xi i̱ncha ts'i̱ín ni xi i̱ncha ts'ín jóo̱, ni xi tu̱ subahá maná. |
165
  | mir.jpg | Isthmus Mixe | Cab jaduhṉ yhahixøꞌøy coo jaꞌa naam̱dägøꞌøbä tiúnät wiindsǿṉ maa jaꞌa Diostøjcän, coo jaduhṉ ñäꞌä niguiumayǿøjät. |
166
  | otm.jpg | Eastern Highland Otomi | ma'ueque ma mbʉihʉ. Nɛ gätho gahʉ dyʉ mbäją gahʉ bi 'dac ma ts |
167
+ | lac.jpg | Lacandon | wa quin chen u'yicob a t'ʌnex, wa yʌn in wu'yicob a ba' cu ya'aric C'uj? Tin t'ʌn, mʌ' in wu'yicob a t'ʌnex, yʌn in wu'yicob a ba' cu ya'aric C'uj. Yʌn in man in wa'aricob a ba' caj in wirajob yejer a ba' caj in wu'yajob ―baxuc tu ya'araj Pedro ti' u jach ts'urirob. Jeroj tune', chich t'ʌn Pedro yejer Juan ten u jach ts'urirob u winiquirob judío, caj ts'oquij caj cha'b u binob ten u jach ts'urirob. |
168
  """
169
 
170
  css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;} .output_markdown {min-height: 30rem !important;}"
lac.jpg ADDED