kasper-boy commited on
Commit
5b8c56c
·
verified ·
1 Parent(s): 7f96a9b

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +182 -0
  2. example_file.pdf +0 -0
  3. pdf2text.py +17 -1
  4. requirements.txt +1 -1
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ import contextlib
5
+
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s - %(levelname)s - %(message)s",
9
+ )
10
+
11
+
12
+ import gradio as gr
13
+ import nltk
14
+ import torch
15
+
16
+ from pdf2text import *
17
+
18
+ _here = Path(__file__).parent
19
+
20
+ nltk.download("stopwords") # TODO=find where this requirement originates from
21
+
22
+
23
+ def load_uploaded_file(file_obj, temp_dir: Path = None):
24
+ """
25
+ load_uploaded_file - process an uploaded file
26
+
27
+ Args:
28
+ file_obj (POTENTIALLY list): Gradio file object inside a list
29
+
30
+ Returns:
31
+ str, the uploaded file contents
32
+ """
33
+
34
+ # check if mysterious file object is a list
35
+ if isinstance(file_obj, list):
36
+ file_obj = file_obj[0]
37
+ file_path = Path(file_obj.name)
38
+
39
+ if temp_dir is None:
40
+ _temp_dir = _here / "temp"
41
+ _temp_dir.mkdir(exist_ok=True)
42
+
43
+ try:
44
+ pdf_bytes_obj = open(file_path, "rb").read()
45
+ temp_path = temp_dir / file_path.name if temp_dir else file_path
46
+ # save to PDF file
47
+ with open(temp_path, "wb") as f:
48
+ f.write(pdf_bytes_obj)
49
+ logging.info(f"Saved uploaded file to {temp_path}")
50
+ return str(temp_path.resolve())
51
+
52
+ except Exception as e:
53
+ logging.error(f"Trying to load file with path {file_path}, error: {e}")
54
+ print(f"Trying to load file with path {file_path}, error: {e}")
55
+ return None
56
+
57
+
58
+ def convert_PDF(
59
+ pdf_obj,
60
+ language: str = "en",
61
+ max_pages=20,
62
+ ):
63
+ """
64
+ convert_PDF - convert a PDF file to text
65
+
66
+ Args:
67
+ pdf_bytes_obj (bytes): PDF file contents
68
+ language (str, optional): Language to use for OCR. Defaults to "en".
69
+
70
+ Returns:
71
+ str, the PDF file contents as text
72
+ """
73
+ # clear local text cache
74
+ rm_local_text_files()
75
+ global ocr_model
76
+ st = time.perf_counter()
77
+ if isinstance(pdf_obj, list):
78
+ pdf_obj = pdf_obj[0]
79
+ file_path = Path(pdf_obj.name)
80
+ if not file_path.suffix == ".pdf":
81
+ logging.error(f"File {file_path} is not a PDF file")
82
+
83
+ html_error = f"""
84
+ <div style="color: red; font-size: 20px; font-weight: bold;">
85
+ File {file_path} is not a PDF file. Please upload a PDF file.
86
+ </div>
87
+ """
88
+ return "File is not a PDF file", html_error, None
89
+
90
+ conversion_stats = convert_PDF_to_Text(
91
+ file_path,
92
+ ocr_model=ocr_model,
93
+ max_pages=max_pages,
94
+ )
95
+ converted_txt = conversion_stats["converted_text"]
96
+ num_pages = conversion_stats["num_pages"]
97
+ was_truncated = conversion_stats["truncated"]
98
+ # if alt_lang: # TODO: fix this
99
+
100
+ rt = round((time.perf_counter() - st) / 60, 2)
101
+ print(f"Runtime: {rt} minutes")
102
+ html = ""
103
+ if was_truncated:
104
+ html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
105
+ html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
106
+
107
+ _output_name = f"RESULT_{file_path.stem}_OCR.txt"
108
+ with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
109
+ f.write(converted_txt)
110
+
111
+ return converted_txt, html, _output_name
112
+
113
+
114
+ if __name__ == "__main__":
115
+ logging.info("Starting app")
116
+
117
+ use_GPU = torch.cuda.is_available()
118
+ logging.info(f"Using GPU status: {use_GPU}")
119
+ logging.info("Loading OCR model")
120
+ with contextlib.redirect_stdout(None):
121
+ ocr_model = ocr_predictor(
122
+ "db_resnet50",
123
+ "crnn_mobilenet_v3_large",
124
+ pretrained=True,
125
+ assume_straight_pages=True,
126
+ )
127
+
128
+ # define pdf bytes as None
129
+ pdf_obj = _here / "example_file.pdf"
130
+ pdf_obj = str(pdf_obj.resolve())
131
+ _temp_dir = _here / "temp"
132
+ _temp_dir.mkdir(exist_ok=True)
133
+
134
+ logging.info("starting demo")
135
+ demo = gr.Blocks()
136
+
137
+ with demo:
138
+
139
+ gr.Markdown("# PDF to Text")
140
+ gr.Markdown(
141
+ "A basic demo of pdf-to-text conversion using OCR from the [doctr](https://mindee.github.io/doctr/index.html) package"
142
+ )
143
+ gr.Markdown("---")
144
+
145
+ with gr.Column():
146
+
147
+ gr.Markdown("## Load Inputs")
148
+ gr.Markdown("Upload your own file & replace the default. Files should be < 10MB to avoid upload issues - search for a PDF compressor online as needed.")
149
+ gr.Markdown(
150
+ "_If no file is uploaded, a sample PDF will be used. PDFs are truncated to 20 pages._"
151
+ )
152
+
153
+ uploaded_file = gr.File(
154
+ label="Upload a PDF file",
155
+ file_count="single",
156
+ type="file",
157
+ value=_here / "example_file.pdf",
158
+ )
159
+
160
+ gr.Markdown("---")
161
+
162
+ with gr.Column():
163
+ gr.Markdown("## Convert PDF to Text")
164
+ convert_button = gr.Button("Convert PDF!", variant="primary")
165
+ out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
166
+ gr.Markdown("### Output")
167
+ OCR_text = gr.Textbox(
168
+ label="OCR Result", placeholder="The OCR text will appear here"
169
+ )
170
+ text_file = gr.File(
171
+ label="Download Text File",
172
+ file_count="single",
173
+ type="file",
174
+ interactive=False,
175
+ )
176
+
177
+ convert_button.click(
178
+ fn=convert_PDF,
179
+ inputs=[uploaded_file],
180
+ outputs=[OCR_text, out_placeholder, text_file],
181
+ )
182
+ demo.launch(enable_queue=True)
example_file.pdf ADDED
Binary file (290 kB). View file
 
pdf2text.py CHANGED
@@ -1,5 +1,6 @@
1
  # -*- coding: utf-8 -*-
2
  """
 
3
  easyocr.py - A wrapper for easyocr to convert pdf to images to text
4
  """
5
 
@@ -40,6 +41,7 @@ def simple_rename(filepath, target_ext=".txt"):
40
  def rm_local_text_files(name_contains="RESULT_"):
41
  """
42
  rm_local_text_files - remove local text files
 
43
  Args:
44
  name_contains (str, optional): [description]. Defaults to "OCR_".
45
  """
@@ -60,10 +62,12 @@ def corr(
60
  exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
61
  ) -> str:
62
  """corrects spacing in a string
 
63
  Args:
64
  s (str): the string to correct
65
  add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
66
  exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
 
67
  Returns:
68
  str: the corrected string
69
  """
@@ -90,9 +94,11 @@ def corr(
90
  def fix_punct_spaces(string):
91
  """
92
  fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
 
93
  Parameters
94
  ----------
95
  string : str, required, input string to be corrected
 
96
  Returns
97
  -------
98
  str, corrected string
@@ -108,9 +114,11 @@ def fix_punct_spaces(string):
108
  def clean_OCR(ugly_text: str):
109
  """
110
  clean_OCR - clean the OCR text files.
 
111
  Parameters
112
  ----------
113
  ugly_text : str, required, input string to be cleaned
 
114
  Returns
115
  -------
116
  str, cleaned string
@@ -154,6 +162,7 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
154
 
155
 
156
  """## pdf2text functions
 
157
  """
158
 
159
 
@@ -179,8 +188,10 @@ spell = SpellChecker()
179
  def check_word_spelling(word: str) -> bool:
180
  """
181
  check_word_spelling - check the spelling of a word
 
182
  Args:
183
  word (str): word to check
 
184
  Returns:
185
  bool: True if word is spelled correctly, False if not
186
  """
@@ -193,9 +204,11 @@ def check_word_spelling(word: str) -> bool:
193
  def eval_and_replace(text: str, match_token: str = "- ") -> str:
194
  """
195
  eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
 
196
  Args:
197
  text (str): text to evaluate
198
  match_token (str, optional): token to replace. Defaults to "- ".
 
199
  Returns:
200
  str: text with replaced tokens
201
  """
@@ -228,10 +241,12 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
228
  def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
229
  """
230
  cleantxt_ocr - clean text from OCR
 
231
  Args:
232
  ugly_text (str): text to clean
233
  lower (bool, optional): _description_. Defaults to False.
234
  lang (str, optional): _description_. Defaults to "en".
 
235
  Returns:
236
  str: cleaned text
237
  """
@@ -363,6 +378,7 @@ def translate_text(text, source_l, target_l="en"):
363
 
364
  def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
365
  """translate a document from lang_start to lang_end
 
366
  {'code': 'en', 'name': 'English'},
367
  {'code': 'fr', 'name': 'French'},
368
  {'code': 'de', 'name': 'German'},
@@ -387,4 +403,4 @@ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
387
  f_o.writelines(translated_doc)
388
  if verbose:
389
  print("finished translating the document! - ", datetime.now())
390
- return out_path
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+
4
  easyocr.py - A wrapper for easyocr to convert pdf to images to text
5
  """
6
 
 
41
  def rm_local_text_files(name_contains="RESULT_"):
42
  """
43
  rm_local_text_files - remove local text files
44
+
45
  Args:
46
  name_contains (str, optional): [description]. Defaults to "OCR_".
47
  """
 
62
  exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
63
  ) -> str:
64
  """corrects spacing in a string
65
+
66
  Args:
67
  s (str): the string to correct
68
  add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
69
  exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
70
+
71
  Returns:
72
  str: the corrected string
73
  """
 
94
  def fix_punct_spaces(string):
95
  """
96
  fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
97
+
98
  Parameters
99
  ----------
100
  string : str, required, input string to be corrected
101
+
102
  Returns
103
  -------
104
  str, corrected string
 
114
  def clean_OCR(ugly_text: str):
115
  """
116
  clean_OCR - clean the OCR text files.
117
+
118
  Parameters
119
  ----------
120
  ugly_text : str, required, input string to be cleaned
121
+
122
  Returns
123
  -------
124
  str, cleaned string
 
162
 
163
 
164
  """## pdf2text functions
165
+
166
  """
167
 
168
 
 
188
  def check_word_spelling(word: str) -> bool:
189
  """
190
  check_word_spelling - check the spelling of a word
191
+
192
  Args:
193
  word (str): word to check
194
+
195
  Returns:
196
  bool: True if word is spelled correctly, False if not
197
  """
 
204
  def eval_and_replace(text: str, match_token: str = "- ") -> str:
205
  """
206
  eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
207
+
208
  Args:
209
  text (str): text to evaluate
210
  match_token (str, optional): token to replace. Defaults to "- ".
211
+
212
  Returns:
213
  str: text with replaced tokens
214
  """
 
241
  def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
242
  """
243
  cleantxt_ocr - clean text from OCR
244
+
245
  Args:
246
  ugly_text (str): text to clean
247
  lower (bool, optional): _description_. Defaults to False.
248
  lang (str, optional): _description_. Defaults to "en".
249
+
250
  Returns:
251
  str: cleaned text
252
  """
 
378
 
379
  def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
380
  """translate a document from lang_start to lang_end
381
+
382
  {'code': 'en', 'name': 'English'},
383
  {'code': 'fr', 'name': 'French'},
384
  {'code': 'de', 'name': 'German'},
 
403
  f_o.writelines(translated_doc)
404
  if verbose:
405
  print("finished translating the document! - ", datetime.now())
406
+ return out_path
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  clean-text[gpl]
2
  python-doctr[torch]
3
- gradio
4
  libretranslatepy
5
  natsort
6
  nltk
 
1
  clean-text[gpl]
2
  python-doctr[torch]
3
+ gradio==3.4.0
4
  libretranslatepy
5
  natsort
6
  nltk