Kevin Hu
commited on
Commit
·
7b6220c
1
Parent(s):
e758781
bigger resolution for OCR (#2919)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
deepdoc/parser/pdf_parser.py
CHANGED
|
@@ -957,6 +957,8 @@ class RAGFlowPdfParser:
|
|
| 957 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
| 958 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
| 959 |
enumerate(self.pdf.pages[page_from:page_to])]
|
|
|
|
|
|
|
| 960 |
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
| 961 |
self.pdf.pages[page_from:page_to]]
|
| 962 |
self.total_page = len(self.pdf.pages)
|
|
@@ -992,7 +994,7 @@ class RAGFlowPdfParser:
|
|
| 992 |
self.is_english = False
|
| 993 |
|
| 994 |
st = timer()
|
| 995 |
-
for i, img in enumerate(self.
|
| 996 |
chars = self.page_chars[i] if not self.is_english else []
|
| 997 |
self.mean_height.append(
|
| 998 |
np.median(sorted([c["height"] for c in chars])) if chars else 0
|
|
@@ -1000,7 +1002,7 @@ class RAGFlowPdfParser:
|
|
| 1000 |
self.mean_width.append(
|
| 1001 |
np.median(sorted([c["width"] for c in chars])) if chars else 8
|
| 1002 |
)
|
| 1003 |
-
self.page_cum_height.append(img.size[1] / zoomin)
|
| 1004 |
j = 0
|
| 1005 |
while j + 1 < len(chars):
|
| 1006 |
if chars[j]["text"] and chars[j + 1]["text"] \
|
|
@@ -1010,7 +1012,7 @@ class RAGFlowPdfParser:
|
|
| 1010 |
chars[j]["text"] += " "
|
| 1011 |
j += 1
|
| 1012 |
|
| 1013 |
-
self.__ocr(i + 1, img, chars, zoomin)
|
| 1014 |
if callback and i % 6 == 5:
|
| 1015 |
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
|
| 1016 |
# print("OCR:", timer()-st)
|
|
|
|
| 957 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
| 958 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
| 959 |
enumerate(self.pdf.pages[page_from:page_to])]
|
| 960 |
+
self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in
|
| 961 |
+
enumerate(self.pdf.pages[page_from:page_to])]
|
| 962 |
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
| 963 |
self.pdf.pages[page_from:page_to]]
|
| 964 |
self.total_page = len(self.pdf.pages)
|
|
|
|
| 994 |
self.is_english = False
|
| 995 |
|
| 996 |
st = timer()
|
| 997 |
+
for i, img in enumerate(self.page_images_x2):
|
| 998 |
chars = self.page_chars[i] if not self.is_english else []
|
| 999 |
self.mean_height.append(
|
| 1000 |
np.median(sorted([c["height"] for c in chars])) if chars else 0
|
|
|
|
| 1002 |
self.mean_width.append(
|
| 1003 |
np.median(sorted([c["width"] for c in chars])) if chars else 8
|
| 1004 |
)
|
| 1005 |
+
self.page_cum_height.append(img.size[1] / zoomin/2)
|
| 1006 |
j = 0
|
| 1007 |
while j + 1 < len(chars):
|
| 1008 |
if chars[j]["text"] and chars[j + 1]["text"] \
|
|
|
|
| 1012 |
chars[j]["text"] += " "
|
| 1013 |
j += 1
|
| 1014 |
|
| 1015 |
+
self.__ocr(i + 1, img, chars, zoomin*2)
|
| 1016 |
if callback and i % 6 == 5:
|
| 1017 |
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
|
| 1018 |
# print("OCR:", timer()-st)
|