KevinHuSh
commited on
Commit
·
96a1a44
1
Parent(s):
04aba1b
add paper & manual parser (#46)
Browse files- rag/app/__init__.py +22 -0
- rag/app/laws.py +7 -21
- rag/app/manual.py +140 -0
- rag/app/paper.py +240 -0
- rag/app/presentation.py +8 -8
- rag/nlp/__init__.py +4 -1
- rag/parser/pdf_parser.py +96 -63
rag/app/__init__.py
CHANGED
|
@@ -1,5 +1,9 @@
|
|
| 1 |
import re
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def callback__(progress, msg, func):
|
| 5 |
if not func :return
|
|
@@ -46,3 +50,21 @@ def bullets_category(sections):
|
|
| 46 |
res = i
|
| 47 |
maxium = h
|
| 48 |
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
+
from nltk import word_tokenize
|
| 4 |
+
|
| 5 |
+
from rag.nlp import stemmer, huqie
|
| 6 |
+
|
| 7 |
|
| 8 |
def callback__(progress, msg, func):
|
| 9 |
if not func :return
|
|
|
|
| 50 |
res = i
|
| 51 |
maxium = h
|
| 52 |
return res
|
| 53 |
+
|
| 54 |
+
def is_english(texts):
|
| 55 |
+
eng = 0
|
| 56 |
+
for t in texts:
|
| 57 |
+
if re.match(r"[a-zA-Z]", t.strip()):
|
| 58 |
+
eng += 1
|
| 59 |
+
if eng / len(texts) > 0.8:
|
| 60 |
+
return True
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
def tokenize(d, t, eng):
|
| 64 |
+
d["content_with_weight"] = t
|
| 65 |
+
if eng:
|
| 66 |
+
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
| 67 |
+
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
|
| 68 |
+
else:
|
| 69 |
+
d["content_ltks"] = huqie.qie(t)
|
| 70 |
+
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
rag/app/laws.py
CHANGED
|
@@ -3,12 +3,13 @@ import re
|
|
| 3 |
from io import BytesIO
|
| 4 |
from docx import Document
|
| 5 |
import numpy as np
|
| 6 |
-
from rag.app import callback__, bullets_category, BULLET_PATTERN
|
| 7 |
from rag.nlp import huqie
|
|
|
|
| 8 |
from rag.parser.pdf_parser import HuParser
|
| 9 |
|
| 10 |
|
| 11 |
-
class Docx(
|
| 12 |
def __init__(self):
|
| 13 |
pass
|
| 14 |
|
|
@@ -42,14 +43,7 @@ class Pdf(HuParser):
|
|
| 42 |
print("paddle layouts:", timer()-start)
|
| 43 |
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
|
| 44 |
# is it English
|
| 45 |
-
eng =
|
| 46 |
-
for b in bxs:
|
| 47 |
-
if re.match(r"[a-zA-Z]", b["text"].strip()):
|
| 48 |
-
eng += 1
|
| 49 |
-
if eng / len(bxs) > 0.8:
|
| 50 |
-
eng = True
|
| 51 |
-
else:
|
| 52 |
-
eng = False
|
| 53 |
# Merge vertically
|
| 54 |
i = 0
|
| 55 |
while i + 1 < len(bxs):
|
|
@@ -59,7 +53,7 @@ class Pdf(HuParser):
|
|
| 59 |
bxs.pop(i)
|
| 60 |
continue
|
| 61 |
concatting_feats = [
|
| 62 |
-
b["text"].strip()[-1] in ",;:'\"
|
| 63 |
len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
|
| 64 |
b["text"].strip()[0] in "。;?!?”)),,、:",
|
| 65 |
]
|
|
@@ -118,14 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
|
| 118 |
sections = [l for l in sections if l]
|
| 119 |
|
| 120 |
# is it English
|
| 121 |
-
eng =
|
| 122 |
-
for sec in sections:
|
| 123 |
-
if re.match(r"[a-zA-Z]", sec.strip()):
|
| 124 |
-
eng += 1
|
| 125 |
-
if eng / len(sections) > 0.8:
|
| 126 |
-
eng = True
|
| 127 |
-
else:
|
| 128 |
-
eng = False
|
| 129 |
# Remove 'Contents' part
|
| 130 |
i = 0
|
| 131 |
while i < len(sections):
|
|
@@ -181,8 +168,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
|
| 181 |
if pdf_parser:
|
| 182 |
d["image"] = pdf_parser.crop(ck)
|
| 183 |
ck = pdf_parser.remove_tag(ck)
|
| 184 |
-
d
|
| 185 |
-
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
| 186 |
res.append(d)
|
| 187 |
return res
|
| 188 |
|
|
|
|
| 3 |
from io import BytesIO
|
| 4 |
from docx import Document
|
| 5 |
import numpy as np
|
| 6 |
+
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
|
| 7 |
from rag.nlp import huqie
|
| 8 |
+
from rag.parser.docx_parser import HuDocxParser
|
| 9 |
from rag.parser.pdf_parser import HuParser
|
| 10 |
|
| 11 |
|
| 12 |
+
class Docx(HuDocxParser):
|
| 13 |
def __init__(self):
|
| 14 |
pass
|
| 15 |
|
|
|
|
| 43 |
print("paddle layouts:", timer()-start)
|
| 44 |
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
|
| 45 |
# is it English
|
| 46 |
+
eng = is_english([b["text"] for b in bxs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
# Merge vertically
|
| 48 |
i = 0
|
| 49 |
while i + 1 < len(bxs):
|
|
|
|
| 53 |
bxs.pop(i)
|
| 54 |
continue
|
| 55 |
concatting_feats = [
|
| 56 |
+
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
|
| 57 |
len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
|
| 58 |
b["text"].strip()[0] in "。;?!?”)),,、:",
|
| 59 |
]
|
|
|
|
| 112 |
sections = [l for l in sections if l]
|
| 113 |
|
| 114 |
# is it English
|
| 115 |
+
eng = is_english(sections)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
# Remove 'Contents' part
|
| 117 |
i = 0
|
| 118 |
while i < len(sections):
|
|
|
|
| 168 |
if pdf_parser:
|
| 169 |
d["image"] = pdf_parser.crop(ck)
|
| 170 |
ck = pdf_parser.remove_tag(ck)
|
| 171 |
+
tokenize(d, ck, eng)
|
|
|
|
| 172 |
res.append(d)
|
| 173 |
return res
|
| 174 |
|
rag/app/manual.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import re
|
| 3 |
+
from collections import Counter
|
| 4 |
+
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
|
| 5 |
+
from rag.nlp import huqie, stemmer
|
| 6 |
+
from rag.parser.docx_parser import HuDocxParser
|
| 7 |
+
from rag.parser.pdf_parser import HuParser
|
| 8 |
+
from nltk.tokenize import word_tokenize
|
| 9 |
+
import numpy as np
|
| 10 |
+
from rag.utils import num_tokens_from_string
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Pdf(HuParser):
|
| 14 |
+
def __call__(self, filename, binary=None, from_page=0,
|
| 15 |
+
to_page=100000, zoomin=3, callback=None):
|
| 16 |
+
self.__images__(
|
| 17 |
+
filename if not binary else binary,
|
| 18 |
+
zoomin,
|
| 19 |
+
from_page,
|
| 20 |
+
to_page)
|
| 21 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 22 |
+
"Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 23 |
+
|
| 24 |
+
from timeit import default_timer as timer
|
| 25 |
+
start = timer()
|
| 26 |
+
self._layouts_paddle(zoomin)
|
| 27 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 28 |
+
"Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 29 |
+
print("paddle layouts:", timer() - start)
|
| 30 |
+
self._table_transformer_job(zoomin)
|
| 31 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 32 |
+
"Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 33 |
+
self._text_merge()
|
| 34 |
+
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
| 35 |
+
self._concat_downward(concat_between_pages=False)
|
| 36 |
+
self._filter_forpages()
|
| 37 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 38 |
+
"Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 39 |
+
tbls = self._extract_table_figure(True, zoomin, False)
|
| 40 |
+
|
| 41 |
+
# clean mess
|
| 42 |
+
for b in self.boxes:
|
| 43 |
+
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
| 44 |
+
|
| 45 |
+
# merge chunks with the same bullets
|
| 46 |
+
i = 0
|
| 47 |
+
while i + 1 < len(self.boxes):
|
| 48 |
+
b = self.boxes[i]
|
| 49 |
+
b_ = self.boxes[i + 1]
|
| 50 |
+
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
| 51 |
+
or b["page_number"]!=b_["page_number"] \
|
| 52 |
+
or b["top"] > b_["bottom"]:
|
| 53 |
+
i += 1
|
| 54 |
+
continue
|
| 55 |
+
b_["text"] = b["text"] + "\n" + b_["text"]
|
| 56 |
+
b_["x0"] = min(b["x0"], b_["x0"])
|
| 57 |
+
b_["x1"] = max(b["x1"], b_["x1"])
|
| 58 |
+
b_["top"] = b["top"]
|
| 59 |
+
self.boxes.pop(i)
|
| 60 |
+
# merge title with decent chunk
|
| 61 |
+
i = 0
|
| 62 |
+
while i + 1 < len(self.boxes):
|
| 63 |
+
b = self.boxes[i]
|
| 64 |
+
if b.get("layoutno","").find("title") < 0:
|
| 65 |
+
i += 1
|
| 66 |
+
continue
|
| 67 |
+
b_ = self.boxes[i + 1]
|
| 68 |
+
b_["text"] = b["text"] + "\n" + b_["text"]
|
| 69 |
+
b_["x0"] = min(b["x0"], b_["x0"])
|
| 70 |
+
b_["x1"] = max(b["x1"], b_["x1"])
|
| 71 |
+
b_["top"] = b["top"]
|
| 72 |
+
self.boxes.pop(i)
|
| 73 |
+
|
| 74 |
+
for b in self.boxes: print(b["text"], b.get("layoutno"))
|
| 75 |
+
|
| 76 |
+
print(tbls)
|
| 77 |
+
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
| 81 |
+
pdf_parser = None
|
| 82 |
+
paper = {}
|
| 83 |
+
|
| 84 |
+
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 85 |
+
pdf_parser = Pdf()
|
| 86 |
+
cks, tbls = pdf_parser(filename if not binary else binary,
|
| 87 |
+
from_page=from_page, to_page=to_page, callback=callback)
|
| 88 |
+
doc = {
|
| 89 |
+
"docnm_kwd": filename
|
| 90 |
+
}
|
| 91 |
+
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
| 92 |
+
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
| 93 |
+
# is it English
|
| 94 |
+
eng = pdf_parser.is_english
|
| 95 |
+
|
| 96 |
+
res = []
|
| 97 |
+
# add tables
|
| 98 |
+
for img, rows in tbls:
|
| 99 |
+
bs = 10
|
| 100 |
+
de = ";" if eng else ";"
|
| 101 |
+
for i in range(0, len(rows), bs):
|
| 102 |
+
d = copy.deepcopy(doc)
|
| 103 |
+
r = de.join(rows[i:i + bs])
|
| 104 |
+
r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
|
| 105 |
+
tokenize(d, r, eng)
|
| 106 |
+
d["image"] = img
|
| 107 |
+
res.append(d)
|
| 108 |
+
|
| 109 |
+
i = 0
|
| 110 |
+
chunk = []
|
| 111 |
+
tk_cnt = 0
|
| 112 |
+
def add_chunk():
|
| 113 |
+
nonlocal chunk, res, doc, pdf_parser, tk_cnt
|
| 114 |
+
d = copy.deepcopy(doc)
|
| 115 |
+
ck = "\n".join(chunk)
|
| 116 |
+
tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
|
| 117 |
+
d["image"] = pdf_parser.crop(ck)
|
| 118 |
+
res.append(d)
|
| 119 |
+
chunk = []
|
| 120 |
+
tk_cnt = 0
|
| 121 |
+
|
| 122 |
+
while i < len(cks):
|
| 123 |
+
if tk_cnt > 128: add_chunk()
|
| 124 |
+
txt = cks[i]
|
| 125 |
+
txt_ = pdf_parser.remove_tag(txt)
|
| 126 |
+
i += 1
|
| 127 |
+
cnt = num_tokens_from_string(txt_)
|
| 128 |
+
chunk.append(txt)
|
| 129 |
+
tk_cnt += cnt
|
| 130 |
+
if chunk: add_chunk()
|
| 131 |
+
for i, d in enumerate(res):
|
| 132 |
+
print(d)
|
| 133 |
+
# d["image"].save(f"./logs/{i}.jpg")
|
| 134 |
+
return res
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
if __name__ == "__main__":
|
| 138 |
+
import sys
|
| 139 |
+
|
| 140 |
+
chunk(sys.argv[1])
|
rag/app/paper.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import re
|
| 3 |
+
from collections import Counter
|
| 4 |
+
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
|
| 5 |
+
from rag.nlp import huqie, stemmer
|
| 6 |
+
from rag.parser.docx_parser import HuDocxParser
|
| 7 |
+
from rag.parser.pdf_parser import HuParser
|
| 8 |
+
from nltk.tokenize import word_tokenize
|
| 9 |
+
import numpy as np
|
| 10 |
+
from rag.utils import num_tokens_from_string
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Pdf(HuParser):
|
| 14 |
+
def __call__(self, filename, binary=None, from_page=0,
|
| 15 |
+
to_page=100000, zoomin=3, callback=None):
|
| 16 |
+
self.__images__(
|
| 17 |
+
filename if not binary else binary,
|
| 18 |
+
zoomin,
|
| 19 |
+
from_page,
|
| 20 |
+
to_page)
|
| 21 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 22 |
+
"Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 23 |
+
|
| 24 |
+
from timeit import default_timer as timer
|
| 25 |
+
start = timer()
|
| 26 |
+
self._layouts_paddle(zoomin)
|
| 27 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 28 |
+
"Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 29 |
+
print("paddle layouts:", timer() - start)
|
| 30 |
+
self._table_transformer_job(zoomin)
|
| 31 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 32 |
+
"Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 33 |
+
self._text_merge()
|
| 34 |
+
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
| 35 |
+
self._concat_downward(concat_between_pages=False)
|
| 36 |
+
self._filter_forpages()
|
| 37 |
+
callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
|
| 38 |
+
"Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 39 |
+
tbls = self._extract_table_figure(True, zoomin, False)
|
| 40 |
+
|
| 41 |
+
# clean mess
|
| 42 |
+
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
| 43 |
+
print("two_column...................", column_width,
|
| 44 |
+
self.page_images[0].size[0] / zoomin / 2)
|
| 45 |
+
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
|
| 46 |
+
for b in self.boxes:
|
| 47 |
+
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
| 48 |
+
freq = Counter([b["text"] for b in self.boxes])
|
| 49 |
+
garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
|
| 50 |
+
i = 0
|
| 51 |
+
while i < len(self.boxes):
|
| 52 |
+
if self.boxes[i]["text"] in garbage \
|
| 53 |
+
or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
|
| 54 |
+
or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
|
| 55 |
+
self.boxes.pop(i)
|
| 56 |
+
elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
|
| 57 |
+
'1'):
|
| 58 |
+
# merge within same layouts
|
| 59 |
+
self.boxes[i + 1]["top"] = self.boxes[i]["top"]
|
| 60 |
+
self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
|
| 61 |
+
self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
|
| 62 |
+
self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
|
| 63 |
+
self.boxes.pop(i)
|
| 64 |
+
else:
|
| 65 |
+
i += 1
|
| 66 |
+
|
| 67 |
+
def _begin(txt):
|
| 68 |
+
return re.match(
|
| 69 |
+
"[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
|
| 70 |
+
txt.lower().strip())
|
| 71 |
+
|
| 72 |
+
# get title and authors
|
| 73 |
+
title = ""
|
| 74 |
+
authors = []
|
| 75 |
+
i = 0
|
| 76 |
+
while i < min(32, len(self.boxes)):
|
| 77 |
+
b = self.boxes[i]
|
| 78 |
+
i += 1
|
| 79 |
+
if b.get("layoutno", "").find("title") >= 0:
|
| 80 |
+
title = b["text"]
|
| 81 |
+
if _begin(title):
|
| 82 |
+
title = ""
|
| 83 |
+
break
|
| 84 |
+
for j in range(3):
|
| 85 |
+
if _begin(self.boxes[i + j]["text"]): break
|
| 86 |
+
authors.append(self.boxes[i + j]["text"])
|
| 87 |
+
break
|
| 88 |
+
break
|
| 89 |
+
# get abstract
|
| 90 |
+
abstr = ""
|
| 91 |
+
i = 0
|
| 92 |
+
while i + 1 < min(32, len(self.boxes)):
|
| 93 |
+
b = self.boxes[i]
|
| 94 |
+
i += 1
|
| 95 |
+
txt = b["text"].lower().strip()
|
| 96 |
+
if re.match("(abstract|摘要)", txt):
|
| 97 |
+
if len(txt.split(" ")) > 32 or len(txt) > 64:
|
| 98 |
+
abstr = txt + self._line_tag(b, zoomin)
|
| 99 |
+
i += 1
|
| 100 |
+
break
|
| 101 |
+
txt = self.boxes[i + 1]["text"].lower().strip()
|
| 102 |
+
if len(txt.split(" ")) > 32 or len(txt) > 64:
|
| 103 |
+
abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
|
| 104 |
+
i += 1
|
| 105 |
+
break
|
| 106 |
+
if not abstr: i = 0
|
| 107 |
+
|
| 108 |
+
for b in self.boxes: print(b["text"], b.get("layoutno"))
|
| 109 |
+
print(tbls)
|
| 110 |
+
|
| 111 |
+
return {
|
| 112 |
+
"title": title if title else filename,
|
| 113 |
+
"authors": " ".join(authors),
|
| 114 |
+
"abstract": abstr,
|
| 115 |
+
"lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
|
| 116 |
+
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
| 117 |
+
"tables": tbls
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
| 122 |
+
pdf_parser = None
|
| 123 |
+
paper = {}
|
| 124 |
+
|
| 125 |
+
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 126 |
+
pdf_parser = Pdf()
|
| 127 |
+
paper = pdf_parser(filename if not binary else binary,
|
| 128 |
+
from_page=from_page, to_page=to_page, callback=callback)
|
| 129 |
+
doc = {
|
| 130 |
+
"docnm_kwd": paper["title"] if paper["title"] else filename,
|
| 131 |
+
"authors_tks": paper["authors"]
|
| 132 |
+
}
|
| 133 |
+
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
| 134 |
+
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
| 135 |
+
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
| 136 |
+
# is it English
|
| 137 |
+
eng = pdf_parser.is_english
|
| 138 |
+
print("It's English.....", eng)
|
| 139 |
+
|
| 140 |
+
res = []
|
| 141 |
+
# add tables
|
| 142 |
+
for img, rows in paper["tables"]:
|
| 143 |
+
bs = 10
|
| 144 |
+
de = ";" if eng else ";"
|
| 145 |
+
for i in range(0, len(rows), bs):
|
| 146 |
+
d = copy.deepcopy(doc)
|
| 147 |
+
r = de.join(rows[i:i + bs])
|
| 148 |
+
r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
|
| 149 |
+
tokenize(d, r)
|
| 150 |
+
d["image"] = img
|
| 151 |
+
res.append(d)
|
| 152 |
+
|
| 153 |
+
if paper["abstract"]:
|
| 154 |
+
d = copy.deepcopy(doc)
|
| 155 |
+
txt = pdf_parser.remove_tag(paper["abstract"])
|
| 156 |
+
d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
|
| 157 |
+
d["important_tks"] = " ".join(d["important_kwd"])
|
| 158 |
+
d["image"] = pdf_parser.crop(paper["abstract"])
|
| 159 |
+
tokenize(d, txt, eng)
|
| 160 |
+
res.append(d)
|
| 161 |
+
|
| 162 |
+
readed = [0] * len(paper["lines"])
|
| 163 |
+
# find colon firstly
|
| 164 |
+
i = 0
|
| 165 |
+
while i + 1 < len(paper["lines"]):
|
| 166 |
+
txt = pdf_parser.remove_tag(paper["lines"][i][0])
|
| 167 |
+
j = i
|
| 168 |
+
if txt.strip("\n").strip()[-1] not in "::":
|
| 169 |
+
i += 1
|
| 170 |
+
continue
|
| 171 |
+
i += 1
|
| 172 |
+
while i < len(paper["lines"]) and not paper["lines"][i][0]:
|
| 173 |
+
i += 1
|
| 174 |
+
if i >= len(paper["lines"]): break
|
| 175 |
+
proj = [paper["lines"][i][0].strip()]
|
| 176 |
+
i += 1
|
| 177 |
+
while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
|
| 178 |
+
proj.append(paper["lines"][i])
|
| 179 |
+
i += 1
|
| 180 |
+
for k in range(j, i): readed[k] = True
|
| 181 |
+
txt = txt[::-1]
|
| 182 |
+
if eng:
|
| 183 |
+
r = re.search(r"(.*?) ([\.;?!]|$)", txt)
|
| 184 |
+
txt = r.group(1)[::-1] if r else txt[::-1]
|
| 185 |
+
else:
|
| 186 |
+
r = re.search(r"(.*?) ([。?;!]|$)", txt)
|
| 187 |
+
txt = r.group(1)[::-1] if r else txt[::-1]
|
| 188 |
+
for p in proj:
|
| 189 |
+
d = copy.deepcopy(doc)
|
| 190 |
+
txt += "\n" + pdf_parser.remove_tag(p)
|
| 191 |
+
d["image"] = pdf_parser.crop(p)
|
| 192 |
+
tokenize(d, txt)
|
| 193 |
+
res.append(d)
|
| 194 |
+
|
| 195 |
+
i = 0
|
| 196 |
+
chunk = []
|
| 197 |
+
tk_cnt = 0
|
| 198 |
+
def add_chunk():
|
| 199 |
+
nonlocal chunk, res, doc, pdf_parser, tk_cnt
|
| 200 |
+
d = copy.deepcopy(doc)
|
| 201 |
+
ck = "\n".join(chunk)
|
| 202 |
+
tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
|
| 203 |
+
d["image"] = pdf_parser.crop(ck)
|
| 204 |
+
res.append(d)
|
| 205 |
+
chunk = []
|
| 206 |
+
tk_cnt = 0
|
| 207 |
+
|
| 208 |
+
while i < len(paper["lines"]):
|
| 209 |
+
if tk_cnt > 128:
|
| 210 |
+
add_chunk()
|
| 211 |
+
if readed[i]:
|
| 212 |
+
i += 1
|
| 213 |
+
continue
|
| 214 |
+
readed[i] = True
|
| 215 |
+
txt, layouts = paper["lines"][i]
|
| 216 |
+
txt_ = pdf_parser.remove_tag(txt)
|
| 217 |
+
i += 1
|
| 218 |
+
cnt = num_tokens_from_string(txt_)
|
| 219 |
+
if any([
|
| 220 |
+
layouts.find("title") >= 0 and chunk,
|
| 221 |
+
cnt + tk_cnt > 128 and tk_cnt > 32,
|
| 222 |
+
]):
|
| 223 |
+
add_chunk()
|
| 224 |
+
chunk = [txt]
|
| 225 |
+
tk_cnt = cnt
|
| 226 |
+
else:
|
| 227 |
+
chunk.append(txt)
|
| 228 |
+
tk_cnt += cnt
|
| 229 |
+
|
| 230 |
+
if chunk: add_chunk()
|
| 231 |
+
for i, d in enumerate(res):
|
| 232 |
+
print(d)
|
| 233 |
+
# d["image"].save(f"./logs/{i}.jpg")
|
| 234 |
+
return res
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
if __name__ == "__main__":
|
| 238 |
+
import sys
|
| 239 |
+
|
| 240 |
+
chunk(sys.argv[1])
|
rag/app/presentation.py
CHANGED
|
@@ -3,7 +3,7 @@ import re
|
|
| 3 |
from io import BytesIO
|
| 4 |
from pptx import Presentation
|
| 5 |
|
| 6 |
-
from rag.app import callback__
|
| 7 |
from rag.nlp import huqie
|
| 8 |
from rag.parser.pdf_parser import HuParser
|
| 9 |
|
|
@@ -57,7 +57,7 @@ class Ppt(object):
|
|
| 57 |
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
| 58 |
callback__((min(to_page, self.total_page) - from_page) / self.total_page,
|
| 59 |
"Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 60 |
-
|
| 61 |
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
| 62 |
|
| 63 |
|
|
@@ -103,19 +103,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
|
|
| 103 |
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
| 104 |
res = []
|
| 105 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
| 106 |
-
|
|
|
|
| 107 |
d = copy.deepcopy(doc)
|
| 108 |
-
d["content_ltks"] = huqie.qie(txt)
|
| 109 |
-
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
| 110 |
d["image"] = img
|
|
|
|
| 111 |
res.append(d)
|
| 112 |
return res
|
| 113 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 114 |
-
|
|
|
|
| 115 |
d = copy.deepcopy(doc)
|
| 116 |
-
d["content_ltks"] = huqie.qie(txt)
|
| 117 |
-
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
| 118 |
d["image"] = img
|
|
|
|
| 119 |
res.append(d)
|
| 120 |
return res
|
| 121 |
callback__(-1, "This kind of presentation document did not support yet!", callback)
|
|
|
|
| 3 |
from io import BytesIO
|
| 4 |
from pptx import Presentation
|
| 5 |
|
| 6 |
+
from rag.app import callback__, tokenize, is_english
|
| 7 |
from rag.nlp import huqie
|
| 8 |
from rag.parser.pdf_parser import HuParser
|
| 9 |
|
|
|
|
| 57 |
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
| 58 |
callback__((min(to_page, self.total_page) - from_page) / self.total_page,
|
| 59 |
"Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
|
| 60 |
+
self.is_english = is_english(txts)
|
| 61 |
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
| 62 |
|
| 63 |
|
|
|
|
| 103 |
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
| 104 |
res = []
|
| 105 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
| 106 |
+
ppt_parser = Ppt()
|
| 107 |
+
for txt,img in ppt_parser(filename if not binary else binary, from_page, to_page, callback):
|
| 108 |
d = copy.deepcopy(doc)
|
|
|
|
|
|
|
| 109 |
d["image"] = img
|
| 110 |
+
tokenize(d, txt, ppt_parser.is_english)
|
| 111 |
res.append(d)
|
| 112 |
return res
|
| 113 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 114 |
+
pdf_parser = Pdf()
|
| 115 |
+
for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
|
| 116 |
d = copy.deepcopy(doc)
|
|
|
|
|
|
|
| 117 |
d["image"] = img
|
| 118 |
+
tokenize(d, txt, pdf_parser.is_english)
|
| 119 |
res.append(d)
|
| 120 |
return res
|
| 121 |
callback__(-1, "This kind of presentation document did not support yet!", callback)
|
rag/nlp/__init__.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
from . import search
|
| 2 |
from rag.utils import ELASTICSEARCH
|
| 3 |
|
| 4 |
-
retrievaler = search.Dealer(ELASTICSEARCH)
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from . import search
|
| 2 |
from rag.utils import ELASTICSEARCH
|
| 3 |
|
| 4 |
+
retrievaler = search.Dealer(ELASTICSEARCH)
|
| 5 |
+
|
| 6 |
+
from nltk.stem import PorterStemmer
|
| 7 |
+
stemmer = PorterStemmer()
|
rag/parser/pdf_parser.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
| 2 |
import fitz
|
| 3 |
import xgboost as xgb
|
| 4 |
from io import BytesIO
|
|
@@ -14,6 +16,7 @@ from copy import deepcopy
|
|
| 14 |
from rag.cv.table_recognize import TableTransformer
|
| 15 |
from rag.cv.ppdetection import PPDet
|
| 16 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 17 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
| 18 |
|
| 19 |
|
|
@@ -22,8 +25,8 @@ class HuParser:
|
|
| 22 |
from paddleocr import PaddleOCR
|
| 23 |
logging.getLogger("ppocr").setLevel(logging.ERROR)
|
| 24 |
self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
|
| 25 |
-
self.layouter = PPDet()
|
| 26 |
-
self.tbl_det =
|
| 27 |
|
| 28 |
self.updown_cnt_mdl = xgb.Booster()
|
| 29 |
if torch.cuda.is_available():
|
|
@@ -55,7 +58,7 @@ class HuParser:
|
|
| 55 |
def _y_dis(
|
| 56 |
self, a, b):
|
| 57 |
return (
|
| 58 |
-
|
| 59 |
|
| 60 |
def _match_proj(self, b):
|
| 61 |
proj_patt = [
|
|
@@ -78,9 +81,9 @@ class HuParser:
|
|
| 78 |
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
| 79 |
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
| 80 |
tks_all = up["text"][-LEN:].strip() \
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
tks_all = huqie.qie(tks_all).split(" ")
|
| 85 |
fea = [
|
| 86 |
up.get("R", -1) == down.get("R", -1),
|
|
@@ -102,7 +105,7 @@ class HuParser:
|
|
| 102 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
| 103 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
| 104 |
True if re.search(r"[\((][^\))]+$", up["text"])
|
| 105 |
-
|
| 106 |
self._match_proj(down),
|
| 107 |
True if re.match(r"[A-Z]", down["text"]) else False,
|
| 108 |
True if re.match(r"[A-Z]", up["text"][-1]) else False,
|
|
@@ -141,6 +144,21 @@ class HuParser:
|
|
| 141 |
arr[j + 1] = deepcopy(tmp)
|
| 142 |
return arr
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
@staticmethod
|
| 145 |
def sort_R_firstly(arr, thr=0):
|
| 146 |
# sort using y1 first and then x1
|
|
@@ -219,7 +237,7 @@ class HuParser:
|
|
| 219 |
assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
|
| 220 |
tp, btm, x0, x1, b)
|
| 221 |
ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
|
| 222 |
-
|
| 223 |
if ov > 0 and ratio:
|
| 224 |
ov /= (x1 - x0) * (btm - tp)
|
| 225 |
return ov
|
|
@@ -326,7 +344,7 @@ class HuParser:
|
|
| 326 |
return layouts
|
| 327 |
|
| 328 |
def __table_paddle(self, images):
|
| 329 |
-
tbls = self.tbl_det([img for img in images],
|
| 330 |
res = []
|
| 331 |
# align left&right for rows, align top&bottom for columns
|
| 332 |
for tbl in tbls:
|
|
@@ -384,7 +402,7 @@ class HuParser:
|
|
| 384 |
continue
|
| 385 |
for tb in tbls: # for table
|
| 386 |
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
|
| 387 |
-
|
| 388 |
left *= ZM
|
| 389 |
top *= ZM
|
| 390 |
right *= ZM
|
|
@@ -482,10 +500,13 @@ class HuParser:
|
|
| 482 |
continue
|
| 483 |
ch = c["bottom"] - c["top"]
|
| 484 |
bh = bxs[ii]["bottom"] - bxs[ii]["top"]
|
| 485 |
-
if abs(ch - bh) / max(ch, bh) >= 0.7:
|
| 486 |
self.lefted_chars.append(c)
|
| 487 |
continue
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
| 489 |
|
| 490 |
for b in bxs:
|
| 491 |
if not b["text"]:
|
|
@@ -629,7 +650,7 @@ class HuParser:
|
|
| 629 |
i += 1
|
| 630 |
self.boxes = bxs
|
| 631 |
|
| 632 |
-
def _concat_downward(self):
|
| 633 |
# count boxes in the same row as a feature
|
| 634 |
for i in range(len(self.boxes)):
|
| 635 |
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
|
@@ -665,6 +686,8 @@ class HuParser:
|
|
| 665 |
if not smpg and ydis > mh * 16:
|
| 666 |
break
|
| 667 |
down = boxes[i]
|
|
|
|
|
|
|
| 668 |
|
| 669 |
if up.get("R", "") != down.get(
|
| 670 |
"R", "") and up["text"][-1] != ",":
|
|
@@ -735,43 +758,29 @@ class HuParser:
|
|
| 735 |
|
| 736 |
self.boxes = self.sort_Y_firstly(boxes, 0)
|
| 737 |
|
| 738 |
-
def
|
| 739 |
if not self.boxes:
|
| 740 |
return
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
if possible(c):
|
| 761 |
-
pg_hits[c["page_number"] - 1] += 1
|
| 762 |
-
|
| 763 |
-
st, ed = -1, -1
|
| 764 |
-
for i in range(len(self.boxes)):
|
| 765 |
-
c = self.boxes[i]
|
| 766 |
-
if c["page_number"] >= to:
|
| 767 |
break
|
| 768 |
-
if pg_hits[c["page_number"] - 1] >= 3 and possible(c):
|
| 769 |
-
if st < 0:
|
| 770 |
-
st = i
|
| 771 |
-
else:
|
| 772 |
-
ed = i
|
| 773 |
-
for _ in range(st, ed + 1):
|
| 774 |
-
self.boxes.pop(st)
|
| 775 |
|
| 776 |
def _blockType(self, b):
|
| 777 |
patt = [
|
|
@@ -918,7 +927,7 @@ class HuParser:
|
|
| 918 |
lst_r = rows[-1]
|
| 919 |
if lst_r[-1].get("R", "") != b.get("R", "") \
|
| 920 |
or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
|
| 921 |
-
|
| 922 |
btm = b["bottom"]
|
| 923 |
b["rn"] += 1
|
| 924 |
rows.append([b])
|
|
@@ -968,9 +977,9 @@ class HuParser:
|
|
| 968 |
j += 1
|
| 969 |
continue
|
| 970 |
f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
|
| 971 |
-
|
| 972 |
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
|
| 973 |
-
|
| 974 |
if f and ff:
|
| 975 |
j += 1
|
| 976 |
continue
|
|
@@ -1031,9 +1040,9 @@ class HuParser:
|
|
| 1031 |
i += 1
|
| 1032 |
continue
|
| 1033 |
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
|
| 1034 |
-
|
| 1035 |
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
|
| 1036 |
-
|
| 1037 |
if f and ff:
|
| 1038 |
i += 1
|
| 1039 |
continue
|
|
@@ -1153,6 +1162,7 @@ class HuParser:
|
|
| 1153 |
headers = {}
|
| 1154 |
hdrset = set()
|
| 1155 |
lst_hdr = []
|
|
|
|
| 1156 |
for r in sorted(list(hdr_rowno)):
|
| 1157 |
headers[r] = ["" for _ in range(clmno)]
|
| 1158 |
for i in range(clmno):
|
|
@@ -1184,12 +1194,12 @@ class HuParser:
|
|
| 1184 |
if headers[j][k].find(headers[j - 1][k]) >= 0:
|
| 1185 |
continue
|
| 1186 |
if len(headers[j][k]) > len(headers[j - 1][k]):
|
| 1187 |
-
headers[j][k] += (
|
| 1188 |
else "") + headers[j - 1][k]
|
| 1189 |
else:
|
| 1190 |
headers[j][k] = headers[j - 1][k] \
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
|
| 1194 |
logging.debug(
|
| 1195 |
f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
|
|
@@ -1241,7 +1251,11 @@ class HuParser:
|
|
| 1241 |
row_txt.append("; ".join(rtxt))
|
| 1242 |
|
| 1243 |
if cap:
|
| 1244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1245 |
return row_txt
|
| 1246 |
|
| 1247 |
@staticmethod
|
|
@@ -1254,7 +1268,7 @@ class HuParser:
|
|
| 1254 |
return True
|
| 1255 |
return False
|
| 1256 |
|
| 1257 |
-
def
|
| 1258 |
tables = {}
|
| 1259 |
figures = {}
|
| 1260 |
# extract figure and table boxes
|
|
@@ -1266,7 +1280,7 @@ class HuParser:
|
|
| 1266 |
i += 1
|
| 1267 |
continue
|
| 1268 |
lout_no = str(self.boxes[i]["page_number"]) + \
|
| 1269 |
-
|
| 1270 |
if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
|
| 1271 |
"figure caption", "reference"]:
|
| 1272 |
nomerge_lout_no.append(lst_lout_no)
|
|
@@ -1574,8 +1588,14 @@ class HuParser:
|
|
| 1574 |
self.page_chars.append([])
|
| 1575 |
|
| 1576 |
logging.info("Images converted.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1577 |
for i, img in enumerate(self.page_images):
|
| 1578 |
-
chars = self.page_chars[i]
|
| 1579 |
self.mean_height.append(
|
| 1580 |
np.median(sorted([c["height"] for c in chars])) if chars else 0
|
| 1581 |
)
|
|
@@ -1583,6 +1603,14 @@ class HuParser:
|
|
| 1583 |
np.median(sorted([c["width"] for c in chars])) if chars else 8
|
| 1584 |
)
|
| 1585 |
self.page_cum_height.append(img.size[1] / zoomin)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1586 |
# if i > 0:
|
| 1587 |
# if not chars:
|
| 1588 |
# self.page_cum_height.append(img.size[1] / zoomin)
|
|
@@ -1591,8 +1619,13 @@ class HuParser:
|
|
| 1591 |
# np.max([c["bottom"] for c in chars]))
|
| 1592 |
self.__ocr_paddle(i + 1, img, chars, zoomin)
|
| 1593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1594 |
self.page_cum_height = np.cumsum(self.page_cum_height)
|
| 1595 |
-
assert len(self.page_cum_height) == len(self.page_images)+1
|
| 1596 |
|
| 1597 |
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
| 1598 |
self.__images__(fnm, zoomin)
|
|
@@ -1600,8 +1633,8 @@ class HuParser:
|
|
| 1600 |
self._table_transformer_job(zoomin)
|
| 1601 |
self._text_merge()
|
| 1602 |
self._concat_downward()
|
| 1603 |
-
self.
|
| 1604 |
-
tbls = self.
|
| 1605 |
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
| 1606 |
|
| 1607 |
def remove_tag(self, txt):
|
|
@@ -1622,7 +1655,7 @@ class HuParser:
|
|
| 1622 |
self.page_images[pns[0]].crop((left * ZM, top * ZM,
|
| 1623 |
right *
|
| 1624 |
ZM, min(
|
| 1625 |
-
|
| 1626 |
))
|
| 1627 |
)
|
| 1628 |
bottom -= self.page_images[pns[0]].size[1]
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
+
import random
|
| 3 |
+
|
| 4 |
import fitz
|
| 5 |
import xgboost as xgb
|
| 6 |
from io import BytesIO
|
|
|
|
| 16 |
from rag.cv.table_recognize import TableTransformer
|
| 17 |
from rag.cv.ppdetection import PPDet
|
| 18 |
from huggingface_hub import hf_hub_download
|
| 19 |
+
|
| 20 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
| 21 |
|
| 22 |
|
|
|
|
| 25 |
from paddleocr import PaddleOCR
|
| 26 |
logging.getLogger("ppocr").setLevel(logging.ERROR)
|
| 27 |
self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
|
| 28 |
+
self.layouter = PPDet("/data/newpeak/medical-gpt/res/ppdet")
|
| 29 |
+
self.tbl_det = PPDet("/data/newpeak/medical-gpt/res/ppdet.tbl")
|
| 30 |
|
| 31 |
self.updown_cnt_mdl = xgb.Booster()
|
| 32 |
if torch.cuda.is_available():
|
|
|
|
| 58 |
def _y_dis(
|
| 59 |
self, a, b):
|
| 60 |
return (
|
| 61 |
+
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
|
| 62 |
|
| 63 |
def _match_proj(self, b):
|
| 64 |
proj_patt = [
|
|
|
|
| 81 |
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
| 82 |
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
| 83 |
tks_all = up["text"][-LEN:].strip() \
|
| 84 |
+
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
| 85 |
+
up["text"][-1] + down["text"][0]) else "") \
|
| 86 |
+
+ down["text"][:LEN].strip()
|
| 87 |
tks_all = huqie.qie(tks_all).split(" ")
|
| 88 |
fea = [
|
| 89 |
up.get("R", -1) == down.get("R", -1),
|
|
|
|
| 105 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
| 106 |
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
| 107 |
True if re.search(r"[\((][^\))]+$", up["text"])
|
| 108 |
+
and re.search(r"[\))]", down["text"]) else False,
|
| 109 |
self._match_proj(down),
|
| 110 |
True if re.match(r"[A-Z]", down["text"]) else False,
|
| 111 |
True if re.match(r"[A-Z]", up["text"][-1]) else False,
|
|
|
|
| 144 |
arr[j + 1] = deepcopy(tmp)
|
| 145 |
return arr
|
| 146 |
|
| 147 |
+
@staticmethod
|
| 148 |
+
def sort_X_by_page(arr, threashold):
|
| 149 |
+
# sort using y1 first and then x1
|
| 150 |
+
arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
|
| 151 |
+
for i in range(len(arr) - 1):
|
| 152 |
+
for j in range(i, -1, -1):
|
| 153 |
+
# restore the order using th
|
| 154 |
+
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
|
| 155 |
+
and arr[j + 1]["top"] < arr[j]["top"]\
|
| 156 |
+
and arr[j + 1]["page_number"] == arr[j]["page_number"]:
|
| 157 |
+
tmp = arr[j]
|
| 158 |
+
arr[j] = arr[j + 1]
|
| 159 |
+
arr[j + 1] = tmp
|
| 160 |
+
return arr
|
| 161 |
+
|
| 162 |
@staticmethod
|
| 163 |
def sort_R_firstly(arr, thr=0):
|
| 164 |
# sort using y1 first and then x1
|
|
|
|
| 237 |
assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
|
| 238 |
tp, btm, x0, x1, b)
|
| 239 |
ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
|
| 240 |
+
x0 != 0 and btm - tp != 0 else 0
|
| 241 |
if ov > 0 and ratio:
|
| 242 |
ov /= (x1 - x0) * (btm - tp)
|
| 243 |
return ov
|
|
|
|
| 344 |
return layouts
|
| 345 |
|
| 346 |
def __table_paddle(self, images):
|
| 347 |
+
tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
|
| 348 |
res = []
|
| 349 |
# align left&right for rows, align top&bottom for columns
|
| 350 |
for tbl in tbls:
|
|
|
|
| 402 |
continue
|
| 403 |
for tb in tbls: # for table
|
| 404 |
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
|
| 405 |
+
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
|
| 406 |
left *= ZM
|
| 407 |
top *= ZM
|
| 408 |
right *= ZM
|
|
|
|
| 500 |
continue
|
| 501 |
ch = c["bottom"] - c["top"]
|
| 502 |
bh = bxs[ii]["bottom"] - bxs[ii]["top"]
|
| 503 |
+
if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
|
| 504 |
self.lefted_chars.append(c)
|
| 505 |
continue
|
| 506 |
+
if c["text"] == " " and bxs[ii]["text"]:
|
| 507 |
+
if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
|
| 508 |
+
else:
|
| 509 |
+
bxs[ii]["text"] += c["text"]
|
| 510 |
|
| 511 |
for b in bxs:
|
| 512 |
if not b["text"]:
|
|
|
|
| 650 |
i += 1
|
| 651 |
self.boxes = bxs
|
| 652 |
|
| 653 |
+
def _concat_downward(self, concat_between_pages=True):
|
| 654 |
# count boxes in the same row as a feature
|
| 655 |
for i in range(len(self.boxes)):
|
| 656 |
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
|
|
|
| 686 |
if not smpg and ydis > mh * 16:
|
| 687 |
break
|
| 688 |
down = boxes[i]
|
| 689 |
+
if not concat_between_pages and down["page_number"] > up["page_number"]:
|
| 690 |
+
break
|
| 691 |
|
| 692 |
if up.get("R", "") != down.get(
|
| 693 |
"R", "") and up["text"][-1] != ",":
|
|
|
|
| 758 |
|
| 759 |
self.boxes = self.sort_Y_firstly(boxes, 0)
|
| 760 |
|
| 761 |
+
def _filter_forpages(self):
|
| 762 |
if not self.boxes:
|
| 763 |
return
|
| 764 |
+
i = 0
|
| 765 |
+
while i < len(self.boxes):
|
| 766 |
+
if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
|
| 767 |
+
i += 1
|
| 768 |
+
continue
|
| 769 |
+
eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
|
| 770 |
+
self.boxes.pop(i)
|
| 771 |
+
if i >= len(self.boxes): break
|
| 772 |
+
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
|
| 773 |
+
while not prefix:
|
| 774 |
+
self.boxes.pop(i)
|
| 775 |
+
if i >= len(self.boxes): break
|
| 776 |
+
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
|
| 777 |
+
self.boxes.pop(i)
|
| 778 |
+
if i >= len(self.boxes) or not prefix: break
|
| 779 |
+
for j in range(i, min(i + 128, len(self.boxes))):
|
| 780 |
+
if not re.match(prefix, self.boxes[j]["text"]):
|
| 781 |
+
continue
|
| 782 |
+
for k in range(i, j): self.boxes.pop(i)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
|
| 785 |
def _blockType(self, b):
|
| 786 |
patt = [
|
|
|
|
| 927 |
lst_r = rows[-1]
|
| 928 |
if lst_r[-1].get("R", "") != b.get("R", "") \
|
| 929 |
or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
|
| 930 |
+
): # new row
|
| 931 |
btm = b["bottom"]
|
| 932 |
b["rn"] += 1
|
| 933 |
rows.append([b])
|
|
|
|
| 977 |
j += 1
|
| 978 |
continue
|
| 979 |
f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
|
| 980 |
+
[j - 1][0].get("text")) or j == 0
|
| 981 |
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
|
| 982 |
+
[j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
|
| 983 |
if f and ff:
|
| 984 |
j += 1
|
| 985 |
continue
|
|
|
|
| 1040 |
i += 1
|
| 1041 |
continue
|
| 1042 |
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
|
| 1043 |
+
[jj][0].get("text")) or i == 0
|
| 1044 |
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
|
| 1045 |
+
[jj][0].get("text")) or i + 1 >= len(tbl)
|
| 1046 |
if f and ff:
|
| 1047 |
i += 1
|
| 1048 |
continue
|
|
|
|
| 1162 |
headers = {}
|
| 1163 |
hdrset = set()
|
| 1164 |
lst_hdr = []
|
| 1165 |
+
de = "的" if not self.is_english else " for "
|
| 1166 |
for r in sorted(list(hdr_rowno)):
|
| 1167 |
headers[r] = ["" for _ in range(clmno)]
|
| 1168 |
for i in range(clmno):
|
|
|
|
| 1194 |
if headers[j][k].find(headers[j - 1][k]) >= 0:
|
| 1195 |
continue
|
| 1196 |
if len(headers[j][k]) > len(headers[j - 1][k]):
|
| 1197 |
+
headers[j][k] += (de if headers[j][k]
|
| 1198 |
else "") + headers[j - 1][k]
|
| 1199 |
else:
|
| 1200 |
headers[j][k] = headers[j - 1][k] \
|
| 1201 |
+
+ (de if headers[j - 1][k] else "") \
|
| 1202 |
+
+ headers[j][k]
|
| 1203 |
|
| 1204 |
logging.debug(
|
| 1205 |
f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
|
|
|
|
| 1251 |
row_txt.append("; ".join(rtxt))
|
| 1252 |
|
| 1253 |
if cap:
|
| 1254 |
+
if self.is_english:
|
| 1255 |
+
from_ = " in "
|
| 1256 |
+
else:
|
| 1257 |
+
from_ = "来自"
|
| 1258 |
+
row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
|
| 1259 |
return row_txt
|
| 1260 |
|
| 1261 |
@staticmethod
|
|
|
|
| 1268 |
return True
|
| 1269 |
return False
|
| 1270 |
|
| 1271 |
+
def _extract_table_figure(self, need_image, ZM, return_html):
|
| 1272 |
tables = {}
|
| 1273 |
figures = {}
|
| 1274 |
# extract figure and table boxes
|
|
|
|
| 1280 |
i += 1
|
| 1281 |
continue
|
| 1282 |
lout_no = str(self.boxes[i]["page_number"]) + \
|
| 1283 |
+
"-" + str(self.boxes[i]["layoutno"])
|
| 1284 |
if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
|
| 1285 |
"figure caption", "reference"]:
|
| 1286 |
nomerge_lout_no.append(lst_lout_no)
|
|
|
|
| 1588 |
self.page_chars.append([])
|
| 1589 |
|
| 1590 |
logging.info("Images converted.")
|
| 1591 |
+
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))]
|
| 1592 |
+
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
|
| 1593 |
+
self.is_english = True
|
| 1594 |
+
else:
|
| 1595 |
+
self.is_english = False
|
| 1596 |
+
|
| 1597 |
for i, img in enumerate(self.page_images):
|
| 1598 |
+
chars = self.page_chars[i] if not self.is_english else []
|
| 1599 |
self.mean_height.append(
|
| 1600 |
np.median(sorted([c["height"] for c in chars])) if chars else 0
|
| 1601 |
)
|
|
|
|
| 1603 |
np.median(sorted([c["width"] for c in chars])) if chars else 8
|
| 1604 |
)
|
| 1605 |
self.page_cum_height.append(img.size[1] / zoomin)
|
| 1606 |
+
j = 0
|
| 1607 |
+
while j + 1 < len(chars):
|
| 1608 |
+
if chars[j]["text"] and chars[j + 1]["text"] \
|
| 1609 |
+
and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
|
| 1610 |
+
and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
|
| 1611 |
+
chars[j]["width"]) / 2:
|
| 1612 |
+
chars[j]["text"] += " "
|
| 1613 |
+
j += 1
|
| 1614 |
# if i > 0:
|
| 1615 |
# if not chars:
|
| 1616 |
# self.page_cum_height.append(img.size[1] / zoomin)
|
|
|
|
| 1619 |
# np.max([c["bottom"] for c in chars]))
|
| 1620 |
self.__ocr_paddle(i + 1, img, chars, zoomin)
|
| 1621 |
|
| 1622 |
+
if not self.is_english and not all([c for c in self.page_chars]) and self.boxes:
|
| 1623 |
+
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)]))
|
| 1624 |
+
|
| 1625 |
+
logging.info("Is it English:", self.is_english)
|
| 1626 |
+
|
| 1627 |
self.page_cum_height = np.cumsum(self.page_cum_height)
|
| 1628 |
+
assert len(self.page_cum_height) == len(self.page_images) + 1
|
| 1629 |
|
| 1630 |
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
| 1631 |
self.__images__(fnm, zoomin)
|
|
|
|
| 1633 |
self._table_transformer_job(zoomin)
|
| 1634 |
self._text_merge()
|
| 1635 |
self._concat_downward()
|
| 1636 |
+
self._filter_forpages()
|
| 1637 |
+
tbls = self._extract_table_figure(need_image, zoomin, return_html)
|
| 1638 |
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
| 1639 |
|
| 1640 |
def remove_tag(self, txt):
|
|
|
|
| 1655 |
self.page_images[pns[0]].crop((left * ZM, top * ZM,
|
| 1656 |
right *
|
| 1657 |
ZM, min(
|
| 1658 |
+
bottom, self.page_images[pns[0]].size[1])
|
| 1659 |
))
|
| 1660 |
)
|
| 1661 |
bottom -= self.page_images[pns[0]].size[1]
|