Place pdf's image at the correct position in QA parser (#1235)
Browse files### What problem does this PR solve?
Place pdf's image at the correct position in QA parser
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- rag/app/qa.py +45 -6
rag/app/qa.py
CHANGED
|
@@ -100,27 +100,69 @@ class Pdf(PdfParser):
|
|
| 100 |
last_index = -1
|
| 101 |
last_box = {'text':''}
|
| 102 |
last_bull = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
for box in self.boxes:
|
| 104 |
section, line_tag = box['text'], self._line_tag(box, zoomin)
|
| 105 |
has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
|
| 106 |
last_box, last_index, last_bull = box, index, has_bull
|
|
|
|
|
|
|
|
|
|
| 107 |
if not has_bull: # No question bullet
|
| 108 |
if not last_q:
|
|
|
|
|
|
|
| 109 |
continue
|
| 110 |
else:
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
else:
|
| 114 |
if last_q:
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
last_q, last_a, last_tag = '', '', ''
|
| 117 |
last_q = has_bull.group()
|
| 118 |
_, end = has_bull.span()
|
| 119 |
last_a = section[end:]
|
| 120 |
last_tag = line_tag
|
|
|
|
|
|
|
| 121 |
if last_q:
|
| 122 |
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
|
| 123 |
return qai_list, tbls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
class Docx(DocxParser):
|
| 125 |
def __init__(self):
|
| 126 |
pass
|
|
@@ -324,14 +366,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 324 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 325 |
callback(0.1, "Start to parse.")
|
| 326 |
pdf_parser = Pdf()
|
| 327 |
-
count = 0
|
| 328 |
qai_list, tbls = pdf_parser(filename if not binary else binary,
|
| 329 |
from_page=0, to_page=10000, callback=callback)
|
| 330 |
|
| 331 |
-
res = tokenize_table(tbls, doc, eng)
|
| 332 |
|
| 333 |
for q, a, image, poss in qai_list:
|
| 334 |
-
count += 1
|
| 335 |
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
|
| 336 |
return res
|
| 337 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
|
|
|
| 100 |
last_index = -1
|
| 101 |
last_box = {'text':''}
|
| 102 |
last_bull = None
|
| 103 |
+
def sort_key(element):
|
| 104 |
+
tbls_pn = element[1][0][0]
|
| 105 |
+
tbls_top = element[1][0][3]
|
| 106 |
+
return tbls_pn, tbls_top
|
| 107 |
+
tbls.sort(key=sort_key)
|
| 108 |
+
tbl_index = 0
|
| 109 |
+
last_pn, last_bottom = 0, 0
|
| 110 |
+
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
| 111 |
for box in self.boxes:
|
| 112 |
section, line_tag = box['text'], self._line_tag(box, zoomin)
|
| 113 |
has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
|
| 114 |
last_box, last_index, last_bull = box, index, has_bull
|
| 115 |
+
line_pn = float(line_tag.lstrip('@@').split('\t')[0])
|
| 116 |
+
line_top = float(line_tag.rstrip('##').split('\t')[3])
|
| 117 |
+
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
|
| 118 |
if not has_bull: # No question bullet
|
| 119 |
if not last_q:
|
| 120 |
+
if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed
|
| 121 |
+
tbls_index += 1
|
| 122 |
continue
|
| 123 |
else:
|
| 124 |
+
sum_tag = line_tag
|
| 125 |
+
sum_section = section
|
| 126 |
+
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
|
| 127 |
+
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the middle of current answer
|
| 128 |
+
sum_tag = f'{tbl_tag}{sum_tag}'
|
| 129 |
+
sum_section = f'{tbl_text}{sum_section}'
|
| 130 |
+
tbl_index += 1
|
| 131 |
+
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
|
| 132 |
+
last_a = f'{last_a}{sum_section}'
|
| 133 |
+
last_tag = f'{last_tag}{sum_tag}'
|
| 134 |
else:
|
| 135 |
if last_q:
|
| 136 |
+
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
|
| 137 |
+
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the end of last answer
|
| 138 |
+
last_tag = f'{last_tag}{tbl_tag}'
|
| 139 |
+
last_a = f'{last_a}{tbl_text}'
|
| 140 |
+
tbl_index += 1
|
| 141 |
+
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
|
| 142 |
+
image, poss = self.crop(last_tag, need_position=True)
|
| 143 |
+
qai_list.append((last_q, last_a, image, poss))
|
| 144 |
last_q, last_a, last_tag = '', '', ''
|
| 145 |
last_q = has_bull.group()
|
| 146 |
_, end = has_bull.span()
|
| 147 |
last_a = section[end:]
|
| 148 |
last_tag = line_tag
|
| 149 |
+
last_bottom = float(line_tag.rstrip('##').split('\t')[4])
|
| 150 |
+
last_pn = line_pn
|
| 151 |
if last_q:
|
| 152 |
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
|
| 153 |
return qai_list, tbls
|
| 154 |
+
def get_tbls_info(self, tbls, tbl_index):
|
| 155 |
+
if tbl_index >= len(tbls):
|
| 156 |
+
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
| 157 |
+
tbl_pn = tbls[tbl_index][1][0][0]+1
|
| 158 |
+
tbl_left = tbls[tbl_index][1][0][1]
|
| 159 |
+
tbl_right = tbls[tbl_index][1][0][2]
|
| 160 |
+
tbl_top = tbls[tbl_index][1][0][3]
|
| 161 |
+
tbl_bottom = tbls[tbl_index][1][0][4]
|
| 162 |
+
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
| 163 |
+
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
|
| 164 |
+
tbl_text = ''.join(tbls[tbl_index][0][1])
|
| 165 |
+
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text
|
| 166 |
class Docx(DocxParser):
|
| 167 |
def __init__(self):
|
| 168 |
pass
|
|
|
|
| 366 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 367 |
callback(0.1, "Start to parse.")
|
| 368 |
pdf_parser = Pdf()
|
|
|
|
| 369 |
qai_list, tbls = pdf_parser(filename if not binary else binary,
|
| 370 |
from_page=0, to_page=10000, callback=callback)
|
| 371 |
|
|
|
|
| 372 |
|
| 373 |
for q, a, image, poss in qai_list:
|
|
|
|
| 374 |
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
|
| 375 |
return res
|
| 376 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|