Kevin Hu
commited on
Commit
·
64508f3
1
Parent(s):
e9c1552
let presentation do raptor (#2838)
Browse files### What problem does this PR solve?
#2837
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/document_app.py +3 -2
- rag/app/qa.py +10 -1
api/apps/document_app.py
CHANGED
|
@@ -439,8 +439,9 @@ def change_parser():
|
|
| 439 |
else:
|
| 440 |
return get_json_result(data=True)
|
| 441 |
|
| 442 |
-
if doc.type == FileType.VISUAL
|
| 443 |
-
|
|
|
|
| 444 |
return get_data_error_result(retmsg="Not supported yet!")
|
| 445 |
|
| 446 |
e = DocumentService.update_by_id(doc.id,
|
|
|
|
| 439 |
else:
|
| 440 |
return get_json_result(data=True)
|
| 441 |
|
| 442 |
+
if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
|
| 443 |
+
or (re.search(
|
| 444 |
+
r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
|
| 445 |
return get_data_error_result(retmsg="Not supported yet!")
|
| 446 |
|
| 447 |
e = DocumentService.update_by_id(doc.id,
|
rag/app/qa.py
CHANGED
|
@@ -68,6 +68,7 @@ class Excel(ExcelParser):
|
|
| 68 |
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
|
| 69 |
return res
|
| 70 |
|
|
|
|
| 71 |
class Pdf(PdfParser):
|
| 72 |
def __call__(self, filename, binary=None, from_page=0,
|
| 73 |
to_page=100000, zoomin=3, callback=None):
|
|
@@ -155,6 +156,7 @@ class Pdf(PdfParser):
|
|
| 155 |
if last_q:
|
| 156 |
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
|
| 157 |
return qai_list, tbls
|
|
|
|
| 158 |
def get_tbls_info(self, tbls, tbl_index):
|
| 159 |
if tbl_index >= len(tbls):
|
| 160 |
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
|
@@ -166,10 +168,13 @@ class Pdf(PdfParser):
|
|
| 166 |
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
| 167 |
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
|
| 168 |
tbl_text = ''.join(tbls[tbl_index][0][1])
|
| 169 |
-
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
|
|
|
|
|
|
|
| 170 |
class Docx(DocxParser):
|
| 171 |
def __init__(self):
|
| 172 |
pass
|
|
|
|
| 173 |
def get_picture(self, document, paragraph):
|
| 174 |
img = paragraph._element.xpath('.//pic:pic')
|
| 175 |
if not img:
|
|
@@ -242,6 +247,7 @@ class Docx(DocxParser):
|
|
| 242 |
tbls.append(((None, html), ""))
|
| 243 |
return qai_list, tbls
|
| 244 |
|
|
|
|
| 245 |
def rmPrefix(txt):
|
| 246 |
return re.sub(
|
| 247 |
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
|
|
@@ -258,6 +264,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
|
|
| 258 |
add_positions(d, poss)
|
| 259 |
return d
|
| 260 |
|
|
|
|
| 261 |
def beAdocDocx(d, q, a, eng, image):
|
| 262 |
qprefix = "Question: " if eng else "问题:"
|
| 263 |
aprefix = "Answer: " if eng else "回答:"
|
|
@@ -268,6 +275,7 @@ def beAdocDocx(d, q, a, eng, image):
|
|
| 268 |
d["image"] = image
|
| 269 |
return d
|
| 270 |
|
|
|
|
| 271 |
def beAdoc(d, q, a, eng):
|
| 272 |
qprefix = "Question: " if eng else "问题:"
|
| 273 |
aprefix = "Answer: " if eng else "回答:"
|
|
@@ -282,6 +290,7 @@ def mdQuestionLevel(s):
|
|
| 282 |
match = re.match(r'#*', s)
|
| 283 |
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
| 284 |
|
|
|
|
| 285 |
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
| 286 |
"""
|
| 287 |
Excel and csv(txt) format files are supported.
|
|
|
|
| 68 |
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
|
| 69 |
return res
|
| 70 |
|
| 71 |
+
|
| 72 |
class Pdf(PdfParser):
|
| 73 |
def __call__(self, filename, binary=None, from_page=0,
|
| 74 |
to_page=100000, zoomin=3, callback=None):
|
|
|
|
| 156 |
if last_q:
|
| 157 |
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
|
| 158 |
return qai_list, tbls
|
| 159 |
+
|
| 160 |
def get_tbls_info(self, tbls, tbl_index):
|
| 161 |
if tbl_index >= len(tbls):
|
| 162 |
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
|
|
|
| 168 |
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
| 169 |
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
|
| 170 |
tbl_text = ''.join(tbls[tbl_index][0][1])
|
| 171 |
+
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
|
| 172 |
+
|
| 173 |
+
|
| 174 |
class Docx(DocxParser):
|
| 175 |
def __init__(self):
|
| 176 |
pass
|
| 177 |
+
|
| 178 |
def get_picture(self, document, paragraph):
|
| 179 |
img = paragraph._element.xpath('.//pic:pic')
|
| 180 |
if not img:
|
|
|
|
| 247 |
tbls.append(((None, html), ""))
|
| 248 |
return qai_list, tbls
|
| 249 |
|
| 250 |
+
|
| 251 |
def rmPrefix(txt):
|
| 252 |
return re.sub(
|
| 253 |
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
|
|
|
|
| 264 |
add_positions(d, poss)
|
| 265 |
return d
|
| 266 |
|
| 267 |
+
|
| 268 |
def beAdocDocx(d, q, a, eng, image):
|
| 269 |
qprefix = "Question: " if eng else "问题:"
|
| 270 |
aprefix = "Answer: " if eng else "回答:"
|
|
|
|
| 275 |
d["image"] = image
|
| 276 |
return d
|
| 277 |
|
| 278 |
+
|
| 279 |
def beAdoc(d, q, a, eng):
|
| 280 |
qprefix = "Question: " if eng else "问题:"
|
| 281 |
aprefix = "Answer: " if eng else "回答:"
|
|
|
|
| 290 |
match = re.match(r'#*', s)
|
| 291 |
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
| 292 |
|
| 293 |
+
|
| 294 |
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
| 295 |
"""
|
| 296 |
Excel and csv(txt) format files are supported.
|