Support displaying tables in the chunks of pdf file when using QA parser (#1263)
Browse files### What problem does this PR solve?
Support displaying tables in the chunks of pdf file when using QA parser
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- rag/app/qa.py +5 -6
- requirements.txt +1 -0
- requirements_arm.txt +2 -1
- requirements_dev.txt +1 -0
rag/app/qa.py
CHANGED
|
@@ -22,6 +22,7 @@ from rag.settings import cron_logger
|
|
| 22 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
| 23 |
from docx import Document
|
| 24 |
from PIL import Image
|
|
|
|
| 25 |
class Excel(ExcelParser):
|
| 26 |
def __call__(self, fnm, binary=None, callback=None):
|
| 27 |
if not binary:
|
|
@@ -374,8 +375,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 374 |
code_block = False
|
| 375 |
level_index = [-1] * 7
|
| 376 |
for index, l in enumerate(lines):
|
| 377 |
-
if not l.strip():
|
| 378 |
-
continue
|
| 379 |
if l.strip().startswith('```'):
|
| 380 |
code_block = not code_block
|
| 381 |
question_level, question = 0, ''
|
|
@@ -385,10 +384,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 385 |
if not question_level or question_level > 6: # not a question
|
| 386 |
last_answer = f'{last_answer}\n{l}'
|
| 387 |
else: # is a question
|
| 388 |
-
if last_answer:
|
| 389 |
sum_question = '\n'.join(question_stack)
|
| 390 |
if sum_question:
|
| 391 |
-
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
| 392 |
last_answer = ''
|
| 393 |
|
| 394 |
i = question_level
|
|
@@ -397,10 +396,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 397 |
level_stack.pop()
|
| 398 |
question_stack.append(question)
|
| 399 |
level_stack.append(question_level)
|
| 400 |
-
if last_answer:
|
| 401 |
sum_question = '\n'.join(question_stack)
|
| 402 |
if sum_question:
|
| 403 |
-
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
| 404 |
return res
|
| 405 |
elif re.search(r"\.docx$", filename, re.IGNORECASE):
|
| 406 |
docx_parser = Docx()
|
|
|
|
| 22 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
| 23 |
from docx import Document
|
| 24 |
from PIL import Image
|
| 25 |
+
from markdown import markdown
|
| 26 |
class Excel(ExcelParser):
|
| 27 |
def __call__(self, fnm, binary=None, callback=None):
|
| 28 |
if not binary:
|
|
|
|
| 375 |
code_block = False
|
| 376 |
level_index = [-1] * 7
|
| 377 |
for index, l in enumerate(lines):
|
|
|
|
|
|
|
| 378 |
if l.strip().startswith('```'):
|
| 379 |
code_block = not code_block
|
| 380 |
question_level, question = 0, ''
|
|
|
|
| 384 |
if not question_level or question_level > 6: # not a question
|
| 385 |
last_answer = f'{last_answer}\n{l}'
|
| 386 |
else: # is a question
|
| 387 |
+
if last_answer.strip():
|
| 388 |
sum_question = '\n'.join(question_stack)
|
| 389 |
if sum_question:
|
| 390 |
+
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
|
| 391 |
last_answer = ''
|
| 392 |
|
| 393 |
i = question_level
|
|
|
|
| 396 |
level_stack.pop()
|
| 397 |
question_stack.append(question)
|
| 398 |
level_stack.append(question_level)
|
| 399 |
+
if last_answer.strip():
|
| 400 |
sum_question = '\n'.join(question_stack)
|
| 401 |
if sum_question:
|
| 402 |
+
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
|
| 403 |
return res
|
| 404 |
elif re.search(r"\.docx$", filename, re.IGNORECASE):
|
| 405 |
docx_parser = Docx()
|
requirements.txt
CHANGED
|
@@ -143,3 +143,4 @@ webdriver-manager==4.0.1
|
|
| 143 |
cn2an==0.5.22
|
| 144 |
roman-numbers==1.0.2
|
| 145 |
word2number==1.1
|
|
|
|
|
|
| 143 |
cn2an==0.5.22
|
| 144 |
roman-numbers==1.0.2
|
| 145 |
word2number==1.1
|
| 146 |
+
markdown==3.6
|
requirements_arm.txt
CHANGED
|
@@ -143,4 +143,5 @@ selenium==4.21.0
|
|
| 143 |
webdriver-manager==4.0.1
|
| 144 |
cn2an==0.5.22
|
| 145 |
roman-numbers==1.0.2
|
| 146 |
-
word2number==1.1
|
|
|
|
|
|
| 143 |
webdriver-manager==4.0.1
|
| 144 |
cn2an==0.5.22
|
| 145 |
roman-numbers==1.0.2
|
| 146 |
+
word2number==1.1
|
| 147 |
+
markdown==3.6
|
requirements_dev.txt
CHANGED
|
@@ -129,3 +129,4 @@ html_text==0.6.2
|
|
| 129 |
cn2an==0.5.22
|
| 130 |
roman-numbers==1.0.2
|
| 131 |
word2number==1.1
|
|
|
|
|
|
| 129 |
cn2an==0.5.22
|
| 130 |
roman-numbers==1.0.2
|
| 131 |
word2number==1.1
|
| 132 |
+
markdown==3.6
|