|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
|
|
|
class RAGFlowMarkdownParser: |
|
|
def __init__(self, chunk_token_num=128): |
|
|
self.chunk_token_num = int(chunk_token_num) |
|
|
|
|
|
def extract_tables_and_remainder(self, markdown_text): |
|
|
|
|
|
table_pattern = re.compile( |
|
|
r''' |
|
|
(?:\n|^) |
|
|
(?:\|.*?\|.*?\|.*?\n) |
|
|
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) |
|
|
(?:\|.*?\|.*?\|.*?\n)+ |
|
|
''', re.VERBOSE) |
|
|
tables = table_pattern.findall(markdown_text) |
|
|
remainder = table_pattern.sub('', markdown_text) |
|
|
|
|
|
|
|
|
no_border_table_pattern = re.compile( |
|
|
r''' |
|
|
(?:\n|^) |
|
|
(?:\S.*?\|.*?\n) |
|
|
(?:(?:\s*[:-]+[-| :]*\s*).*?\n) |
|
|
(?:\S.*?\|.*?\n)+ |
|
|
''', re.VERBOSE) |
|
|
no_border_tables = no_border_table_pattern.findall(remainder) |
|
|
tables.extend(no_border_tables) |
|
|
remainder = no_border_table_pattern.sub('', remainder) |
|
|
|
|
|
return remainder, tables |
|
|
|