GYH
commited on
Commit
·
366c531
1
Parent(s):
c008ff2
Split Excel file into different chunks (#847)
Browse files### What problem does this PR solve?
Split Excel into different chunk
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- deepdoc/parser/excel_parser.py +25 -16
- rag/app/naive.py +1 -1
- rag/app/one.py +1 -1
deepdoc/parser/excel_parser.py
CHANGED
|
@@ -7,30 +7,39 @@ from rag.nlp import find_codec
|
|
| 7 |
|
| 8 |
|
| 9 |
class RAGFlowExcelParser:
|
| 10 |
-
def html(self, fnm):
|
| 11 |
if isinstance(fnm, str):
|
| 12 |
wb = load_workbook(fnm)
|
| 13 |
else:
|
| 14 |
wb = load_workbook(BytesIO(fnm))
|
| 15 |
-
|
|
|
|
| 16 |
for sheetname in wb.sheetnames:
|
| 17 |
ws = wb[sheetname]
|
| 18 |
rows = list(ws.rows)
|
| 19 |
-
if not rows:continue
|
| 20 |
-
|
|
|
|
| 21 |
for t in list(rows[0]):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def __call__(self, fnm):
|
| 36 |
if isinstance(fnm, str):
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class RAGFlowExcelParser:
|
| 10 |
+
def html(self, fnm,chunk_rows=256):
|
| 11 |
if isinstance(fnm, str):
|
| 12 |
wb = load_workbook(fnm)
|
| 13 |
else:
|
| 14 |
wb = load_workbook(BytesIO(fnm))
|
| 15 |
+
|
| 16 |
+
tb_chunks = []
|
| 17 |
for sheetname in wb.sheetnames:
|
| 18 |
ws = wb[sheetname]
|
| 19 |
rows = list(ws.rows)
|
| 20 |
+
if not rows: continue
|
| 21 |
+
|
| 22 |
+
tb_rows_0 = "<tr>"
|
| 23 |
for t in list(rows[0]):
|
| 24 |
+
tb_rows_0 += f"<th>{t.value}</th>"
|
| 25 |
+
tb_rows_0 += "</tr>"
|
| 26 |
+
|
| 27 |
+
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
| 28 |
+
tb = ""
|
| 29 |
+
tb += f"<table><caption>{sheetname}</caption>"
|
| 30 |
+
tb += tb_rows_0
|
| 31 |
+
for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
|
| 32 |
+
tb += "<tr>"
|
| 33 |
+
for i, c in enumerate(r):
|
| 34 |
+
if c.value is None:
|
| 35 |
+
tb += "<td></td>"
|
| 36 |
+
else:
|
| 37 |
+
tb += f"<td>{c.value}</td>"
|
| 38 |
+
tb += "</tr>"
|
| 39 |
+
tb += "</table>\n"
|
| 40 |
+
tb_chunks.append(tb)
|
| 41 |
+
|
| 42 |
+
return tb_chunks
|
| 43 |
|
| 44 |
def __call__(self, fnm):
|
| 45 |
if isinstance(fnm, str):
|
rag/app/naive.py
CHANGED
|
@@ -134,7 +134,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 134 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
| 135 |
callback(0.1, "Start to parse.")
|
| 136 |
excel_parser = ExcelParser()
|
| 137 |
-
sections = [(excel_parser.html(binary)
|
| 138 |
|
| 139 |
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
| 140 |
callback(0.1, "Start to parse.")
|
|
|
|
| 134 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
| 135 |
callback(0.1, "Start to parse.")
|
| 136 |
excel_parser = ExcelParser()
|
| 137 |
+
sections = [(l, "") for l in excel_parser.html(binary) if l]
|
| 138 |
|
| 139 |
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
| 140 |
callback(0.1, "Start to parse.")
|
rag/app/one.py
CHANGED
|
@@ -78,7 +78,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 78 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
| 79 |
callback(0.1, "Start to parse.")
|
| 80 |
excel_parser = ExcelParser()
|
| 81 |
-
sections =
|
| 82 |
|
| 83 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
| 84 |
callback(0.1, "Start to parse.")
|
|
|
|
| 78 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
| 79 |
callback(0.1, "Start to parse.")
|
| 80 |
excel_parser = ExcelParser()
|
| 81 |
+
sections = excel_parser.html(binary , 10000000)
|
| 82 |
|
| 83 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
| 84 |
callback(0.1, "Start to parse.")
|