Spaces:

retopara
/

ragflow

Build error

GYH commited on May 20, 2024

Commit

366c531

1 Parent(s): c008ff2

Split Excel file into different chunks (#847)

### What problem does this PR solve?

Split Excel into different chunk
### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (3) hide show

deepdoc/parser/excel_parser.py +25 -16
rag/app/naive.py +1 -1
rag/app/one.py +1 -1

deepdoc/parser/excel_parser.py CHANGED Viewed

@@ -7,30 +7,39 @@ from rag.nlp import find_codec
 class RAGFlowExcelParser:
-    def html(self, fnm):
         if isinstance(fnm, str):
             wb = load_workbook(fnm)
         else:
             wb = load_workbook(BytesIO(fnm))
-        tb = ""
         for sheetname in wb.sheetnames:
             ws = wb[sheetname]
             rows = list(ws.rows)
-            if not rows:continue
-            tb += f"<table><caption>{sheetname}</caption><tr>"
             for t in list(rows[0]):
-                tb += f"<th>{t.value}</th>"
-            tb += "</tr>"
-            for r in list(rows[1:]):
-                tb += "<tr>"
-                for i, c in enumerate(r):
-                    if c.value is None:
-                        tb += "<td></td>"
-                    else:
-                        tb += f"<td>{c.value}</td>"
-                tb += "</tr>"
-            tb += "</table>\n"
-        return tb
     def __call__(self, fnm):
         if isinstance(fnm, str):

 class RAGFlowExcelParser:
+    def html(self, fnm,chunk_rows=256):
         if isinstance(fnm, str):
             wb = load_workbook(fnm)
         else:
             wb = load_workbook(BytesIO(fnm))
+        tb_chunks = []
         for sheetname in wb.sheetnames:
             ws = wb[sheetname]
             rows = list(ws.rows)
+            if not rows: continue
+            tb_rows_0 = "<tr>"
             for t in list(rows[0]):
+                tb_rows_0 += f"<th>{t.value}</th>"
+            tb_rows_0 += "</tr>"
+            for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
+                tb = ""
+                tb += f"<table><caption>{sheetname}</caption>"
+                tb += tb_rows_0
+                for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
+                    tb += "<tr>"
+                    for i, c in enumerate(r):
+                        if c.value is None:
+                            tb += "<td></td>"
+                        else:
+                            tb += f"<td>{c.value}</td>"
+                    tb += "</tr>"
+                tb += "</table>\n"
+                tb_chunks.append(tb)
+        return tb_chunks
     def __call__(self, fnm):
         if isinstance(fnm, str):

rag/app/naive.py CHANGED Viewed

@@ -134,7 +134,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         excel_parser = ExcelParser()
-        sections = [(excel_parser.html(binary), "")]
     elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")

     elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         excel_parser = ExcelParser()
+        sections = [(l, "") for l in excel_parser.html(binary) if l]
     elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")

rag/app/one.py CHANGED Viewed

@@ -78,7 +78,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         excel_parser = ExcelParser()
-        sections = [excel_parser.html(binary)]
     elif re.search(r"\.txt$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")

     elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         excel_parser = ExcelParser()
+        sections = excel_parser.html(binary , 10000000)
     elif re.search(r"\.txt$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")