Fix some bugs in text2sql.(#4279)(#4281) (#4280)
Browse filesFix some bugs in text2sql.(#4279)(#4281)
### What problem does this PR solve?
- The incorrect results in parsing CSV files of the QA knowledge base in
the text2sql scenario. Process CSV files using the csv library. Decouple
CSV parsing from TXT parsing
- Most llm return results in markdown format ```sql query ```, Fix
execution error caused by LLM output SQLmarkdown format.### Type of
change
- [x] Bug Fix (non-breaking change which fixes an issue)
- agent/component/exesql.py +8 -12
- rag/app/qa.py +34 -2
agent/component/exesql.py
CHANGED
|
@@ -65,20 +65,16 @@ class ExeSQL(ComponentBase, ABC):
|
|
| 65 |
self._loop += 1
|
| 66 |
|
| 67 |
ans = self.get_input()
|
| 68 |
-
|
| 69 |
-
|
| 70 |
ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
else:
|
| 78 |
-
print("no markdown")
|
| 79 |
-
ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
|
| 80 |
else:
|
| 81 |
-
|
|
|
|
| 82 |
ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
|
| 83 |
ans = re.sub(r';[^;]*$', r';', ans)
|
| 84 |
if not ans:
|
|
|
|
| 65 |
self._loop += 1
|
| 66 |
|
| 67 |
ans = self.get_input()
|
|
|
|
|
|
|
| 68 |
ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
|
| 69 |
+
|
| 70 |
+
# improve the information extraction, most llm return results in markdown format ```sql query ```
|
| 71 |
+
match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
|
| 72 |
+
if match:
|
| 73 |
+
ans = match.group(1) # Query content
|
| 74 |
+
print(ans)
|
|
|
|
|
|
|
|
|
|
| 75 |
else:
|
| 76 |
+
print("no markdown")
|
| 77 |
+
ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
|
| 78 |
ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
|
| 79 |
ans = re.sub(r';[^;]*$', r';', ans)
|
| 80 |
if not ans:
|
rag/app/qa.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
| 12 |
#
|
| 13 |
import logging
|
| 14 |
import re
|
|
|
|
| 15 |
from copy import deepcopy
|
| 16 |
from io import BytesIO
|
| 17 |
from timeit import default_timer as timer
|
|
@@ -25,7 +26,6 @@ from docx import Document
|
|
| 25 |
from PIL import Image
|
| 26 |
from markdown import markdown
|
| 27 |
|
| 28 |
-
|
| 29 |
class Excel(ExcelParser):
|
| 30 |
def __call__(self, fnm, binary=None, callback=None):
|
| 31 |
if not binary:
|
|
@@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 320 |
res.append(beAdoc(deepcopy(doc), q, a, eng))
|
| 321 |
return res
|
| 322 |
|
| 323 |
-
elif re.search(r"\.(txt
|
| 324 |
callback(0.1, "Start to parse.")
|
| 325 |
txt = get_text(filename, binary)
|
| 326 |
lines = txt.split("\n")
|
|
@@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 359 |
|
| 360 |
return res
|
| 361 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 363 |
callback(0.1, "Start to parse.")
|
| 364 |
pdf_parser = Pdf()
|
|
|
|
| 12 |
#
|
| 13 |
import logging
|
| 14 |
import re
|
| 15 |
+
import csv
|
| 16 |
from copy import deepcopy
|
| 17 |
from io import BytesIO
|
| 18 |
from timeit import default_timer as timer
|
|
|
|
| 26 |
from PIL import Image
|
| 27 |
from markdown import markdown
|
| 28 |
|
|
|
|
| 29 |
class Excel(ExcelParser):
|
| 30 |
def __call__(self, fnm, binary=None, callback=None):
|
| 31 |
if not binary:
|
|
|
|
| 320 |
res.append(beAdoc(deepcopy(doc), q, a, eng))
|
| 321 |
return res
|
| 322 |
|
| 323 |
+
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
|
| 324 |
callback(0.1, "Start to parse.")
|
| 325 |
txt = get_text(filename, binary)
|
| 326 |
lines = txt.split("\n")
|
|
|
|
| 359 |
|
| 360 |
return res
|
| 361 |
|
| 362 |
+
elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
|
| 363 |
+
callback(0.1, "Start to parse.")
|
| 364 |
+
txt = get_text(filename, binary)
|
| 365 |
+
lines = txt.split("\n")
|
| 366 |
+
delimiter = "\t" if any("\t" in line for line in lines) else ","
|
| 367 |
+
|
| 368 |
+
fails = []
|
| 369 |
+
question, answer = "", ""
|
| 370 |
+
res = []
|
| 371 |
+
reader = csv.reader(lines, delimiter=delimiter)
|
| 372 |
+
|
| 373 |
+
for i, row in enumerate(reader):
|
| 374 |
+
if len(row) != 2:
|
| 375 |
+
if question:
|
| 376 |
+
answer += "\n" + lines[i]
|
| 377 |
+
else:
|
| 378 |
+
fails.append(str(i + 1))
|
| 379 |
+
elif len(row) == 2:
|
| 380 |
+
if question and answer:
|
| 381 |
+
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
| 382 |
+
question, answer = row
|
| 383 |
+
if len(res) % 999 == 0:
|
| 384 |
+
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
| 385 |
+
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
| 386 |
+
|
| 387 |
+
if question:
|
| 388 |
+
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
| 389 |
+
|
| 390 |
+
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
| 391 |
+
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
| 392 |
+
return res
|
| 393 |
+
|
| 394 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 395 |
callback(0.1, "Start to parse.")
|
| 396 |
pdf_parser = Pdf()
|