Kevin Hu
commited on
Commit
·
c337e13
1
Parent(s):
d7fa9e2
Updates on parsing progress, including more detailed time cost inform… (#3402)
Browse files### What problem does this PR solve?
#3401
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/validation.py +14 -3
- rag/svr/task_executor.py +8 -3
api/validation.py
CHANGED
|
@@ -32,7 +32,18 @@ def python_version_validation():
|
|
| 32 |
|
| 33 |
python_version_validation()
|
| 34 |
|
|
|
|
| 35 |
# Download nltk data
|
| 36 |
-
|
| 37 |
-
nltk
|
| 38 |
-
nltk.download('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
python_version_validation()
|
| 34 |
|
| 35 |
+
|
| 36 |
# Download nltk data
|
| 37 |
+
def download_nltk_data():
|
| 38 |
+
import nltk
|
| 39 |
+
nltk.download('wordnet', halt_on_error=False, quiet=True)
|
| 40 |
+
nltk.download('punkt_tab', halt_on_error=False, quiet=True)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
from multiprocessing import Pool
|
| 45 |
+
pool = Pool(processes=1)
|
| 46 |
+
thr = pool.apply_async(download_nltk_data)
|
| 47 |
+
binary = thr.get(timeout=60)
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print('\x1b[6;37;41m WARNING \x1b[0m' + "Downloading NLTK data failure.", flush=True)
|
rag/svr/task_executor.py
CHANGED
|
@@ -218,14 +218,17 @@ def build(row):
|
|
| 218 |
logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
| 219 |
|
| 220 |
if row["parser_config"].get("auto_keywords", 0):
|
|
|
|
| 221 |
callback(msg="Start to generate keywords for every chunk ...")
|
| 222 |
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 223 |
for d in docs:
|
| 224 |
d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
|
| 225 |
row["parser_config"]["auto_keywords"]).split(",")
|
| 226 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
|
|
|
|
| 227 |
|
| 228 |
if row["parser_config"].get("auto_questions", 0):
|
|
|
|
| 229 |
callback(msg="Start to generate questions for every chunk ...")
|
| 230 |
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 231 |
for d in docs:
|
|
@@ -236,6 +239,7 @@ def build(row):
|
|
| 236 |
d["content_ltks"] += " " + qst
|
| 237 |
if "content_sm_ltks" in d:
|
| 238 |
d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
|
|
|
|
| 239 |
|
| 240 |
return docs
|
| 241 |
|
|
@@ -364,8 +368,8 @@ def main():
|
|
| 364 |
# TODO: exception handler
|
| 365 |
## set_progress(r["did"], -1, "ERROR: ")
|
| 366 |
callback(
|
| 367 |
-
msg="Finished slicing files(
|
| 368 |
-
|
| 369 |
st = timer()
|
| 370 |
try:
|
| 371 |
tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
|
|
@@ -374,7 +378,7 @@ def main():
|
|
| 374 |
logger.exception("run_rembedding got exception")
|
| 375 |
tk_count = 0
|
| 376 |
logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
| 377 |
-
callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
|
| 378 |
|
| 379 |
# logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
| 380 |
init_kb(r, vector_size)
|
|
@@ -396,6 +400,7 @@ def main():
|
|
| 396 |
if TaskService.do_cancel(r["id"]):
|
| 397 |
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
| 398 |
continue
|
|
|
|
| 399 |
callback(1., "Done!")
|
| 400 |
DocumentService.increment_chunk_num(
|
| 401 |
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
|
|
|
| 218 |
logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
| 219 |
|
| 220 |
if row["parser_config"].get("auto_keywords", 0):
|
| 221 |
+
st = timer()
|
| 222 |
callback(msg="Start to generate keywords for every chunk ...")
|
| 223 |
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 224 |
for d in docs:
|
| 225 |
d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
|
| 226 |
row["parser_config"]["auto_keywords"]).split(",")
|
| 227 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
|
| 228 |
+
callback(msg="Keywords generation completed in {:.2f}s".format(timer()-st))
|
| 229 |
|
| 230 |
if row["parser_config"].get("auto_questions", 0):
|
| 231 |
+
st = timer()
|
| 232 |
callback(msg="Start to generate questions for every chunk ...")
|
| 233 |
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
|
| 234 |
for d in docs:
|
|
|
|
| 239 |
d["content_ltks"] += " " + qst
|
| 240 |
if "content_sm_ltks" in d:
|
| 241 |
d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
|
| 242 |
+
callback(msg="Question generation completed in {:.2f}s".format(timer()-st))
|
| 243 |
|
| 244 |
return docs
|
| 245 |
|
|
|
|
| 368 |
# TODO: exception handler
|
| 369 |
## set_progress(r["did"], -1, "ERROR: ")
|
| 370 |
callback(
|
| 371 |
+
msg="Finished slicing files ({} chunks in {:.2f}s). Start to embedding the content.".format(len(cks), timer() - st)
|
| 372 |
+
)
|
| 373 |
st = timer()
|
| 374 |
try:
|
| 375 |
tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
|
|
|
|
| 378 |
logger.exception("run_rembedding got exception")
|
| 379 |
tk_count = 0
|
| 380 |
logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
| 381 |
+
callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st))
|
| 382 |
|
| 383 |
# logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
| 384 |
init_kb(r, vector_size)
|
|
|
|
| 400 |
if TaskService.do_cancel(r["id"]):
|
| 401 |
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
| 402 |
continue
|
| 403 |
+
callback(msg="Indexing elapsed in {:.2f}s.".format(timer() - st))
|
| 404 |
callback(1., "Done!")
|
| 405 |
DocumentService.increment_chunk_num(
|
| 406 |
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|