DocTalk

Runtime error

App Files Files Community

Raghav001 commited on Apr 10, 2023

Commit

ab98b8b

1 Parent(s): b616209

initial

Browse files

Translation

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -44,7 +44,7 @@ def doc_emb(doc: str):
     #     emb_list.append(f.result())
     print('\n'.join(texts))
     return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
-        value="""操作说明 step 3：PDF解析提交成功！ 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
 def get_response(msg, bot, doc_text_list, doc_embeddings):
@@ -71,7 +71,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
             break
         index_set.add(s_i[1])
         now_len += len(doc)
-        # 可能段落截断错误，所以把上下段也加入进来
         if s_i[1] > 0 and s_i[1] -1 not in index_set:
             doc = doc_text_list[s_i[1]-1]
             if now_len + len(doc) > all_max_len:
@@ -107,12 +107,12 @@ def up_file(files):
         print(file.name)
         with pdfplumber.open(file.name) as pdf:
             for i in range(len(pdf.pages)):
-                # 读取PDF文档第i+1页
                 page = pdf.pages[i]
                 res_list = page.extract_text().split('\n')[:-1]
                 for j in range(len(page.images)):
-                    # 获取图片的二进制流
                     img = page.images[j]
                     file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
                     with open(file_name, mode='wb') as f:
@@ -126,7 +126,7 @@ def up_file(files):
                 tables = page.extract_tables()
                 for table in tables:
-                    # 第一列当成表头：
                     df = pd.DataFrame(table[1:], columns=table[0])
                     try:
                         records = json.loads(df.to_json(orient="records", force_ascii=False))
@@ -140,22 +140,22 @@ def up_file(files):
     print(doc_text_list)
     return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
         visible=True), gr.Markdown.update(
-        value="操作说明 step 2：确认PDF解析结果（可修正），点击“提交解析结果”，随后进行对话")
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            file = gr.File(file_types=['.pdf'], label='点击上传PDF，进行解析(支持多文档、表格、OCR)', file_count='multiple')
-            doc_bu = gr.Button(value='提交解析结果', visible=False)
-            txt = gr.Textbox(label='PDF解析结果', visible=False)
             doc_text_state = gr.State([])
             doc_emb_state = gr.State([])
         with gr.Column():
-            md = gr.Markdown("""操作说明 step 1：点击左侧区域，上传PDF，进行解析""")
             chat_bot = gr.Chatbot(visible=False)
-            msg_txt = gr.Textbox(label='消息框', placeholder='输入消息，点击发送', visible=False)
-            chat_bu = gr.Button(value='发送', visible=False)
     file.change(up_file, [file], [txt, doc_bu, md])
     doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])

     #     emb_list.append(f.result())
     print('\n'.join(texts))
     return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
+        value="""success ! Let's talk"""), gr.Chatbot.update(visible=True)
 def get_response(msg, bot, doc_text_list, doc_embeddings):
             break
         index_set.add(s_i[1])
         now_len += len(doc)
+       # Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
         if s_i[1] > 0 and s_i[1] -1 not in index_set:
             doc = doc_text_list[s_i[1]-1]
             if now_len + len(doc) > all_max_len:
         print(file.name)
         with pdfplumber.open(file.name) as pdf:
             for i in range(len(pdf.pages)):
+                # Read page i+1 of a PDF document
                 page = pdf.pages[i]
                 res_list = page.extract_text().split('\n')[:-1]
                 for j in range(len(page.images)):
+                   # Get the binary stream of the image
                     img = page.images[j]
                     file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
                     with open(file_name, mode='wb') as f:
                 tables = page.extract_tables()
                 for table in tables:
+                    # The first column is used as the header
                     df = pd.DataFrame(table[1:], columns=table[0])
                     try:
                         records = json.loads(df.to_json(orient="records", force_ascii=False))
     print(doc_text_list)
     return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
         visible=True), gr.Markdown.update(
+        value="Processing")
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            file = gr.File(file_types=['.pdf'], label='Click to upload Document', file_count='multiple')
+            doc_bu = gr.Button(value='Submit', visible=False)
+            txt = gr.Textbox(label='result', visible=False)
             doc_text_state = gr.State([])
             doc_emb_state = gr.State([])
         with gr.Column():
+            md = gr.Markdown("Please Upload the PDF")
             chat_bot = gr.Chatbot(visible=False)
+            msg_txt = gr.Textbox(label='Ask Questions', placeholder='write', visible=False)
+            chat_bu = gr.Button(value='Proceed', visible=False)
     file.change(up_file, [file], [txt, doc_bu, md])
     doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])