Raghav001 commited on
Commit
ab98b8b
·
1 Parent(s): b616209

Translation

Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -44,7 +44,7 @@ def doc_emb(doc: str):
44
  # emb_list.append(f.result())
45
  print('\n'.join(texts))
46
  return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
47
- value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
48
 
49
 
50
  def get_response(msg, bot, doc_text_list, doc_embeddings):
@@ -71,7 +71,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
71
  break
72
  index_set.add(s_i[1])
73
  now_len += len(doc)
74
- # 可能段落截断错误,所以把上下段也加入进来
75
  if s_i[1] > 0 and s_i[1] -1 not in index_set:
76
  doc = doc_text_list[s_i[1]-1]
77
  if now_len + len(doc) > all_max_len:
@@ -107,12 +107,12 @@ def up_file(files):
107
  print(file.name)
108
  with pdfplumber.open(file.name) as pdf:
109
  for i in range(len(pdf.pages)):
110
- # 读取PDF文档第i+1
111
  page = pdf.pages[i]
112
  res_list = page.extract_text().split('\n')[:-1]
113
 
114
  for j in range(len(page.images)):
115
- # 获取图片的二进制流
116
  img = page.images[j]
117
  file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
118
  with open(file_name, mode='wb') as f:
@@ -126,7 +126,7 @@ def up_file(files):
126
 
127
  tables = page.extract_tables()
128
  for table in tables:
129
- # 第一列当成表头:
130
  df = pd.DataFrame(table[1:], columns=table[0])
131
  try:
132
  records = json.loads(df.to_json(orient="records", force_ascii=False))
@@ -140,22 +140,22 @@ def up_file(files):
140
  print(doc_text_list)
141
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
142
  visible=True), gr.Markdown.update(
143
- value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
144
 
145
 
146
  with gr.Blocks() as demo:
147
  with gr.Row():
148
  with gr.Column():
149
- file = gr.File(file_types=['.pdf'], label='点击上传PDF,进行解析(支持多文档、表格、OCR)', file_count='multiple')
150
- doc_bu = gr.Button(value='提交解析结果', visible=False)
151
- txt = gr.Textbox(label='PDF解析结果', visible=False)
152
  doc_text_state = gr.State([])
153
  doc_emb_state = gr.State([])
154
  with gr.Column():
155
- md = gr.Markdown("""操作说明 step 1:点击左侧区域,上传PDF,进行解析""")
156
  chat_bot = gr.Chatbot(visible=False)
157
- msg_txt = gr.Textbox(label='消息框', placeholder='输入消息,点击发送', visible=False)
158
- chat_bu = gr.Button(value='发送', visible=False)
159
 
160
  file.change(up_file, [file], [txt, doc_bu, md])
161
  doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
 
44
  # emb_list.append(f.result())
45
  print('\n'.join(texts))
46
  return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
47
+ value="""success ! Let's talk"""), gr.Chatbot.update(visible=True)
48
 
49
 
50
  def get_response(msg, bot, doc_text_list, doc_embeddings):
 
71
  break
72
  index_set.add(s_i[1])
73
  now_len += len(doc)
74
+ # Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
75
  if s_i[1] > 0 and s_i[1] -1 not in index_set:
76
  doc = doc_text_list[s_i[1]-1]
77
  if now_len + len(doc) > all_max_len:
 
107
  print(file.name)
108
  with pdfplumber.open(file.name) as pdf:
109
  for i in range(len(pdf.pages)):
110
+ # Read page i+1 of a PDF document
111
  page = pdf.pages[i]
112
  res_list = page.extract_text().split('\n')[:-1]
113
 
114
  for j in range(len(page.images)):
115
+ # Get the binary stream of the image
116
  img = page.images[j]
117
  file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
118
  with open(file_name, mode='wb') as f:
 
126
 
127
  tables = page.extract_tables()
128
  for table in tables:
129
+ # The first column is used as the header
130
  df = pd.DataFrame(table[1:], columns=table[0])
131
  try:
132
  records = json.loads(df.to_json(orient="records", force_ascii=False))
 
140
  print(doc_text_list)
141
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
142
  visible=True), gr.Markdown.update(
143
+ value="Processing")
144
 
145
 
146
  with gr.Blocks() as demo:
147
  with gr.Row():
148
  with gr.Column():
149
+ file = gr.File(file_types=['.pdf'], label='Click to upload Document', file_count='multiple')
150
+ doc_bu = gr.Button(value='Submit', visible=False)
151
+ txt = gr.Textbox(label='result', visible=False)
152
  doc_text_state = gr.State([])
153
  doc_emb_state = gr.State([])
154
  with gr.Column():
155
+ md = gr.Markdown("Please Upload the PDF")
156
  chat_bot = gr.Chatbot(visible=False)
157
+ msg_txt = gr.Textbox(label='Ask Questions', placeholder='write', visible=False)
158
+ chat_bu = gr.Button(value='Proceed', visible=False)
159
 
160
  file.change(up_file, [file], [txt, doc_bu, md])
161
  doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])