initial
Browse filesTranslation
app.py
CHANGED
|
@@ -44,7 +44,7 @@ def doc_emb(doc: str):
|
|
| 44 |
# emb_list.append(f.result())
|
| 45 |
print('\n'.join(texts))
|
| 46 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
| 47 |
-
value="""
|
| 48 |
|
| 49 |
|
| 50 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
@@ -71,7 +71,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
| 71 |
break
|
| 72 |
index_set.add(s_i[1])
|
| 73 |
now_len += len(doc)
|
| 74 |
-
|
| 75 |
if s_i[1] > 0 and s_i[1] -1 not in index_set:
|
| 76 |
doc = doc_text_list[s_i[1]-1]
|
| 77 |
if now_len + len(doc) > all_max_len:
|
|
@@ -107,12 +107,12 @@ def up_file(files):
|
|
| 107 |
print(file.name)
|
| 108 |
with pdfplumber.open(file.name) as pdf:
|
| 109 |
for i in range(len(pdf.pages)):
|
| 110 |
-
#
|
| 111 |
page = pdf.pages[i]
|
| 112 |
res_list = page.extract_text().split('\n')[:-1]
|
| 113 |
|
| 114 |
for j in range(len(page.images)):
|
| 115 |
-
|
| 116 |
img = page.images[j]
|
| 117 |
file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
|
| 118 |
with open(file_name, mode='wb') as f:
|
|
@@ -126,7 +126,7 @@ def up_file(files):
|
|
| 126 |
|
| 127 |
tables = page.extract_tables()
|
| 128 |
for table in tables:
|
| 129 |
-
#
|
| 130 |
df = pd.DataFrame(table[1:], columns=table[0])
|
| 131 |
try:
|
| 132 |
records = json.loads(df.to_json(orient="records", force_ascii=False))
|
|
@@ -140,22 +140,22 @@ def up_file(files):
|
|
| 140 |
print(doc_text_list)
|
| 141 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
| 142 |
visible=True), gr.Markdown.update(
|
| 143 |
-
value="
|
| 144 |
|
| 145 |
|
| 146 |
with gr.Blocks() as demo:
|
| 147 |
with gr.Row():
|
| 148 |
with gr.Column():
|
| 149 |
-
file = gr.File(file_types=['.pdf'], label='
|
| 150 |
-
doc_bu = gr.Button(value='
|
| 151 |
-
txt = gr.Textbox(label='
|
| 152 |
doc_text_state = gr.State([])
|
| 153 |
doc_emb_state = gr.State([])
|
| 154 |
with gr.Column():
|
| 155 |
-
md = gr.Markdown("
|
| 156 |
chat_bot = gr.Chatbot(visible=False)
|
| 157 |
-
msg_txt = gr.Textbox(label='
|
| 158 |
-
chat_bu = gr.Button(value='
|
| 159 |
|
| 160 |
file.change(up_file, [file], [txt, doc_bu, md])
|
| 161 |
doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
|
|
|
|
| 44 |
# emb_list.append(f.result())
|
| 45 |
print('\n'.join(texts))
|
| 46 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
| 47 |
+
value="""success ! Let's talk"""), gr.Chatbot.update(visible=True)
|
| 48 |
|
| 49 |
|
| 50 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
|
|
| 71 |
break
|
| 72 |
index_set.add(s_i[1])
|
| 73 |
now_len += len(doc)
|
| 74 |
+
# Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
|
| 75 |
if s_i[1] > 0 and s_i[1] -1 not in index_set:
|
| 76 |
doc = doc_text_list[s_i[1]-1]
|
| 77 |
if now_len + len(doc) > all_max_len:
|
|
|
|
| 107 |
print(file.name)
|
| 108 |
with pdfplumber.open(file.name) as pdf:
|
| 109 |
for i in range(len(pdf.pages)):
|
| 110 |
+
# Read page i+1 of a PDF document
|
| 111 |
page = pdf.pages[i]
|
| 112 |
res_list = page.extract_text().split('\n')[:-1]
|
| 113 |
|
| 114 |
for j in range(len(page.images)):
|
| 115 |
+
# Get the binary stream of the image
|
| 116 |
img = page.images[j]
|
| 117 |
file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
|
| 118 |
with open(file_name, mode='wb') as f:
|
|
|
|
| 126 |
|
| 127 |
tables = page.extract_tables()
|
| 128 |
for table in tables:
|
| 129 |
+
# The first column is used as the header
|
| 130 |
df = pd.DataFrame(table[1:], columns=table[0])
|
| 131 |
try:
|
| 132 |
records = json.loads(df.to_json(orient="records", force_ascii=False))
|
|
|
|
| 140 |
print(doc_text_list)
|
| 141 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
| 142 |
visible=True), gr.Markdown.update(
|
| 143 |
+
value="Processing")
|
| 144 |
|
| 145 |
|
| 146 |
with gr.Blocks() as demo:
|
| 147 |
with gr.Row():
|
| 148 |
with gr.Column():
|
| 149 |
+
file = gr.File(file_types=['.pdf'], label='Click to upload Document', file_count='multiple')
|
| 150 |
+
doc_bu = gr.Button(value='Submit', visible=False)
|
| 151 |
+
txt = gr.Textbox(label='result', visible=False)
|
| 152 |
doc_text_state = gr.State([])
|
| 153 |
doc_emb_state = gr.State([])
|
| 154 |
with gr.Column():
|
| 155 |
+
md = gr.Markdown("Please Upload the PDF")
|
| 156 |
chat_bot = gr.Chatbot(visible=False)
|
| 157 |
+
msg_txt = gr.Textbox(label='Ask Questions', placeholder='write', visible=False)
|
| 158 |
+
chat_bu = gr.Button(value='Proceed', visible=False)
|
| 159 |
|
| 160 |
file.change(up_file, [file], [txt, doc_bu, md])
|
| 161 |
doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
|