stephenz007 commited on
Commit
0fdadb1
Β·
1 Parent(s): 7af6287

initail commit

Browse files
Files changed (2) hide show
  1. app.py +204 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import gradio as gr
4
+ # from concurrent.futures import ThreadPoolExecutor
5
+ import pdfplumber
6
+ import pandas as pd
7
+ import langchain
8
+ import time
9
+ from cnocr import CnOcr
10
+
11
+ # from langchain.document_loaders import PyPDFLoader
12
+ from langchain.document_loaders import UnstructuredWordDocumentLoader
13
+ from sentence_transformers import SentenceTransformer, models, util
14
+ word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2', do_lower_case=True)
15
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
16
+ embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
17
+ ocr = CnOcr()
18
+ # chat_url = 'https://souljoy-my-api.hf.space/sale'
19
+ chat_url = 'https://souljoy-my-api.hf.space/chatpdf'
20
+ headers = {
21
+ 'Content-Type': 'application/json',
22
+ }
23
+ # thread_pool_executor = ThreadPoolExecutor(max_workers=4)
24
+ history_max_len = 500
25
+ all_max_len = 3000
26
+
27
+
28
+ def get_emb(text):
29
+ emb_url = 'https://souljoy-my-api.hf.space/embeddings'
30
+ data = {"content": text}
31
+ try:
32
+ result = requests.post(url=emb_url,
33
+ data=json.dumps(data),
34
+ headers=headers
35
+ )
36
+ return result.json()['data'][0]['embedding']
37
+ except Exception as e:
38
+ print('data', data, 'result json', result.json())
39
+
40
+
41
+ def doc_emb(doc: str):
42
+ texts = doc.split('\n')
43
+ # futures = []
44
+ emb_list = embedder.encode(texts)
45
+ # for text in texts:
46
+ # futures.append(thread_pool_executor.submit(get_emb, text))
47
+ # for f in futures:
48
+ # emb_list.append(f.result())
49
+ print('\n'.join(texts))
50
+ gr.Textbox.update(value="")
51
+ return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
52
+ value="""success ! Let's talk"""), gr.Chatbot.update(visible=True)
53
+
54
+
55
+ def get_response(msg, bot, doc_text_list, doc_embeddings):
56
+ # future = thread_pool_executor.submit(get_emb, msg)
57
+ gr.Textbox.update(value="")
58
+ now_len = len(msg)
59
+ req_json = {'question': msg}
60
+ his_bg = -1
61
+ for i in range(len(bot) - 1, -1, -1):
62
+ if now_len + len(bot[i][0]) + len(bot[i][1]) > history_max_len:
63
+ break
64
+ now_len += len(bot[i][0]) + len(bot[i][1])
65
+ his_bg = i
66
+ req_json['history'] = [] if his_bg == -1 else bot[his_bg:]
67
+ # query_embedding = future.result()
68
+ query_embedding = embedder.encode([msg])
69
+ cos_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
70
+ score_index = [[score, index] for score, index in zip(cos_scores, [i for i in range(len(cos_scores))])]
71
+ score_index.sort(key=lambda x: x[0], reverse=True)
72
+ print('score_index:\n', score_index)
73
+ index_set, sub_doc_list = set(), []
74
+ for s_i in score_index:
75
+ doc = doc_text_list[s_i[1]]
76
+ if now_len + len(doc) > all_max_len:
77
+ break
78
+ index_set.add(s_i[1])
79
+ now_len += len(doc)
80
+ # Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
81
+ if s_i[1] > 0 and s_i[1] -1 not in index_set:
82
+ doc = doc_text_list[s_i[1]-1]
83
+ if now_len + len(doc) > all_max_len:
84
+ break
85
+ index_set.add(s_i[1]-1)
86
+ now_len += len(doc)
87
+ if s_i[1] + 1 < len(doc_text_list) and s_i[1] + 1 not in index_set:
88
+ doc = doc_text_list[s_i[1]+1]
89
+ if now_len + len(doc) > all_max_len:
90
+ break
91
+ index_set.add(s_i[1]+1)
92
+ now_len += len(doc)
93
+
94
+ index_list = list(index_set)
95
+ index_list.sort()
96
+ for i in index_list:
97
+ sub_doc_list.append(doc_text_list[i])
98
+ req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
99
+ data = {"content": json.dumps(req_json)}
100
+ print('data:\n', req_json)
101
+ result = requests.post(url=chat_url,
102
+ data=json.dumps(data),
103
+ headers=headers
104
+ )
105
+ res = result.json()['content']
106
+ bot.append([msg, res])
107
+ return bot[max(0, len(bot) - 3):]
108
+
109
+
110
+ def up_file(fls):
111
+ doc_text_list = []
112
+ names = []
113
+
114
+ for i in fls:
115
+ names.append(str(i.name))
116
+
117
+ files = []
118
+ docs = []
119
+ for i in names:
120
+
121
+ if(i[-3:] == "pdf"):
122
+ files.append(i)
123
+ else:
124
+ docs.append(i)
125
+
126
+ for i in docs:
127
+ loader = UnstructuredWordDocumentLoader(i, mode="elements")
128
+ data = loader.load()
129
+ extracted = data[1]
130
+ doc_text_list.append(extracted)
131
+
132
+
133
+
134
+ for idx, file in enumerate(files):
135
+ print("11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
136
+ #print(file.name)
137
+ with pdfplumber.open(file) as pdf:
138
+ for i in range(len(pdf.pages)):
139
+ # Read page i+1 of a PDF document
140
+ page = pdf.pages[i]
141
+ res_list = page.extract_text().split('\n')[:-1]
142
+
143
+ for j in range(len(page.images)):
144
+ # Get the binary stream of the image
145
+ img = page.images[j]
146
+ file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
147
+ with open(file_name, mode='wb') as f:
148
+ f.write(img['stream'].get_data())
149
+ try:
150
+ res = ocr.ocr(file_name)
151
+ # res = PyPDFLoader(file_name)
152
+ except Exception as e:
153
+ res = []
154
+ if len(res) > 0:
155
+ res_list.append(' '.join([re['text'] for re in res]))
156
+
157
+ tables = page.extract_tables()
158
+ for table in tables:
159
+ # The first column is used as the header
160
+ df = pd.DataFrame(table[1:], columns=table[0])
161
+ try:
162
+ records = json.loads(df.to_json(orient="records", force_ascii=False))
163
+ for rec in records:
164
+ res_list.append(json.dumps(rec, ensure_ascii=False))
165
+ except Exception as e:
166
+ res_list.append(str(df))
167
+
168
+ doc_text_list += res_list
169
+ doc_text_list = [str(text).strip() for text in doc_text_list if len(str(text).strip()) > 0]
170
+ # print(doc_text_list)
171
+ return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
172
+ visible=True), gr.Markdown.update(
173
+ value="Processing")
174
+
175
+
176
+
177
+
178
+
179
+ with gr.Blocks(css=".gradio-container {background-color: #f7f377}, footer {visibility: hidden}") as demo:
180
+ with gr.Row():
181
+ with gr.Column():
182
+ file = gr.File(file_types=['.docx','.pdf'], label='Click to upload Document', file_count='multiple')
183
+ doc_bu = gr.Button(value='Submit', visible=False)
184
+
185
+
186
+ txt = gr.Textbox(label='result', visible=False)
187
+
188
+
189
+ doc_text_state = gr.State([])
190
+ doc_emb_state = gr.State([])
191
+ with gr.Column():
192
+ md = gr.Markdown("Please Upload the PDF")
193
+ chat_bot = gr.Chatbot(visible=False)
194
+ msg_txt = gr.Textbox(visible = False)
195
+ chat_bu = gr.Button(value='Clear', visible=False)
196
+
197
+ file.change(up_file, [file], [txt, doc_bu, md]) #hiding the text
198
+ doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
199
+ msg_txt.submit(get_response, [msg_txt, chat_bot,doc_text_state, doc_emb_state], [chat_bot],queue=False)
200
+ chat_bu.click(lambda: None, None, chat_bot, queue=False)
201
+
202
+ if __name__ == "__main__":
203
+ demo.queue().launch(show_api=False)
204
+ # demo.queue().launch(share=False, server_name='172.22.2.54', server_port=9191)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pdfplumber
2
+ sentence_transformers
3
+ cnocr
4
+ langchain
5
+ unstructured