WORD / app.py
Raghav001's picture
Update app.py
2aa4073
import requests
import json
import gradio as gr
import pdfplumber
import pandas as pd
import langchain
import time
import pinecone
import openai
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI
from sentence_transformers import SentenceTransformer, models, util
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2', do_lower_case=True)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
chat_url = 'https://Raghav001-API.hf.space/chatpdf'
chat_emd = 'https://Raghav001-API.hf.space/embedd'
headers = {
'Content-Type': 'application/json',
}
history_max_len = 500
all_max_len = 3000
# Initialize Pinecone client and create an index
pinecone.init(api_key='d0a5b89b-b901-4b47-bc99-38b93695390d',environment = 'asia-southeast1-gcp')
index = pinecone.Index(index_name='test')
def get_emb(text):
emb_url = 'https://Raghav001-API.hf.space/embeddings'
data = {"content": text}
try:
result = requests.post(url=emb_url,
data=json.dumps(data),
headers=headers
)
print("--------------------------------Embeddings-----------------------------------")
print(result.json()['data'][0]['embedding'])
return result.json()['data'][0]['embedding']
except Exception as e:
print('data', data, 'result json', result.json())
def doc_emb(doc: str):
texts = doc.split('\n')
emb_list = embedder.encode(texts)
print('emb_list',emb_list)
print('\n'.join(texts))
gr.Textbox.update(value="")
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
value="""success ! Let's talk"""), gr.Chatbot.update(visible=True)
def get_response(msg, bot, doc_text_list, doc_embeddings):
gr.Textbox.update(value="")
now_len = len(msg)
req_json = {'question': msg}
his_bg = -1
for i in range(len(bot) - 1, -1, -1):
if now_len + len(bot[i][0]) + len(bot[i][1]) > history_max_len:
break
now_len += len(bot[i][0]) + len(bot[i][1])
his_bg = i
req_json['history'] = [] if his_bg == -1 else bot[his_bg:]
query_embedding = embedder.encode([msg])
cos_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
score_index = [[score, index] for score, index in zip(cos_scores, [i for i in range(len(cos_scores))])]
score_index.sort(key=lambda x: x[0], reverse=True)
print('score_index:\n', score_index)
print('doc_emb_state', doc_emb_state)
index_set, sub_doc_list = set(), []
for s_i in score_index:
doc = doc_text_list[s_i[1]]
if now_len + len(doc) > all_max_len:
break
index_set.add(s_i[1])
now_len += len(doc)
# Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
if s_i[1] > 0 and s_i[1] -1 not in index_set:
doc = doc_text_list[s_i[1]-1]
if now_len + len(doc) > all_max_len:
break
index_set.add(s_i[1]-1)
now_len += len(doc)
if s_i[1] + 1 < len(doc_text_list) and s_i[1] + 1 not in index_set:
doc = doc_text_list[s_i[1]+1]
if now_len + len(doc) > all_max_len:
break
index_set.add(s_i[1]+1)
now_len += len(doc)
index_list = list(index_set)
index_list.sort()
for i in index_list:
sub_doc_list.append(doc_text_list[i])
req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
data = {"content": json.dumps(req_json)}
print('data:\n', req_json)
result = requests.post(url=chat_url,
data=json.dumps(data),
headers=headers
)
res = result.json()['content']
bot.append([msg, res])
return bot[max(0, len(bot) - 3):]
def up_file(fls):
doc_text_list = []
names = []
print(names)
for i in fls:
names.append(str(i.name))
pdf = []
docs = []
pptx = []
for i in names:
if i[-3:] == "pdf":
pdf.append(i)
elif i[-4:] == "docx":
docs.append(i)
else:
pptx.append(i)
#Doc Extracting
for i in docs:
loader = UnstructuredWordDocumentLoader(i)
data = loader.load()
# content = str(data).split("'")
# cnt = content[1]
# # c = cnt.split('\\n\\n')
# # final = "".join(c)
# c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
doc_text_list.append(data)
doc_text_list = [str(text).strip() for text in doc_text_list if len(str(text).strip()) > 0]
# print(doc_text_list)
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
visible=True), gr.Markdown.update(
value="Processing")
def get_answer(query_live):
llm = OpenAI(temperature=0, openai='aaa')
qa_chain = load_qa_chain(llm,chain_type='stuff')
query = query_live
docs = docstore.similarity_search(query)
qa_chain.run(input_documents = docs, question = query)
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
file = gr.File(file_types=['.docx'], label='Click to upload Document', file_count='multiple')
doc_bu = gr.Button(value='Submit', visible=False)
txt = gr.Textbox(label='result', visible=False)
doc_text_state = gr.State([])
doc_emb_state = gr.State([])
with gr.Column():
md = gr.Markdown("Please Upload the PDF")
chat_bot = gr.Chatbot(visible=False)
msg_txt = gr.Textbox(visible = False)
chat_bu = gr.Button(value='Clear', visible=False)
file.change(up_file, [file], [txt, doc_bu, md]) #hiding the text
doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
msg_txt.submit(get_response, [msg_txt, chat_bot,doc_text_state, doc_emb_state], [chat_bot],queue=False)
chat_bu.click(lambda: None, None, chat_bot, queue=False)
if __name__ == "__main__":
demo.queue().launch(show_api=False)
# demo.queue().launch(share=False, server_name='172.22.2.54', server_port=9191)