Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -1,17 +1,109 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
def save_pdf_to_directory(uploaded_file, directory):
|
| 5 |
-
if uploaded_file is not None:
|
| 6 |
-
# Define directory to save file
|
| 7 |
-
|
| 8 |
-
if not os.path.exists(directory):
|
| 9 |
-
os.makedirs(directory)
|
| 10 |
-
|
| 11 |
-
# Save uploaded PDF file to directory
|
| 12 |
-
with open(os.path.join(directory, uploaded_file.name), "wb") as pdf_file:
|
| 13 |
-
pdf_file.write(uploaded_file.getbuffer())
|
| 14 |
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
| 3 |
+
from pypdf import PdfReader
|
| 4 |
+
from langchain.schema import Document
|
| 5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 6 |
+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 7 |
+
from pinecone import Pinecone as PineconeClient
|
| 8 |
+
from langchain.chains.question_answering import load_qa_chain
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from langchain_community.vectorstores import Pinecone
|
| 11 |
import os
|
| 12 |
+
import time
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def get_pdf_text(pdf_doc):
|
| 16 |
+
text = ""
|
| 17 |
+
pdf_reader = PdfReader(pdf_doc)
|
| 18 |
+
for page in pdf_reader.pages:
|
| 19 |
+
text += page.extract_text()
|
| 20 |
+
return text
|
| 21 |
+
|
| 22 |
+
def create_docs(user_pdf_list, unique_id):
|
| 23 |
+
docs=[]
|
| 24 |
+
for filename in user_pdf_list:
|
| 25 |
+
chunks = get_pdf_text(filename)
|
| 26 |
+
|
| 27 |
+
docs.append(Document(
|
| 28 |
+
page_content = chunks,
|
| 29 |
+
metadata = {"name":filename.name, "type=": filename.type, "size": filename.size, "unique_id": unique_id, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
| 30 |
+
))
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
return docs
|
| 34 |
+
|
| 35 |
+
# transform documents
|
| 36 |
+
def split_docs(documents, chunk_size=400, chunk_overlap=20):
|
| 37 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 38 |
+
docs = text_splitter.split_documents(documents)
|
| 39 |
+
|
| 40 |
+
return docs
|
| 41 |
+
|
| 42 |
+
def get_embeddings():
|
| 43 |
+
embedding = OpenAIEmbeddings()
|
| 44 |
+
return embedding
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def push_to_pinecone(docs, embedding):
|
| 48 |
+
|
| 49 |
+
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
|
| 50 |
+
index_name=os.environ.get("PINECONE_INDEX_NAME")
|
| 51 |
+
index = pc.Index(index_name)
|
| 52 |
+
|
| 53 |
+
index.delete(delete_all=True, namespace='rag_bot')
|
| 54 |
+
|
| 55 |
+
vector = []
|
| 56 |
+
for i, doc in enumerate(docs):
|
| 57 |
+
entry = { "id": str(i),
|
| 58 |
+
"values": embedding.embed_query(doc.page_content),
|
| 59 |
+
"metadata":doc.metadata}
|
| 60 |
+
vector.append(entry)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
index = Pinecone.from_documents(docs, embedding, index_name = index_name, namespace='rag_bot')
|
| 64 |
+
|
| 65 |
+
st.sidebar.write("This 30 seconds delay is added Manually... \n(because I'm using some free resources)")
|
| 66 |
+
time.sleep(30)
|
| 67 |
+
|
| 68 |
+
return index
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
#Function to pull index data from Pinecone
|
| 73 |
+
def pull_from_pinecone(embeddings):
|
| 74 |
+
|
| 75 |
+
pinecone_apikey = os.environ.get("PINECONE_API_KEY")
|
| 76 |
+
pinecone_index_name =os.environ.get("PINECONE_INDEX_NAME")
|
| 77 |
+
|
| 78 |
+
PineconeClient(
|
| 79 |
+
api_key=pinecone_apikey
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
#PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
|
| 83 |
+
index = Pinecone.from_existing_index(pinecone_index_name, embeddings, namespace='rag_bot')
|
| 84 |
+
|
| 85 |
+
return index
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_similar_doc(query, embedding,k=2):
|
| 91 |
+
|
| 92 |
+
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
|
| 93 |
+
index_name=os.environ.get("PINECONE_INDEX_NAME")
|
| 94 |
+
index = pc.Index(index_name)
|
| 95 |
+
|
| 96 |
+
index = pull_from_pinecone(embeddings=embedding)
|
| 97 |
+
similar_doc = index.similarity_search_with_score(query, int(k))
|
| 98 |
+
|
| 99 |
+
return [doc for doc, similarity_score in similar_doc]
|
| 100 |
+
|
| 101 |
+
|
| 102 |
|
| 103 |
+
def get_answer(query, embedding, k=2):
|
| 104 |
+
llm=ChatOpenAI(temperature=0.5)
|
| 105 |
+
chain = load_qa_chain(llm, chain_type="stuff")
|
| 106 |
|
| 107 |
+
relevent_doc = get_similar_doc(query, embedding,k=2)
|
| 108 |
+
response = chain.run(input_documents = relevent_doc, question=query)
|
| 109 |
+
return response
|