chatpdf_app / utils.py
darthPanda's picture
refactoring code
ce3f912
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
import pinecone
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader
import tempfile
import streamlit as st
import openai
# To create embeddings on hard disk
@st.cache_resource()
def get_embeddings_model():
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
return model, embeddings
model, embeddings = get_embeddings_model()
def ingest(
uploaded_document,
pinecone_api_key,
pinecone_env,
pinecone_index_namespace,
chunk_size=500,
chunk_overlap=20
):
with tempfile.NamedTemporaryFile(delete=False) as tf:
tf.write(uploaded_document.getbuffer())
file_path = tf.name
loader = PyPDFLoader(file_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
# embeddings = get_embeddings_model()
pinecone.init(
api_key=pinecone_api_key,
environment=pinecone_env
)
index_name = pinecone_index_namespace
try:
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
st.success('Document uploaded to Pinecone database successfully')
except Exception as error_message:
st.error(error_message)
# # To create embeddings on hard disk
# # !pip install chromadb
# # from langchain.vectorstores import Chroma
# # persist_directory = './data/embeddings'
# # vStore = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
def query_refiner(conversation, query):
response = openai.Completion.create(
model="text-davinci-003",
prompt=f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation}\n\nQuery: {query}\n\nRefined Query:",
temperature=0.7,
max_tokens=256,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
return response['choices'][0]['text']
def find_match(input, pinecone_api_key, pinecone_env, pinecone_index_namespace):
pinecone.init(
api_key=pinecone_api_key,
environment=pinecone_env
)
index = pinecone.Index(pinecone_index_namespace)
input_em = model.encode(input).tolist()
result = index.query(input_em, top_k=2, includeMetadata=True)
return result['matches'][0]['metadata']['text']+"\n"+result['matches'][1]['metadata']['text']
def get_conversation_string():
conversation_string = ""
for i in range(len(st.session_state['responses'])-1):
conversation_string += "Human: "+st.session_state['requests'][i] + "\n"
conversation_string += "Bot: "+ st.session_state['responses'][i+1] + "\n"
return conversation_string