Spaces:
Sleeping
Sleeping
| from langchain.document_loaders import DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import SentenceTransformerEmbeddings | |
| from sentence_transformers import SentenceTransformer | |
| import pinecone | |
| from langchain.vectorstores import Pinecone | |
| from langchain.document_loaders import PyPDFLoader | |
| import tempfile | |
| import streamlit as st | |
| import openai | |
| # To create embeddings on hard disk | |
| def get_embeddings_model(): | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
| return model, embeddings | |
| model, embeddings = get_embeddings_model() | |
| def ingest( | |
| uploaded_document, | |
| pinecone_api_key, | |
| pinecone_env, | |
| pinecone_index_namespace, | |
| chunk_size=500, | |
| chunk_overlap=20 | |
| ): | |
| with tempfile.NamedTemporaryFile(delete=False) as tf: | |
| tf.write(uploaded_document.getbuffer()) | |
| file_path = tf.name | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| docs = text_splitter.split_documents(documents) | |
| # embeddings = get_embeddings_model() | |
| pinecone.init( | |
| api_key=pinecone_api_key, | |
| environment=pinecone_env | |
| ) | |
| index_name = pinecone_index_namespace | |
| try: | |
| index = Pinecone.from_documents(docs, embeddings, index_name=index_name) | |
| st.success('Document uploaded to Pinecone database successfully') | |
| except Exception as error_message: | |
| st.error(error_message) | |
| # # To create embeddings on hard disk | |
| # # !pip install chromadb | |
| # # from langchain.vectorstores import Chroma | |
| # # persist_directory = './data/embeddings' | |
| # # vStore = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory) | |
| def query_refiner(conversation, query): | |
| response = openai.Completion.create( | |
| model="text-davinci-003", | |
| prompt=f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation}\n\nQuery: {query}\n\nRefined Query:", | |
| temperature=0.7, | |
| max_tokens=256, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0 | |
| ) | |
| return response['choices'][0]['text'] | |
| def find_match(input, pinecone_api_key, pinecone_env, pinecone_index_namespace): | |
| pinecone.init( | |
| api_key=pinecone_api_key, | |
| environment=pinecone_env | |
| ) | |
| index = pinecone.Index(pinecone_index_namespace) | |
| input_em = model.encode(input).tolist() | |
| result = index.query(input_em, top_k=2, includeMetadata=True) | |
| return result['matches'][0]['metadata']['text']+"\n"+result['matches'][1]['metadata']['text'] | |
| def get_conversation_string(): | |
| conversation_string = "" | |
| for i in range(len(st.session_state['responses'])-1): | |
| conversation_string += "Human: "+st.session_state['requests'][i] + "\n" | |
| conversation_string += "Bot: "+ st.session_state['responses'][i+1] + "\n" | |
| return conversation_string |