# import Libraries import openai import langchain import pinecone from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Pinecone from langchain.llms import OpenAI from langchain_community.document_loaders import DirectoryLoader from langchain_community.document_loaders import UnstructuredWordDocumentLoader from langchain_openai import ChatOpenAI from dotenv import load_dotenv load_dotenv() ## Lets Read the document def read_doc(directory): loader = DirectoryLoader( directory, glob="**/*.docx", # This will match .docx files loader_cls=UnstructuredWordDocumentLoader ) documents = loader.load() return documents import os doc = read_doc('documents/') print(f"Loaded {len(doc)} documents") def chunk_data(docs, chunk_size=800, chunk_overlap=50): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False, ) # Split documents and maintain document identity chunks = text_splitter.split_documents(docs) # Print information about the chunks print(f"Split {len(docs)} documents into {len(chunks)} chunks") for i, chunk in enumerate(chunks): print(f"Chunk {i}: Source: {chunk.metadata['source']}, Length: {len(chunk.page_content)} chars") return chunks # Return chunks instead of original docs documents=chunk_data(docs=doc) len(documents) ## Embedding Technique Of OPENAI embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY']) embeddings vectors=embeddings.embed_query("How are you?") len(vectors) ## Vector Search DB In Pinecone import pinecone pc = pinecone.Pinecone( api_key="s_jb2Enoqd32qMqAZHGtT3BlbkFJUSYttAQpCkEFzWehIwE3HYwtUpR8TCgI0juyjCfLd1V8yKoPBDBuOTrlzJ26veRHI538W38p4A" ) index_name = "advrag" index = Pinecone.from_documents( documents, embeddings, index_name=index_name ) ## Cosine Similarity Retreive Results from VectorDB def retrieve_query(query,k=2): matching_results=index.similarity_search(query,k=k) return matching_results from langchain.chains.question_answering import load_qa_chain from langchain_openai import OpenAI from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate def initialize_qa_chain(): llm = ChatOpenAI( model="gpt-4", temperature=0.5 ) prompt_template = """ System: You are a helpful AI assistant that provides accurate and concise answers based on the given context. Always cite the specific source document when providing information. Context: {context} Question: {question} Please provide a clear and direct answer based on the context above. If the information isn't available in the context, say so. """ PROMPT = PromptTemplate( template=prompt_template, input_variables=["context", "question"] ) chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT) return chain qa_chain = None def retrieve_answers(query, k=2): global qa_chain if qa_chain is None: qa_chain = initialize_qa_chain() try: # Get relevant documents matching_docs = retrieve_query(query, k=k) # Create the input dictionary chain_input = { "input_documents": matching_docs, "question": query } # Use invoke instead of __call__ result = qa_chain.invoke(chain_input) return result['output_text'] except Exception as e: return f"Error processing query: {str(e)}" # Test the function our_query = "Identify the homework items that the client agreed to complete in each of the two coaching sessions." answer = retrieve_answers(our_query) print("\nAnswer:", answer)