Spaces:
Sleeping
Sleeping
File size: 3,925 Bytes
57f5dc8 b9479ec 57f5dc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# import Libraries
import openai
import langchain
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()
## Lets Read the document
def read_doc(directory):
loader = DirectoryLoader(
directory,
glob="**/*.docx", # This will match .docx files
loader_cls=UnstructuredWordDocumentLoader
)
documents = loader.load()
return documents
import os
doc = read_doc('documents/')
print(f"Loaded {len(doc)} documents")
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False,
)
# Split documents and maintain document identity
chunks = text_splitter.split_documents(docs)
# Print information about the chunks
print(f"Split {len(docs)} documents into {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
print(f"Chunk {i}: Source: {chunk.metadata['source']}, Length: {len(chunk.page_content)} chars")
return chunks # Return chunks instead of original docs
documents=chunk_data(docs=doc)
len(documents)
## Embedding Technique Of OPENAI
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings
vectors=embeddings.embed_query("How are you?")
len(vectors)
## Vector Search DB In Pinecone
import pinecone
pc = pinecone.Pinecone(
api_key="s_jb2Enoqd32qMqAZHGtT3BlbkFJUSYttAQpCkEFzWehIwE3HYwtUpR8TCgI0juyjCfLd1V8yKoPBDBuOTrlzJ26veRHI538W38p4A"
)
index_name = "advrag"
index = Pinecone.from_documents(
documents,
embeddings,
index_name=index_name
)
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
matching_results=index.similarity_search(query,k=k)
return matching_results
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
def initialize_qa_chain():
llm = ChatOpenAI(
model="gpt-4",
temperature=0.5
)
prompt_template = """
System: You are a helpful AI assistant that provides accurate and concise answers based on the given context. Always cite the specific source document when providing information.
Context: {context}
Question: {question}
Please provide a clear and direct answer based on the context above. If the information isn't available in the context, say so.
"""
PROMPT = PromptTemplate(
template=prompt_template,
input_variables=["context", "question"]
)
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)
return chain
qa_chain = None
def retrieve_answers(query, k=2):
global qa_chain
if qa_chain is None:
qa_chain = initialize_qa_chain()
try:
# Get relevant documents
matching_docs = retrieve_query(query, k=k)
# Create the input dictionary
chain_input = {
"input_documents": matching_docs,
"question": query
}
# Use invoke instead of __call__
result = qa_chain.invoke(chain_input)
return result['output_text']
except Exception as e:
return f"Error processing query: {str(e)}"
# Test the function
our_query = "Identify the homework items that the client agreed to complete in each of the two coaching sessions."
answer = retrieve_answers(our_query)
print("\nAnswer:", answer)
|