Spaces:
Sleeping
Sleeping
File size: 6,661 Bytes
f95d6c7 cace76b f95d6c7 bb6d187 cace76b bb6d187 cace76b bb6d187 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# This is for input / output operation
import os
import keyfile
import time
# Warning to be ignored
import warnings
warnings.filterwarnings("ignore")
# This library is for loading textual data
from langchain.document_loaders import TextLoader
# This library will handle the splitting part of the data
from langchain.text_splitter import CharacterTextSplitter
# This library will handle embedding of data
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from langchain.vectorstores import Pinecone
template = """
You are a MLOPs engineer. The user will ask you a question about Machine Learning Operations.
Use the following piece of context to answer the question.
If you don't know the answer, just say don't know/
Keep the answer brief
Context: {context}
Question: {question}
Answer:
"""
def setup_retrieval_qa_system(doc_directory, question, chunk_size=500, chunk_overlap=100):
load_dotenv()
hugging_face = keyfile.Hugging_face_key
if not hugging_face:
raise ValueError("HuggingFace API key is missing. Please set it in the .env file.")
os.environ['HUGGINGFACEHUB_API_TOKEN'] = hugging_face
pc = keyfile.PCToken
PINECONE_API_KEY = os.getenv("PCToken")
if not pc:
raise ValueError("pc API key is missing. Please set it in the .env file.")
os.environ['PCToken'] = pc
# We are initializing the cloud platform over here
cloud = os.environ.get("PINECONE_CLOUD") or "aws"
# We are going to give a region for aws
region = os.environ.get("PINECONE_REGION") or "us-east-1"
# Initialize the client
serv = ServerlessSpec(cloud = cloud, region = region)
index_name = "Bhagya-27thoct"
# We are check if the name of our index is not existing in pinecone directory
if index_name not in pc.list_indexes().names():
# if not then we will create a index for us
pc.create_index(
name = index_name,
dimension = 768,
metric = "cosine",
spec = serv
)
# Waiting till the machine has not created the index
while not pc.describe_index(index_name).status['ready']:
time.sleep(1)
# Check to see if the index is ready
print("Index before inserting")
print(pc.Index(index_name).describe_index_stats())
all_docs = []
with st.spinner('Loading and processing documents...'):
for file_name in os.listdir(doc_directory):
file_path = os.path.join(doc_directory, file_name)
loader = PyPDFLoader(file_path)
docs = loader.load()
all_docs.extend(docs)
text_splitter = CharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
#splitted_chunks = text_splitter.split_documents(all_docs)
splitted_chunks = text_splitter.split_documents(all_docs)
#embedding_model = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
embedding_model = HuggingFaceInstructEmbeddings(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
vector_db = FAISS.from_documents(splitted_chunks, embedding_model)
retriever = vector_db.as_retriever()
# IF the index is not there in the index list
if index_name not in pc.list_indexes():
docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name = index_name)
else:
docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings, pinecone_index = pc.Index(index_name))
llm = HuggingFaceHub(
repo_id = model_id,
model_kwargs = {"temperature" : 0.8, "top_k" : 50},
huggingfacehub_api_token = hugging_face
)
#llm = ChatGroq(model="llama3-8b-8192")
prompt = PromptTemplate(
template = template,
input_variables = ["context", "question"]
)
rag_chain = (
{"context" : docsearch.as_retriever(), "question" : RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
llm = HuggingFaceHub(
repo_id=model_id,
model_kwargs={"temperature": 0.8, "top_k": 50},
huggingfacehub_api_token=hugging_face
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=docsearch.as_retriever(),
)
#retrieval_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
with st.spinner('Finding the best answer...'):
result = qa_chain.run(query)
# with st.spinner('Finding the best answer...'):
# result = retrieval_chain.invoke(question)
return result['result']
def main():
st.title("📝 Document-Based Question Answering System with Groq")
st.sidebar.header("Configuration")
# File uploader for PDFs
uploaded_files = st.sidebar.file_uploader("Upload PDF documents", type="pdf", accept_multiple_files=True)
# Get the document directory from the user
doc_directory = st.text_input("Or enter the document directory path directly:", "")
# Set chunk size and overlap
chunk_size = st.sidebar.slider("Set chunk size", 100, 1000, 500)
chunk_overlap = st.sidebar.slider("Set chunk overlap", 0, 200, 100)
# Input for the question
question = st.text_input("Enter your question:")
# Button to trigger the QA system
if st.button("Get Answer"):
if uploaded_files:
doc_directory = "/tmp/streamlit_uploaded_docs"
os.makedirs(doc_directory, exist_ok=True)
for file in uploaded_files:
with open(os.path.join(doc_directory, file.name), "wb") as f:
f.write(file.getbuffer())
elif not doc_directory:
st.warning("Please upload PDF files or provide a document directory.")
return
if question:
try:
result = setup_retrieval_qa_system(doc_directory, question, chunk_size, chunk_overlap)
st.success("Answer found!")
st.write(f"**Answer:** {result}")
except Exception as e:
st.error(f"An error occurred: {e}")
else:
st.warning("Please provide a question.")
if __name__ == "__main__":
main() |