|
|
import gradio as gr |
|
|
import os |
|
|
import hashlib |
|
|
import pickle |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain.chains import RetrievalQA |
|
|
from langchain_groq import ChatGroq |
|
|
|
|
|
|
|
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY") |
|
|
|
|
|
|
|
|
CACHE_DIR = "vector_cache" |
|
|
os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
def get_pdf_hash(pdf_path: str) -> str: |
|
|
"""Generate a hash for the PDF file to use as cache key""" |
|
|
with open(pdf_path, "rb") as f: |
|
|
data = f.read() |
|
|
return hashlib.md5(data).hexdigest() |
|
|
|
|
|
|
|
|
def build_vectorstore(pdf_path: str): |
|
|
"""Load PDF, chunk it, embed, and create FAISS index""" |
|
|
loader = PyPDFLoader(pdf_path) |
|
|
documents = loader.load() |
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200, |
|
|
separators=["\n\n", "\n", " ", ""] |
|
|
) |
|
|
chunks = text_splitter.split_documents(documents) |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
vectorstore = FAISS.from_documents(chunks, embeddings) |
|
|
return vectorstore |
|
|
|
|
|
|
|
|
def get_vectorstore(pdf_path: str): |
|
|
"""Return cached FAISS index if available, else build new one""" |
|
|
pdf_hash = get_pdf_hash(pdf_path) |
|
|
cache_file = os.path.join(CACHE_DIR, f"{pdf_hash}.pkl") |
|
|
|
|
|
if os.path.exists(cache_file): |
|
|
with open(cache_file, "rb") as f: |
|
|
return pickle.load(f) |
|
|
|
|
|
|
|
|
vectorstore = build_vectorstore(pdf_path) |
|
|
with open(cache_file, "wb") as f: |
|
|
pickle.dump(vectorstore, f) |
|
|
return vectorstore |
|
|
|
|
|
|
|
|
def rag_bot(question: str, pdf_path: str): |
|
|
"""Answer user queries using the uploaded PDF""" |
|
|
if not pdf_path: |
|
|
return "β οΈ Please upload a PDF first." |
|
|
|
|
|
try: |
|
|
|
|
|
vectorstore = get_vectorstore(pdf_path) |
|
|
retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) |
|
|
|
|
|
|
|
|
llm = ChatGroq( |
|
|
groq_api_key=GROQ_API_KEY, |
|
|
model_name="llama-3.3-70b-versatile", |
|
|
) |
|
|
|
|
|
qa = RetrievalQA.from_chain_type( |
|
|
llm=llm, |
|
|
chain_type="stuff", |
|
|
retriever=retriever, |
|
|
) |
|
|
result = qa.run(question) |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"β Error: {e}" |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("## π RAG Q&A Bot β Powered by Groq + HuggingFace Embeddings") |
|
|
|
|
|
with gr.Row(): |
|
|
pdf_file = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"]) |
|
|
with gr.Row(): |
|
|
question = gr.Textbox(label="Ask a Question") |
|
|
with gr.Row(): |
|
|
answer = gr.Textbox(label="Answer", interactive=False) |
|
|
|
|
|
submit = gr.Button("Submit") |
|
|
submit.click(fn=rag_bot, inputs=[question, pdf_file], outputs=answer) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |