File size: 3,178 Bytes
1aba22b
 
8ab7a24
1aba22b
 
 
f56d8ed
1aba22b
 
f56d8ed
1aba22b
f56d8ed
 
1aba22b
8ab7a24
 
 
 
 
 
 
 
 
 
 
 
 
1aba22b
 
 
 
f56d8ed
1aba22b
 
 
 
 
 
 
f56d8ed
1aba22b
 
 
 
8ab7a24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1aba22b
 
8ab7a24
 
1aba22b
 
f56d8ed
 
 
5b5526d
f56d8ed
 
1aba22b
f56d8ed
1aba22b
 
 
 
 
 
8ab7a24
1aba22b
8ab7a24
 
1aba22b
f56d8ed
8ab7a24
1aba22b
8ab7a24
1aba22b
 
8ab7a24
 
1aba22b
 
 
 
 
5b5526d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import os
import hashlib
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

# Load Groq API key
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Directory to cache vectorstores
CACHE_DIR = "vector_cache"
os.makedirs(CACHE_DIR, exist_ok=True)


def get_pdf_hash(pdf_path: str) -> str:
    """Generate a hash for the PDF file to use as cache key"""
    with open(pdf_path, "rb") as f:
        data = f.read()
    return hashlib.md5(data).hexdigest()


def build_vectorstore(pdf_path: str):
    """Load PDF, chunk it, embed, and create FAISS index"""
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    # Chunking strategy
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    return vectorstore


def get_vectorstore(pdf_path: str):
    """Return cached FAISS index if available, else build new one"""
    pdf_hash = get_pdf_hash(pdf_path)
    cache_file = os.path.join(CACHE_DIR, f"{pdf_hash}.pkl")

    if os.path.exists(cache_file):
        with open(cache_file, "rb") as f:
            return pickle.load(f)

    # Build and cache
    vectorstore = build_vectorstore(pdf_path)
    with open(cache_file, "wb") as f:
        pickle.dump(vectorstore, f)
    return vectorstore


def rag_bot(question: str, pdf_path: str):
    """Answer user queries using the uploaded PDF"""
    if not pdf_path:
        return "⚠️ Please upload a PDF first."

    try:
        # Load or build vectorstore
        vectorstore = get_vectorstore(pdf_path)
        retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

        # Use Groq LLM
        llm = ChatGroq(
            groq_api_key=GROQ_API_KEY,
            model_name="llama-3.3-70b-versatile",  # The updated model name
        )

        qa = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
        )
        result = qa.run(question)
        return result
    except Exception as e:
        return f"❌ Error: {e}"


# ------------------ Gradio UI ------------------
with gr.Blocks() as demo:
    gr.Markdown("## 📖 RAG Q&A Bot – Powered by Groq + HuggingFace Embeddings")

    with gr.Row():
        pdf_file = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
    with gr.Row():
        question = gr.Textbox(label="Ask a Question")
    with gr.Row():
        answer = gr.Textbox(label="Answer", interactive=False)

    submit = gr.Button("Submit")
    submit.click(fn=rag_bot, inputs=[question, pdf_file], outputs=answer)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)