Spaces:

imeesam
/

ragchatbot

Sleeping

App Files Files Community

imeesam commited on Apr 19

Commit

cbcf4a5

verified ·

1 Parent(s): ba45220

Create app.py

Browse files

Files changed (1) hide show

app.py +170 -0

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# app.py — PDF upload version
+import os
+import gradio as gr
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_groq import ChatGroq
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.runnable import RunnablePassthrough
+from langchain.schema.output_parser import StrOutputParser
+# ── Config ─────────────────────────────────────────────────────────────────────
+EMBED_MODEL   = "sentence-transformers/all-MiniLM-L6-v2"
+GROQ_MODEL    = "llama-3.1-8b-instant"
+CHUNK_SIZE    = 800    # larger chunks work better for dense PDFs
+CHUNK_OVERLAP = 100
+TOP_K         = 4
+RAG_PROMPT = ChatPromptTemplate.from_template("""
+You are a helpful assistant. Answer the question using ONLY the context below.
+If the answer is not in the context, say "I don't have enough information."
+Context:
+{context}
+Question: {question}
+Answer:
+""")
+# ── Load embedding model once at startup (slow, ~30s) ─────────────────────────
+print("Loading embedding model...")
+embeddings = HuggingFaceEmbeddings(
+    model_name=EMBED_MODEL,
+    model_kwargs={"device": "cpu"},
+    encode_kwargs={"normalize_embeddings": True}
+)
+print("Embeddings ready.")
+# Global state — replaced whenever new PDFs are uploaded
+rag_chain   = None
+vectorstore = None
+# ── Core logic ─────────────────────────────────────────────────────────────────
+def process_pdfs(pdf_files):
+    """
+    Called when user clicks 'Process PDFs'.
+    pdf_files: list of temp file paths Gradio provides.
+    Returns a status message.
+    """
+    global rag_chain, vectorstore
+    if not pdf_files:
+        return "No files uploaded. Please upload at least one PDF."
+    all_chunks = []
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        separators=["\n\n", "\n", ".", " ", ""]
+    )
+    for pdf_file in pdf_files:
+        try:
+            # pdf_file is a temp path string like /tmp/gradio/abc123/file.pdf
+            loader = PyPDFLoader(pdf_file)
+            pages  = loader.load()          # one Document per page
+            # Add filename to metadata for traceability
+            filename = os.path.basename(pdf_file)
+            for page in pages:
+                page.metadata["source"] = filename
+            chunks = splitter.split_documents(pages)
+            all_chunks.extend(chunks)
+            print(f"Loaded {filename}: {len(pages)} pages → {len(chunks)} chunks")
+        except Exception as e:
+            return f"Error loading {pdf_file}: {str(e)}"
+    if not all_chunks:
+        return "No text could be extracted. Check if the PDFs contain selectable text (not scanned images)."
+    # Build FAISS index from all chunks
+    print(f"Indexing {len(all_chunks)} chunks...")
+    vectorstore = FAISS.from_documents(all_chunks, embeddings)
+    retriever   = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
+    # Build LLM
+    llm = ChatGroq(
+        model=GROQ_MODEL,
+        temperature=0.2,
+        max_tokens=1024,
+        groq_api_key=os.environ["GROQ_API_KEY"]
+    )
+    def format_docs(docs):
+        # Include source filename in context so the LLM knows where info came from
+        return "\n\n".join(
+            f"[Source: {d.metadata.get('source', 'unknown')}, "
+            f"Page {d.metadata.get('page', '?')+1}]\n{d.page_content}"
+            for d in docs
+        )
+    rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | RAG_PROMPT
+        | llm
+        | StrOutputParser()
+    )
+    total_pages = sum(
+        len(PyPDFLoader(f).load()) for f in pdf_files
+    )
+    return (
+        f"Ready! Indexed {len(pdf_files)} PDF(s), "
+        f"{total_pages} pages, "
+        f"{len(all_chunks)} chunks. You can now ask questions."
+    )
+def chat(message, history):
+    if rag_chain is None:
+        return "", history + [[message, "Please upload and process PDFs first."]]
+    if not message.strip():
+        return "", history
+    try:
+        response = rag_chain.invoke(message)
+    except Exception as e:
+        response = f"Error: {str(e)}"
+    history.append([message, response])
+    return "", history
+# ── Gradio UI ──────────────────────────────────────────────────────────────────
+with gr.Blocks(title="PDF RAG Chatbot", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## PDF RAG Chatbot\nUpload your PDFs, then ask questions about them.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="Upload PDFs",
+                file_types=[".pdf"],
+                file_count="multiple"   # allow multiple files at once
+            )
+            process_btn = gr.Button("Process PDFs", variant="primary")
+            status_box  = gr.Textbox(
+                label="Status",
+                interactive=False,
+                placeholder="Upload PDFs and click Process..."
+            )
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(height=450, label="Chat")
+            msg     = gr.Textbox(placeholder="Ask a question about your PDFs...", label="Question")
+            clear   = gr.Button("Clear chat")
+    # Wire up events
+    process_btn.click(
+        fn=process_pdfs,
+        inputs=[pdf_input],
+        outputs=[status_box]
+    )
+    msg.submit(chat, [msg, chatbot], [msg, chatbot])
+    clear.click(lambda: [], outputs=[chatbot])
+if __name__ == "__main__":
+    demo.launch()