Spaces:

simran40
/

RAG-CHATBOT

Sleeping

App Files Files Community

simran40 commited on 18 days ago

Commit

a74d897

verified ·

1 Parent(s): fcd815e

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -43

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-import fitz
 import re
 import faiss
 import numpy as np
@@ -9,17 +9,17 @@ from transformers import pipeline
 # =================================================
-# MODELS
 # =================================================
-# Embedding model (for retrieval)
 embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
-# BART summarization model (used as answer generator)
-bart = pipeline(
-    "summarization",
-    model="facebook/bart-large-cnn",
-    tokenizer="facebook/bart-large-cnn"
 )
@@ -42,7 +42,7 @@ def clean_text(text):
     return text.strip()
-def chunk_text(text, chunk_size=400, overlap=80):
     chunks = []
     start = 0
     while start < len(text):
@@ -53,57 +53,63 @@ def chunk_text(text, chunk_size=400, overlap=80):
 # =================================================
-# VECTOR SEARCH
 # =================================================
 def build_faiss_index(chunks):
     embeddings = embedding_model.encode(chunks)
     embeddings = np.array(embeddings).astype("float32")
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     return index, chunks
-def retrieve_chunks(question, index, chunks, top_k=3):
-    q_emb = embedding_model.encode([question]).astype("float32")
-    _, indices = index.search(q_emb, top_k)
-    return [chunks[i] for i in indices[0]]
 # =================================================
-# QUESTION–ANSWER USING BART
 # =================================================
 def generate_answer(question, context_chunks):
-    context = " ".join(context_chunks)
-    prompt = f"""
-Answer the following question using ONLY the given context.
-Context:
-{context}
-Question:
-{question}
-"""
-    result = bart(
-        prompt,
-        max_length=120,
-        min_length=30,
-        do_sample=False
-    )[0]["summary_text"]
-    return result
 # =================================================
 # MAIN PIPELINE
 # =================================================
-def pdf_qa(pdf_file, question):
     if pdf_file is None or question.strip() == "":
-        return "Please upload a PDF and ask a question."
     text = extract_text_from_pdf(pdf_file.name)
     text = clean_text(text)
@@ -111,7 +117,7 @@ def pdf_qa(pdf_file, question):
     chunks = chunk_text(text)
     index, chunks = build_faiss_index(chunks)
-    relevant_chunks = retrieve_chunks(question, index, chunks)
     answer = generate_answer(question, relevant_chunks)
     return answer
@@ -124,27 +130,41 @@ def pdf_qa(pdf_file, question):
 with gr.Blocks() as demo:
     gr.Markdown("""
-    # 📄 PDF Question Answering System (BART Based)
-    Upload a **PDF** and ask a **specific question**.
-    The system retrieves relevant content and generates a **focused answer**,
-    not a full summary.
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            pdf_input = gr.File(label="📤 Upload PDF", file_types=[".pdf"])
             question_input = gr.Textbox(
                 label="❓ Ask your question",
-                placeholder="e.g. What is the objective of the project?",
                 lines=2
             )
-            btn = gr.Button("🔍 Get Answer")
         with gr.Column(scale=2):
-            output = gr.Textbox(label="📌 Answer", lines=8)
-    btn.click(pdf_qa, [pdf_input, question_input], output)
     gr.Markdown("""
     ---

 import gradio as gr
+import fitz  # PyMuPDF
 import re
 import faiss
 import numpy as np
 # =================================================
+# MODEL LOADING (ONCE)
 # =================================================
+# Embedding model for semantic search
 embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
+# Extractive Question Answering model (HIGH ACCURACY)
+qa_pipeline = pipeline(
+    "question-answering",
+    model="deepset/roberta-base-squad2",
+    tokenizer="deepset/roberta-base-squad2"
 )
     return text.strip()
+def chunk_text(text, chunk_size=350, overlap=80):
     chunks = []
     start = 0
     while start < len(text):
 # =================================================
+# VECTOR DATABASE (FAISS)
 # =================================================
 def build_faiss_index(chunks):
     embeddings = embedding_model.encode(chunks)
     embeddings = np.array(embeddings).astype("float32")
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     return index, chunks
+def retrieve_relevant_chunks(question, index, chunks, top_k=5):
+    query_embedding = embedding_model.encode([question]).astype("float32")
+    distances, indices = index.search(query_embedding, top_k)
+    results = []
+    for i, idx in enumerate(indices[0]):
+        results.append((chunks[idx], distances[0][i]))
+    # sort by relevance
+    results.sort(key=lambda x: x[1])
+    return [r[0] for r in results]
 # =================================================
+# ANSWER GENERATION (ACCURATE)
 # =================================================
 def generate_answer(question, context_chunks):
+    best_answer = ""
+    best_score = 0.0
+    for chunk in context_chunks:
+        result = qa_pipeline(
+            question=question,
+            context=chunk
+        )
+        if result["score"] > best_score:
+            best_score = result["score"]
+            best_answer = result["answer"]
+    if best_score < 0.3 or best_answer.strip() == "":
+        return "Information not found in the document."
+    return best_answer
 # =================================================
 # MAIN PIPELINE
 # =================================================
+def pdf_qa_chat(pdf_file, question):
     if pdf_file is None or question.strip() == "":
+        return "Please upload a PDF and enter a valid question."
     text = extract_text_from_pdf(pdf_file.name)
     text = clean_text(text)
     chunks = chunk_text(text)
     index, chunks = build_faiss_index(chunks)
+    relevant_chunks = retrieve_relevant_chunks(question, index, chunks)
     answer = generate_answer(question, relevant_chunks)
     return answer
 with gr.Blocks() as demo:
     gr.Markdown("""
+    # 📄 PDF Question Answering System (High Accuracy)
+    Upload a **PDF document** and ask a **specific question**.
+    The system uses **semantic retrieval + extractive AI**, ensuring
+    **accurate answers directly from the document** (no hallucination).
+    ---
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="📤 Upload PDF",
+                file_types=[".pdf"]
+            )
             question_input = gr.Textbox(
                 label="❓ Ask your question",
+                placeholder="e.g. Whose report is this?",
                 lines=2
             )
+            submit_btn = gr.Button("🔍 Get Answer")
         with gr.Column(scale=2):
+            answer_output = gr.Textbox(
+                label="📌 Answer",
+                lines=6
+            )
+    submit_btn.click(
+        fn=pdf_qa_chat,
+        inputs=[pdf_input, question_input],
+        outputs=answer_output
+    )
     gr.Markdown("""
     ---