Spaces:

simran40
/

RAG-CHATBOT

Sleeping

App Files Files Community

simran40 commited on Dec 16, 2025

Commit

45001af

verified ·

1 Parent(s): a74d897

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -32

app.py CHANGED Viewed

@@ -12,16 +12,23 @@ from transformers import pipeline
 # MODEL LOADING (ONCE)
 # =================================================
-# Embedding model for semantic search
 embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
-# Extractive Question Answering model (HIGH ACCURACY)
 qa_pipeline = pipeline(
     "question-answering",
     model="deepset/roberta-base-squad2",
     tokenizer="deepset/roberta-base-squad2"
 )
 # =================================================
 # PDF PROCESSING
@@ -52,6 +59,16 @@ def chunk_text(text, chunk_size=350, overlap=80):
     return chunks
 # =================================================
 # VECTOR DATABASE (FAISS)
 # =================================================
@@ -59,10 +76,8 @@ def chunk_text(text, chunk_size=350, overlap=80):
 def build_faiss_index(chunks):
     embeddings = embedding_model.encode(chunks)
     embeddings = np.array(embeddings).astype("float32")
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     return index, chunks
@@ -74,13 +89,12 @@ def retrieve_relevant_chunks(question, index, chunks, top_k=5):
     for i, idx in enumerate(indices[0]):
         results.append((chunks[idx], distances[0][i]))
-    # sort by relevance
     results.sort(key=lambda x: x[1])
     return [r[0] for r in results]
 # =================================================
-# ANSWER GENERATION (ACCURATE)
 # =================================================
 def generate_answer(question, context_chunks):
@@ -104,12 +118,32 @@ def generate_answer(question, context_chunks):
 # =================================================
-# MAIN PIPELINE
 # =================================================
-def pdf_qa_chat(pdf_file, question):
     if pdf_file is None or question.strip() == "":
-        return "Please upload a PDF and enter a valid question."
     text = extract_text_from_pdf(pdf_file.name)
     text = clean_text(text)
@@ -118,53 +152,54 @@ def pdf_qa_chat(pdf_file, question):
     index, chunks = build_faiss_index(chunks)
     relevant_chunks = retrieve_relevant_chunks(question, index, chunks)
-    answer = generate_answer(question, relevant_chunks)
-    return answer
 # =================================================
-# GRADIO UI
 # =================================================
 with gr.Blocks() as demo:
     gr.Markdown("""
-    # 📄 PDF Question Answering System (High Accuracy)
-    Upload a **PDF document** and ask a **specific question**.
-    The system uses **semantic retrieval + extractive AI**, ensuring
-    **accurate answers directly from the document** (no hallucination).
-    ---
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            pdf_input = gr.File(
-                label="📤 Upload PDF",
-                file_types=[".pdf"]
-            )
             question_input = gr.Textbox(
-                label="❓ Ask your question",
                 placeholder="e.g. Whose report is this?",
                 lines=2
             )
-            submit_btn = gr.Button("🔍 Get Answer")
         with gr.Column(scale=2):
-            answer_output = gr.Textbox(
-                label="📌 Answer",
-                lines=6
-            )
-    submit_btn.click(
-        fn=pdf_qa_chat,
-        inputs=[pdf_input, question_input],
-        outputs=answer_output
-    )
     gr.Markdown("""
     ---

 # MODEL LOADING (ONCE)
 # =================================================
+# Embedding model for semantic retrieval
 embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
+# Extractive QA model (accurate answers)
 qa_pipeline = pipeline(
     "question-answering",
     model="deepset/roberta-base-squad2",
     tokenizer="deepset/roberta-base-squad2"
 )
+# Summarization model (clean summary)
+summarizer = pipeline(
+    "summarization",
+    model="facebook/bart-large-cnn",
+    tokenizer="facebook/bart-large-cnn"
+)
 # =================================================
 # PDF PROCESSING
     return chunks
+def chunk_text_for_summary(text, chunk_size=900, overlap=100):
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunks.append(text[start:end])
+        start = end - overlap
+    return chunks
 # =================================================
 # VECTOR DATABASE (FAISS)
 # =================================================
 def build_faiss_index(chunks):
     embeddings = embedding_model.encode(chunks)
     embeddings = np.array(embeddings).astype("float32")
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     return index, chunks
     for i, idx in enumerate(indices[0]):
         results.append((chunks[idx], distances[0][i]))
     results.sort(key=lambda x: x[1])
     return [r[0] for r in results]
 # =================================================
+# QUESTION ANSWERING (ACCURATE)
 # =================================================
 def generate_answer(question, context_chunks):
 # =================================================
+# SUMMARIZATION
 # =================================================
+def generate_summary(chunks):
+    summaries = []
+    for chunk in chunks:
+        summary = summarizer(
+            chunk,
+            max_length=150,
+            min_length=60,
+            do_sample=False
+        )[0]["summary_text"]
+        summaries.append(summary)
+    return " ".join(summaries)
+# =================================================
+# MAIN FUNCTIONS
+# =================================================
+def pdf_qa(pdf_file, question):
     if pdf_file is None or question.strip() == "":
+        return "Please upload a PDF and ask a question."
     text = extract_text_from_pdf(pdf_file.name)
     text = clean_text(text)
     index, chunks = build_faiss_index(chunks)
     relevant_chunks = retrieve_relevant_chunks(question, index, chunks)
+    return generate_answer(question, relevant_chunks)
+def pdf_summary(pdf_file):
+    if pdf_file is None:
+        return "Please upload a PDF document."
+    text = extract_text_from_pdf(pdf_file.name)
+    text = clean_text(text)
+    chunks = chunk_text_for_summary(text)
+    return generate_summary(chunks)
 # =================================================
+# GRADIO UI (QA + SUMMARY)
 # =================================================
 with gr.Blocks() as demo:
     gr.Markdown("""
+    # 📄 PDF Question Answering & Summarization System
+    This system supports **two functionalities**:
+    - 🔍 **Ask Questions** (Accurate answers from PDF)
+    - 📝 **Generate Summary** (Concise document summary)
+    Built using **RAG architecture with open-source AI models**.
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            pdf_input = gr.File(label="📤 Upload PDF", file_types=[".pdf"])
             question_input = gr.Textbox(
+                label="❓ Ask a question (for Q&A)",
                 placeholder="e.g. Whose report is this?",
                 lines=2
             )
+            qa_btn = gr.Button("🔍 Get Answer")
+            summary_btn = gr.Button("📝 Generate Summary")
         with gr.Column(scale=2):
+            output_box = gr.Textbox(label="📌 Output", lines=12)
+    qa_btn.click(pdf_qa, [pdf_input, question_input], output_box)
+    summary_btn.click(pdf_summary, [pdf_input], output_box)
     gr.Markdown("""
     ---