Spaces:

simran40
/

RAG-CHATBOT

Sleeping

App Files Files Community

simran40 commited on Dec 16, 2025

Commit

c0cb811

verified ·

1 Parent(s): 630d618

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -268

app.py CHANGED Viewed

@@ -1,324 +1,171 @@
 import gradio as gr
-import fitz  # PyMuPDF
 import re
 import faiss
 import numpy as np
-import time
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
-# --- Global State and Initialization ---
-# These variables will hold the processed document data
-qa_index = None
-qa_chunks = []
-summarizer_chunks = []
-is_initialized = False
-# =================================================
-# MODEL LOADING (ONCE)
-# WARNING: This step is the primary cause of slow startup.
-# =================================================
-try:
-    # Embedding model for semantic retrieval
-    print("Loading Sentence Transformer model...")
-    embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
-    # Extractive QA model (accurate answers)
-    print("Loading Extractive QA model...")
-    qa_pipeline = pipeline(
-        "question-answering",
-        model="deepset/roberta-base-squad2",
-        tokenizer="deepset/roberta-base-squad2"
-    )
-    # Summarization model (clean summary)
-    print("Loading Summarization model...")
-    summarizer = pipeline(
-        "summarization",
-        model="facebook/bart-large-cnn",
-        tokenizer="facebook/bart-large-cnn"
-    )
-    is_initialized = True
-    print("All models loaded successfully.")
-except Exception as e:
-    print(f"ERROR: Failed to load required models. Please check dependencies (requirements.txt). Error: {e}")
-    # Set initialized to False so functions return an error message
-    is_initialized = False
-# =================================================
-# PDF PROCESSING UTILITIES
-# =================================================
 def extract_text_from_pdf(pdf_path):
-    """Extracts raw text content from a PDF file using PyMuPDF."""
     doc = fitz.open(pdf_path)
     text = ""
     for page in doc:
-        text += page.get_text() + "\n\n"
     return text
 def clean_text(text):
-    """Performs common cleanup on raw PDF text."""
-    # Remove excessive whitespace
-    text = re.sub(r"\s+", " ", text)
-    # Attempt to remove table of contents, headers, footers (often document-specific)
-    text = re.sub(r"Table of Contents.*?Introduction", "", text, flags=re.I | re.DOTALL)
-    text = re.sub(r"\bPage \d+ of \d+\b|\bPage \d+\b", "", text)
-    return text.strip()
-def chunk_text(text, chunk_size=400, overlap=100):
-    """Chunks text for QA retrieval (smaller chunks for better context focus)."""
     chunks = []
     start = 0
     while start < len(text):
         end = start + chunk_size
         chunks.append(text[start:end])
-        start = end - overlap if end < len(text) else len(text)
     return chunks
-def chunk_text_for_summary(text, chunk_size=1024, overlap=150):
-    """Chunks text for summarization (larger chunks to maintain context flow)."""
-    chunks = []
-    start = 0
-    while start < len(text):
-        end = start + chunk_size
-        chunks.append(text[start:end])
-        start = end - overlap if end < len(text) else len(text)
-    return chunks
-# =================================================
-# FAISS AND CONTEXT RETRIEVAL
-# =================================================
 def build_faiss_index(chunks):
-    """Builds a FAISS Index from text chunks."""
-    print(f"Encoding {len(chunks)} chunks...")
-    embeddings = embedding_model.encode(chunks, show_progress_bar=False)
     embeddings = np.array(embeddings).astype("float32")
-    # Initialize FAISS Index (L2 distance for 'multi-qa-MiniLM-L6-cos-v1')
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
-    print("FAISS Index built.")
     return index, chunks
-def retrieve_relevant_chunks(question, index, chunks, top_k=5):
-    """Retrieves the most relevant chunks for a given question."""
-    # Ensure FAISS index is ready
-    if index is None:
-        return []
-    # Encode the query
-    query_embedding = embedding_model.encode([question]).astype("float32")
-    # Search the index
-    distances, indices = index.search(query_embedding, top_k)
-    results = []
-    for i, idx in enumerate(indices[0]):
-        # Higher score (smaller distance) is better in L2
-        results.append((chunks[idx], distances[0][i]))
-    # Sort by distance (smallest distance first)
-    results.sort(key=lambda x: x[1])
-    return [r[0] for r in results]
-# =================================================
-# HANDLERS FOR GRADIO INPUT
-# =================================================
-def process_pdf(pdf_file):
-    """
-    Initial PDF processing step: extracts text, cleans it, chunks it,
-    and builds the FAISS index for retrieval. Updates global state.
-    """
-    global qa_index, qa_chunks, summarizer_chunks
-    if not is_initialized:
-        return "ERROR: AI models failed to load. Please check console for details."
-    if pdf_file is None:
-        # Clear state if no file is provided
-        qa_index = None
-        qa_chunks = []
-        summarizer_chunks = []
-        return "Please upload a PDF document."
-    try:
-        start_time = time.time()
-        print("Starting PDF processing...")
-        # 1. Extraction and Cleaning
-        raw_text = extract_text_from_pdf(pdf_file.name)
-        cleaned_text = clean_text(raw_text)
-        # 2. Chunking for QA and Summary
-        qa_chunks = chunk_text(cleaned_text)
-        # Summarizer chunks might be larger to keep sequential context
-        summarizer_chunks = chunk_text_for_summary(cleaned_text)
-        # 3. Building FAISS Index for QA
-        qa_index, qa_chunks = build_faiss_index(qa_chunks)
-        end_time = time.time()
-        return (f"Document successfully processed and indexed! "
-                f"Total chunks: {len(qa_chunks)}. "
-                f"Ready for Q&A and Summary. (Processing time: {end_time - start_time:.2f} seconds)")
-    except Exception as e:
-        return f"An error occurred during PDF processing: {e}"
-def get_answer(question):
-    """Handles the Question Answering functionality."""
-    if not is_initialized:
-        return "ERROR: AI models failed to load. Cannot answer questions."
-    if qa_index is None:
-        return "Please upload and process a document first."
-    if not question or question.strip() == "":
-        return "Please enter a question to get an answer."
-    try:
-        start_time = time.time()
-        # 1. Retrieval (RAG component)
-        relevant_chunks = retrieve_relevant_chunks(question, qa_index, qa_chunks)
-        # Combine the retrieved chunks into a single context
-        context = " ".join(relevant_chunks)
-        # 2. Generation (Extractive QA component)
-        # Pass the question and the combined, relevant context to the QA model
-        result = qa_pipeline(
-            question=question,
-            context=context,
-            # Set minimum answer length to avoid single-word outputs
-            max_answer_len=256,
         )
-        answer = result["answer"]
-        score = result["score"]
-        # Set a confidence threshold for a valid answer
-        if score < 0.4 or answer.strip() == "":
-            return "Information not found in the most relevant sections of the document (confidence too low)."
-        end_time = time.time()
-        return (f"Answer: {answer}\n\n"
-                f"Confidence Score: {score:.2f}\n"
-                f"Time taken: {end_time - start_time:.2f} seconds")
-    except Exception as e:
-        return f"An error occurred during Q&A generation: {e}"
-def get_summary():
-    """Handles the Summarization functionality."""
-    if not is_initialized:
-        return "ERROR: AI models failed to load. Cannot generate summary."
-    if not summarizer_chunks:
-        return "Please upload and process a document first."
-    try:
-        start_time = time.time()
-        summaries = []
-        # Summarize each chunk sequentially
-        for i, chunk in enumerate(summarizer_chunks):
-            print(f"Summarizing chunk {i+1}/{len(summarizer_chunks)}")
-            summary_output = summarizer(
-                chunk,
-                max_length=150,
-                min_length=50,
-                do_sample=False,
-                truncation=True # Crucial to handle inputs slightly over the model's max length
-            )[0]["summary_text"]
-            summaries.append(summary_output)
-        # Join the sequential summaries and run a final merge summary
-        merged_summary_text = " ".join(summaries)
-        # If the merged summary is still too long, run a final summary pass
-        if len(merged_summary_text) > 1024:
-            print("Running final merge summary...")
-            final_summary_output = summarizer(
-                merged_summary_text,
-                max_length=400,
-                min_length=150,
-                do_sample=False,
-                truncation=True
-            )[0]["summary_text"]
-        else:
-            final_summary_output = merged_summary_text
-        end_time = time.time()
-        return (f"--- Document Summary ---\n\n{final_summary_output}\n\n"
-                f"Time taken: {end_time - start_time:.2f} seconds")
-    except Exception as e:
-        return f"An error occurred during summarization: {e}"
-# =================================================
-# GRADIO UI
-# =================================================
 with gr.Blocks() as demo:
     gr.Markdown("""
-    # 📄 Open-Source RAG Document Analysis System (Python/Gradio)
-    This system uses three best-in-class open-source models for **Retrieval-Augmented Generation (RAG)**:
-    1. **`multi-qa-MiniLM-L6-cos-v1`**: for fast, accurate context retrieval.
-    2. **`deepset/roberta-base-squad2`**: for highly accurate, extractive Question Answering.
-    3. **`facebook/bart-large-cnn`**: for multi-step, high-quality Summarization.
-    ⚠️ **Warning**: Initial model loading is very slow. Please be patient after the app starts.
-    """)
-    with gr.Row():
-        pdf_input = gr.File(label="📤 Upload PDF Document", file_types=[".pdf"])
-        process_status = gr.Textbox(label="Processing Status", interactive=False, value="Upload a PDF to begin.")
-    process_btn = gr.Button("1. Process & Index Document", variant="primary")
-    process_btn.click(process_pdf, [pdf_input], process_status)
-    gr.Markdown("---")
     with gr.Row():
         with gr.Column(scale=1):
             question_input = gr.Textbox(
-                label="❓ Step 2: Ask a Question",
-                placeholder="e.g. What were the Q4 revenue figures?",
                 lines=2
             )
-            qa_btn = gr.Button("🔍 Get Accurate Answer", variant="secondary")
-        with gr.Column(scale=1):
-            summary_btn = gr.Button("📝 Step 2: Generate Full Summary", variant="secondary")
-    output_box = gr.Textbox(label="📌 Output / Result", lines=10, interactive=False)
-    # Bind events
-    qa_btn.click(get_answer, [question_input], output_box)
-    summary_btn.click(get_summary, [], output_box)
     gr.Markdown("""
     ---
-    *Disclaimer: Due to the size of the models, expect longer processing times for Q&A and Summarization than API-based solutions.*
     """)
-# To run the Gradio application
-demo.launch()

 import gradio as gr
+import fitz
 import re
 import faiss
+import torch
 import numpy as np
 from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# ===============================
+# MODEL LOADING
+# ===============================
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+LLM_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
+llm = AutoModelForCausalLM.from_pretrained(
+    LLM_NAME,
+    torch_dtype=torch.float32
+)
+llm.eval()
+# ===============================
+# PDF PROCESSING
+# ===============================
 def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
     for page in doc:
+        text += page.get_text()
     return text
 def clean_text(text):
+    return re.sub(r"\s+", " ", text).strip()
+def chunk_text(text, chunk_size=500, overlap=50):
     chunks = []
     start = 0
     while start < len(text):
         end = start + chunk_size
         chunks.append(text[start:end])
+        start = end - overlap
     return chunks
+# ===============================
+# VECTOR DB (FAISS)
+# ===============================
 def build_faiss_index(chunks):
+    embeddings = embedding_model.encode(chunks)
     embeddings = np.array(embeddings).astype("float32")
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     return index, chunks
+def retrieve_relevant_chunks(query, index, chunks, top_k=3):
+    query_embedding = embedding_model.encode([query]).astype("float32")
+    _, indices = index.search(query_embedding, top_k)
+    return [chunks[i] for i in indices[0]]
+# ===============================
+# LLM ANSWER
+# ===============================
+def generate_answer(question, context_chunks):
+    context = "\n\n".join(context_chunks)
+    prompt = f"""
+Answer the question strictly using the given context.
+If the answer is not found, say:
+"Information not found in the document."
+Context:
+{context}
+Question:
+{question}
+Answer:
+"""
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+    with torch.no_grad():
+        output = llm.generate(
+            **inputs,
+            max_new_tokens=200,
+            temperature=0.2
         )
+    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
+    return decoded.split("Answer:")[-1].strip()
+# ===============================
+# MAIN PIPELINE
+# ===============================
+def pdf_rag_chat(pdf_file, question):
+    if pdf_file is None or question.strip() == "":
+        return "Please upload a PDF and enter a question."
+    text = extract_text_from_pdf(pdf_file.name)
+    text = clean_text(text)
+    chunks = chunk_text(text)
+    index, chunks = build_faiss_index(chunks)
+    context = retrieve_relevant_chunks(question, index, chunks)
+    return generate_answer(question, context)
+# ===============================
+# GRADIO UI (GRADIO 6 SAFE)
+# ===============================
 with gr.Blocks() as demo:
     gr.Markdown("""
+    # 📄 PDF RAG Chatbot (Open-Source AI)
+    Upload a **PDF** and ask questions based **only on its content**.
+    Built using **Retrieval Augmented Generation (RAG)** and
+    **open-source Hugging Face models**, running on **free CPU**.
+    """)
     with gr.Row():
         with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="📤 Upload PDF",
+                file_types=[".pdf"]
+            )
             question_input = gr.Textbox(
+                label="❓ Ask a question",
+                placeholder="e.g. What is the objective of the project?",
                 lines=2
             )
+            submit_btn = gr.Button("🔍 Get Answer")
+        with gr.Column(scale=2):
+            answer_output = gr.Textbox(
+                label="📌 Answer",
+                lines=10
+            )
+    submit_btn.click(
+        fn=pdf_rag_chat,
+        inputs=[pdf_input, question_input],
+        outputs=answer_output
+    )
     gr.Markdown("""
     ---
+    **© Simranpreet Kaur**
+    **NIELIT Ropar | AIML Six Months Training | 2026**
     """)
+demo.launch()