Spaces:

murkasad
/

DeepLearningRAGchatbot

Sleeping

App Files Files Community

murkasad commited on Apr 7

Commit

22cff0b

verified ·

1 Parent(s): 48878c3

Upload 10 files

Browse files

Files changed (10) hide show

README.md +46 -13
app.py +51 -0
chatbot.py +75 -0
config.py +14 -0
error_logger.py +15 -0
graphs.py +34 -0
langchain_text_splitter.py +17 -0
summarizer.py +21 -0
text_extraction.py +32 -0
vector_store.py +34 -0

README.md CHANGED Viewed

@@ -1,13 +1,46 @@
----
-title: DeepLearningRAGchatbot
-emoji: 📈
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: 6.11.0
-app_file: app.py
-pinned: false
-short_description: Chatbot that helps gain more knowledge about deep learning
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Text Retrieval and Summarizer ChatBot Framework
+### RAG (Retrieval-Augmented Generation) System
+**Project Summary**:
+This project is a text retrieval and summarization system that allows users to input a question and receive a concise summary based on relevant content.
+It works by first converting the user’s input into numerical embeddings using a sentence transformer model. These embeddings are then compared against a pre-built vector index (FAISS) to identify the most relevant text chunks from your dataset. The retrieved content is combined and passed to a transformer-based summarization model BART, which generates a concise summary as the final output.
+The entire pipeline is integrated into an interactive user interface built using Gradio, allowing users to easily input queries and view summarized results in real time.
+Steps:
+1. Retrieves relevant text from a User's Document (FAISS)
+2. Converts Corpus to Sentences (Sentence Transformer)
+3. Generates a Summarized output (HuggingFace Text Summarizer)
+**Use of SBERT**:
+Sentence Transformers(SBERT), uses pretrained "Embedding" models, all we do is provide them our chunks from previous step and it creates vectors. (huggingface)
+Embeddings are dense, lower-dimensional, numerical vector representations of data such as text, images, or audio that capture semantic meaning and relationships.(soucre: google)
+Steps:
+1. Load an embedding model
+2. Feed text chunks into the model
+3. Convert each chunk into a vector of numbers
+Transformer Model (all-MiniLM-L6-v2):
+This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.(huggingface)
+**Use of FAISS**:
+FAISS as a super-fast “vector search engine”, stands for Facebook AI Similarity Search.
+It is an open-source library developed by Meta's Fundamental AI Research group (formerly Facebook AI Research) designed for the efficient similarity search and clustering of dense vectors. (google)
+Takes chunks of text from the document
+As each chunk is previously converted to a 384-dimensional embedding by MiniLM
+This store all embeddings in FAISS
+so when a user asks a question, the question is converted to a vector and FAISS finds the nearest embeddings (most similar chunks of text from the document)
+Then we pass those chunks to your LLM to generate the answer
+**Final Pipeline**:
+Take PDF -> Get chunks -> Make embeddings -> Ask Question -> Retrieve Answer -> Summarize Result and Display Metrics
+*--by Murk Asad*

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#setting up interface
+import gradio as gr
+from error_logger import setup_logger
+from text_extraction import load_pdf_text
+from langchain_text_splitter import clean_text, create_chunks
+from vector_store import build_vectorstore
+from summarizer import load_summarizer
+from chatbot import chat_answer
+from config import PDF_PATH
+setup_logger() #handle errors if any and then log them
+corpus = load_pdf_text(PDF_PATH)
+cleaned = clean_text(corpus)
+chunks = create_chunks(cleaned)
+embedding_model, index = build_vectorstore(chunks)
+summarizer = load_summarizer()
+def respond(message, history):
+    answer, metrics, g1, g2, g3 = chat_answer(
+        message,
+        history,
+        embedding_model,
+        index,
+        chunks,
+        summarizer
+    )
+    history.append({"role": "user", "content": message})
+    history.append({"role": "assistant", "content": answer})
+    return history, metrics, g1, g2, g3
+with gr.Blocks() as demo:
+    gr.Markdown("## Deep Learning Chat with Metrics & Graphs")
+    chatbot = gr.Chatbot()
+    msg = gr.Textbox(label="Ask a question")
+    metrics_box = gr.Textbox(label="Metrics")
+    g1 = gr.Image(label="Graph 1")
+    g2 = gr.Image(label="Graph 2")
+    g3 = gr.Image(label="Graph 3")
+    msg.submit(respond, [msg, chatbot], [chatbot, metrics_box, g1, g2, g3])
+    gr.Markdown("RAG Project by Murk Asad")
+demo.launch()

chatbot.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#getting replies
+import time
+import logging
+from config import TOP_K, MAX_RETRIEVED_WORDS
+from graphs import create_graphs
+from vector_store import retrieve_chunks
+from summarizer import summarize_text
+logger = logging.getLogger(__name__)
+def chat_answer(message, history, embedding_model, index, chunks, summarizer):
+    try:
+        context = " ".join(
+            str(h["content"])
+            for h in history[-3:]
+            if h["role"] == "user"
+        )
+        # for h in history[-3:]  means loop through each message (h)
+        # if h["role"] == "user" means keep only user messages (ignore assistant replies)
+        # h["content"] will extract the actual text of the user question
+        # then create a list of those questions and joins them
+        full_query = context + " " + message
+        t1 = time.time()
+        retrieved_chunks = retrieve_chunks(
+            full_query,
+            embedding_model,
+            index,
+            chunks,
+            TOP_K
+        )
+        t2 = time.time()
+        answer = " ".join(retrieved_chunks)
+        answer = " ".join(answer.split()[:MAX_RETRIEVED_WORDS])
+        summary = summarize_text(answer, summarizer)
+        t3 = time.time()
+        retrieved_len = len(answer.split())
+        summary_len = len(summary.split())
+        #to fetch time it takes for evry step in the pipleine
+        stage_times = {
+            "Retrieve": t2 - t1,
+            "Summarize": t3 - t2
+        }
+        chunk_lengths = [len(c.split()) for c in chunks]
+        g1, g2, g3 = create_graphs(
+            chunk_lengths,
+            retrieved_len,
+            summary_len,
+            stage_times
+        )
+        metrics = f"""
+                        Retrieved words: {retrieved_len}
+                        Summary words: {summary_len}
+                        Compression ratio: {round(summary_len / max(retrieved_len,1), 3)}
+                        Retrieval time: {round(stage_times['Retrieve'],3)}s
+                        Summarization time: {round(stage_times['Summarize'],3)}s
+                        """
+        return summary, metrics, g1, g2, g3
+    except Exception as e:
+        logger.exception("Chatbot error")
+        return "Error occurred. Please try again.", "", None, None, None #get the 5 things (summary, metrics, g1, g2, g3) from return statement or give None for properly handling error

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+#for basic configurations
+PDF_PATH = "data/Deep+Learning+Ian+Goodfellow.pdf"
+CHUNK_SIZE = 500
+CHUNK_OVERLAP = 50
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+SUMMARIZER_MODEL = "facebook/bart-large-cnn"
+TOP_K = 2
+MAX_RETRIEVED_WORDS = 200
+MIN_SUMMARY_LEN = 20
+MAX_SUMMARY_LEN = 50

error_logger.py ADDED Viewed

	@@ -0,0 +1,15 @@

+#for error handling
+import logging
+import os
+def setup_logger():
+    os.makedirs("logs", exist_ok=True)
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        handlers=[
+            logging.FileHandler("logs/app.log"),
+            logging.StreamHandler()
+        ]
+    )

graphs.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import matplotlib.pyplot as plt
+import tempfile
+import os
+import uuid  #helps generate temporary random ids for storing graphs with unique names randomly and save them
+def create_graphs(chunk_lengths, retrieved_len, summary_len, stage_times):
+    temp_dir = tempfile.gettempdir()
+    uid = str(uuid.uuid4())
+    # Graph 1
+    plt.figure()
+    plt.bar(stage_times.keys(), stage_times.values())
+    plt.title("Pipeline Stage Execution Time")
+    g1 = os.path.join(temp_dir, f"g1_{uid}.png")
+    plt.savefig(g1)
+    plt.close()
+    # Graph 2
+    plt.figure()
+    plt.hist(chunk_lengths, bins=20)
+    plt.title("Chunk Length Distribution")
+    g2 = os.path.join(temp_dir, f"g2_{uid}.png")
+    plt.savefig(g2)
+    plt.close()
+    # Graph 3
+    plt.figure()
+    plt.bar(["Retrieved", "Summary"], [retrieved_len, summary_len])
+    plt.title("Retrieved vs Summary Length")
+    g3 = os.path.join(temp_dir, f"g3_{uid}.png")
+    plt.savefig(g3)
+    plt.close()
+    return g1, g2, g3

langchain_text_splitter.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#splitting text to chunks from the extracted pdf file, and overlapping chunks to keep some previous context
+import re
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from config import CHUNK_SIZE, CHUNK_OVERLAP  #getting values from configuration file
+def clean_text(corpus: str) -> str:
+    corpus = re.sub(r'\s+', ' ', corpus)
+    corpus = re.sub(r'([a-z])([A-Z])', r'\1 \2', corpus)
+    return corpus.lower()
+def create_chunks(text: str):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
+    )
+    return splitter.split_text(text)

summarizer.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#summarizing the closest 2 chunks extracted from vector store
+from transformers import pipeline
+from config import SUMMARIZER_MODEL, MIN_SUMMARY_LEN, MAX_SUMMARY_LEN
+def load_summarizer():
+    return pipeline("summarization", model=SUMMARIZER_MODEL)
+def summarize_text(text, summarizer):
+    if not text or not text.strip(): #if input text is empty or even if we remove spaces still empty
+        raise ValueError("Input text for summarization is empty.")
+    output = summarizer(
+        text,
+        repetition_penalty=5.0,
+        length_penalty=0.3,
+        min_length=MIN_SUMMARY_LEN,
+        max_length=MAX_SUMMARY_LEN
+    )
+    return output[0]["summary_text"] #pipeline returns alot of type of dictionaries, we only need the short summary from it so we use [0] and "summary_text"

text_extraction.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#extracting from the pdf book
+import pdfplumber
+import logging
+logger = logging.getLogger(__name__)
+def load_pdf_text(pdf_path: str) -> str:
+    try:
+        corpus = ""
+        with pdfplumber.open(pdf_path) as pdf:
+            for page_num, page in enumerate(pdf.pages, start=1):  #books usually start at later pages, page1 is only the book title
+                text = page.extract_text()
+                if text:
+                    corpus += text + " "
+                else:
+                    logger.warning(f"No text found on page {page_num}")
+        if not corpus.strip():
+            raise ValueError("Empty PDF content")
+        logger.info("PDF loaded successfully")
+        return corpus
+    except FileNotFoundError:
+        logger.error("PDF file not found")
+        raise
+    except Exception as e:
+        logger.exception(f"Error loading PDF: {e}")
+        raise

vector_store.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#transforming sentence chunks from langchain into vectors usin faiss
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from config import EMBEDDING_MODEL
+def load_embedding_model():
+    return SentenceTransformer(EMBEDDING_MODEL)  #all-MiniLM-L6-v2 from config file, we can change it
+def build_vectorstore(chunks):
+    if not chunks:
+        raise ValueError("Chunks list is empty.")
+    model = load_embedding_model()
+    embeddings = model.encode(chunks)
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(np.array(embeddings).astype("float32"))
+    return model, index
+def retrieve_chunks(query, model, index, chunks, k):  #k is number of chunks we want to extract, the more k, better the answer but slower the process
+    if index is None:
+        raise ValueError("FAISS index has not been built.")
+    query_embedding = model.encode([query])
+    distances, indices = index.search(
+        np.array(query_embedding).astype("float32"),
+        k
+    )
+    return [chunks[i] for i in indices[0]] #since we have only 1 query, get 0th item from list of indices [[chunk1, chunk2]]