Spaces:

Sourbh510
/

Retrieval-Augmented-Generation_Question_Answering_System

Sleeping

Retrieval-Augmented-Generation_Question_Answering_System

File size: 3,776 Bytes

05c26b1
 
 
3bf116e
 
 
05c26b1
 
 
 
 
 
 
f2f6d01
 
cf6cbb7
f2f6d01
 
 
05c26b1
 
 
 
 
3bf116e
05c26b1
3bf116e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05c26b1
 
 
3bf116e
 
cf6cbb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05c26b1
 
 
 
 
 
 
 
3bf116e
05c26b1
cf6cbb7
 
 
05c26b1
 
3bf116e
05c26b1
 
 
cf6cbb7
05c26b1
 
 
 
cf6cbb7
05c26b1
 
 
cf6cbb7
05c26b1
 
 
 
 
 
 
cf6cbb7
 
 
 
05c26b1
 
 
 
 
 
 
cf6cbb7
05c26b1
 
f2f6d01
05c26b1
f2f6d01
 
 
 
05c26b1
f2f6d01
 
cf6cbb7
 
 
 
f2f6d01
 
 
 
 
cf6cbb7
 
 
 
05c26b1
f2f6d01
05c26b1
 
 
 
 
 
 
 
 
 
 
 
 
3bf116e
 
05c26b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf6cbb7
05c26b1
 
 
 
cf6cbb7
05c26b1

import gradio as gr
import faiss
import numpy as np
import fitz
import docx
from pptx import Presentation

from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2"
)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

gen_model_name="google/flan-t5-small"

gen_tokenizer=AutoTokenizer.from_pretrained(gen_model_name)
gen_model=AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)


chunks_store=[]
index=None

def extract_text(file):

    name=file.name.lower()

    text=""

    if name.endswith(".pdf"):
        pdf=fitz.open(file.name)
        for page in pdf:
            text += page.get_text()

    elif name.endswith(".docx"):
        doc=docx.Document(file.name)
        for p in doc.paragraphs:
            text += p.text + "\n"

    elif name.endswith(".pptx"):
        prs=Presentation(file.name)

        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape,"text"):
                    text += shape.text + "\n"

    else:
        text="Unsupported file format"

    return text
    
def build_kb(file):

    global chunks_store,index

    text=extract_text(file)

    if not text.strip():
        return "No text extracted from file."

    chunk_size=500
    overlap=100

    chunks=[]
    for i in range(
        0,
        len(text),
        chunk_size-overlap
    ):
        chunks.append(
            text[i:i+chunk_size]
        )

    if len(chunks)==0:
        return "No chunks created."

    chunks_store=chunks

    embeddings=embed_model.encode(chunks)

    dim=embeddings.shape[1]

    index=faiss.IndexFlatL2(dim)

    index.add(
      np.array(
          embeddings
      ).astype("float32")
    )

    return f"Knowledge Base Created with {len(chunks)} chunks"

def ask_question(question):

    global index, chunks_store

    if index is None:
        return "Upload knowledge first.",""

    q_emb = embed_model.encode([question])

    D,I=index.search(
        np.array(q_emb).astype("float32"),
        k=2
    )

    retrieved="\n\n".join(
        [chunks_store[i] for i in I[0]]
    )

    prompt=f"""
Use only the provided context.

If the answer is not found in the context, reply:
Information not found in document.

Context:
{retrieved}

Question:
{question}

Answer in one concise sentence:
"""

    inputs=gen_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    outputs=gen_model.generate(
        **inputs,
        max_new_tokens=35,
        num_beams=4,
        do_sample=False,
        early_stopping=True
    )

    answer=gen_tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    ).strip()

    if "." in answer:
        answer = answer.split(".")[0] + "."

    return answer,retrieved


with gr.Blocks() as demo:

    gr.Markdown(
"""
# 📚 RAG Question Answering System
Ask questions over your own documents
"""
)

    with gr.Tab("Build Knowledge Base"):

        doc=gr.File(
            label="Upload PDF / DOCX / PPTX"
        )

        status=gr.Textbox(label="Status")

        build_btn=gr.Button("Create Knowledge Base")

        build_btn.click(
            build_kb,
            inputs=doc,
            outputs=status
        )


    with gr.Tab("Ask Questions"):

        question=gr.Textbox(
            label="Ask a Question"
        )

        answer=gr.Textbox(
            label="Grounded Answer",
            lines=6
        )

        sources=gr.Textbox(
            label="Sources",
            lines=8
        )

        ask_btn=gr.Button("Ask")

        ask_btn.click(
            ask_question,
            inputs=question,
            outputs=[answer,sources]
        )


demo.launch()