Sourbh510's picture
Update app.py
cf6cbb7 verified
import gradio as gr
import faiss
import numpy as np
import fitz
import docx
from pptx import Presentation
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2"
)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
gen_model_name="google/flan-t5-small"
gen_tokenizer=AutoTokenizer.from_pretrained(gen_model_name)
gen_model=AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)
chunks_store=[]
index=None
def extract_text(file):
name=file.name.lower()
text=""
if name.endswith(".pdf"):
pdf=fitz.open(file.name)
for page in pdf:
text += page.get_text()
elif name.endswith(".docx"):
doc=docx.Document(file.name)
for p in doc.paragraphs:
text += p.text + "\n"
elif name.endswith(".pptx"):
prs=Presentation(file.name)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape,"text"):
text += shape.text + "\n"
else:
text="Unsupported file format"
return text
def build_kb(file):
global chunks_store,index
text=extract_text(file)
if not text.strip():
return "No text extracted from file."
chunk_size=500
overlap=100
chunks=[]
for i in range(
0,
len(text),
chunk_size-overlap
):
chunks.append(
text[i:i+chunk_size]
)
if len(chunks)==0:
return "No chunks created."
chunks_store=chunks
embeddings=embed_model.encode(chunks)
dim=embeddings.shape[1]
index=faiss.IndexFlatL2(dim)
index.add(
np.array(
embeddings
).astype("float32")
)
return f"Knowledge Base Created with {len(chunks)} chunks"
def ask_question(question):
global index, chunks_store
if index is None:
return "Upload knowledge first.",""
q_emb = embed_model.encode([question])
D,I=index.search(
np.array(q_emb).astype("float32"),
k=2
)
retrieved="\n\n".join(
[chunks_store[i] for i in I[0]]
)
prompt=f"""
Use only the provided context.
If the answer is not found in the context, reply:
Information not found in document.
Context:
{retrieved}
Question:
{question}
Answer in one concise sentence:
"""
inputs=gen_tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=512
)
outputs=gen_model.generate(
**inputs,
max_new_tokens=35,
num_beams=4,
do_sample=False,
early_stopping=True
)
answer=gen_tokenizer.decode(
outputs[0],
skip_special_tokens=True
).strip()
if "." in answer:
answer = answer.split(".")[0] + "."
return answer,retrieved
with gr.Blocks() as demo:
gr.Markdown(
"""
# 📚 RAG Question Answering System
Ask questions over your own documents
"""
)
with gr.Tab("Build Knowledge Base"):
doc=gr.File(
label="Upload PDF / DOCX / PPTX"
)
status=gr.Textbox(label="Status")
build_btn=gr.Button("Create Knowledge Base")
build_btn.click(
build_kb,
inputs=doc,
outputs=status
)
with gr.Tab("Ask Questions"):
question=gr.Textbox(
label="Ask a Question"
)
answer=gr.Textbox(
label="Grounded Answer",
lines=6
)
sources=gr.Textbox(
label="Sources",
lines=8
)
ask_btn=gr.Button("Ask")
ask_btn.click(
ask_question,
inputs=question,
outputs=[answer,sources]
)
demo.launch()