RAG_Book_QA_System / gradio_app.py
samithcs's picture
added gradio and app file
6af3122 verified
import gradio as gr
from pathlib import Path
import os
import re
from pipeline.ingest.pdf_parser import PDFParser
from pipeline.ingest.docx_parser import DOCXParser
from pipeline.ingest.txt_parser import TXTParser
from pipeline.ingest.html_parser import HTMLParser
from pipeline.chunking.fixed_chunker import FixedChunker
from pipeline.embeddings.sentence_transformer_embed import embed_chunks
from pipeline.vector_store.faiss_store import FaissStore
from pipeline.rag.retrieval_engine import answer_question
FAISS_INDEX_PATH = "data/faiss.index"
EMBED_DIM = 384
def sanitize_filename(filename):
return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
def process_and_qa(file, question):
try:
save_dir = Path("data/raw/")
save_dir.mkdir(parents=True, exist_ok=True)
filename = sanitize_filename(getattr(file, "name", "uploaded_file"))
file_path = save_dir / Path(filename).name
content = None
if hasattr(file, "read"):
content = file.read()
elif hasattr(file, "data"):
content = file.data
elif isinstance(file, bytes):
content = file
elif isinstance(file, str) and os.path.exists(file):
content = None
file_path = file
else:
return "Invalid file object format!", "Error", "Error"
if content:
with open(file_path, "wb") as f:
f.write(content)
ext = Path(filename).suffix.lower()
if ext == ".pdf":
parser = PDFParser()
elif ext == ".docx":
parser = DOCXParser()
elif ext == ".txt":
parser = TXTParser()
elif ext in [".html", ".htm"]:
parser = HTMLParser()
else:
return "Unsupported filetype.", "", ""
try:
text, metadata = parser.extract_text_and_metadata(str(file_path))
chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64)
#print(f"Chunks parsed: {len(chunks)}")
embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2")
#print(f"Embeddings computed: {len(embeddings)}")
metadatas = [{} for _ in chunks]
store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH)
if os.path.exists(FAISS_INDEX_PATH):
store.load()
store.add_documents(chunks, embeddings, metadatas)
store.save()
#print("Index updated.")
except Exception as e:
return f"Failed to extract: {repr(e)}", "", ""
qa_result = answer_question(
question=question,
embed_model="all-MiniLM-L6-v2",
store_type="faiss",
store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH},
llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
top_k=5,
)
answer = qa_result["answer"]
matched_chunks = qa_result.get("chunks", [])
#print("QA chunks:", matched_chunks)
context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found."
return f"Preview (first 500 chars):\n{text[:500]}", answer, context
except Exception as e:
# print("GRADIO ERROR:", str(e))
return f"Error: {e}", "Error", "Error"
iface = gr.Interface(
fn=process_and_qa,
inputs=[
gr.File(label="Upload PDF, DOCX, TXT, or HTML"),
gr.Textbox(label="Question"),
],
outputs=[
gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True),
gr.Textbox(label="Answer", lines=6, show_copy_button=True),
gr.Textbox(label="Matched Context", lines=12, show_copy_button=True)
],
title="Book/Document QA",
description="Upload your document, ask a question, and see the answer with cited context!"
)
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860)