Spaces:

samithcs
/

RAG_Book_QA_System

Sleeping

File size: 4,084 Bytes

6af3122

import gradio as gr
from pathlib import Path
import os
import re

from pipeline.ingest.pdf_parser import PDFParser
from pipeline.ingest.docx_parser import DOCXParser
from pipeline.ingest.txt_parser import TXTParser
from pipeline.ingest.html_parser import HTMLParser
from pipeline.chunking.fixed_chunker import FixedChunker
from pipeline.embeddings.sentence_transformer_embed import embed_chunks
from pipeline.vector_store.faiss_store import FaissStore
from pipeline.rag.retrieval_engine import answer_question

FAISS_INDEX_PATH = "data/faiss.index"
EMBED_DIM = 384

def sanitize_filename(filename):
    return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)

def process_and_qa(file, question):
    try:
        save_dir = Path("data/raw/")
        save_dir.mkdir(parents=True, exist_ok=True)
        filename = sanitize_filename(getattr(file, "name", "uploaded_file"))
        file_path = save_dir / Path(filename).name

        content = None
        if hasattr(file, "read"):
            content = file.read()
        elif hasattr(file, "data"):
            content = file.data
        elif isinstance(file, bytes):
            content = file
        elif isinstance(file, str) and os.path.exists(file):
            content = None
            file_path = file
        else:
            return "Invalid file object format!", "Error", "Error"

        if content:
            with open(file_path, "wb") as f:
                f.write(content)

        ext = Path(filename).suffix.lower()
        if ext == ".pdf":
            parser = PDFParser()
        elif ext == ".docx":
            parser = DOCXParser()
        elif ext == ".txt":
            parser = TXTParser()
        elif ext in [".html", ".htm"]:
            parser = HTMLParser()
        else:
            return "Unsupported filetype.", "", ""

        
        try:
            text, metadata = parser.extract_text_and_metadata(str(file_path))
            chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64)
            #print(f"Chunks parsed: {len(chunks)}")
            embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2")
            #print(f"Embeddings computed: {len(embeddings)}")
            metadatas = [{} for _ in chunks]
            store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH)
            if os.path.exists(FAISS_INDEX_PATH):
                store.load()
            store.add_documents(chunks, embeddings, metadatas)
            store.save()
            #print("Index updated.")
        except Exception as e:
            return f"Failed to extract: {repr(e)}", "", ""

        qa_result = answer_question(
            question=question,
            embed_model="all-MiniLM-L6-v2",
            store_type="faiss",
            store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH},
            llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            top_k=5,
        )
        answer = qa_result["answer"]
        matched_chunks = qa_result.get("chunks", [])
        #print("QA chunks:", matched_chunks)
        context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found."
        return f"Preview (first 500 chars):\n{text[:500]}", answer, context

    except Exception as e:
        # print("GRADIO ERROR:", str(e))
        return f"Error: {e}", "Error", "Error"

iface = gr.Interface(
    fn=process_and_qa,
    inputs=[
        gr.File(label="Upload PDF, DOCX, TXT, or HTML"),
        gr.Textbox(label="Question"),
    ],
    outputs=[
        gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True),
        gr.Textbox(label="Answer", lines=6, show_copy_button=True),
        gr.Textbox(label="Matched Context", lines=12, show_copy_button=True)
    ],
    title="Book/Document QA",
    description="Upload your document, ask a question, and see the answer with cited context!"
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)