Spaces:

samithcs
/

RAG_Book_QA_System

Sleeping

App Files Files Community

RAG_Book_QA_System / gradio_app.py

samithcs

added gradio and app file

6af3122 verified 5 months ago

raw

history blame contribute delete

4.08 kB

	import gradio as gr
	from pathlib import Path
	import os
	import re

	from pipeline.ingest.pdf_parser import PDFParser
	from pipeline.ingest.docx_parser import DOCXParser
	from pipeline.ingest.txt_parser import TXTParser
	from pipeline.ingest.html_parser import HTMLParser
	from pipeline.chunking.fixed_chunker import FixedChunker
	from pipeline.embeddings.sentence_transformer_embed import embed_chunks
	from pipeline.vector_store.faiss_store import FaissStore
	from pipeline.rag.retrieval_engine import answer_question

	FAISS_INDEX_PATH = "data/faiss.index"
	EMBED_DIM = 384

	def sanitize_filename(filename):
	return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)

	def process_and_qa(file, question):
	try:
	save_dir = Path("data/raw/")
	save_dir.mkdir(parents=True, exist_ok=True)
	filename = sanitize_filename(getattr(file, "name", "uploaded_file"))
	file_path = save_dir / Path(filename).name

	content = None
	if hasattr(file, "read"):
	content = file.read()
	elif hasattr(file, "data"):
	content = file.data
	elif isinstance(file, bytes):
	content = file
	elif isinstance(file, str) and os.path.exists(file):
	content = None
	file_path = file
	else:
	return "Invalid file object format!", "Error", "Error"

	if content:
	with open(file_path, "wb") as f:
	f.write(content)

	ext = Path(filename).suffix.lower()
	if ext == ".pdf":
	parser = PDFParser()
	elif ext == ".docx":
	parser = DOCXParser()
	elif ext == ".txt":
	parser = TXTParser()
	elif ext in [".html", ".htm"]:
	parser = HTMLParser()
	else:
	return "Unsupported filetype.", "", ""


	try:
	text, metadata = parser.extract_text_and_metadata(str(file_path))
	chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64)
	#print(f"Chunks parsed: {len(chunks)}")
	embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2")
	#print(f"Embeddings computed: {len(embeddings)}")
	metadatas = [{} for _ in chunks]
	store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH)
	if os.path.exists(FAISS_INDEX_PATH):
	store.load()
	store.add_documents(chunks, embeddings, metadatas)
	store.save()
	#print("Index updated.")
	except Exception as e:
	return f"Failed to extract: {repr(e)}", "", ""

	qa_result = answer_question(
	question=question,
	embed_model="all-MiniLM-L6-v2",
	store_type="faiss",
	store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH},
	llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	top_k=5,
	)
	answer = qa_result["answer"]
	matched_chunks = qa_result.get("chunks", [])
	#print("QA chunks:", matched_chunks)
	context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found."
	return f"Preview (first 500 chars):\n{text[:500]}", answer, context

	except Exception as e:
	# print("GRADIO ERROR:", str(e))
	return f"Error: {e}", "Error", "Error"

	iface = gr.Interface(
	fn=process_and_qa,
	inputs=[
	gr.File(label="Upload PDF, DOCX, TXT, or HTML"),
	gr.Textbox(label="Question"),
	],
	outputs=[
	gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True),
	gr.Textbox(label="Answer", lines=6, show_copy_button=True),
	gr.Textbox(label="Matched Context", lines=12, show_copy_button=True)
	],
	title="Book/Document QA",
	description="Upload your document, ask a question, and see the answer with cited context!"
	)

	if __name__ == "__main__":
	iface.launch(server_name="0.0.0.0", server_port=7860)