import gradio as gr from pathlib import Path import os import re from pipeline.ingest.pdf_parser import PDFParser from pipeline.ingest.docx_parser import DOCXParser from pipeline.ingest.txt_parser import TXTParser from pipeline.ingest.html_parser import HTMLParser from pipeline.chunking.fixed_chunker import FixedChunker from pipeline.embeddings.sentence_transformer_embed import embed_chunks from pipeline.vector_store.faiss_store import FaissStore from pipeline.rag.retrieval_engine import answer_question FAISS_INDEX_PATH = "data/faiss.index" EMBED_DIM = 384 def sanitize_filename(filename): return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) def process_and_qa(file, question): try: save_dir = Path("data/raw/") save_dir.mkdir(parents=True, exist_ok=True) filename = sanitize_filename(getattr(file, "name", "uploaded_file")) file_path = save_dir / Path(filename).name content = None if hasattr(file, "read"): content = file.read() elif hasattr(file, "data"): content = file.data elif isinstance(file, bytes): content = file elif isinstance(file, str) and os.path.exists(file): content = None file_path = file else: return "Invalid file object format!", "Error", "Error" if content: with open(file_path, "wb") as f: f.write(content) ext = Path(filename).suffix.lower() if ext == ".pdf": parser = PDFParser() elif ext == ".docx": parser = DOCXParser() elif ext == ".txt": parser = TXTParser() elif ext in [".html", ".htm"]: parser = HTMLParser() else: return "Unsupported filetype.", "", "" try: text, metadata = parser.extract_text_and_metadata(str(file_path)) chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64) #print(f"Chunks parsed: {len(chunks)}") embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2") #print(f"Embeddings computed: {len(embeddings)}") metadatas = [{} for _ in chunks] store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH) if os.path.exists(FAISS_INDEX_PATH): store.load() store.add_documents(chunks, embeddings, metadatas) store.save() #print("Index updated.") except Exception as e: return f"Failed to extract: {repr(e)}", "", "" qa_result = answer_question( question=question, embed_model="all-MiniLM-L6-v2", store_type="faiss", store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH}, llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", top_k=5, ) answer = qa_result["answer"] matched_chunks = qa_result.get("chunks", []) #print("QA chunks:", matched_chunks) context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found." return f"Preview (first 500 chars):\n{text[:500]}", answer, context except Exception as e: # print("GRADIO ERROR:", str(e)) return f"Error: {e}", "Error", "Error" iface = gr.Interface( fn=process_and_qa, inputs=[ gr.File(label="Upload PDF, DOCX, TXT, or HTML"), gr.Textbox(label="Question"), ], outputs=[ gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True), gr.Textbox(label="Answer", lines=6, show_copy_button=True), gr.Textbox(label="Matched Context", lines=12, show_copy_button=True) ], title="Book/Document QA", description="Upload your document, ask a question, and see the answer with cited context!" ) if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)