Spaces:

samithcs
/

RAG_Book_QA_System

Sleeping

App Files Files Community

samithcs commited on Sep 24, 2025

Commit

d97666e

verified ·

1 Parent(s): 63105da

app folder added

Browse files

Files changed (14) hide show

app/__init__.py +0 -0
app/__pycache__/__init__.cpython-313.pyc +0 -0
app/__pycache__/gradio_app.cpython-313.pyc +0 -0
app/__pycache__/logging.cpython-313.pyc +0 -0
app/__pycache__/main.cpython-313.pyc +0 -0
app/api/__init__.py +5 -0
app/api/__pycache__/__init__.cpython-313.pyc +0 -0
app/api/__pycache__/routes.cpython-313.pyc +0 -0
app/api/routes.py +67 -0
app/api/schemas.py +0 -0
app/app.py +4 -0
app/gradio_app.py +108 -0
app/logger.py +0 -0
app/main.py +12 -0

app/__init__.py ADDED Viewed

File without changes

app/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (148 Bytes). View file

app/__pycache__/gradio_app.cpython-313.pyc ADDED Viewed

Binary file (5.27 kB). View file

app/__pycache__/logging.cpython-313.pyc ADDED Viewed

Binary file (147 Bytes). View file

app/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (322 Bytes). View file

app/api/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from fastapi import FastAPI
+from app.api.routes import router
+app = FastAPI()
+app.include_router(router)

app/api/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (152 Bytes). View file

app/api/__pycache__/routes.cpython-313.pyc ADDED Viewed

Binary file (1.94 kB). View file

app/api/routes.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from fastapi import APIRouter, File, UploadFile
+from pathlib import Path
+from pipeline.ingest.pdf_parser import PDFParser
+from pipeline.ingest.docx_parser import DOCXParser
+from pipeline.ingest.txt_parser import TXTParser
+from pipeline.ingest.html_parser import HTMLParser
+from fastapi import Request
+from pipeline.rag.retrieval_engine import answer_question
+from app.logger import logging
+router = APIRouter()
+@router.post("/upload")
+async def upload_file(file: UploadFile = File(...)):
+    save_dir = Path("data/raw/")
+    save_dir.mkdir(parents=True, exist_ok=True)
+    ext = Path(file.filename).suffix.lower()
+    file_path = save_dir / file.filename
+    with open(file_path, "wb") as f:
+        f.write(await file.read())
+    if ext == ".pdf":
+        parser = PDFParser()
+    elif ext == ".docx":
+        parser = DOCXParser()
+    elif ext == ".txt":
+        parser = TXTParser()
+    elif ext in [".html", ".htm"]:
+        parser = HTMLParser()
+    else:
+        return {"error": "Unsupported file type!"}
+    text, metadata = parser.extract_text_and_metadata(str(file_path))
+    return {"filename": file.filename, "preview": text[:500], "metadata": metadata}
+@router.post("/ask")
+async def ask_question(request: Request):
+    data = await request.json()
+    question = data.get("question")
+    if not question:
+        return {"error": "No question provided."}
+    # Call your RAG pipeline (update these params as needed!)
+    answer_pack = answer_question(
+        question=question,
+        embed_model="all-MiniLM-L6-v2",
+        store_type="faiss",
+        store_kwargs={"dim": 384},
+        llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        top_k=3,
+    )
+    logging.info(f"Question answered: '{question}'")
+    return {
+        "answer": answer_pack["answer"],
+        "chunks": answer_pack["chunks"],
+        "context": answer_pack["context"]
+    }
+@router.post("/feedback")
+async def feedback(request: Request):
+    data = await request.json()
+    with open("feedback.csv", "a") as f:
+        f.write(f"{data.get('question','')},{data.get('answer','')},{data.get('rating','')}\n")
+    logging.info(f"Feedback received for: '{data.get('question','')}'")
+    return {"success": True}

app/api/schemas.py ADDED Viewed

File without changes

app/app.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from gradio_app import iface
+if __name__ == "__main__":
+    iface.launch()

app/gradio_app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import gradio as gr
+from pathlib import Path
+import os
+import re
+from pipeline.ingest.pdf_parser import PDFParser
+from pipeline.ingest.docx_parser import DOCXParser
+from pipeline.ingest.txt_parser import TXTParser
+from pipeline.ingest.html_parser import HTMLParser
+from pipeline.chunking.fixed_chunker import FixedChunker
+from pipeline.embeddings.sentence_transformer_embed import embed_chunks
+from pipeline.vector_store.faiss_store import FaissStore
+from pipeline.rag.retrieval_engine import answer_question
+FAISS_INDEX_PATH = "data/faiss.index"
+EMBED_DIM = 384
+def sanitize_filename(filename):
+    return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
+def process_and_qa(file, question):
+    try:
+        save_dir = Path("data/raw/")
+        save_dir.mkdir(parents=True, exist_ok=True)
+        filename = sanitize_filename(getattr(file, "name", "uploaded_file"))
+        file_path = save_dir / Path(filename).name
+        content = None
+        if hasattr(file, "read"):
+            content = file.read()
+        elif hasattr(file, "data"):
+            content = file.data
+        elif isinstance(file, bytes):
+            content = file
+        elif isinstance(file, str) and os.path.exists(file):
+            content = None
+            file_path = file
+        else:
+            return "Invalid file object format!", "Error", "Error"
+        if content:
+            with open(file_path, "wb") as f:
+                f.write(content)
+        ext = Path(filename).suffix.lower()
+        if ext == ".pdf":
+            parser = PDFParser()
+        elif ext == ".docx":
+            parser = DOCXParser()
+        elif ext == ".txt":
+            parser = TXTParser()
+        elif ext in [".html", ".htm"]:
+            parser = HTMLParser()
+        else:
+            return "Unsupported filetype.", "", ""
+        try:
+            text, metadata = parser.extract_text_and_metadata(str(file_path))
+            chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64)
+            #print(f"Chunks parsed: {len(chunks)}")
+            embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2")
+            #print(f"Embeddings computed: {len(embeddings)}")
+            metadatas = [{} for _ in chunks]
+            store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH)
+            if os.path.exists(FAISS_INDEX_PATH):
+                store.load()
+            store.add_documents(chunks, embeddings, metadatas)
+            store.save()
+            #print("Index updated.")
+        except Exception as e:
+            return f"Failed to extract: {repr(e)}", "", ""
+        qa_result = answer_question(
+            question=question,
+            embed_model="all-MiniLM-L6-v2",
+            store_type="faiss",
+            store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH},
+            llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            top_k=5,
+        )
+        answer = qa_result["answer"]
+        matched_chunks = qa_result.get("chunks", [])
+        #print("QA chunks:", matched_chunks)
+        context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found."
+        return f"Preview (first 500 chars):\n{text[:500]}", answer, context
+    except Exception as e:
+        # print("GRADIO ERROR:", str(e))
+        return f"Error: {e}", "Error", "Error"
+iface = gr.Interface(
+    fn=process_and_qa,
+    inputs=[
+        gr.File(label="Upload PDF, DOCX, TXT, or HTML"),
+        gr.Textbox(label="Question"),
+    ],
+    outputs=[
+        gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True),
+        gr.Textbox(label="Answer", lines=6, show_copy_button=True),
+        gr.Textbox(label="Matched Context", lines=12, show_copy_button=True)
+    ],
+    title="Book/Document QA",
+    description="Upload your document, ask a question, and see the answer with cited context!"
+)
+if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)

app/logger.py ADDED Viewed

File without changes

app/main.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from fastapi import FastAPI
+from app.api.routes import router
+app = FastAPI(
+    title="RAG Book QA System API",
+    docs_url="/docs"
+)
+app.include_router(router)
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}