Spaces:
Sleeping
Sleeping
app folder added
Browse files- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-313.pyc +0 -0
- app/__pycache__/gradio_app.cpython-313.pyc +0 -0
- app/__pycache__/logging.cpython-313.pyc +0 -0
- app/__pycache__/main.cpython-313.pyc +0 -0
- app/api/__init__.py +5 -0
- app/api/__pycache__/__init__.cpython-313.pyc +0 -0
- app/api/__pycache__/routes.cpython-313.pyc +0 -0
- app/api/routes.py +67 -0
- app/api/schemas.py +0 -0
- app/app.py +4 -0
- app/gradio_app.py +108 -0
- app/logger.py +0 -0
- app/main.py +12 -0
app/__init__.py
ADDED
|
File without changes
|
app/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (148 Bytes). View file
|
|
|
app/__pycache__/gradio_app.cpython-313.pyc
ADDED
|
Binary file (5.27 kB). View file
|
|
|
app/__pycache__/logging.cpython-313.pyc
ADDED
|
Binary file (147 Bytes). View file
|
|
|
app/__pycache__/main.cpython-313.pyc
ADDED
|
Binary file (322 Bytes). View file
|
|
|
app/api/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from app.api.routes import router
|
| 3 |
+
|
| 4 |
+
app = FastAPI()
|
| 5 |
+
app.include_router(router)
|
app/api/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (152 Bytes). View file
|
|
|
app/api/__pycache__/routes.cpython-313.pyc
ADDED
|
Binary file (1.94 kB). View file
|
|
|
app/api/routes.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, File, UploadFile
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from pipeline.ingest.pdf_parser import PDFParser
|
| 4 |
+
from pipeline.ingest.docx_parser import DOCXParser
|
| 5 |
+
from pipeline.ingest.txt_parser import TXTParser
|
| 6 |
+
from pipeline.ingest.html_parser import HTMLParser
|
| 7 |
+
from fastapi import Request
|
| 8 |
+
from pipeline.rag.retrieval_engine import answer_question
|
| 9 |
+
from app.logger import logging
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
router = APIRouter()
|
| 13 |
+
|
| 14 |
+
@router.post("/upload")
|
| 15 |
+
async def upload_file(file: UploadFile = File(...)):
|
| 16 |
+
save_dir = Path("data/raw/")
|
| 17 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
| 18 |
+
ext = Path(file.filename).suffix.lower()
|
| 19 |
+
file_path = save_dir / file.filename
|
| 20 |
+
|
| 21 |
+
with open(file_path, "wb") as f:
|
| 22 |
+
f.write(await file.read())
|
| 23 |
+
|
| 24 |
+
if ext == ".pdf":
|
| 25 |
+
parser = PDFParser()
|
| 26 |
+
elif ext == ".docx":
|
| 27 |
+
parser = DOCXParser()
|
| 28 |
+
elif ext == ".txt":
|
| 29 |
+
parser = TXTParser()
|
| 30 |
+
elif ext in [".html", ".htm"]:
|
| 31 |
+
parser = HTMLParser()
|
| 32 |
+
else:
|
| 33 |
+
return {"error": "Unsupported file type!"}
|
| 34 |
+
|
| 35 |
+
text, metadata = parser.extract_text_and_metadata(str(file_path))
|
| 36 |
+
return {"filename": file.filename, "preview": text[:500], "metadata": metadata}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@router.post("/ask")
|
| 40 |
+
async def ask_question(request: Request):
|
| 41 |
+
data = await request.json()
|
| 42 |
+
question = data.get("question")
|
| 43 |
+
if not question:
|
| 44 |
+
return {"error": "No question provided."}
|
| 45 |
+
# Call your RAG pipeline (update these params as needed!)
|
| 46 |
+
answer_pack = answer_question(
|
| 47 |
+
question=question,
|
| 48 |
+
embed_model="all-MiniLM-L6-v2",
|
| 49 |
+
store_type="faiss",
|
| 50 |
+
store_kwargs={"dim": 384},
|
| 51 |
+
llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
| 52 |
+
top_k=3,
|
| 53 |
+
)
|
| 54 |
+
logging.info(f"Question answered: '{question}'")
|
| 55 |
+
return {
|
| 56 |
+
"answer": answer_pack["answer"],
|
| 57 |
+
"chunks": answer_pack["chunks"],
|
| 58 |
+
"context": answer_pack["context"]
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
@router.post("/feedback")
|
| 62 |
+
async def feedback(request: Request):
|
| 63 |
+
data = await request.json()
|
| 64 |
+
with open("feedback.csv", "a") as f:
|
| 65 |
+
f.write(f"{data.get('question','')},{data.get('answer','')},{data.get('rating','')}\n")
|
| 66 |
+
logging.info(f"Feedback received for: '{data.get('question','')}'")
|
| 67 |
+
return {"success": True}
|
app/api/schemas.py
ADDED
|
File without changes
|
app/app.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gradio_app import iface
|
| 2 |
+
|
| 3 |
+
if __name__ == "__main__":
|
| 4 |
+
iface.launch()
|
app/gradio_app.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
from pipeline.ingest.pdf_parser import PDFParser
|
| 7 |
+
from pipeline.ingest.docx_parser import DOCXParser
|
| 8 |
+
from pipeline.ingest.txt_parser import TXTParser
|
| 9 |
+
from pipeline.ingest.html_parser import HTMLParser
|
| 10 |
+
from pipeline.chunking.fixed_chunker import FixedChunker
|
| 11 |
+
from pipeline.embeddings.sentence_transformer_embed import embed_chunks
|
| 12 |
+
from pipeline.vector_store.faiss_store import FaissStore
|
| 13 |
+
from pipeline.rag.retrieval_engine import answer_question
|
| 14 |
+
|
| 15 |
+
FAISS_INDEX_PATH = "data/faiss.index"
|
| 16 |
+
EMBED_DIM = 384
|
| 17 |
+
|
| 18 |
+
def sanitize_filename(filename):
|
| 19 |
+
return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
|
| 20 |
+
|
| 21 |
+
def process_and_qa(file, question):
|
| 22 |
+
try:
|
| 23 |
+
save_dir = Path("data/raw/")
|
| 24 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
filename = sanitize_filename(getattr(file, "name", "uploaded_file"))
|
| 26 |
+
file_path = save_dir / Path(filename).name
|
| 27 |
+
|
| 28 |
+
content = None
|
| 29 |
+
if hasattr(file, "read"):
|
| 30 |
+
content = file.read()
|
| 31 |
+
elif hasattr(file, "data"):
|
| 32 |
+
content = file.data
|
| 33 |
+
elif isinstance(file, bytes):
|
| 34 |
+
content = file
|
| 35 |
+
elif isinstance(file, str) and os.path.exists(file):
|
| 36 |
+
content = None
|
| 37 |
+
file_path = file
|
| 38 |
+
else:
|
| 39 |
+
return "Invalid file object format!", "Error", "Error"
|
| 40 |
+
|
| 41 |
+
if content:
|
| 42 |
+
with open(file_path, "wb") as f:
|
| 43 |
+
f.write(content)
|
| 44 |
+
|
| 45 |
+
ext = Path(filename).suffix.lower()
|
| 46 |
+
if ext == ".pdf":
|
| 47 |
+
parser = PDFParser()
|
| 48 |
+
elif ext == ".docx":
|
| 49 |
+
parser = DOCXParser()
|
| 50 |
+
elif ext == ".txt":
|
| 51 |
+
parser = TXTParser()
|
| 52 |
+
elif ext in [".html", ".htm"]:
|
| 53 |
+
parser = HTMLParser()
|
| 54 |
+
else:
|
| 55 |
+
return "Unsupported filetype.", "", ""
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
+
text, metadata = parser.extract_text_and_metadata(str(file_path))
|
| 60 |
+
chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64)
|
| 61 |
+
#print(f"Chunks parsed: {len(chunks)}")
|
| 62 |
+
embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2")
|
| 63 |
+
#print(f"Embeddings computed: {len(embeddings)}")
|
| 64 |
+
metadatas = [{} for _ in chunks]
|
| 65 |
+
store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH)
|
| 66 |
+
if os.path.exists(FAISS_INDEX_PATH):
|
| 67 |
+
store.load()
|
| 68 |
+
store.add_documents(chunks, embeddings, metadatas)
|
| 69 |
+
store.save()
|
| 70 |
+
#print("Index updated.")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
return f"Failed to extract: {repr(e)}", "", ""
|
| 73 |
+
|
| 74 |
+
qa_result = answer_question(
|
| 75 |
+
question=question,
|
| 76 |
+
embed_model="all-MiniLM-L6-v2",
|
| 77 |
+
store_type="faiss",
|
| 78 |
+
store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH},
|
| 79 |
+
llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
| 80 |
+
top_k=5,
|
| 81 |
+
)
|
| 82 |
+
answer = qa_result["answer"]
|
| 83 |
+
matched_chunks = qa_result.get("chunks", [])
|
| 84 |
+
#print("QA chunks:", matched_chunks)
|
| 85 |
+
context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found."
|
| 86 |
+
return f"Preview (first 500 chars):\n{text[:500]}", answer, context
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
# print("GRADIO ERROR:", str(e))
|
| 90 |
+
return f"Error: {e}", "Error", "Error"
|
| 91 |
+
|
| 92 |
+
iface = gr.Interface(
|
| 93 |
+
fn=process_and_qa,
|
| 94 |
+
inputs=[
|
| 95 |
+
gr.File(label="Upload PDF, DOCX, TXT, or HTML"),
|
| 96 |
+
gr.Textbox(label="Question"),
|
| 97 |
+
],
|
| 98 |
+
outputs=[
|
| 99 |
+
gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True),
|
| 100 |
+
gr.Textbox(label="Answer", lines=6, show_copy_button=True),
|
| 101 |
+
gr.Textbox(label="Matched Context", lines=12, show_copy_button=True)
|
| 102 |
+
],
|
| 103 |
+
title="Book/Document QA",
|
| 104 |
+
description="Upload your document, ask a question, and see the answer with cited context!"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
iface.launch(server_name="0.0.0.0", server_port=7860)
|
app/logger.py
ADDED
|
File without changes
|
app/main.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from app.api.routes import router
|
| 3 |
+
|
| 4 |
+
app = FastAPI(
|
| 5 |
+
title="RAG Book QA System API",
|
| 6 |
+
docs_url="/docs"
|
| 7 |
+
)
|
| 8 |
+
app.include_router(router)
|
| 9 |
+
|
| 10 |
+
@app.get("/health")
|
| 11 |
+
def health_check():
|
| 12 |
+
return {"status": "ok"}
|