samithcs commited on
Commit
d97666e
·
verified ·
1 Parent(s): 63105da

app folder added

Browse files
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (148 Bytes). View file
 
app/__pycache__/gradio_app.cpython-313.pyc ADDED
Binary file (5.27 kB). View file
 
app/__pycache__/logging.cpython-313.pyc ADDED
Binary file (147 Bytes). View file
 
app/__pycache__/main.cpython-313.pyc ADDED
Binary file (322 Bytes). View file
 
app/api/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from app.api.routes import router
3
+
4
+ app = FastAPI()
5
+ app.include_router(router)
app/api/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (152 Bytes). View file
 
app/api/__pycache__/routes.cpython-313.pyc ADDED
Binary file (1.94 kB). View file
 
app/api/routes.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, File, UploadFile
2
+ from pathlib import Path
3
+ from pipeline.ingest.pdf_parser import PDFParser
4
+ from pipeline.ingest.docx_parser import DOCXParser
5
+ from pipeline.ingest.txt_parser import TXTParser
6
+ from pipeline.ingest.html_parser import HTMLParser
7
+ from fastapi import Request
8
+ from pipeline.rag.retrieval_engine import answer_question
9
+ from app.logger import logging
10
+
11
+
12
+ router = APIRouter()
13
+
14
+ @router.post("/upload")
15
+ async def upload_file(file: UploadFile = File(...)):
16
+ save_dir = Path("data/raw/")
17
+ save_dir.mkdir(parents=True, exist_ok=True)
18
+ ext = Path(file.filename).suffix.lower()
19
+ file_path = save_dir / file.filename
20
+
21
+ with open(file_path, "wb") as f:
22
+ f.write(await file.read())
23
+
24
+ if ext == ".pdf":
25
+ parser = PDFParser()
26
+ elif ext == ".docx":
27
+ parser = DOCXParser()
28
+ elif ext == ".txt":
29
+ parser = TXTParser()
30
+ elif ext in [".html", ".htm"]:
31
+ parser = HTMLParser()
32
+ else:
33
+ return {"error": "Unsupported file type!"}
34
+
35
+ text, metadata = parser.extract_text_and_metadata(str(file_path))
36
+ return {"filename": file.filename, "preview": text[:500], "metadata": metadata}
37
+
38
+
39
+ @router.post("/ask")
40
+ async def ask_question(request: Request):
41
+ data = await request.json()
42
+ question = data.get("question")
43
+ if not question:
44
+ return {"error": "No question provided."}
45
+ # Call your RAG pipeline (update these params as needed!)
46
+ answer_pack = answer_question(
47
+ question=question,
48
+ embed_model="all-MiniLM-L6-v2",
49
+ store_type="faiss",
50
+ store_kwargs={"dim": 384},
51
+ llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
52
+ top_k=3,
53
+ )
54
+ logging.info(f"Question answered: '{question}'")
55
+ return {
56
+ "answer": answer_pack["answer"],
57
+ "chunks": answer_pack["chunks"],
58
+ "context": answer_pack["context"]
59
+ }
60
+
61
+ @router.post("/feedback")
62
+ async def feedback(request: Request):
63
+ data = await request.json()
64
+ with open("feedback.csv", "a") as f:
65
+ f.write(f"{data.get('question','')},{data.get('answer','')},{data.get('rating','')}\n")
66
+ logging.info(f"Feedback received for: '{data.get('question','')}'")
67
+ return {"success": True}
app/api/schemas.py ADDED
File without changes
app/app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from gradio_app import iface
2
+
3
+ if __name__ == "__main__":
4
+ iface.launch()
app/gradio_app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pathlib import Path
3
+ import os
4
+ import re
5
+
6
+ from pipeline.ingest.pdf_parser import PDFParser
7
+ from pipeline.ingest.docx_parser import DOCXParser
8
+ from pipeline.ingest.txt_parser import TXTParser
9
+ from pipeline.ingest.html_parser import HTMLParser
10
+ from pipeline.chunking.fixed_chunker import FixedChunker
11
+ from pipeline.embeddings.sentence_transformer_embed import embed_chunks
12
+ from pipeline.vector_store.faiss_store import FaissStore
13
+ from pipeline.rag.retrieval_engine import answer_question
14
+
15
+ FAISS_INDEX_PATH = "data/faiss.index"
16
+ EMBED_DIM = 384
17
+
18
+ def sanitize_filename(filename):
19
+ return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
20
+
21
+ def process_and_qa(file, question):
22
+ try:
23
+ save_dir = Path("data/raw/")
24
+ save_dir.mkdir(parents=True, exist_ok=True)
25
+ filename = sanitize_filename(getattr(file, "name", "uploaded_file"))
26
+ file_path = save_dir / Path(filename).name
27
+
28
+ content = None
29
+ if hasattr(file, "read"):
30
+ content = file.read()
31
+ elif hasattr(file, "data"):
32
+ content = file.data
33
+ elif isinstance(file, bytes):
34
+ content = file
35
+ elif isinstance(file, str) and os.path.exists(file):
36
+ content = None
37
+ file_path = file
38
+ else:
39
+ return "Invalid file object format!", "Error", "Error"
40
+
41
+ if content:
42
+ with open(file_path, "wb") as f:
43
+ f.write(content)
44
+
45
+ ext = Path(filename).suffix.lower()
46
+ if ext == ".pdf":
47
+ parser = PDFParser()
48
+ elif ext == ".docx":
49
+ parser = DOCXParser()
50
+ elif ext == ".txt":
51
+ parser = TXTParser()
52
+ elif ext in [".html", ".htm"]:
53
+ parser = HTMLParser()
54
+ else:
55
+ return "Unsupported filetype.", "", ""
56
+
57
+
58
+ try:
59
+ text, metadata = parser.extract_text_and_metadata(str(file_path))
60
+ chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64)
61
+ #print(f"Chunks parsed: {len(chunks)}")
62
+ embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2")
63
+ #print(f"Embeddings computed: {len(embeddings)}")
64
+ metadatas = [{} for _ in chunks]
65
+ store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH)
66
+ if os.path.exists(FAISS_INDEX_PATH):
67
+ store.load()
68
+ store.add_documents(chunks, embeddings, metadatas)
69
+ store.save()
70
+ #print("Index updated.")
71
+ except Exception as e:
72
+ return f"Failed to extract: {repr(e)}", "", ""
73
+
74
+ qa_result = answer_question(
75
+ question=question,
76
+ embed_model="all-MiniLM-L6-v2",
77
+ store_type="faiss",
78
+ store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH},
79
+ llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
80
+ top_k=5,
81
+ )
82
+ answer = qa_result["answer"]
83
+ matched_chunks = qa_result.get("chunks", [])
84
+ #print("QA chunks:", matched_chunks)
85
+ context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found."
86
+ return f"Preview (first 500 chars):\n{text[:500]}", answer, context
87
+
88
+ except Exception as e:
89
+ # print("GRADIO ERROR:", str(e))
90
+ return f"Error: {e}", "Error", "Error"
91
+
92
+ iface = gr.Interface(
93
+ fn=process_and_qa,
94
+ inputs=[
95
+ gr.File(label="Upload PDF, DOCX, TXT, or HTML"),
96
+ gr.Textbox(label="Question"),
97
+ ],
98
+ outputs=[
99
+ gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True),
100
+ gr.Textbox(label="Answer", lines=6, show_copy_button=True),
101
+ gr.Textbox(label="Matched Context", lines=12, show_copy_button=True)
102
+ ],
103
+ title="Book/Document QA",
104
+ description="Upload your document, ask a question, and see the answer with cited context!"
105
+ )
106
+
107
+ if __name__ == "__main__":
108
+ iface.launch(server_name="0.0.0.0", server_port=7860)
app/logger.py ADDED
File without changes
app/main.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from app.api.routes import router
3
+
4
+ app = FastAPI(
5
+ title="RAG Book QA System API",
6
+ docs_url="/docs"
7
+ )
8
+ app.include_router(router)
9
+
10
+ @app.get("/health")
11
+ def health_check():
12
+ return {"status": "ok"}