Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- Dockerfile +22 -0
- RAG.py +1285 -0
- README.md +48 -11
- __pycache__/RAG.cpython-312.pyc +0 -0
- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/rag_system.cpython-312.pyc +0 -0
- app.py +1379 -0
- rag_storage/metadata.pkl +3 -0
- rag_storage/vector_store.faiss +3 -0
- requirements.txt +13 -0
- templates/index.html +1338 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
rag_storage/vector_store.faiss filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
gcc \
|
| 7 |
+
g++ \
|
| 8 |
+
build-essential \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
RUN pip install --upgrade pip
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
RUN mkdir -p /app/templates /app/static
|
| 18 |
+
RUN mkdir -p /app/uploads /app/documents
|
| 19 |
+
|
| 20 |
+
EXPOSE 7860
|
| 21 |
+
|
| 22 |
+
CMD ["python", "app.py"]
|
RAG.py
ADDED
|
@@ -0,0 +1,1285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
# import re
|
| 3 |
+
# import fitz
|
| 4 |
+
# import nltk
|
| 5 |
+
# import numpy as np
|
| 6 |
+
# import pandas as pd
|
| 7 |
+
# from typing import List, Dict, Tuple, Any, Optional
|
| 8 |
+
# from sentence_transformers import SentenceTransformer
|
| 9 |
+
# from nltk.tokenize import sent_tokenize
|
| 10 |
+
# import logging
|
| 11 |
+
# import json
|
| 12 |
+
# from sklearn.metrics.pairwise import cosine_similarity
|
| 13 |
+
# import torch
|
| 14 |
+
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging as hf_logging
|
| 15 |
+
# from pathlib import Path
|
| 16 |
+
# import faiss
|
| 17 |
+
# from unstructured.partition.auto import partition
|
| 18 |
+
# import tempfile
|
| 19 |
+
# import pickle
|
| 20 |
+
# import shutil
|
| 21 |
+
|
| 22 |
+
# hf_logging.set_verbosity_error()
|
| 23 |
+
|
| 24 |
+
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 25 |
+
# logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
# EMBEDDING_MODEL_NAME = 'all-MiniLM-L12-v2'
|
| 28 |
+
# GENERATIVE_MODEL_NAME = "microsoft/phi-2"
|
| 29 |
+
# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 30 |
+
# PHI_MAX_NEW_TOKENS = 250
|
| 31 |
+
# PHI_TEMPERATURE = 0.3
|
| 32 |
+
# QUERY_SIMILARITY_THRESHOLD = 0.50
|
| 33 |
+
# CHUNK_SIZE = 100
|
| 34 |
+
# CHUNK_OVERLAP = 30
|
| 35 |
+
# STORAGE_DIR = "rag_storage"
|
| 36 |
+
|
| 37 |
+
# try:
|
| 38 |
+
# nltk.download('punkt', quiet=True)
|
| 39 |
+
# logger.info("NLTK punkt found or downloaded successfully")
|
| 40 |
+
# except Exception as e:
|
| 41 |
+
# logger.warning(f"Failed to download or find NLTK punkt: {e}. Using fallback tokenization.")
|
| 42 |
+
|
| 43 |
+
# def simple_sent_tokenize(text):
|
| 44 |
+
# sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
|
| 45 |
+
# return [s for s in sentences if s.strip()]
|
| 46 |
+
|
| 47 |
+
# sent_tokenize = simple_sent_tokenize
|
| 48 |
+
|
| 49 |
+
# class DocumentProcessor:
|
| 50 |
+
# def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME, device: str = DEVICE):
|
| 51 |
+
# try:
|
| 52 |
+
# self.embedding_model = SentenceTransformer(embedding_model_name, device=device)
|
| 53 |
+
# logger.info(f"Initialized embedding model: {embedding_model_name} on device: {device}")
|
| 54 |
+
# self.device = device
|
| 55 |
+
# self.vector_store = None
|
| 56 |
+
# self.chunks = []
|
| 57 |
+
# self.doc_metadata = []
|
| 58 |
+
# self.storage_dir = STORAGE_DIR
|
| 59 |
+
# os.makedirs(self.storage_dir, exist_ok=True)
|
| 60 |
+
# except Exception as e:
|
| 61 |
+
# logger.error(f"Failed to load embedding model {embedding_model_name}: {e}")
|
| 62 |
+
# raise
|
| 63 |
+
|
| 64 |
+
# def save_state(self):
|
| 65 |
+
# """Save the current state to disk"""
|
| 66 |
+
# try:
|
| 67 |
+
# # Save FAISS index if it exists
|
| 68 |
+
# if self.vector_store is not None:
|
| 69 |
+
# faiss.write_index(self.vector_store, os.path.join(self.storage_dir, "vector_store.faiss"))
|
| 70 |
+
|
| 71 |
+
# # Save chunks and metadata
|
| 72 |
+
# state = {
|
| 73 |
+
# "chunks": self.chunks,
|
| 74 |
+
# "doc_metadata": self.doc_metadata
|
| 75 |
+
# }
|
| 76 |
+
|
| 77 |
+
# with open(os.path.join(self.storage_dir, "metadata.pkl"), "wb") as f:
|
| 78 |
+
# pickle.dump(state, f)
|
| 79 |
+
|
| 80 |
+
# logger.info("Successfully saved document processor state")
|
| 81 |
+
# return True
|
| 82 |
+
# except Exception as e:
|
| 83 |
+
# logger.error(f"Failed to save state: {e}")
|
| 84 |
+
# return False
|
| 85 |
+
|
| 86 |
+
# def load_state(self) -> bool:
|
| 87 |
+
# """Load state from disk if available"""
|
| 88 |
+
# try:
|
| 89 |
+
# faiss_path = os.path.join(self.storage_dir, "vector_store.faiss")
|
| 90 |
+
# metadata_path = os.path.join(self.storage_dir, "metadata.pkl")
|
| 91 |
+
|
| 92 |
+
# if os.path.exists(faiss_path) and os.path.exists(metadata_path):
|
| 93 |
+
# # Load FAISS index
|
| 94 |
+
# self.vector_store = faiss.read_index(faiss_path)
|
| 95 |
+
|
| 96 |
+
# # Load metadata and chunks
|
| 97 |
+
# with open(metadata_path, "rb") as f:
|
| 98 |
+
# state = pickle.load(f)
|
| 99 |
+
# self.chunks = state["chunks"]
|
| 100 |
+
# self.doc_metadata = state["doc_metadata"]
|
| 101 |
+
|
| 102 |
+
# logger.info(f"Successfully loaded state with {len(self.chunks)} chunks and {len(self.doc_metadata)} documents")
|
| 103 |
+
# return True
|
| 104 |
+
# else:
|
| 105 |
+
# logger.info("No saved state found - starting fresh")
|
| 106 |
+
# return False
|
| 107 |
+
# except Exception as e:
|
| 108 |
+
# logger.error(f"Failed to load state: {e}")
|
| 109 |
+
# return False
|
| 110 |
+
|
| 111 |
+
# def clear_state(self) -> bool:
|
| 112 |
+
# """Clear all stored data"""
|
| 113 |
+
# try:
|
| 114 |
+
# if os.path.exists(self.storage_dir):
|
| 115 |
+
# shutil.rmtree(self.storage_dir)
|
| 116 |
+
# os.makedirs(self.storage_dir, exist_ok=True)
|
| 117 |
+
|
| 118 |
+
# self.vector_store = None
|
| 119 |
+
# self.chunks = []
|
| 120 |
+
# self.doc_metadata = []
|
| 121 |
+
|
| 122 |
+
# logger.info("Successfully cleared all stored data")
|
| 123 |
+
# return True
|
| 124 |
+
# except Exception as e:
|
| 125 |
+
# logger.error(f"Failed to clear state: {e}")
|
| 126 |
+
# return False
|
| 127 |
+
|
| 128 |
+
# def _process_file(self, file_path: str) -> Tuple[str, str]:
|
| 129 |
+
# """Process different file types and extract text"""
|
| 130 |
+
# try:
|
| 131 |
+
# # Try unstructured first
|
| 132 |
+
# try:
|
| 133 |
+
# elements = partition(filename=file_path)
|
| 134 |
+
# text = "\n\n".join([str(el) for el in elements])
|
| 135 |
+
# title = Path(file_path).stem
|
| 136 |
+
# return text, title
|
| 137 |
+
# except ImportError:
|
| 138 |
+
# # Fallback to PyMuPDF for PDFs
|
| 139 |
+
# if file_path.lower().endswith('.pdf'):
|
| 140 |
+
# doc = fitz.open(file_path)
|
| 141 |
+
# text = ""
|
| 142 |
+
# for page in doc:
|
| 143 |
+
# text += page.get_text() + "\n\n"
|
| 144 |
+
# doc.close()
|
| 145 |
+
# title = Path(file_path).stem
|
| 146 |
+
# return text, title
|
| 147 |
+
# else:
|
| 148 |
+
# raise
|
| 149 |
+
# except Exception as e:
|
| 150 |
+
# logger.error(f"Error processing file {file_path}: {e}")
|
| 151 |
+
# return "", Path(file_path).stem
|
| 152 |
+
|
| 153 |
+
# def chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
| 154 |
+
# """Split text into chunks with overlap using sentence boundaries"""
|
| 155 |
+
# if not text:
|
| 156 |
+
# return []
|
| 157 |
+
|
| 158 |
+
# try:
|
| 159 |
+
# sentences = sent_tokenize(text)
|
| 160 |
+
# except Exception as e:
|
| 161 |
+
# logger.error(f"Sentence tokenization failed: {e}. Using simple split.")
|
| 162 |
+
# sentences = re.split(r'[\n\.\?\!]+', text)
|
| 163 |
+
# sentences = [s.strip() for s in sentences if s.strip()]
|
| 164 |
+
|
| 165 |
+
# if not sentences:
|
| 166 |
+
# logger.warning("No sentences found after tokenization.")
|
| 167 |
+
# return [text] if len(text) <= chunk_size else [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-overlap)]
|
| 168 |
+
|
| 169 |
+
# chunks = []
|
| 170 |
+
# current_chunk = []
|
| 171 |
+
# current_length = 0
|
| 172 |
+
|
| 173 |
+
# for sentence in sentences:
|
| 174 |
+
# sentence_len = len(sentence)
|
| 175 |
+
# if current_length + sentence_len > chunk_size:
|
| 176 |
+
# if current_chunk:
|
| 177 |
+
# chunks.append(" ".join(current_chunk))
|
| 178 |
+
# current_chunk = current_chunk[-max(1, len(current_chunk)*overlap//chunk_size):] # Keep overlap
|
| 179 |
+
# current_length = sum(len(s) for s in current_chunk)
|
| 180 |
+
|
| 181 |
+
# if sentence_len <= chunk_size:
|
| 182 |
+
# current_chunk.append(sentence)
|
| 183 |
+
# current_length += sentence_len
|
| 184 |
+
# else:
|
| 185 |
+
# logger.warning(f"Sentence length ({sentence_len}) exceeds chunk size ({chunk_size}). Adding as its own chunk.")
|
| 186 |
+
# chunks.append(sentence)
|
| 187 |
+
# else:
|
| 188 |
+
# current_chunk.append(sentence)
|
| 189 |
+
# current_length += sentence_len
|
| 190 |
+
|
| 191 |
+
# if current_chunk:
|
| 192 |
+
# chunks.append(" ".join(current_chunk))
|
| 193 |
+
|
| 194 |
+
# chunks = [c for c in chunks if c.strip()]
|
| 195 |
+
# logger.info(f"Split text into {len(chunks)} chunks.")
|
| 196 |
+
# return chunks
|
| 197 |
+
|
| 198 |
+
# def generate_embedding(self, text: str) -> Optional[np.ndarray]:
|
| 199 |
+
# """Generate embedding for a single text chunk"""
|
| 200 |
+
# if not text or not isinstance(text, str):
|
| 201 |
+
# logger.warning("generate_embedding called with invalid text.")
|
| 202 |
+
# return None
|
| 203 |
+
# try:
|
| 204 |
+
# self.embedding_model.to(self.device)
|
| 205 |
+
# embedding = self.embedding_model.encode(text, convert_to_numpy=True, show_progress_bar=False)
|
| 206 |
+
# return embedding.astype(np.float32)
|
| 207 |
+
# except Exception as e:
|
| 208 |
+
# logger.error(f"Error generating embedding: {e}")
|
| 209 |
+
# return None
|
| 210 |
+
|
| 211 |
+
# def add_document(self, file_path: str) -> bool:
|
| 212 |
+
# """Process and add a document to the vector store"""
|
| 213 |
+
# logger.info(f"Processing document: {file_path}")
|
| 214 |
+
|
| 215 |
+
# try:
|
| 216 |
+
# # Check if document already exists
|
| 217 |
+
# for doc in self.doc_metadata:
|
| 218 |
+
# if os.path.normpath(doc["path"]) == os.path.normpath(file_path):
|
| 219 |
+
# logger.info(f"Document '{doc['title']}' already exists in the index - skipping")
|
| 220 |
+
# return True
|
| 221 |
+
|
| 222 |
+
# text, title = self._process_file(file_path)
|
| 223 |
+
# if not text:
|
| 224 |
+
# logger.warning(f"No text extracted from {file_path}")
|
| 225 |
+
# return False
|
| 226 |
+
|
| 227 |
+
# chunks = self.chunk_text(text)
|
| 228 |
+
# if not chunks:
|
| 229 |
+
# logger.warning(f"No chunks created for {file_path}")
|
| 230 |
+
# return False
|
| 231 |
+
|
| 232 |
+
# # Generate embeddings for all chunks
|
| 233 |
+
# embeddings = []
|
| 234 |
+
# valid_chunks = []
|
| 235 |
+
# for i, chunk in enumerate(chunks):
|
| 236 |
+
# emb = self.generate_embedding(chunk)
|
| 237 |
+
# if emb is not None:
|
| 238 |
+
# embeddings.append(emb)
|
| 239 |
+
# valid_chunks.append({
|
| 240 |
+
# "text": chunk,
|
| 241 |
+
# "doc_title": title,
|
| 242 |
+
# "doc_path": file_path,
|
| 243 |
+
# "chunk_index": i
|
| 244 |
+
# })
|
| 245 |
+
|
| 246 |
+
# if not embeddings:
|
| 247 |
+
# logger.warning(f"No valid embeddings generated for {file_path}")
|
| 248 |
+
# return False
|
| 249 |
+
|
| 250 |
+
# embeddings = np.array(embeddings)
|
| 251 |
+
|
| 252 |
+
# # Initialize or update FAISS index
|
| 253 |
+
# if self.vector_store is None:
|
| 254 |
+
# self.vector_store = faiss.IndexFlatL2(embeddings.shape[1])
|
| 255 |
+
# self.vector_store.add(embeddings)
|
| 256 |
+
# else:
|
| 257 |
+
# self.vector_store.add(embeddings)
|
| 258 |
+
|
| 259 |
+
# # Store metadata
|
| 260 |
+
# start_idx = len(self.chunks)
|
| 261 |
+
# self.chunks.extend(valid_chunks)
|
| 262 |
+
|
| 263 |
+
# self.doc_metadata.append({
|
| 264 |
+
# "title": title,
|
| 265 |
+
# "path": file_path,
|
| 266 |
+
# "chunk_count": len(valid_chunks),
|
| 267 |
+
# "start_idx": start_idx,
|
| 268 |
+
# "end_idx": start_idx + len(valid_chunks) - 1
|
| 269 |
+
# })
|
| 270 |
+
|
| 271 |
+
# # Save state after each document addition
|
| 272 |
+
# self.save_state()
|
| 273 |
+
|
| 274 |
+
# logger.info(f"Successfully added document '{title}' with {len(valid_chunks)} chunks")
|
| 275 |
+
# return True
|
| 276 |
+
|
| 277 |
+
# except Exception as e:
|
| 278 |
+
# logger.error(f"Failed to process document {file_path}: {e}")
|
| 279 |
+
# return False
|
| 280 |
+
|
| 281 |
+
# def search_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 282 |
+
# """Search for relevant chunks using semantic similarity"""
|
| 283 |
+
# if self.vector_store is None or not self.chunks:
|
| 284 |
+
# logger.warning("No documents have been indexed yet")
|
| 285 |
+
# return []
|
| 286 |
+
|
| 287 |
+
# query_embedding = self.generate_embedding(query)
|
| 288 |
+
# if query_embedding is None:
|
| 289 |
+
# logger.error("Failed to generate embedding for the query")
|
| 290 |
+
# return []
|
| 291 |
+
|
| 292 |
+
# query_embedding = np.array([query_embedding]) # Convert to 2D array
|
| 293 |
+
|
| 294 |
+
# # Search FAISS index
|
| 295 |
+
# distances, indices = self.vector_store.search(query_embedding, top_k)
|
| 296 |
+
|
| 297 |
+
# # Convert to similarity scores (FAISS returns squared L2 distances)
|
| 298 |
+
# similarities = 1 / (1 + distances[0])
|
| 299 |
+
|
| 300 |
+
# results = []
|
| 301 |
+
# for idx, sim in zip(indices[0], similarities):
|
| 302 |
+
# if idx < 0 or idx >= len(self.chunks): # Invalid index
|
| 303 |
+
# continue
|
| 304 |
+
|
| 305 |
+
# chunk_data = self.chunks[idx]
|
| 306 |
+
# results.append({
|
| 307 |
+
# "text": chunk_data["text"],
|
| 308 |
+
# "similarity": float(sim),
|
| 309 |
+
# "doc_title": chunk_data["doc_title"],
|
| 310 |
+
# "doc_path": chunk_data["doc_path"],
|
| 311 |
+
# "chunk_index": chunk_data["chunk_index"]
|
| 312 |
+
# })
|
| 313 |
+
|
| 314 |
+
# # Sort by similarity (highest first)
|
| 315 |
+
# results.sort(key=lambda x: x["similarity"], reverse=True)
|
| 316 |
+
|
| 317 |
+
# # Apply threshold
|
| 318 |
+
# results = [r for r in results if r["similarity"] >= QUERY_SIMILARITY_THRESHOLD]
|
| 319 |
+
|
| 320 |
+
# if not results and top_k > 0:
|
| 321 |
+
# logger.info("No chunks met similarity threshold, returning top result anyway")
|
| 322 |
+
# return results[:1]
|
| 323 |
+
|
| 324 |
+
# return results
|
| 325 |
+
|
| 326 |
+
# class RAGSystem:
|
| 327 |
+
# def __init__(self):
|
| 328 |
+
# logger.info("Initializing RAG System...")
|
| 329 |
+
# try:
|
| 330 |
+
# self.doc_processor = DocumentProcessor(embedding_model_name=EMBEDDING_MODEL_NAME, device=DEVICE)
|
| 331 |
+
|
| 332 |
+
# # Try to load existing state
|
| 333 |
+
# if self.doc_processor.load_state():
|
| 334 |
+
# logger.info("Successfully loaded existing document index")
|
| 335 |
+
# else:
|
| 336 |
+
# logger.info("Starting with a fresh document index")
|
| 337 |
+
|
| 338 |
+
# logger.info(f"Loading Generative LLM: {GENERATIVE_MODEL_NAME} on {DEVICE}...")
|
| 339 |
+
# try:
|
| 340 |
+
# phi_tokenizer = AutoTokenizer.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True)
|
| 341 |
+
# model_kwargs = {"trust_remote_code": True}
|
| 342 |
+
# if DEVICE == 'cuda':
|
| 343 |
+
# if torch.cuda.is_bf16_supported():
|
| 344 |
+
# logger.info("Using bfloat16 for Phi-2 model.")
|
| 345 |
+
# model_kwargs["torch_dtype"] = torch.bfloat16
|
| 346 |
+
# else:
|
| 347 |
+
# logger.info("Using float16 for Phi-2 model.")
|
| 348 |
+
# model_kwargs["torch_dtype"] = torch.float16
|
| 349 |
+
# else:
|
| 350 |
+
# logger.info("Using float32 for Phi-2 model on CPU.")
|
| 351 |
+
# model_kwargs["torch_dtype"] = torch.float32
|
| 352 |
+
|
| 353 |
+
# phi_model = AutoModelForCausalLM.from_pretrained(GENERATIVE_MODEL_NAME, **model_kwargs)
|
| 354 |
+
# phi_model = phi_model.to(DEVICE)
|
| 355 |
+
|
| 356 |
+
# pipeline_device_index = 0 if DEVICE == "cuda" else -1
|
| 357 |
+
# self.phi_pipe = pipeline(
|
| 358 |
+
# "text-generation",
|
| 359 |
+
# model=phi_model,
|
| 360 |
+
# tokenizer=phi_tokenizer,
|
| 361 |
+
# device=pipeline_device_index
|
| 362 |
+
# )
|
| 363 |
+
# logger.info(f"✅ Generative LLM ({GENERATIVE_MODEL_NAME}) loaded successfully on {DEVICE}.")
|
| 364 |
+
# except Exception as e:
|
| 365 |
+
# logger.error(f"❌ Critical Error loading Phi-2 model: {e}")
|
| 366 |
+
# logger.error("RAG Q&A functionality will be disabled.")
|
| 367 |
+
# self.phi_pipe = None
|
| 368 |
+
|
| 369 |
+
# logger.info("✅ RAG System initialized successfully.")
|
| 370 |
+
|
| 371 |
+
# except Exception as e:
|
| 372 |
+
# logger.critical(f"Failed to initialize RAG System: {e}", exc_info=True)
|
| 373 |
+
# raise RuntimeError("System initialization failed.") from e
|
| 374 |
+
|
| 375 |
+
# def add_document(self, file_path: str) -> bool:
|
| 376 |
+
# """Add a document to the system"""
|
| 377 |
+
# return self.doc_processor.add_document(file_path)
|
| 378 |
+
|
| 379 |
+
# def ask_question(self, question: str, top_k: int = 3) -> Dict[str, Any]:
|
| 380 |
+
# """Answer a question using RAG"""
|
| 381 |
+
# if self.phi_pipe is None:
|
| 382 |
+
# return {
|
| 383 |
+
# "answer": "Error: The AI model is not available. Please check the logs.",
|
| 384 |
+
# "sources": []
|
| 385 |
+
# }
|
| 386 |
+
|
| 387 |
+
# logger.info(f"Processing question: '{question[:100]}...'")
|
| 388 |
+
|
| 389 |
+
# # Step 1: Retrieve relevant chunks
|
| 390 |
+
# relevant_chunks = self.doc_processor.search_chunks(question, top_k)
|
| 391 |
+
# if not relevant_chunks:
|
| 392 |
+
# return {
|
| 393 |
+
# "answer": "No relevant information found in documents to answer this question.",
|
| 394 |
+
# "sources": []
|
| 395 |
+
# }
|
| 396 |
+
|
| 397 |
+
# # Step 2: Prepare context for generation
|
| 398 |
+
# context = "\n\n---\n\n".join([
|
| 399 |
+
# f"Document: {chunk['doc_title']}\nChunk {chunk['chunk_index']} (Similarity: {chunk['similarity']:.2f})\n\n{chunk['text']}"
|
| 400 |
+
# for chunk in relevant_chunks
|
| 401 |
+
# ])
|
| 402 |
+
|
| 403 |
+
# # Step 3: Generate answer with Phi-2
|
| 404 |
+
# prompt = f"""You are a helpful assistant.Answer the question ONLY from the provided context.If the context is insufficient, just say you don't know.
|
| 405 |
+
|
| 406 |
+
# Context:
|
| 407 |
+
# {context}
|
| 408 |
+
|
| 409 |
+
# Question: {question}
|
| 410 |
+
|
| 411 |
+
# Answer: """
|
| 412 |
+
|
| 413 |
+
# try:
|
| 414 |
+
# output = self.phi_pipe(
|
| 415 |
+
# prompt,
|
| 416 |
+
# max_new_tokens=PHI_MAX_NEW_TOKENS,
|
| 417 |
+
# temperature=PHI_TEMPERATURE,
|
| 418 |
+
# do_sample=True,
|
| 419 |
+
# return_full_text=False,
|
| 420 |
+
# pad_token_id=self.phi_pipe.tokenizer.eos_token_id
|
| 421 |
+
# )
|
| 422 |
+
|
| 423 |
+
# generated_text = output[0]["generated_text"].strip()
|
| 424 |
+
|
| 425 |
+
# # Post-processing to clean up the response
|
| 426 |
+
# if "Question:" in generated_text:
|
| 427 |
+
# generated_text = generated_text.split("Question:")[0].strip()
|
| 428 |
+
|
| 429 |
+
# # Extract sources
|
| 430 |
+
# sources = []
|
| 431 |
+
# seen_docs = set()
|
| 432 |
+
# for chunk in relevant_chunks:
|
| 433 |
+
# if chunk['doc_title'] not in seen_docs:
|
| 434 |
+
# sources.append({
|
| 435 |
+
# "document": chunk['doc_title'],
|
| 436 |
+
# "path": chunk['doc_path'],
|
| 437 |
+
# "similarity": chunk['similarity']
|
| 438 |
+
# })
|
| 439 |
+
# seen_docs.add(chunk['doc_title'])
|
| 440 |
+
|
| 441 |
+
# return {
|
| 442 |
+
# "answer": generated_text,
|
| 443 |
+
# "sources": sources,
|
| 444 |
+
# "relevant_chunks": relevant_chunks # For debugging/explanation
|
| 445 |
+
# }
|
| 446 |
+
|
| 447 |
+
# except Exception as e:
|
| 448 |
+
# logger.error(f"Error generating answer: {e}")
|
| 449 |
+
# return {
|
| 450 |
+
# "answer": f"Error generating answer: {str(e)}",
|
| 451 |
+
# "sources": []
|
| 452 |
+
# }
|
| 453 |
+
|
| 454 |
+
# def explain_retrieval(self, question: str):
|
| 455 |
+
# """Explain the retrieval process for educational purposes"""
|
| 456 |
+
# print("\n=== RAG Process Explanation ===")
|
| 457 |
+
# print(f"Question: {question}")
|
| 458 |
+
|
| 459 |
+
# # Step 1: Show query embedding
|
| 460 |
+
# print("\n1. Query Embedding:")
|
| 461 |
+
# query_embedding = self.doc_processor.generate_embedding(question)
|
| 462 |
+
# if query_embedding is not None:
|
| 463 |
+
# print(f"- Generated {len(query_embedding)}-dimensional embedding vector")
|
| 464 |
+
# print(f"- Sample values: {query_embedding[:5]}...")
|
| 465 |
+
# else:
|
| 466 |
+
# print("Failed to generate query embedding")
|
| 467 |
+
# return
|
| 468 |
+
|
| 469 |
+
# # Step 2: Show retrieval
|
| 470 |
+
# print("\n2. Document Chunk Retrieval:")
|
| 471 |
+
# chunks = self.doc_processor.search_chunks(question, top_k=3)
|
| 472 |
+
# if not chunks:
|
| 473 |
+
# print("No relevant chunks found")
|
| 474 |
+
# return
|
| 475 |
+
|
| 476 |
+
# print(f"Found {len(chunks)} relevant chunks:")
|
| 477 |
+
# for i, chunk in enumerate(chunks, 1):
|
| 478 |
+
# print(f"\nChunk {i}:")
|
| 479 |
+
# print(f"- Source: {chunk['doc_title']}")
|
| 480 |
+
# print(f"- Chunk Index: {chunk['chunk_index']}")
|
| 481 |
+
# print(f"- Similarity Score: {chunk['similarity']:.4f}")
|
| 482 |
+
# print(f"- Text Preview: {chunk['text'][:150]}...")
|
| 483 |
+
|
| 484 |
+
# # Step 3: Show context preparation
|
| 485 |
+
# print("\n3. Context Preparation:")
|
| 486 |
+
# print("The top chunks are combined into a context that will be sent to the LLM")
|
| 487 |
+
|
| 488 |
+
# # Step 4: Show generation
|
| 489 |
+
# print("\n4. Generation with Phi-2:")
|
| 490 |
+
# print("The LLM is prompted to answer the question using ONLY the provided context")
|
| 491 |
+
# print("This helps prevent hallucination by grounding the response in the retrieved documents")
|
| 492 |
+
|
| 493 |
+
# # Show actual answer
|
| 494 |
+
# result = self.ask_question(question)
|
| 495 |
+
# print("\nFinal Answer:")
|
| 496 |
+
# print(result['answer'])
|
| 497 |
+
|
| 498 |
+
# print("\nSources:")
|
| 499 |
+
# for source in result['sources']:
|
| 500 |
+
# print(f"- {source['document']} (similarity: {source['similarity']:.2f})")
|
| 501 |
+
|
| 502 |
+
# def list_documents(self) -> List[Dict[str, Any]]:
|
| 503 |
+
# """List all indexed documents"""
|
| 504 |
+
# return [{
|
| 505 |
+
# "title": doc["title"],
|
| 506 |
+
# "path": doc["path"],
|
| 507 |
+
# "chunk_count": doc["chunk_count"]
|
| 508 |
+
# } for doc in self.doc_processor.doc_metadata]
|
| 509 |
+
|
| 510 |
+
# def clear_index(self) -> bool:
|
| 511 |
+
# """Clear all indexed documents"""
|
| 512 |
+
# return self.doc_processor.clear_state()
|
| 513 |
+
|
| 514 |
+
# def close(self):
|
| 515 |
+
# """Clean up resources"""
|
| 516 |
+
# logger.info("Shutting down RAG System...")
|
| 517 |
+
# # Save state before closing
|
| 518 |
+
# self.doc_processor.save_state()
|
| 519 |
+
|
| 520 |
+
# if hasattr(self, 'phi_pipe') and self.phi_pipe:
|
| 521 |
+
# del self.phi_pipe
|
| 522 |
+
# if hasattr(self.doc_processor, 'embedding_model'):
|
| 523 |
+
# del self.doc_processor.embedding_model
|
| 524 |
+
# if DEVICE == 'cuda':
|
| 525 |
+
# torch.cuda.empty_cache()
|
| 526 |
+
# logger.info("Cleared CUDA cache.")
|
| 527 |
+
# logger.info("RAG System shut down.")
|
| 528 |
+
|
| 529 |
+
# def main():
|
| 530 |
+
# rag_system = RAGSystem()
|
| 531 |
+
|
| 532 |
+
# while True:
|
| 533 |
+
# print("\n1. Add Document")
|
| 534 |
+
# print("2. Ask Question")
|
| 535 |
+
# print("3. Explain Retrieval Process")
|
| 536 |
+
# print("4. List Indexed Documents")
|
| 537 |
+
# print("5. Clear All Documents")
|
| 538 |
+
# print("6. Exit")
|
| 539 |
+
|
| 540 |
+
# choice = input("Enter your choice: ")
|
| 541 |
+
|
| 542 |
+
# if choice == "1":
|
| 543 |
+
# file_path = input("Enter document path (CSV, DOCX, PDF, etc.): ").strip('"')
|
| 544 |
+
# if not os.path.exists(file_path):
|
| 545 |
+
# print("File not found!")
|
| 546 |
+
# continue
|
| 547 |
+
|
| 548 |
+
# if rag_system.add_document(file_path):
|
| 549 |
+
# print("Document added successfully!")
|
| 550 |
+
# else:
|
| 551 |
+
# print("Failed to add document")
|
| 552 |
+
|
| 553 |
+
# elif choice == "2":
|
| 554 |
+
# question = input("Enter your question: ")
|
| 555 |
+
# result = rag_system.ask_question(question)
|
| 556 |
+
# print("\nAnswer:", result["answer"])
|
| 557 |
+
# if result["sources"]:
|
| 558 |
+
# print("\nSources:")
|
| 559 |
+
# for src in result["sources"]:
|
| 560 |
+
# print(f"- {src['document']} (similarity: {src['similarity']:.2f})")
|
| 561 |
+
# else:
|
| 562 |
+
# print("(No sources cited)")
|
| 563 |
+
|
| 564 |
+
# elif choice == "3":
|
| 565 |
+
# question = input("Enter a question to explain the retrieval process: ")
|
| 566 |
+
# rag_system.explain_retrieval(question)
|
| 567 |
+
|
| 568 |
+
# elif choice == "4":
|
| 569 |
+
# docs = rag_system.list_documents()
|
| 570 |
+
# if docs:
|
| 571 |
+
# print("\nIndexed Documents:")
|
| 572 |
+
# for i, doc in enumerate(docs, 1):
|
| 573 |
+
# print(f"{i}. {doc['title']} ({doc['chunk_count']} chunks)")
|
| 574 |
+
# print(f" Path: {doc['path']}")
|
| 575 |
+
# else:
|
| 576 |
+
# print("No documents indexed yet")
|
| 577 |
+
|
| 578 |
+
# elif choice == "5":
|
| 579 |
+
# confirm = input("Are you sure you want to clear ALL documents? (y/n): ")
|
| 580 |
+
# if confirm.lower() == 'y':
|
| 581 |
+
# if rag_system.clear_index():
|
| 582 |
+
# print("All documents cleared")
|
| 583 |
+
# else:
|
| 584 |
+
# print("Failed to clear documents")
|
| 585 |
+
|
| 586 |
+
# elif choice == "6":
|
| 587 |
+
# rag_system.close()
|
| 588 |
+
# break
|
| 589 |
+
|
| 590 |
+
# else:
|
| 591 |
+
# print("Invalid choice")
|
| 592 |
+
|
| 593 |
+
# if __name__ == "__main__":
|
| 594 |
+
# main()
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
import os
|
| 605 |
+
import re
|
| 606 |
+
import fitz
|
| 607 |
+
import nltk
|
| 608 |
+
import numpy as np
|
| 609 |
+
import pandas as pd
|
| 610 |
+
from typing import List, Dict, Tuple, Any, Optional
|
| 611 |
+
from sentence_transformers import SentenceTransformer
|
| 612 |
+
from nltk.tokenize import sent_tokenize
|
| 613 |
+
import logging
|
| 614 |
+
import json
|
| 615 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 616 |
+
import torch
|
| 617 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging as hf_logging
|
| 618 |
+
from pathlib import Path
|
| 619 |
+
import faiss
|
| 620 |
+
from unstructured.partition.auto import partition
|
| 621 |
+
import tempfile
|
| 622 |
+
import pickle
|
| 623 |
+
import shutil
|
| 624 |
+
|
| 625 |
+
hf_logging.set_verbosity_error()
|
| 626 |
+
|
| 627 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 628 |
+
logger = logging.getLogger(__name__)
|
| 629 |
+
|
| 630 |
+
EMBEDDING_MODEL_NAME = 'all-MiniLM-L12-v2'
|
| 631 |
+
GENERATIVE_MODEL_NAME = "microsoft/phi-2"
|
| 632 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 633 |
+
PHI_MAX_NEW_TOKENS = 250
|
| 634 |
+
PHI_TEMPERATURE = 0.3
|
| 635 |
+
QUERY_SIMILARITY_THRESHOLD = 0.50
|
| 636 |
+
CHUNK_SIZE = 100
|
| 637 |
+
CHUNK_OVERLAP = 30
|
| 638 |
+
STORAGE_DIR = "rag_storage"
|
| 639 |
+
|
| 640 |
+
try:
|
| 641 |
+
nltk.download('punkt', quiet=True)
|
| 642 |
+
logger.info("NLTK punkt found or downloaded successfully")
|
| 643 |
+
except Exception as e:
|
| 644 |
+
logger.warning(f"Failed to download or find NLTK punkt: {e}. Using fallback tokenization.")
|
| 645 |
+
|
| 646 |
+
def simple_sent_tokenize(text):
|
| 647 |
+
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
|
| 648 |
+
return [s for s in sentences if s.strip()]
|
| 649 |
+
|
| 650 |
+
sent_tokenize = simple_sent_tokenize
|
| 651 |
+
|
| 652 |
+
class DocumentProcessor:
|
| 653 |
+
def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME, device: str = DEVICE):
|
| 654 |
+
try:
|
| 655 |
+
self.embedding_model = SentenceTransformer(embedding_model_name, device=device)
|
| 656 |
+
logger.info(f"Initialized embedding model: {embedding_model_name} on device: {device}")
|
| 657 |
+
self.device = device
|
| 658 |
+
self.vector_store = None
|
| 659 |
+
self.chunks = []
|
| 660 |
+
self.doc_metadata = []
|
| 661 |
+
self.storage_dir = STORAGE_DIR
|
| 662 |
+
os.makedirs(self.storage_dir, exist_ok=True)
|
| 663 |
+
except Exception as e:
|
| 664 |
+
logger.error(f"Failed to load embedding model {embedding_model_name}: {e}")
|
| 665 |
+
raise
|
| 666 |
+
|
| 667 |
+
def save_state(self):
|
| 668 |
+
"""Save the current state to disk"""
|
| 669 |
+
try:
|
| 670 |
+
# Save FAISS index if it exists
|
| 671 |
+
if self.vector_store is not None:
|
| 672 |
+
faiss.write_index(self.vector_store, os.path.join(self.storage_dir, "vector_store.faiss"))
|
| 673 |
+
|
| 674 |
+
# Save chunks and metadata
|
| 675 |
+
state = {
|
| 676 |
+
"chunks": self.chunks,
|
| 677 |
+
"doc_metadata": self.doc_metadata
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
with open(os.path.join(self.storage_dir, "metadata.pkl"), "wb") as f:
|
| 681 |
+
pickle.dump(state, f)
|
| 682 |
+
|
| 683 |
+
logger.info("Successfully saved document processor state")
|
| 684 |
+
return True
|
| 685 |
+
except Exception as e:
|
| 686 |
+
logger.error(f"Failed to save state: {e}")
|
| 687 |
+
return False
|
| 688 |
+
|
| 689 |
+
def load_state(self) -> bool:
|
| 690 |
+
"""Load state from disk if available"""
|
| 691 |
+
try:
|
| 692 |
+
faiss_path = os.path.join(self.storage_dir, "vector_store.faiss")
|
| 693 |
+
metadata_path = os.path.join(self.storage_dir, "metadata.pkl")
|
| 694 |
+
|
| 695 |
+
if os.path.exists(faiss_path) and os.path.exists(metadata_path):
|
| 696 |
+
# Load FAISS index
|
| 697 |
+
self.vector_store = faiss.read_index(faiss_path)
|
| 698 |
+
|
| 699 |
+
# Load metadata and chunks
|
| 700 |
+
with open(metadata_path, "rb") as f:
|
| 701 |
+
state = pickle.load(f)
|
| 702 |
+
self.chunks = state["chunks"]
|
| 703 |
+
self.doc_metadata = state["doc_metadata"]
|
| 704 |
+
|
| 705 |
+
logger.info(f"Successfully loaded state with {len(self.chunks)} chunks and {len(self.doc_metadata)} documents")
|
| 706 |
+
return True
|
| 707 |
+
else:
|
| 708 |
+
logger.info("No saved state found - starting fresh")
|
| 709 |
+
return False
|
| 710 |
+
except Exception as e:
|
| 711 |
+
logger.error(f"Failed to load state: {e}")
|
| 712 |
+
return False
|
| 713 |
+
|
| 714 |
+
def clear_state(self) -> bool:
|
| 715 |
+
"""Clear all stored data"""
|
| 716 |
+
try:
|
| 717 |
+
if os.path.exists(self.storage_dir):
|
| 718 |
+
shutil.rmtree(self.storage_dir)
|
| 719 |
+
os.makedirs(self.storage_dir, exist_ok=True)
|
| 720 |
+
|
| 721 |
+
self.vector_store = None
|
| 722 |
+
self.chunks = []
|
| 723 |
+
self.doc_metadata = []
|
| 724 |
+
|
| 725 |
+
logger.info("Successfully cleared all stored data")
|
| 726 |
+
return True
|
| 727 |
+
except Exception as e:
|
| 728 |
+
logger.error(f"Failed to clear state: {e}")
|
| 729 |
+
return False
|
| 730 |
+
|
| 731 |
+
def _extract_pdf_pages(self, file_path: str) -> List[Dict[str, Any]]:
|
| 732 |
+
"""Extract text from PDF with page numbers"""
|
| 733 |
+
pages = []
|
| 734 |
+
try:
|
| 735 |
+
doc = fitz.open(file_path)
|
| 736 |
+
for page_num in range(len(doc)):
|
| 737 |
+
page = doc.load_page(page_num)
|
| 738 |
+
text = page.get_text()
|
| 739 |
+
if text.strip(): # Only include pages with content
|
| 740 |
+
pages.append({
|
| 741 |
+
"page_number": page_num + 1,
|
| 742 |
+
"text": text
|
| 743 |
+
})
|
| 744 |
+
doc.close()
|
| 745 |
+
logger.info(f"Extracted {len(pages)} pages from PDF")
|
| 746 |
+
return pages
|
| 747 |
+
except Exception as e:
|
| 748 |
+
logger.error(f"Error extracting PDF pages: {e}")
|
| 749 |
+
return []
|
| 750 |
+
|
| 751 |
+
def _process_file(self, file_path: str) -> Tuple[str, str, List[Dict[str, Any]]]:
|
| 752 |
+
"""Process different file types and extract text with page information"""
|
| 753 |
+
try:
|
| 754 |
+
title = Path(file_path).stem
|
| 755 |
+
pages = []
|
| 756 |
+
|
| 757 |
+
# Handle PDF files specially to extract page numbers
|
| 758 |
+
if file_path.lower().endswith('.pdf'):
|
| 759 |
+
pages = self._extract_pdf_pages(file_path)
|
| 760 |
+
text = "\n\n".join([page["text"] for page in pages])
|
| 761 |
+
return text, title, pages
|
| 762 |
+
else:
|
| 763 |
+
# For non-PDF files, try unstructured first
|
| 764 |
+
try:
|
| 765 |
+
elements = partition(filename=file_path)
|
| 766 |
+
text = "\n\n".join([str(el) for el in elements])
|
| 767 |
+
# For non-PDF files, create a single "page"
|
| 768 |
+
pages = [{"page_number": 1, "text": text}]
|
| 769 |
+
return text, title, pages
|
| 770 |
+
except ImportError:
|
| 771 |
+
# Fallback for text files
|
| 772 |
+
if file_path.lower().endswith(('.txt', '.csv')):
|
| 773 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 774 |
+
text = f.read()
|
| 775 |
+
pages = [{"page_number": 1, "text": text}]
|
| 776 |
+
return text, title, pages
|
| 777 |
+
else:
|
| 778 |
+
raise
|
| 779 |
+
except Exception as e:
|
| 780 |
+
logger.error(f"Error processing file {file_path}: {e}")
|
| 781 |
+
return "", Path(file_path).stem, []
|
| 782 |
+
|
| 783 |
+
def _find_chunk_page(self, chunk_text: str, pages: List[Dict[str, Any]]) -> int:
|
| 784 |
+
"""Find which page a chunk belongs to"""
|
| 785 |
+
chunk_words = set(chunk_text.lower().split()[:10]) # Use first 10 words for matching
|
| 786 |
+
|
| 787 |
+
best_page = 1
|
| 788 |
+
best_score = 0
|
| 789 |
+
|
| 790 |
+
for page in pages:
|
| 791 |
+
page_words = set(page["text"].lower().split())
|
| 792 |
+
common_words = chunk_words.intersection(page_words)
|
| 793 |
+
score = len(common_words) / len(chunk_words) if chunk_words else 0
|
| 794 |
+
|
| 795 |
+
if score > best_score:
|
| 796 |
+
best_score = score
|
| 797 |
+
best_page = page["page_number"]
|
| 798 |
+
|
| 799 |
+
return best_page
|
| 800 |
+
|
| 801 |
+
def chunk_text(self, text: str, pages: List[Dict[str, Any]], chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Dict[str, Any]]:
|
| 802 |
+
"""Split text into chunks with overlap using sentence boundaries and track page numbers"""
|
| 803 |
+
if not text:
|
| 804 |
+
return []
|
| 805 |
+
|
| 806 |
+
try:
|
| 807 |
+
sentences = sent_tokenize(text)
|
| 808 |
+
except Exception as e:
|
| 809 |
+
logger.error(f"Sentence tokenization failed: {e}. Using simple split.")
|
| 810 |
+
sentences = re.split(r'[\n\.\?\!]+', text)
|
| 811 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 812 |
+
|
| 813 |
+
if not sentences:
|
| 814 |
+
logger.warning("No sentences found after tokenization.")
|
| 815 |
+
return [{"text": text, "page_number": 1}] if len(text) <= chunk_size else [{"text": text[i:i+chunk_size], "page_number": 1} for i in range(0, len(text), chunk_size-overlap)]
|
| 816 |
+
|
| 817 |
+
chunks = []
|
| 818 |
+
current_chunk = []
|
| 819 |
+
current_length = 0
|
| 820 |
+
|
| 821 |
+
for sentence in sentences:
|
| 822 |
+
sentence_len = len(sentence)
|
| 823 |
+
if current_length + sentence_len > chunk_size:
|
| 824 |
+
if current_chunk:
|
| 825 |
+
chunk_text = " ".join(current_chunk)
|
| 826 |
+
page_number = self._find_chunk_page(chunk_text, pages)
|
| 827 |
+
chunks.append({
|
| 828 |
+
"text": chunk_text,
|
| 829 |
+
"page_number": page_number
|
| 830 |
+
})
|
| 831 |
+
current_chunk = current_chunk[-max(1, len(current_chunk)*overlap//chunk_size):] # Keep overlap
|
| 832 |
+
current_length = sum(len(s) for s in current_chunk)
|
| 833 |
+
|
| 834 |
+
if sentence_len <= chunk_size:
|
| 835 |
+
current_chunk.append(sentence)
|
| 836 |
+
current_length += sentence_len
|
| 837 |
+
else:
|
| 838 |
+
logger.warning(f"Sentence length ({sentence_len}) exceeds chunk size ({chunk_size}). Adding as its own chunk.")
|
| 839 |
+
page_number = self._find_chunk_page(sentence, pages)
|
| 840 |
+
chunks.append({
|
| 841 |
+
"text": sentence,
|
| 842 |
+
"page_number": page_number
|
| 843 |
+
})
|
| 844 |
+
else:
|
| 845 |
+
current_chunk.append(sentence)
|
| 846 |
+
current_length += sentence_len
|
| 847 |
+
|
| 848 |
+
if current_chunk:
|
| 849 |
+
chunk_text = " ".join(current_chunk)
|
| 850 |
+
page_number = self._find_chunk_page(chunk_text, pages)
|
| 851 |
+
chunks.append({
|
| 852 |
+
"text": chunk_text,
|
| 853 |
+
"page_number": page_number
|
| 854 |
+
})
|
| 855 |
+
|
| 856 |
+
chunks = [c for c in chunks if c["text"].strip()]
|
| 857 |
+
logger.info(f"Split text into {len(chunks)} chunks with page numbers.")
|
| 858 |
+
return chunks
|
| 859 |
+
|
| 860 |
+
def generate_embedding(self, text: str) -> Optional[np.ndarray]:
|
| 861 |
+
"""Generate embedding for a single text chunk"""
|
| 862 |
+
if not text or not isinstance(text, str):
|
| 863 |
+
logger.warning("generate_embedding called with invalid text.")
|
| 864 |
+
return None
|
| 865 |
+
try:
|
| 866 |
+
self.embedding_model.to(self.device)
|
| 867 |
+
embedding = self.embedding_model.encode(text, convert_to_numpy=True, show_progress_bar=False)
|
| 868 |
+
return embedding.astype(np.float32)
|
| 869 |
+
except Exception as e:
|
| 870 |
+
logger.error(f"Error generating embedding: {e}")
|
| 871 |
+
return None
|
| 872 |
+
|
| 873 |
+
def add_document(self, file_path: str) -> bool:
|
| 874 |
+
"""Process and add a document to the vector store"""
|
| 875 |
+
logger.info(f"Processing document: {file_path}")
|
| 876 |
+
|
| 877 |
+
try:
|
| 878 |
+
# Check if document already exists
|
| 879 |
+
for doc in self.doc_metadata:
|
| 880 |
+
if os.path.normpath(doc["path"]) == os.path.normpath(file_path):
|
| 881 |
+
logger.info(f"Document '{doc['title']}' already exists in the index - skipping")
|
| 882 |
+
return True
|
| 883 |
+
|
| 884 |
+
text, title, pages = self._process_file(file_path)
|
| 885 |
+
if not text:
|
| 886 |
+
logger.warning(f"No text extracted from {file_path}")
|
| 887 |
+
return False
|
| 888 |
+
|
| 889 |
+
chunks = self.chunk_text(text, pages)
|
| 890 |
+
if not chunks:
|
| 891 |
+
logger.warning(f"No chunks created for {file_path}")
|
| 892 |
+
return False
|
| 893 |
+
|
| 894 |
+
# Generate embeddings for all chunks
|
| 895 |
+
embeddings = []
|
| 896 |
+
valid_chunks = []
|
| 897 |
+
for i, chunk_data in enumerate(chunks):
|
| 898 |
+
emb = self.generate_embedding(chunk_data["text"])
|
| 899 |
+
if emb is not None:
|
| 900 |
+
embeddings.append(emb)
|
| 901 |
+
valid_chunks.append({
|
| 902 |
+
"text": chunk_data["text"],
|
| 903 |
+
"page_number": chunk_data["page_number"],
|
| 904 |
+
"doc_title": title,
|
| 905 |
+
"doc_path": file_path,
|
| 906 |
+
"chunk_index": i
|
| 907 |
+
})
|
| 908 |
+
|
| 909 |
+
if not embeddings:
|
| 910 |
+
logger.warning(f"No valid embeddings generated for {file_path}")
|
| 911 |
+
return False
|
| 912 |
+
|
| 913 |
+
embeddings = np.array(embeddings)
|
| 914 |
+
|
| 915 |
+
# Initialize or update FAISS index
|
| 916 |
+
if self.vector_store is None:
|
| 917 |
+
self.vector_store = faiss.IndexFlatL2(embeddings.shape[1])
|
| 918 |
+
self.vector_store.add(embeddings)
|
| 919 |
+
else:
|
| 920 |
+
self.vector_store.add(embeddings)
|
| 921 |
+
|
| 922 |
+
# Store metadata
|
| 923 |
+
start_idx = len(self.chunks)
|
| 924 |
+
self.chunks.extend(valid_chunks)
|
| 925 |
+
|
| 926 |
+
self.doc_metadata.append({
|
| 927 |
+
"title": title,
|
| 928 |
+
"path": file_path,
|
| 929 |
+
"chunk_count": len(valid_chunks),
|
| 930 |
+
"start_idx": start_idx,
|
| 931 |
+
"end_idx": start_idx + len(valid_chunks) - 1,
|
| 932 |
+
"total_pages": max([page["page_number"] for page in pages]) if pages else 1
|
| 933 |
+
})
|
| 934 |
+
|
| 935 |
+
# Save state after each document addition
|
| 936 |
+
self.save_state()
|
| 937 |
+
|
| 938 |
+
logger.info(f"Successfully added document '{title}' with {len(valid_chunks)} chunks")
|
| 939 |
+
return True
|
| 940 |
+
|
| 941 |
+
except Exception as e:
|
| 942 |
+
logger.error(f"Failed to process document {file_path}: {e}")
|
| 943 |
+
return False
|
| 944 |
+
|
| 945 |
+
def search_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 946 |
+
"""Search for relevant chunks using semantic similarity"""
|
| 947 |
+
if self.vector_store is None or not self.chunks:
|
| 948 |
+
logger.warning("No documents have been indexed yet")
|
| 949 |
+
return []
|
| 950 |
+
|
| 951 |
+
query_embedding = self.generate_embedding(query)
|
| 952 |
+
if query_embedding is None:
|
| 953 |
+
logger.error("Failed to generate embedding for the query")
|
| 954 |
+
return []
|
| 955 |
+
|
| 956 |
+
query_embedding = np.array([query_embedding]) # Convert to 2D array
|
| 957 |
+
|
| 958 |
+
# Search FAISS index
|
| 959 |
+
distances, indices = self.vector_store.search(query_embedding, top_k)
|
| 960 |
+
|
| 961 |
+
# Convert to similarity scores (FAISS returns squared L2 distances)
|
| 962 |
+
similarities = 1 / (1 + distances[0])
|
| 963 |
+
|
| 964 |
+
results = []
|
| 965 |
+
for idx, sim in zip(indices[0], similarities):
|
| 966 |
+
if idx < 0 or idx >= len(self.chunks): # Invalid index
|
| 967 |
+
continue
|
| 968 |
+
|
| 969 |
+
chunk_data = self.chunks[idx]
|
| 970 |
+
results.append({
|
| 971 |
+
"text": chunk_data["text"],
|
| 972 |
+
"similarity": float(sim),
|
| 973 |
+
"doc_title": chunk_data["doc_title"],
|
| 974 |
+
"doc_path": chunk_data["doc_path"],
|
| 975 |
+
"chunk_index": chunk_data["chunk_index"],
|
| 976 |
+
"page_number": chunk_data["page_number"]
|
| 977 |
+
})
|
| 978 |
+
|
| 979 |
+
# Sort by similarity (highest first)
|
| 980 |
+
results.sort(key=lambda x: x["similarity"], reverse=True)
|
| 981 |
+
|
| 982 |
+
# Apply threshold
|
| 983 |
+
results = [r for r in results if r["similarity"] >= QUERY_SIMILARITY_THRESHOLD]
|
| 984 |
+
|
| 985 |
+
if not results and top_k > 0:
|
| 986 |
+
logger.info("No chunks met similarity threshold, returning top result anyway")
|
| 987 |
+
return results[:1]
|
| 988 |
+
|
| 989 |
+
return results
|
| 990 |
+
|
| 991 |
+
class RAGSystem:
|
| 992 |
+
def __init__(self):
|
| 993 |
+
logger.info("Initializing RAG System...")
|
| 994 |
+
try:
|
| 995 |
+
self.doc_processor = DocumentProcessor(embedding_model_name=EMBEDDING_MODEL_NAME, device=DEVICE)
|
| 996 |
+
|
| 997 |
+
# Try to load existing state
|
| 998 |
+
if self.doc_processor.load_state():
|
| 999 |
+
logger.info("Successfully loaded existing document index")
|
| 1000 |
+
else:
|
| 1001 |
+
logger.info("Starting with a fresh document index")
|
| 1002 |
+
|
| 1003 |
+
logger.info(f"Loading Generative LLM: {GENERATIVE_MODEL_NAME} on {DEVICE}...")
|
| 1004 |
+
try:
|
| 1005 |
+
phi_tokenizer = AutoTokenizer.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True)
|
| 1006 |
+
model_kwargs = {"trust_remote_code": True}
|
| 1007 |
+
if DEVICE == 'cuda':
|
| 1008 |
+
if torch.cuda.is_bf16_supported():
|
| 1009 |
+
logger.info("Using bfloat16 for Phi-2 model.")
|
| 1010 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
| 1011 |
+
else:
|
| 1012 |
+
logger.info("Using float16 for Phi-2 model.")
|
| 1013 |
+
model_kwargs["torch_dtype"] = torch.float16
|
| 1014 |
+
else:
|
| 1015 |
+
logger.info("Using float32 for Phi-2 model on CPU.")
|
| 1016 |
+
model_kwargs["torch_dtype"] = torch.float32
|
| 1017 |
+
|
| 1018 |
+
phi_model = AutoModelForCausalLM.from_pretrained(GENERATIVE_MODEL_NAME, **model_kwargs)
|
| 1019 |
+
phi_model = phi_model.to(DEVICE)
|
| 1020 |
+
|
| 1021 |
+
pipeline_device_index = 0 if DEVICE == "cuda" else -1
|
| 1022 |
+
self.phi_pipe = pipeline(
|
| 1023 |
+
"text-generation",
|
| 1024 |
+
model=phi_model,
|
| 1025 |
+
tokenizer=phi_tokenizer,
|
| 1026 |
+
device=pipeline_device_index
|
| 1027 |
+
)
|
| 1028 |
+
logger.info(f"✅ Generative LLM ({GENERATIVE_MODEL_NAME}) loaded successfully on {DEVICE}.")
|
| 1029 |
+
except Exception as e:
|
| 1030 |
+
logger.error(f"❌ Critical Error loading Phi-2 model: {e}")
|
| 1031 |
+
logger.error("RAG Q&A functionality will be disabled.")
|
| 1032 |
+
self.phi_pipe = None
|
| 1033 |
+
|
| 1034 |
+
logger.info("✅ RAG System initialized successfully.")
|
| 1035 |
+
|
| 1036 |
+
except Exception as e:
|
| 1037 |
+
logger.critical(f"Failed to initialize RAG System: {e}", exc_info=True)
|
| 1038 |
+
raise RuntimeError("System initialization failed.") from e
|
| 1039 |
+
|
| 1040 |
+
def add_document(self, file_path: str) -> bool:
|
| 1041 |
+
"""Add a document to the system"""
|
| 1042 |
+
return self.doc_processor.add_document(file_path)
|
| 1043 |
+
|
| 1044 |
+
def ask_question(self, question: str, top_k: int = 3) -> Dict[str, Any]:
|
| 1045 |
+
"""Answer a question using RAG"""
|
| 1046 |
+
if self.phi_pipe is None:
|
| 1047 |
+
return {
|
| 1048 |
+
"answer": "Error: The AI model is not available. Please check the logs.",
|
| 1049 |
+
"sources": [],
|
| 1050 |
+
"question_chunks": []
|
| 1051 |
+
}
|
| 1052 |
+
|
| 1053 |
+
logger.info(f"Processing question: '{question[:100]}...'")
|
| 1054 |
+
|
| 1055 |
+
# Step 1: Retrieve relevant chunks
|
| 1056 |
+
relevant_chunks = self.doc_processor.search_chunks(question, top_k)
|
| 1057 |
+
if not relevant_chunks:
|
| 1058 |
+
return {
|
| 1059 |
+
"answer": "No relevant information found in documents to answer this question.",
|
| 1060 |
+
"sources": [],
|
| 1061 |
+
"question_chunks": []
|
| 1062 |
+
}
|
| 1063 |
+
|
| 1064 |
+
# Step 2: Prepare context for generation
|
| 1065 |
+
context = "\n\n---\n\n".join([
|
| 1066 |
+
f"Document: {chunk['doc_title']} (Page {chunk['page_number']})\nChunk {chunk['chunk_index']} (Similarity: {chunk['similarity']:.2f})\n\n{chunk['text']}"
|
| 1067 |
+
for chunk in relevant_chunks
|
| 1068 |
+
])
|
| 1069 |
+
|
| 1070 |
+
# Step 3: Generate answer with Phi-2
|
| 1071 |
+
prompt = f"""You are a helpful assistant. Answer the question ONLY from the provided context. If the context is insufficient, just say you don't know.
|
| 1072 |
+
|
| 1073 |
+
Context:
|
| 1074 |
+
{context}
|
| 1075 |
+
|
| 1076 |
+
Question: {question}
|
| 1077 |
+
|
| 1078 |
+
Answer: """
|
| 1079 |
+
|
| 1080 |
+
try:
|
| 1081 |
+
output = self.phi_pipe(
|
| 1082 |
+
prompt,
|
| 1083 |
+
max_new_tokens=PHI_MAX_NEW_TOKENS,
|
| 1084 |
+
temperature=PHI_TEMPERATURE,
|
| 1085 |
+
do_sample=True,
|
| 1086 |
+
return_full_text=False,
|
| 1087 |
+
pad_token_id=self.phi_pipe.tokenizer.eos_token_id
|
| 1088 |
+
)
|
| 1089 |
+
|
| 1090 |
+
generated_text = output[0]["generated_text"].strip()
|
| 1091 |
+
|
| 1092 |
+
# Post-processing to clean up the response
|
| 1093 |
+
if "Question:" in generated_text:
|
| 1094 |
+
generated_text = generated_text.split("Question:")[0].strip()
|
| 1095 |
+
|
| 1096 |
+
# Extract sources with page numbers
|
| 1097 |
+
sources = []
|
| 1098 |
+
seen_docs = set()
|
| 1099 |
+
for chunk in relevant_chunks:
|
| 1100 |
+
doc_key = f"{chunk['doc_title']}_page_{chunk['page_number']}"
|
| 1101 |
+
if doc_key not in seen_docs:
|
| 1102 |
+
sources.append({
|
| 1103 |
+
"document": chunk['doc_title'],
|
| 1104 |
+
"page_number": chunk['page_number'],
|
| 1105 |
+
"path": chunk['doc_path'],
|
| 1106 |
+
"similarity": chunk['similarity']
|
| 1107 |
+
})
|
| 1108 |
+
seen_docs.add(doc_key)
|
| 1109 |
+
|
| 1110 |
+
# Prepare question chunks for display
|
| 1111 |
+
question_chunks = []
|
| 1112 |
+
for chunk in relevant_chunks:
|
| 1113 |
+
question_chunks.append({
|
| 1114 |
+
"document": chunk['doc_title'],
|
| 1115 |
+
"page_number": chunk['page_number'],
|
| 1116 |
+
"chunk_index": chunk['chunk_index'],
|
| 1117 |
+
"similarity": chunk['similarity'],
|
| 1118 |
+
"text_preview": chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text']
|
| 1119 |
+
})
|
| 1120 |
+
|
| 1121 |
+
return {
|
| 1122 |
+
"answer": generated_text,
|
| 1123 |
+
"sources": sources,
|
| 1124 |
+
"question_chunks": question_chunks,
|
| 1125 |
+
"relevant_chunks": relevant_chunks # For debugging/explanation
|
| 1126 |
+
}
|
| 1127 |
+
|
| 1128 |
+
except Exception as e:
|
| 1129 |
+
logger.error(f"Error generating answer: {e}")
|
| 1130 |
+
return {
|
| 1131 |
+
"answer": f"Error generating answer: {str(e)}",
|
| 1132 |
+
"sources": [],
|
| 1133 |
+
"question_chunks": []
|
| 1134 |
+
}
|
| 1135 |
+
|
| 1136 |
+
def explain_retrieval(self, question: str):
|
| 1137 |
+
"""Explain the retrieval process for educational purposes"""
|
| 1138 |
+
print("\n=== RAG Process Explanation ===")
|
| 1139 |
+
print(f"Question: {question}")
|
| 1140 |
+
|
| 1141 |
+
# Step 1: Show query embedding
|
| 1142 |
+
print("\n1. Query Embedding:")
|
| 1143 |
+
query_embedding = self.doc_processor.generate_embedding(question)
|
| 1144 |
+
if query_embedding is not None:
|
| 1145 |
+
print(f"- Generated {len(query_embedding)}-dimensional embedding vector")
|
| 1146 |
+
print(f"- Sample values: {query_embedding[:5]}...")
|
| 1147 |
+
else:
|
| 1148 |
+
print("Failed to generate query embedding")
|
| 1149 |
+
return
|
| 1150 |
+
|
| 1151 |
+
# Step 2: Show retrieval
|
| 1152 |
+
print("\n2. Document Chunk Retrieval:")
|
| 1153 |
+
chunks = self.doc_processor.search_chunks(question, top_k=3)
|
| 1154 |
+
if not chunks:
|
| 1155 |
+
print("No relevant chunks found")
|
| 1156 |
+
return
|
| 1157 |
+
|
| 1158 |
+
print(f"Found {len(chunks)} relevant chunks:")
|
| 1159 |
+
for i, chunk in enumerate(chunks, 1):
|
| 1160 |
+
print(f"\nChunk {i}:")
|
| 1161 |
+
print(f"- Source: {chunk['doc_title']} (Page {chunk['page_number']})")
|
| 1162 |
+
print(f"- Chunk Index: {chunk['chunk_index']}")
|
| 1163 |
+
print(f"- Similarity Score: {chunk['similarity']:.4f}")
|
| 1164 |
+
print(f"- Text Preview: {chunk['text'][:150]}...")
|
| 1165 |
+
|
| 1166 |
+
# Step 3: Show context preparation
|
| 1167 |
+
print("\n3. Context Preparation:")
|
| 1168 |
+
print("The top chunks are combined into a context that will be sent to the LLM")
|
| 1169 |
+
|
| 1170 |
+
# Step 4: Show generation
|
| 1171 |
+
print("\n4. Generation with Phi-2:")
|
| 1172 |
+
print("The LLM is prompted to answer the question using ONLY the provided context")
|
| 1173 |
+
print("This helps prevent hallucination by grounding the response in the retrieved documents")
|
| 1174 |
+
|
| 1175 |
+
# Show actual answer
|
| 1176 |
+
result = self.ask_question(question)
|
| 1177 |
+
print("\nFinal Answer:")
|
| 1178 |
+
print(result['answer'])
|
| 1179 |
+
|
| 1180 |
+
print("\nSources with Page Numbers:")
|
| 1181 |
+
for source in result['sources']:
|
| 1182 |
+
print(f"- {source['document']} (Page {source['page_number']}, similarity: {source['similarity']:.2f})")
|
| 1183 |
+
|
| 1184 |
+
def list_documents(self) -> List[Dict[str, Any]]:
|
| 1185 |
+
"""List all indexed documents"""
|
| 1186 |
+
return [{
|
| 1187 |
+
"title": doc["title"],
|
| 1188 |
+
"path": doc["path"],
|
| 1189 |
+
"chunk_count": doc["chunk_count"],
|
| 1190 |
+
"total_pages": doc.get("total_pages", 1)
|
| 1191 |
+
} for doc in self.doc_processor.doc_metadata]
|
| 1192 |
+
|
| 1193 |
+
def clear_index(self) -> bool:
|
| 1194 |
+
"""Clear all indexed documents"""
|
| 1195 |
+
return self.doc_processor.clear_state()
|
| 1196 |
+
|
| 1197 |
+
def close(self):
|
| 1198 |
+
"""Clean up resources"""
|
| 1199 |
+
logger.info("Shutting down RAG System...")
|
| 1200 |
+
# Save state before closing
|
| 1201 |
+
self.doc_processor.save_state()
|
| 1202 |
+
|
| 1203 |
+
if hasattr(self, 'phi_pipe') and self.phi_pipe:
|
| 1204 |
+
del self.phi_pipe
|
| 1205 |
+
if hasattr(self.doc_processor, 'embedding_model'):
|
| 1206 |
+
del self.doc_processor.embedding_model
|
| 1207 |
+
if DEVICE == 'cuda':
|
| 1208 |
+
torch.cuda.empty_cache()
|
| 1209 |
+
logger.info("Cleared CUDA cache.")
|
| 1210 |
+
logger.info("RAG System shut down.")
|
| 1211 |
+
|
| 1212 |
+
def main():
|
| 1213 |
+
rag_system = RAGSystem()
|
| 1214 |
+
|
| 1215 |
+
while True:
|
| 1216 |
+
print("\n1. Add Document")
|
| 1217 |
+
print("2. Ask Question")
|
| 1218 |
+
print("3. Explain Retrieval Process")
|
| 1219 |
+
print("4. List Indexed Documents")
|
| 1220 |
+
print("5. Clear All Documents")
|
| 1221 |
+
print("6. Exit")
|
| 1222 |
+
|
| 1223 |
+
choice = input("Enter your choice: ")
|
| 1224 |
+
|
| 1225 |
+
if choice == "1":
|
| 1226 |
+
file_path = input("Enter document path (CSV, DOCX, PDF, etc.): ").strip('"')
|
| 1227 |
+
if not os.path.exists(file_path):
|
| 1228 |
+
print("File not found!")
|
| 1229 |
+
continue
|
| 1230 |
+
|
| 1231 |
+
if rag_system.add_document(file_path):
|
| 1232 |
+
print("Document added successfully!")
|
| 1233 |
+
else:
|
| 1234 |
+
print("Failed to add document")
|
| 1235 |
+
|
| 1236 |
+
elif choice == "2":
|
| 1237 |
+
question = input("Enter your question: ")
|
| 1238 |
+
result = rag_system.ask_question(question)
|
| 1239 |
+
print("\nAnswer:", result["answer"])
|
| 1240 |
+
if result["sources"]:
|
| 1241 |
+
print("\nSources:")
|
| 1242 |
+
for src in result["sources"]:
|
| 1243 |
+
print(f"- {src['document']} (Page {src['page_number']}, similarity: {src['similarity']:.2f})")
|
| 1244 |
+
else:
|
| 1245 |
+
print("(No sources cited)")
|
| 1246 |
+
|
| 1247 |
+
if result["question_chunks"]:
|
| 1248 |
+
print("\nRelevant Chunks:")
|
| 1249 |
+
for i, chunk in enumerate(result["question_chunks"], 1):
|
| 1250 |
+
print(f"{i}. {chunk['document']} (Page {chunk['page_number']}, Chunk {chunk['chunk_index']})")
|
| 1251 |
+
print(f" Similarity: {chunk['similarity']:.2f}")
|
| 1252 |
+
print(f" Preview: {chunk['text_preview']}")
|
| 1253 |
+
print()
|
| 1254 |
+
|
| 1255 |
+
elif choice == "3":
|
| 1256 |
+
question = input("Enter a question to explain the retrieval process: ")
|
| 1257 |
+
rag_system.explain_retrieval(question)
|
| 1258 |
+
|
| 1259 |
+
elif choice == "4":
|
| 1260 |
+
docs = rag_system.list_documents()
|
| 1261 |
+
if docs:
|
| 1262 |
+
print("\nIndexed Documents:")
|
| 1263 |
+
for i, doc in enumerate(docs, 1):
|
| 1264 |
+
print(f"{i}. {doc['title']} ({doc['chunk_count']} chunks, {doc['total_pages']} pages)")
|
| 1265 |
+
print(f" Path: {doc['path']}")
|
| 1266 |
+
else:
|
| 1267 |
+
print("No documents indexed yet")
|
| 1268 |
+
|
| 1269 |
+
elif choice == "5":
|
| 1270 |
+
confirm = input("Are you sure you want to clear ALL documents? (y/n): ")
|
| 1271 |
+
if confirm.lower() == 'y':
|
| 1272 |
+
if rag_system.clear_index():
|
| 1273 |
+
print("All documents cleared")
|
| 1274 |
+
else:
|
| 1275 |
+
print("Failed to clear documents")
|
| 1276 |
+
|
| 1277 |
+
elif choice == "6":
|
| 1278 |
+
rag_system.close()
|
| 1279 |
+
break
|
| 1280 |
+
|
| 1281 |
+
else:
|
| 1282 |
+
print("Invalid choice")
|
| 1283 |
+
|
| 1284 |
+
if __name__ == "__main__":
|
| 1285 |
+
main()
|
README.md
CHANGED
|
@@ -1,11 +1,48 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: RAG
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Document Q&A with RAG
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue # ✅ valid color now
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "4.28.0"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Document Q&A with RAG System
|
| 14 |
+
|
| 15 |
+
This is a Retrieval-Augmented Generation (RAG) system deployed on Hugging Face Spaces. It allows you to:
|
| 16 |
+
|
| 17 |
+
1. Upload documents (PDF, DOCX, TXT, CSV)
|
| 18 |
+
2. Ask questions about the content
|
| 19 |
+
3. Get answers grounded in your documents
|
| 20 |
+
|
| 21 |
+
## Features
|
| 22 |
+
|
| 23 |
+
- Supports multiple document formats
|
| 24 |
+
- Semantic search for relevant content
|
| 25 |
+
- Generative answers using Phi-2 model
|
| 26 |
+
- Persistent document storage
|
| 27 |
+
- Web interface and API endpoints
|
| 28 |
+
|
| 29 |
+
## How to Use
|
| 30 |
+
|
| 31 |
+
1. Upload documents using the upload form
|
| 32 |
+
2. Ask questions in natural language
|
| 33 |
+
3. View answers with cited sources
|
| 34 |
+
|
| 35 |
+
## Technical Details
|
| 36 |
+
|
| 37 |
+
- Embedding model: `all-MiniLM-L12-v2`
|
| 38 |
+
- Generative model: `microsoft/phi-2`
|
| 39 |
+
- Vector store: FAISS
|
| 40 |
+
- Web framework: FastAPI + Gradio
|
| 41 |
+
|
| 42 |
+
## Deployment
|
| 43 |
+
|
| 44 |
+
This app is automatically deployed on Hugging Face Spaces. To run locally:
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
pip install -r requirements.txt
|
| 48 |
+
python app.py
|
__pycache__/RAG.cpython-312.pyc
ADDED
|
Binary file (33.6 kB). View file
|
|
|
__pycache__/app.cpython-312.pyc
ADDED
|
Binary file (22.9 kB). View file
|
|
|
__pycache__/rag_system.cpython-312.pyc
ADDED
|
Binary file (25.1 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,1379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from fastapi import FastAPI, File, UploadFile, HTTPException, Form, Request
|
| 2 |
+
# from fastapi.responses import HTMLResponse, JSONResponse
|
| 3 |
+
# from fastapi.staticfiles import StaticFiles
|
| 4 |
+
# from fastapi.templating import Jinja2Templates
|
| 5 |
+
# from pydantic import BaseModel
|
| 6 |
+
# import os
|
| 7 |
+
# import tempfile
|
| 8 |
+
# import shutil
|
| 9 |
+
# from typing import List, Dict, Any
|
| 10 |
+
# import logging
|
| 11 |
+
|
| 12 |
+
# import sys
|
| 13 |
+
# import os
|
| 14 |
+
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 15 |
+
|
| 16 |
+
# try:
|
| 17 |
+
# from RAG import RAGSystem
|
| 18 |
+
# except ImportError:
|
| 19 |
+
# print("Error: Cannot import RAGSystem from RAG.py")
|
| 20 |
+
# print("Make sure RAG.py is in the same directory as app.py")
|
| 21 |
+
# sys.exit(1)
|
| 22 |
+
|
| 23 |
+
# logging.basicConfig(level=logging.DEBUG)
|
| 24 |
+
# logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# app = FastAPI(title="RAG PDF QA System")
|
| 27 |
+
|
| 28 |
+
# # Setup templates directory
|
| 29 |
+
# templates = Jinja2Templates(directory="templates")
|
| 30 |
+
|
| 31 |
+
# # Try to mount static files directory
|
| 32 |
+
# try:
|
| 33 |
+
# app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 34 |
+
# except Exception as e:
|
| 35 |
+
# logger.warning(f"Static files directory not found: {e}")
|
| 36 |
+
|
| 37 |
+
# # Initialize RAG System
|
| 38 |
+
# try:
|
| 39 |
+
# rag_system = RAGSystem()
|
| 40 |
+
# logger.info("RAG System initialized successfully")
|
| 41 |
+
# except Exception as e:
|
| 42 |
+
# logger.error(f"Failed to initialize RAG System: {e}")
|
| 43 |
+
# rag_system = None
|
| 44 |
+
|
| 45 |
+
# class QuestionRequest(BaseModel):
|
| 46 |
+
# question: str
|
| 47 |
+
|
| 48 |
+
# @app.get("/", response_class=HTMLResponse)
|
| 49 |
+
# async def read_root(request: Request):
|
| 50 |
+
# try:
|
| 51 |
+
# return templates.TemplateResponse("index.html", {"request": request})
|
| 52 |
+
# except Exception as e:
|
| 53 |
+
# logger.error(f"Error serving index.html from templates folder: {e}")
|
| 54 |
+
# return HTMLResponse(content=f"""
|
| 55 |
+
# <html>
|
| 56 |
+
# <body>
|
| 57 |
+
# <h1>RAG PDF QA System</h1>
|
| 58 |
+
# <p>Error: Could not load index.html from templates folder</p>
|
| 59 |
+
# <p>Error details: {str(e)}</p>
|
| 60 |
+
# <p>Make sure you have:</p>
|
| 61 |
+
# <ul>
|
| 62 |
+
# <li>A 'templates' folder in the same directory as app.py</li>
|
| 63 |
+
# <li>index.html file inside the templates folder</li>
|
| 64 |
+
# <li>Installed jinja2: pip install jinja2</li>
|
| 65 |
+
# </ul>
|
| 66 |
+
# </body>
|
| 67 |
+
# </html>
|
| 68 |
+
# """)
|
| 69 |
+
|
| 70 |
+
# @app.post("/upload")
|
| 71 |
+
# async def upload_document(file: UploadFile = File(...)):
|
| 72 |
+
# try:
|
| 73 |
+
# if rag_system is None:
|
| 74 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 75 |
+
|
| 76 |
+
# if not file.filename:
|
| 77 |
+
# raise HTTPException(status_code=400, detail="No file selected")
|
| 78 |
+
|
| 79 |
+
# allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
|
| 80 |
+
# file_extension = os.path.splitext(file.filename)[1].lower()
|
| 81 |
+
|
| 82 |
+
# if file_extension not in allowed_extensions:
|
| 83 |
+
# raise HTTPException(status_code=400, detail=f"File type {file_extension} not supported. Supported types: {', '.join(allowed_extensions)}")
|
| 84 |
+
|
| 85 |
+
# # Create temporary file
|
| 86 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
|
| 87 |
+
# shutil.copyfileobj(file.file, temp_file)
|
| 88 |
+
# temp_path = temp_file.name
|
| 89 |
+
|
| 90 |
+
# # Process document
|
| 91 |
+
# success = rag_system.add_document(temp_path)
|
| 92 |
+
|
| 93 |
+
# # Clean up temporary file
|
| 94 |
+
# try:
|
| 95 |
+
# os.unlink(temp_path)
|
| 96 |
+
# except Exception as cleanup_error:
|
| 97 |
+
# logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
|
| 98 |
+
|
| 99 |
+
# if success:
|
| 100 |
+
# return JSONResponse(content={"message": f"Document '{file.filename}' uploaded and processed successfully"})
|
| 101 |
+
# else:
|
| 102 |
+
# raise HTTPException(status_code=500, detail="Failed to process document")
|
| 103 |
+
|
| 104 |
+
# except HTTPException:
|
| 105 |
+
# raise
|
| 106 |
+
# except Exception as e:
|
| 107 |
+
# logger.error(f"Upload error: {e}", exc_info=True)
|
| 108 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 109 |
+
|
| 110 |
+
# @app.post("/ask")
|
| 111 |
+
# async def ask_question(request: QuestionRequest):
|
| 112 |
+
# try:
|
| 113 |
+
# if rag_system is None:
|
| 114 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 115 |
+
|
| 116 |
+
# if not request.question.strip():
|
| 117 |
+
# raise HTTPException(status_code=400, detail="Question cannot be empty")
|
| 118 |
+
|
| 119 |
+
# result = rag_system.ask_question(request.question)
|
| 120 |
+
|
| 121 |
+
# return JSONResponse(content={
|
| 122 |
+
# "answer": result["answer"],
|
| 123 |
+
# "sources": result["sources"]
|
| 124 |
+
# })
|
| 125 |
+
|
| 126 |
+
# except HTTPException:
|
| 127 |
+
# raise
|
| 128 |
+
# except Exception as e:
|
| 129 |
+
# logger.error(f"Question error: {e}", exc_info=True)
|
| 130 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 131 |
+
|
| 132 |
+
# @app.get("/documents")
|
| 133 |
+
# async def get_documents():
|
| 134 |
+
# try:
|
| 135 |
+
# if rag_system is None:
|
| 136 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 137 |
+
|
| 138 |
+
# docs = rag_system.list_documents()
|
| 139 |
+
# return JSONResponse(content={"documents": docs})
|
| 140 |
+
# except HTTPException:
|
| 141 |
+
# raise
|
| 142 |
+
# except Exception as e:
|
| 143 |
+
# logger.error(f"Documents list error: {e}", exc_info=True)
|
| 144 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 145 |
+
|
| 146 |
+
# @app.delete("/clear")
|
| 147 |
+
# async def clear_documents():
|
| 148 |
+
# try:
|
| 149 |
+
# if rag_system is None:
|
| 150 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 151 |
+
|
| 152 |
+
# success = rag_system.clear_index()
|
| 153 |
+
# if success:
|
| 154 |
+
# return JSONResponse(content={"message": "All documents cleared successfully"})
|
| 155 |
+
# else:
|
| 156 |
+
# raise HTTPException(status_code=500, detail="Failed to clear documents")
|
| 157 |
+
# except HTTPException:
|
| 158 |
+
# raise
|
| 159 |
+
# except Exception as e:
|
| 160 |
+
# logger.error(f"Clear error: {e}", exc_info=True)
|
| 161 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 162 |
+
|
| 163 |
+
# @app.get("/health")
|
| 164 |
+
# async def health_check():
|
| 165 |
+
# return {
|
| 166 |
+
# "status": "healthy",
|
| 167 |
+
# "rag_system_initialized": rag_system is not None,
|
| 168 |
+
# "message": "RAG PDF QA System is running"
|
| 169 |
+
# }
|
| 170 |
+
|
| 171 |
+
# if __name__ == "__main__":
|
| 172 |
+
# import uvicorn
|
| 173 |
+
# logger.info("Starting FastAPI server...")
|
| 174 |
+
# uvicorn.run(app, host="0.0.0.0", port=8000, log_level="debug")
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# # second code
|
| 188 |
+
# from fastapi import FastAPI, File, UploadFile, HTTPException, Request
|
| 189 |
+
# from fastapi.responses import HTMLResponse, JSONResponse
|
| 190 |
+
# from fastapi.staticfiles import StaticFiles
|
| 191 |
+
# from fastapi.templating import Jinja2Templates
|
| 192 |
+
# from pydantic import BaseModel
|
| 193 |
+
# import os
|
| 194 |
+
# import tempfile
|
| 195 |
+
# import shutil
|
| 196 |
+
# from typing import List, Dict, Any
|
| 197 |
+
# import logging
|
| 198 |
+
|
| 199 |
+
# import sys
|
| 200 |
+
# import os
|
| 201 |
+
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 202 |
+
|
| 203 |
+
# try:
|
| 204 |
+
# from RAG import RAGSystem
|
| 205 |
+
# except ImportError:
|
| 206 |
+
# print("Error: Cannot import RAGSystem from RAG.py")
|
| 207 |
+
# print("Make sure RAG.py is in the same directory as app.py")
|
| 208 |
+
# sys.exit(1)
|
| 209 |
+
|
| 210 |
+
# logging.basicConfig(level=logging.INFO)
|
| 211 |
+
# logger = logging.getLogger(__name__)
|
| 212 |
+
|
| 213 |
+
# app = FastAPI(title="RAG PDF QA System")
|
| 214 |
+
|
| 215 |
+
# # Setup templates directory
|
| 216 |
+
# templates = Jinja2Templates(directory="templates")
|
| 217 |
+
|
| 218 |
+
# # Try to mount static files directory
|
| 219 |
+
# try:
|
| 220 |
+
# app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 221 |
+
# except Exception as e:
|
| 222 |
+
# logger.warning(f"Static files directory not found: {e}")
|
| 223 |
+
|
| 224 |
+
# # Initialize RAG System
|
| 225 |
+
# try:
|
| 226 |
+
# rag_system = RAGSystem()
|
| 227 |
+
# logger.info("RAG System initialized successfully")
|
| 228 |
+
# except Exception as e:
|
| 229 |
+
# logger.error(f"Failed to initialize RAG System: {e}")
|
| 230 |
+
# rag_system = None
|
| 231 |
+
|
| 232 |
+
# class QuestionRequest(BaseModel):
|
| 233 |
+
# question: str
|
| 234 |
+
# top_k: int = 3
|
| 235 |
+
|
| 236 |
+
# @app.get("/", response_class=HTMLResponse)
|
| 237 |
+
# async def read_root(request: Request):
|
| 238 |
+
# try:
|
| 239 |
+
# return templates.TemplateResponse("index.html", {"request": request})
|
| 240 |
+
# except Exception as e:
|
| 241 |
+
# logger.error(f"Error serving index.html from templates folder: {e}")
|
| 242 |
+
# return HTMLResponse(content=f"""
|
| 243 |
+
# <html>
|
| 244 |
+
# <body>
|
| 245 |
+
# <h1>RAG PDF QA System</h1>
|
| 246 |
+
# <p>Error: Could not load index.html from templates folder</p>
|
| 247 |
+
# <p>Error details: {str(e)}</p>
|
| 248 |
+
# <p>Make sure you have:</p>
|
| 249 |
+
# <ul>
|
| 250 |
+
# <li>A 'templates' folder in the same directory as app.py</li>
|
| 251 |
+
# <li>index.html file inside the templates folder</li>
|
| 252 |
+
# <li>Installed jinja2: pip install jinja2</li>
|
| 253 |
+
# </ul>
|
| 254 |
+
# </body>
|
| 255 |
+
# </html>
|
| 256 |
+
# """)
|
| 257 |
+
|
| 258 |
+
# @app.post("/upload")
|
| 259 |
+
# async def upload_document(file: UploadFile = File(...)):
|
| 260 |
+
# try:
|
| 261 |
+
# if rag_system is None:
|
| 262 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 263 |
+
|
| 264 |
+
# if not file.filename:
|
| 265 |
+
# raise HTTPException(status_code=400, detail="No file selected")
|
| 266 |
+
|
| 267 |
+
# allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
|
| 268 |
+
# file_extension = os.path.splitext(file.filename)[1].lower()
|
| 269 |
+
|
| 270 |
+
# if file_extension not in allowed_extensions:
|
| 271 |
+
# raise HTTPException(status_code=400, detail=f"File type {file_extension} not supported. Supported types: {', '.join(allowed_extensions)}")
|
| 272 |
+
|
| 273 |
+
# # Create temporary file
|
| 274 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
|
| 275 |
+
# shutil.copyfileobj(file.file, temp_file)
|
| 276 |
+
# temp_path = temp_file.name
|
| 277 |
+
|
| 278 |
+
# # Process document
|
| 279 |
+
# success = rag_system.add_document(temp_path)
|
| 280 |
+
|
| 281 |
+
# # Clean up temporary file
|
| 282 |
+
# try:
|
| 283 |
+
# os.unlink(temp_path)
|
| 284 |
+
# except Exception as cleanup_error:
|
| 285 |
+
# logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
|
| 286 |
+
|
| 287 |
+
# if success:
|
| 288 |
+
# return JSONResponse(content={"message": f"Document '{file.filename}' uploaded and processed successfully"})
|
| 289 |
+
# else:
|
| 290 |
+
# raise HTTPException(status_code=500, detail="Failed to process document")
|
| 291 |
+
|
| 292 |
+
# except HTTPException:
|
| 293 |
+
# raise
|
| 294 |
+
# except Exception as e:
|
| 295 |
+
# logger.error(f"Upload error: {e}", exc_info=True)
|
| 296 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 297 |
+
|
| 298 |
+
# @app.post("/ask")
|
| 299 |
+
# async def ask_question(request: QuestionRequest):
|
| 300 |
+
# try:
|
| 301 |
+
# if rag_system is None:
|
| 302 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 303 |
+
|
| 304 |
+
# if not request.question.strip():
|
| 305 |
+
# raise HTTPException(status_code=400, detail="Question cannot be empty")
|
| 306 |
+
|
| 307 |
+
# result = rag_system.ask_question(request.question, top_k=request.top_k)
|
| 308 |
+
|
| 309 |
+
# return JSONResponse(content={
|
| 310 |
+
# "answer": result["answer"],
|
| 311 |
+
# "sources": result["sources"],
|
| 312 |
+
# "question_chunks": result.get("question_chunks", []),
|
| 313 |
+
# "relevant_chunks": result.get("relevant_chunks", [])
|
| 314 |
+
# })
|
| 315 |
+
|
| 316 |
+
# except HTTPException:
|
| 317 |
+
# raise
|
| 318 |
+
# except Exception as e:
|
| 319 |
+
# logger.error(f"Question error: {e}", exc_info=True)
|
| 320 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 321 |
+
|
| 322 |
+
# @app.get("/documents")
|
| 323 |
+
# async def get_documents():
|
| 324 |
+
# try:
|
| 325 |
+
# if rag_system is None:
|
| 326 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 327 |
+
|
| 328 |
+
# docs = rag_system.list_documents()
|
| 329 |
+
# return JSONResponse(content={"documents": docs})
|
| 330 |
+
# except HTTPException:
|
| 331 |
+
# raise
|
| 332 |
+
# except Exception as e:
|
| 333 |
+
# logger.error(f"Documents list error: {e}", exc_info=True)
|
| 334 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 335 |
+
|
| 336 |
+
# @app.delete("/clear")
|
| 337 |
+
# async def clear_documents():
|
| 338 |
+
# try:
|
| 339 |
+
# if rag_system is None:
|
| 340 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 341 |
+
|
| 342 |
+
# success = rag_system.clear_index()
|
| 343 |
+
# if success:
|
| 344 |
+
# return JSONResponse(content={"message": "All documents cleared successfully"})
|
| 345 |
+
# else:
|
| 346 |
+
# raise HTTPException(status_code=500, detail="Failed to clear documents")
|
| 347 |
+
# except HTTPException:
|
| 348 |
+
# raise
|
| 349 |
+
# except Exception as e:
|
| 350 |
+
# logger.error(f"Clear error: {e}", exc_info=True)
|
| 351 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 352 |
+
|
| 353 |
+
# @app.post("/search")
|
| 354 |
+
# async def search_chunks(request: QuestionRequest):
|
| 355 |
+
# try:
|
| 356 |
+
# if rag_system is None:
|
| 357 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 358 |
+
|
| 359 |
+
# if not request.question.strip():
|
| 360 |
+
# raise HTTPException(status_code=400, detail="Search query cannot be empty")
|
| 361 |
+
|
| 362 |
+
# chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
|
| 363 |
+
|
| 364 |
+
# return JSONResponse(content={
|
| 365 |
+
# "query": request.question,
|
| 366 |
+
# "chunks": chunks,
|
| 367 |
+
# "total_found": len(chunks)
|
| 368 |
+
# })
|
| 369 |
+
|
| 370 |
+
# except HTTPException:
|
| 371 |
+
# raise
|
| 372 |
+
# except Exception as e:
|
| 373 |
+
# logger.error(f"Search error: {e}", exc_info=True)
|
| 374 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 375 |
+
|
| 376 |
+
# @app.get("/health")
|
| 377 |
+
# async def health_check():
|
| 378 |
+
# return {
|
| 379 |
+
# "status": "healthy",
|
| 380 |
+
# "rag_system_initialized": rag_system is not None,
|
| 381 |
+
# "message": "RAG PDF QA System is running",
|
| 382 |
+
# "indexed_documents": len(rag_system.list_documents()) if rag_system else 0
|
| 383 |
+
# }
|
| 384 |
+
|
| 385 |
+
# @app.get("/stats")
|
| 386 |
+
# async def get_stats():
|
| 387 |
+
# try:
|
| 388 |
+
# if rag_system is None:
|
| 389 |
+
# raise HTTPException(status_code=500, detail="RAG System not initialized")
|
| 390 |
+
|
| 391 |
+
# docs = rag_system.list_documents()
|
| 392 |
+
|
| 393 |
+
# total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
|
| 394 |
+
# total_pages = sum(doc.get("total_pages", 1) for doc in docs)
|
| 395 |
+
|
| 396 |
+
# return JSONResponse(content={
|
| 397 |
+
# "total_documents": len(docs),
|
| 398 |
+
# "total_chunks": total_chunks,
|
| 399 |
+
# "total_pages": total_pages,
|
| 400 |
+
# "documents": docs
|
| 401 |
+
# })
|
| 402 |
+
|
| 403 |
+
# except HTTPException:
|
| 404 |
+
# raise
|
| 405 |
+
# except Exception as e:
|
| 406 |
+
# logger.error(f"Stats error: {e}", exc_info=True)
|
| 407 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 408 |
+
|
| 409 |
+
# @app.on_event("shutdown")
|
| 410 |
+
# async def shutdown_event():
|
| 411 |
+
# if rag_system:
|
| 412 |
+
# rag_system.close()
|
| 413 |
+
# logger.info("RAG System closed gracefully")
|
| 414 |
+
|
| 415 |
+
# if __name__ == "__main__":
|
| 416 |
+
# import uvicorn
|
| 417 |
+
# logger.info("Starting FastAPI server...")
|
| 418 |
+
# uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# complate code
|
| 423 |
+
# from fastapi import FastAPI, File, UploadFile, HTTPException, Request
|
| 424 |
+
# from fastapi.responses import HTMLResponse, JSONResponse
|
| 425 |
+
# from fastapi.staticfiles import StaticFiles
|
| 426 |
+
# from fastapi.templating import Jinja2Templates
|
| 427 |
+
# from pydantic import BaseModel
|
| 428 |
+
# import os
|
| 429 |
+
# import tempfile
|
| 430 |
+
# import shutil
|
| 431 |
+
# from typing import List, Dict, Any, Optional, Union
|
| 432 |
+
# import logging
|
| 433 |
+
# from datetime import datetime
|
| 434 |
+
# import mimetypes
|
| 435 |
+
|
| 436 |
+
# # Import your RAG system
|
| 437 |
+
# import sys
|
| 438 |
+
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 439 |
+
# try:
|
| 440 |
+
# from RAG import RAGSystem
|
| 441 |
+
# except ImportError:
|
| 442 |
+
# print("Error: Cannot import RAGSystem from RAG.py")
|
| 443 |
+
# print("Make sure RAG.py is in the same directory as app.py")
|
| 444 |
+
# sys.exit(1)
|
| 445 |
+
|
| 446 |
+
# # Configure logging
|
| 447 |
+
# logging.basicConfig(
|
| 448 |
+
# level=logging.INFO,
|
| 449 |
+
# format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 450 |
+
# )
|
| 451 |
+
# logger = logging.getLogger(__name__)
|
| 452 |
+
|
| 453 |
+
# # Initialize FastAPI app
|
| 454 |
+
# app = FastAPI(
|
| 455 |
+
# title="Scholar's Archive - Document Intelligence System",
|
| 456 |
+
# description="A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology",
|
| 457 |
+
# version="1.0.0",
|
| 458 |
+
# docs_url="/api/docs",
|
| 459 |
+
# redoc_url="/api/redoc"
|
| 460 |
+
# )
|
| 461 |
+
|
| 462 |
+
# # Setup templates directory
|
| 463 |
+
# templates = Jinja2Templates(directory="templates")
|
| 464 |
+
|
| 465 |
+
# # Try to mount static files directory
|
| 466 |
+
# try:
|
| 467 |
+
# app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 468 |
+
# except Exception as e:
|
| 469 |
+
# logger.warning(f"Static files directory not found: {e}")
|
| 470 |
+
|
| 471 |
+
# # Initialize RAG System
|
| 472 |
+
# try:
|
| 473 |
+
# rag_system = RAGSystem()
|
| 474 |
+
# logger.info("Scholar's Archive RAG System initialized successfully")
|
| 475 |
+
# except Exception as e:
|
| 476 |
+
# logger.error(f"Failed to initialize RAG System: {e}")
|
| 477 |
+
# rag_system = None
|
| 478 |
+
|
| 479 |
+
# # Pydantic models
|
| 480 |
+
# class QuestionRequest(BaseModel):
|
| 481 |
+
# question: str
|
| 482 |
+
# top_k: int = 3
|
| 483 |
+
|
| 484 |
+
# class DocumentInfo(BaseModel):
|
| 485 |
+
# title: str
|
| 486 |
+
# file_type: str
|
| 487 |
+
# upload_date: str
|
| 488 |
+
# chunk_count: int
|
| 489 |
+
# total_pages: Optional[int] = None
|
| 490 |
+
|
| 491 |
+
# # Fixed AnswerResponse model to handle both strings and dictionaries
|
| 492 |
+
# class AnswerResponse(BaseModel):
|
| 493 |
+
# answer: str
|
| 494 |
+
# sources: List[Dict[str, Any]]
|
| 495 |
+
# question_chunks: List[Union[str, Dict[str, Any]]] = []
|
| 496 |
+
# relevant_chunks: List[Union[str, Dict[str, Any]]] = []
|
| 497 |
+
|
| 498 |
+
# class StatsResponse(BaseModel):
|
| 499 |
+
# total_documents: int
|
| 500 |
+
# total_chunks: int
|
| 501 |
+
# total_pages: int
|
| 502 |
+
# documents: List[DocumentInfo]
|
| 503 |
+
|
| 504 |
+
# # Utility functions
|
| 505 |
+
# def get_file_type_icon(filename: str) -> str:
|
| 506 |
+
# """Get appropriate icon for file type"""
|
| 507 |
+
# ext = os.path.splitext(filename)[1].lower()
|
| 508 |
+
# icons = {
|
| 509 |
+
# '.pdf': 'fas fa-file-pdf',
|
| 510 |
+
# '.docx': 'fas fa-file-word',
|
| 511 |
+
# '.txt': 'fas fa-file-alt',
|
| 512 |
+
# '.csv': 'fas fa-file-csv'
|
| 513 |
+
# }
|
| 514 |
+
# return icons.get(ext, 'fas fa-file')
|
| 515 |
+
|
| 516 |
+
# def format_file_size(size_bytes: int) -> str:
|
| 517 |
+
# """Format file size in human readable format"""
|
| 518 |
+
# if size_bytes == 0:
|
| 519 |
+
# return "0 B"
|
| 520 |
+
# size_names = ["B", "KB", "MB", "GB"]
|
| 521 |
+
# i = 0
|
| 522 |
+
# while size_bytes >= 1024 and i < len(size_names) - 1:
|
| 523 |
+
# size_bytes /= 1024.0
|
| 524 |
+
# i += 1
|
| 525 |
+
# return f"{size_bytes:.1f} {size_names[i]}"
|
| 526 |
+
|
| 527 |
+
# def extract_content_from_chunks(chunks):
|
| 528 |
+
# """Extract string content from chunk data structures"""
|
| 529 |
+
# if not chunks:
|
| 530 |
+
# return []
|
| 531 |
+
|
| 532 |
+
# extracted = []
|
| 533 |
+
# for chunk in chunks:
|
| 534 |
+
# if isinstance(chunk, str):
|
| 535 |
+
# extracted.append(chunk)
|
| 536 |
+
# elif isinstance(chunk, dict):
|
| 537 |
+
# # Try different possible keys for text content
|
| 538 |
+
# content = chunk.get('text') or chunk.get('content') or chunk.get('document') or str(chunk)
|
| 539 |
+
# extracted.append(content)
|
| 540 |
+
# else:
|
| 541 |
+
# extracted.append(str(chunk))
|
| 542 |
+
|
| 543 |
+
# return extracted
|
| 544 |
+
|
| 545 |
+
# # Routes
|
| 546 |
+
# @app.get("/", response_class=HTMLResponse)
|
| 547 |
+
# async def read_root(request: Request):
|
| 548 |
+
# """Serve the main classical interface"""
|
| 549 |
+
# try:
|
| 550 |
+
# return templates.TemplateResponse("index.html", {"request": request})
|
| 551 |
+
# except Exception as e:
|
| 552 |
+
# logger.error(f"Error serving index.html from templates folder: {e}")
|
| 553 |
+
# # Return the embedded HTML if templates folder is not available
|
| 554 |
+
# with open("scholar_archive.html", "r", encoding="utf-8") as f:
|
| 555 |
+
# html_content = f.read()
|
| 556 |
+
# return HTMLResponse(content=html_content)
|
| 557 |
+
|
| 558 |
+
# @app.post("/upload")
|
| 559 |
+
# async def upload_document(file: UploadFile = File(...)):
|
| 560 |
+
# """Upload and process a document"""
|
| 561 |
+
# try:
|
| 562 |
+
# if rag_system is None:
|
| 563 |
+
# raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 564 |
+
|
| 565 |
+
# if not file.filename:
|
| 566 |
+
# raise HTTPException(status_code=400, detail="No file selected")
|
| 567 |
+
|
| 568 |
+
# # Validate file type
|
| 569 |
+
# allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
|
| 570 |
+
# file_extension = os.path.splitext(file.filename)[1].lower()
|
| 571 |
+
|
| 572 |
+
# if file_extension not in allowed_extensions:
|
| 573 |
+
# raise HTTPException(
|
| 574 |
+
# status_code=400,
|
| 575 |
+
# detail=f"File type {file_extension} not supported. Supported formats: {', '.join(allowed_extensions)}"
|
| 576 |
+
# )
|
| 577 |
+
|
| 578 |
+
# # Check file size (limit to 50MB)
|
| 579 |
+
# file_size = 0
|
| 580 |
+
# content = await file.read()
|
| 581 |
+
# file_size = len(content)
|
| 582 |
+
|
| 583 |
+
# if file_size > 50 * 1024 * 1024: # 50MB limit
|
| 584 |
+
# raise HTTPException(status_code=400, detail="File size too large. Maximum size is 50MB")
|
| 585 |
+
|
| 586 |
+
# # Create temporary file
|
| 587 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
|
| 588 |
+
# temp_file.write(content)
|
| 589 |
+
# temp_path = temp_file.name
|
| 590 |
+
|
| 591 |
+
# logger.info(f"Processing document: {file.filename} ({format_file_size(file_size)})")
|
| 592 |
+
|
| 593 |
+
# # Process document
|
| 594 |
+
# success = rag_system.add_document(temp_path)
|
| 595 |
+
|
| 596 |
+
# # Clean up temporary file
|
| 597 |
+
# try:
|
| 598 |
+
# os.unlink(temp_path)
|
| 599 |
+
# except Exception as cleanup_error:
|
| 600 |
+
# logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
|
| 601 |
+
|
| 602 |
+
# if success:
|
| 603 |
+
# logger.info(f"Successfully processed document: {file.filename}")
|
| 604 |
+
# return JSONResponse(content={
|
| 605 |
+
# "message": f"Document '{file.filename}' has been successfully added to the Scholar's Archive",
|
| 606 |
+
# "filename": file.filename,
|
| 607 |
+
# "size": format_file_size(file_size),
|
| 608 |
+
# "type": file_extension
|
| 609 |
+
# })
|
| 610 |
+
# else:
|
| 611 |
+
# raise HTTPException(status_code=500, detail="Failed to process document")
|
| 612 |
+
|
| 613 |
+
# except HTTPException:
|
| 614 |
+
# raise
|
| 615 |
+
# except Exception as e:
|
| 616 |
+
# logger.error(f"Upload error: {e}", exc_info=True)
|
| 617 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 618 |
+
|
| 619 |
+
# @app.post("/ask", response_model=AnswerResponse)
|
| 620 |
+
# async def ask_question(request: QuestionRequest):
|
| 621 |
+
# """Ask a question about the uploaded documents"""
|
| 622 |
+
# try:
|
| 623 |
+
# if rag_system is None:
|
| 624 |
+
# raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 625 |
+
|
| 626 |
+
# if not request.question.strip():
|
| 627 |
+
# raise HTTPException(status_code=400, detail="Question cannot be empty")
|
| 628 |
+
|
| 629 |
+
# logger.info(f"Processing question: {request.question[:100]}...")
|
| 630 |
+
|
| 631 |
+
# # Get answer from RAG system
|
| 632 |
+
# result = rag_system.ask_question(request.question, top_k=request.top_k)
|
| 633 |
+
|
| 634 |
+
# # Handle the chunks data properly
|
| 635 |
+
# question_chunks = result.get("question_chunks", [])
|
| 636 |
+
# relevant_chunks = result.get("relevant_chunks", [])
|
| 637 |
+
|
| 638 |
+
# # Log the structure to understand what we're getting
|
| 639 |
+
# logger.info(f"Question chunks type: {type(question_chunks)}")
|
| 640 |
+
# logger.info(f"Relevant chunks type: {type(relevant_chunks)}")
|
| 641 |
+
# if question_chunks:
|
| 642 |
+
# logger.info(f"First question chunk type: {type(question_chunks[0])}")
|
| 643 |
+
# if relevant_chunks:
|
| 644 |
+
# logger.info(f"First relevant chunk type: {type(relevant_chunks[0])}")
|
| 645 |
+
|
| 646 |
+
# # Format the response - keep original structure but ensure it's serializable
|
| 647 |
+
# response = AnswerResponse(
|
| 648 |
+
# answer=result["answer"],
|
| 649 |
+
# sources=result["sources"],
|
| 650 |
+
# question_chunks=question_chunks,
|
| 651 |
+
# relevant_chunks=relevant_chunks
|
| 652 |
+
# )
|
| 653 |
+
|
| 654 |
+
# logger.info(f"Successfully answered question with {len(result['sources'])} sources")
|
| 655 |
+
|
| 656 |
+
# return response
|
| 657 |
+
|
| 658 |
+
# except HTTPException:
|
| 659 |
+
# raise
|
| 660 |
+
# except Exception as e:
|
| 661 |
+
# logger.error(f"Question processing error: {e}", exc_info=True)
|
| 662 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 663 |
+
|
| 664 |
+
# @app.get("/documents")
|
| 665 |
+
# async def get_documents():
|
| 666 |
+
# """Get list of all uploaded documents"""
|
| 667 |
+
# try:
|
| 668 |
+
# if rag_system is None:
|
| 669 |
+
# raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 670 |
+
|
| 671 |
+
# docs = rag_system.list_documents()
|
| 672 |
+
|
| 673 |
+
# # Format documents with additional metadata
|
| 674 |
+
# formatted_docs = []
|
| 675 |
+
# for doc in docs:
|
| 676 |
+
# formatted_doc = {
|
| 677 |
+
# "title": doc.get("title", "Unknown Document"),
|
| 678 |
+
# "chunk_count": doc.get("chunk_count", 0),
|
| 679 |
+
# "total_pages": doc.get("total_pages"),
|
| 680 |
+
# "file_type": os.path.splitext(doc.get("title", ""))[1].lower(),
|
| 681 |
+
# "upload_date": doc.get("upload_date", datetime.now().isoformat()),
|
| 682 |
+
# "icon": get_file_type_icon(doc.get("title", ""))
|
| 683 |
+
# }
|
| 684 |
+
# formatted_docs.append(formatted_doc)
|
| 685 |
+
|
| 686 |
+
# return JSONResponse(content={"documents": formatted_docs})
|
| 687 |
+
|
| 688 |
+
# except HTTPException:
|
| 689 |
+
# raise
|
| 690 |
+
# except Exception as e:
|
| 691 |
+
# logger.error(f"Documents list error: {e}", exc_info=True)
|
| 692 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 693 |
+
|
| 694 |
+
# @app.get("/stats", response_model=StatsResponse)
|
| 695 |
+
# async def get_stats():
|
| 696 |
+
# """Get statistics about the document collection"""
|
| 697 |
+
# try:
|
| 698 |
+
# if rag_system is None:
|
| 699 |
+
# raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 700 |
+
|
| 701 |
+
# docs = rag_system.list_documents()
|
| 702 |
+
|
| 703 |
+
# total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
|
| 704 |
+
# total_pages = sum(doc.get("total_pages", 1) for doc in docs if doc.get("total_pages"))
|
| 705 |
+
|
| 706 |
+
# # Format documents
|
| 707 |
+
# formatted_docs = []
|
| 708 |
+
# for doc in docs:
|
| 709 |
+
# formatted_doc = DocumentInfo(
|
| 710 |
+
# title=doc.get("title", "Unknown Document"),
|
| 711 |
+
# file_type=os.path.splitext(doc.get("title", ""))[1].lower(),
|
| 712 |
+
# upload_date=doc.get("upload_date", datetime.now().isoformat()),
|
| 713 |
+
# chunk_count=doc.get("chunk_count", 0),
|
| 714 |
+
# total_pages=doc.get("total_pages")
|
| 715 |
+
# )
|
| 716 |
+
# formatted_docs.append(formatted_doc)
|
| 717 |
+
|
| 718 |
+
# stats = StatsResponse(
|
| 719 |
+
# total_documents=len(docs),
|
| 720 |
+
# total_chunks=total_chunks,
|
| 721 |
+
# total_pages=total_pages,
|
| 722 |
+
# documents=formatted_docs
|
| 723 |
+
# )
|
| 724 |
+
|
| 725 |
+
# return stats
|
| 726 |
+
|
| 727 |
+
# except HTTPException:
|
| 728 |
+
# raise
|
| 729 |
+
# except Exception as e:
|
| 730 |
+
# logger.error(f"Stats error: {e}", exc_info=True)
|
| 731 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 732 |
+
|
| 733 |
+
# @app.delete("/clear")
|
| 734 |
+
# async def clear_documents():
|
| 735 |
+
# """Clear all documents from the archive"""
|
| 736 |
+
# try:
|
| 737 |
+
# if rag_system is None:
|
| 738 |
+
# raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 739 |
+
|
| 740 |
+
# logger.info("Clearing all documents from Scholar's Archive")
|
| 741 |
+
|
| 742 |
+
# success = rag_system.clear_index()
|
| 743 |
+
# if success:
|
| 744 |
+
# logger.info("Successfully cleared all documents")
|
| 745 |
+
# return JSONResponse(content={
|
| 746 |
+
# "message": "All documents have been successfully removed from the Scholar's Archive"
|
| 747 |
+
# })
|
| 748 |
+
# else:
|
| 749 |
+
# raise HTTPException(status_code=500, detail="Failed to clear documents")
|
| 750 |
+
|
| 751 |
+
# except HTTPException:
|
| 752 |
+
# raise
|
| 753 |
+
# except Exception as e:
|
| 754 |
+
# logger.error(f"Clear error: {e}", exc_info=True)
|
| 755 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 756 |
+
|
| 757 |
+
# @app.post("/search")
|
| 758 |
+
# async def search_chunks(request: QuestionRequest):
|
| 759 |
+
# """Search for relevant document chunks"""
|
| 760 |
+
# try:
|
| 761 |
+
# if rag_system is None:
|
| 762 |
+
# raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 763 |
+
|
| 764 |
+
# if not request.question.strip():
|
| 765 |
+
# raise HTTPException(status_code=400, detail="Search query cannot be empty")
|
| 766 |
+
|
| 767 |
+
# logger.info(f"Searching chunks for: {request.question[:100]}...")
|
| 768 |
+
|
| 769 |
+
# chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
|
| 770 |
+
|
| 771 |
+
# # Format chunks with additional metadata
|
| 772 |
+
# formatted_chunks = []
|
| 773 |
+
# for chunk in chunks:
|
| 774 |
+
# formatted_chunk = {
|
| 775 |
+
# "content": chunk.get("content", ""),
|
| 776 |
+
# "document": chunk.get("document", "Unknown"),
|
| 777 |
+
# "similarity": chunk.get("similarity", 0.0),
|
| 778 |
+
# "page": chunk.get("page"),
|
| 779 |
+
# "chunk_index": chunk.get("chunk_index")
|
| 780 |
+
# }
|
| 781 |
+
# formatted_chunks.append(formatted_chunk)
|
| 782 |
+
|
| 783 |
+
# return JSONResponse(content={
|
| 784 |
+
# "query": request.question,
|
| 785 |
+
# "chunks": formatted_chunks,
|
| 786 |
+
# "total_found": len(formatted_chunks)
|
| 787 |
+
# })
|
| 788 |
+
|
| 789 |
+
# except HTTPException:
|
| 790 |
+
# raise
|
| 791 |
+
# except Exception as e:
|
| 792 |
+
# logger.error(f"Search error: {e}", exc_info=True)
|
| 793 |
+
# raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 794 |
+
|
| 795 |
+
# @app.get("/health")
|
| 796 |
+
# async def health_check():
|
| 797 |
+
# """Health check endpoint"""
|
| 798 |
+
# try:
|
| 799 |
+
# doc_count = len(rag_system.list_documents()) if rag_system else 0
|
| 800 |
+
|
| 801 |
+
# return {
|
| 802 |
+
# "status": "healthy",
|
| 803 |
+
# "service": "Scholar's Archive - Document Intelligence System",
|
| 804 |
+
# "version": "1.0.0",
|
| 805 |
+
# "rag_system_initialized": rag_system is not None,
|
| 806 |
+
# "indexed_documents": doc_count,
|
| 807 |
+
# "timestamp": datetime.now().isoformat(),
|
| 808 |
+
# "message": "Scholar's Archive is operational and ready to serve"
|
| 809 |
+
# }
|
| 810 |
+
# except Exception as e:
|
| 811 |
+
# logger.error(f"Health check error: {e}")
|
| 812 |
+
# return {
|
| 813 |
+
# "status": "degraded",
|
| 814 |
+
# "service": "Scholar's Archive - Document Intelligence System",
|
| 815 |
+
# "error": str(e),
|
| 816 |
+
# "timestamp": datetime.now().isoformat()
|
| 817 |
+
# }
|
| 818 |
+
|
| 819 |
+
# @app.get("/api/info")
|
| 820 |
+
# async def api_info():
|
| 821 |
+
# """Get API information"""
|
| 822 |
+
# return {
|
| 823 |
+
# "name": "Scholar's Archive API",
|
| 824 |
+
# "description": "Document Intelligence System API",
|
| 825 |
+
# "version": "1.0.0",
|
| 826 |
+
# "endpoints": {
|
| 827 |
+
# "upload": "POST /upload - Upload documents",
|
| 828 |
+
# "ask": "POST /ask - Ask questions",
|
| 829 |
+
# "documents": "GET /documents - List documents",
|
| 830 |
+
# "stats": "GET /stats - Get statistics",
|
| 831 |
+
# "search": "POST /search - Search chunks",
|
| 832 |
+
# "clear": "DELETE /clear - Clear all documents",
|
| 833 |
+
# "health": "GET /health - Health check"
|
| 834 |
+
# },
|
| 835 |
+
# "supported_formats": [".pdf", ".docx", ".txt", ".csv"],
|
| 836 |
+
# "max_file_size": "50MB"
|
| 837 |
+
# }
|
| 838 |
+
|
| 839 |
+
# # Event handlers
|
| 840 |
+
# @app.on_event("startup")
|
| 841 |
+
# async def startup_event():
|
| 842 |
+
# """Application startup event"""
|
| 843 |
+
# logger.info("Starting Scholar's Archive - Document Intelligence System")
|
| 844 |
+
# logger.info("System initialized and ready to serve scholarly inquiries")
|
| 845 |
+
|
| 846 |
+
# @app.on_event("shutdown")
|
| 847 |
+
# async def shutdown_event():
|
| 848 |
+
# """Application shutdown event"""
|
| 849 |
+
# if rag_system:
|
| 850 |
+
# rag_system.close()
|
| 851 |
+
# logger.info("Scholar's Archive system closed gracefully")
|
| 852 |
+
# logger.info("Scholar's Archive shutdown complete")
|
| 853 |
+
|
| 854 |
+
# # Error handlers
|
| 855 |
+
# @app.exception_handler(404)
|
| 856 |
+
# async def not_found_handler(request: Request, exc):
|
| 857 |
+
# """Custom 404 handler"""
|
| 858 |
+
# return JSONResponse(
|
| 859 |
+
# status_code=404,
|
| 860 |
+
# content={
|
| 861 |
+
# "detail": "The requested resource was not found in the Scholar's Archive",
|
| 862 |
+
# "path": str(request.url.path)
|
| 863 |
+
# }
|
| 864 |
+
# )
|
| 865 |
+
|
| 866 |
+
# @app.exception_handler(500)
|
| 867 |
+
# async def internal_error_handler(request: Request, exc):
|
| 868 |
+
# """Custom 500 handler"""
|
| 869 |
+
# logger.error(f"Internal server error: {exc}")
|
| 870 |
+
# return JSONResponse(
|
| 871 |
+
# status_code=500,
|
| 872 |
+
# content={
|
| 873 |
+
# "detail": "An internal error occurred in the Scholar's Archive system",
|
| 874 |
+
# "message": "Please try again later or contact support"
|
| 875 |
+
# }
|
| 876 |
+
# )
|
| 877 |
+
|
| 878 |
+
# # Main execution
|
| 879 |
+
# if __name__ == "__main__":
|
| 880 |
+
# import uvicorn
|
| 881 |
+
|
| 882 |
+
# logger.info("Launching Scholar's Archive - Document Intelligence System")
|
| 883 |
+
# logger.info("Access the interface at: http://localhost:8000")
|
| 884 |
+
# logger.info("API documentation at: http://localhost:8000/api/docs")
|
| 885 |
+
|
| 886 |
+
# uvicorn.run(
|
| 887 |
+
# app,
|
| 888 |
+
# host="0.0.0.0",
|
| 889 |
+
# port=7860,
|
| 890 |
+
# log_level="info",
|
| 891 |
+
# reload=False,
|
| 892 |
+
# access_log=True
|
| 893 |
+
# )
|
| 894 |
+
|
| 895 |
+
|
| 896 |
+
|
| 897 |
+
|
| 898 |
+
|
| 899 |
+
# perfect code
|
| 900 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, Request
|
| 901 |
+
from fastapi.responses import HTMLResponse, JSONResponse
|
| 902 |
+
from fastapi.staticfiles import StaticFiles
|
| 903 |
+
from fastapi.templating import Jinja2Templates
|
| 904 |
+
from pydantic import BaseModel
|
| 905 |
+
import os
|
| 906 |
+
import tempfile
|
| 907 |
+
import shutil
|
| 908 |
+
from typing import List, Dict, Any, Optional, Union
|
| 909 |
+
import logging
|
| 910 |
+
from datetime import datetime
|
| 911 |
+
import mimetypes
|
| 912 |
+
|
| 913 |
+
# Import your RAG system
|
| 914 |
+
import sys
|
| 915 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 916 |
+
try:
|
| 917 |
+
from RAG import RAGSystem
|
| 918 |
+
except ImportError:
|
| 919 |
+
print("Error: Cannot import RAGSystem from RAG.py")
|
| 920 |
+
print("Make sure RAG.py is in the same directory as app.py")
|
| 921 |
+
sys.exit(1)
|
| 922 |
+
|
| 923 |
+
# Configure logging
|
| 924 |
+
logging.basicConfig(
|
| 925 |
+
level=logging.INFO,
|
| 926 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 927 |
+
)
|
| 928 |
+
logger = logging.getLogger(__name__)
|
| 929 |
+
|
| 930 |
+
# Initialize FastAPI app
|
| 931 |
+
app = FastAPI(
|
| 932 |
+
title="Scholar's Archive - Document Intelligence System",
|
| 933 |
+
description="A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology",
|
| 934 |
+
version="1.0.0",
|
| 935 |
+
docs_url="/api/docs",
|
| 936 |
+
redoc_url="/api/redoc"
|
| 937 |
+
)
|
| 938 |
+
|
| 939 |
+
# Setup templates directory
|
| 940 |
+
templates = Jinja2Templates(directory="templates")
|
| 941 |
+
|
| 942 |
+
# Try to mount static files directory
|
| 943 |
+
try:
|
| 944 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 945 |
+
except Exception as e:
|
| 946 |
+
logger.warning(f"Static files directory not found: {e}")
|
| 947 |
+
|
| 948 |
+
# Initialize RAG System
|
| 949 |
+
try:
|
| 950 |
+
rag_system = RAGSystem()
|
| 951 |
+
logger.info("Scholar's Archive RAG System initialized successfully")
|
| 952 |
+
except Exception as e:
|
| 953 |
+
logger.error(f"Failed to initialize RAG System: {e}")
|
| 954 |
+
rag_system = None
|
| 955 |
+
|
| 956 |
+
# Pydantic models
|
| 957 |
+
class QuestionRequest(BaseModel):
|
| 958 |
+
question: str
|
| 959 |
+
top_k: int = 3
|
| 960 |
+
|
| 961 |
+
class DocumentInfo(BaseModel):
|
| 962 |
+
title: str
|
| 963 |
+
file_type: str
|
| 964 |
+
upload_date: str
|
| 965 |
+
chunk_count: int
|
| 966 |
+
total_pages: Optional[int] = None
|
| 967 |
+
|
| 968 |
+
class AnswerResponse(BaseModel):
|
| 969 |
+
answer: str
|
| 970 |
+
sources: List[Dict[str, Any]]
|
| 971 |
+
question_chunks: List[Union[str, Dict[str, Any]]] = []
|
| 972 |
+
relevant_chunks: List[Union[str, Dict[str, Any]]] = []
|
| 973 |
+
|
| 974 |
+
class StatsResponse(BaseModel):
|
| 975 |
+
total_documents: int
|
| 976 |
+
total_chunks: int
|
| 977 |
+
total_pages: int
|
| 978 |
+
documents: List[DocumentInfo]
|
| 979 |
+
|
| 980 |
+
# Utility functions
|
| 981 |
+
def get_file_type_icon(filename: str) -> str:
|
| 982 |
+
"""Get appropriate icon for file type"""
|
| 983 |
+
ext = os.path.splitext(filename)[1].lower()
|
| 984 |
+
icons = {
|
| 985 |
+
'.pdf': 'fas fa-file-pdf',
|
| 986 |
+
'.docx': 'fas fa-file-word',
|
| 987 |
+
'.txt': 'fas fa-file-alt',
|
| 988 |
+
'.csv': 'fas fa-file-csv'
|
| 989 |
+
}
|
| 990 |
+
return icons.get(ext, 'fas fa-file')
|
| 991 |
+
|
| 992 |
+
def format_file_size(size_bytes: int) -> str:
|
| 993 |
+
"""Format file size in human readable format"""
|
| 994 |
+
if size_bytes == 0:
|
| 995 |
+
return "0 B"
|
| 996 |
+
size_names = ["B", "KB", "MB", "GB"]
|
| 997 |
+
i = 0
|
| 998 |
+
while size_bytes >= 1024 and i < len(size_names) - 1:
|
| 999 |
+
size_bytes /= 1024.0
|
| 1000 |
+
i += 1
|
| 1001 |
+
return f"{size_bytes:.1f} {size_names[i]}"
|
| 1002 |
+
|
| 1003 |
+
def extract_content_from_chunks(chunks):
|
| 1004 |
+
"""Extract string content from chunk data structures"""
|
| 1005 |
+
if not chunks:
|
| 1006 |
+
return []
|
| 1007 |
+
|
| 1008 |
+
extracted = []
|
| 1009 |
+
for chunk in chunks:
|
| 1010 |
+
if isinstance(chunk, str):
|
| 1011 |
+
extracted.append(chunk)
|
| 1012 |
+
elif isinstance(chunk, dict):
|
| 1013 |
+
# Try different possible keys for text content
|
| 1014 |
+
content = chunk.get('text') or chunk.get('content') or chunk.get('document') or str(chunk)
|
| 1015 |
+
extracted.append(content)
|
| 1016 |
+
else:
|
| 1017 |
+
extracted.append(str(chunk))
|
| 1018 |
+
|
| 1019 |
+
return extracted
|
| 1020 |
+
|
| 1021 |
+
|
| 1022 |
+
|
| 1023 |
+
# Routes
|
| 1024 |
+
@app.get("/", response_class=HTMLResponse)
|
| 1025 |
+
async def read_root(request: Request):
|
| 1026 |
+
"""Serve the main classical interface"""
|
| 1027 |
+
try:
|
| 1028 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
| 1029 |
+
except Exception as e:
|
| 1030 |
+
logger.error(f"Error serving index.html from templates folder: {e}")
|
| 1031 |
+
# Return the embedded HTML if templates folder is not available
|
| 1032 |
+
with open("scholar_archive.html", "r", encoding="utf-8") as f:
|
| 1033 |
+
html_content = f.read()
|
| 1034 |
+
return HTMLResponse(content=html_content)
|
| 1035 |
+
|
| 1036 |
+
@app.post("/upload")
|
| 1037 |
+
async def upload_document(file: UploadFile = File(...)):
|
| 1038 |
+
"""Upload and process a document"""
|
| 1039 |
+
try:
|
| 1040 |
+
if rag_system is None:
|
| 1041 |
+
raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 1042 |
+
|
| 1043 |
+
if not file.filename:
|
| 1044 |
+
raise HTTPException(status_code=400, detail="No file selected")
|
| 1045 |
+
|
| 1046 |
+
# Validate file type
|
| 1047 |
+
allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
|
| 1048 |
+
file_extension = os.path.splitext(file.filename)[1].lower()
|
| 1049 |
+
|
| 1050 |
+
if file_extension not in allowed_extensions:
|
| 1051 |
+
raise HTTPException(
|
| 1052 |
+
status_code=400,
|
| 1053 |
+
detail=f"File type {file_extension} not supported. Supported formats: {', '.join(allowed_extensions)}"
|
| 1054 |
+
)
|
| 1055 |
+
|
| 1056 |
+
# Check file size (limit to 50MB)
|
| 1057 |
+
file_size = 0
|
| 1058 |
+
content = await file.read()
|
| 1059 |
+
file_size = len(content)
|
| 1060 |
+
|
| 1061 |
+
if file_size > 50 * 1024 * 1024: # 50MB limit
|
| 1062 |
+
raise HTTPException(status_code=400, detail="File size too large. Maximum size is 50MB")
|
| 1063 |
+
|
| 1064 |
+
# Create temporary file
|
| 1065 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
|
| 1066 |
+
temp_file.write(content)
|
| 1067 |
+
temp_path = temp_file.name
|
| 1068 |
+
|
| 1069 |
+
logger.info(f"Processing document: {file.filename} ({format_file_size(file_size)})")
|
| 1070 |
+
|
| 1071 |
+
# Process document
|
| 1072 |
+
success = rag_system.add_document(temp_path)
|
| 1073 |
+
|
| 1074 |
+
# Clean up temporary file
|
| 1075 |
+
try:
|
| 1076 |
+
os.unlink(temp_path)
|
| 1077 |
+
except Exception as cleanup_error:
|
| 1078 |
+
logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
|
| 1079 |
+
|
| 1080 |
+
if success:
|
| 1081 |
+
logger.info(f"Successfully processed document: {file.filename}")
|
| 1082 |
+
return JSONResponse(content={
|
| 1083 |
+
"message": f"Document '{file.filename}' has been successfully added to the Scholar's Archive",
|
| 1084 |
+
"filename": file.filename,
|
| 1085 |
+
"size": format_file_size(file_size),
|
| 1086 |
+
"type": file_extension
|
| 1087 |
+
})
|
| 1088 |
+
else:
|
| 1089 |
+
raise HTTPException(status_code=500, detail="Failed to process document")
|
| 1090 |
+
|
| 1091 |
+
except HTTPException:
|
| 1092 |
+
raise
|
| 1093 |
+
except Exception as e:
|
| 1094 |
+
logger.error(f"Upload error: {e}", exc_info=True)
|
| 1095 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 1096 |
+
|
| 1097 |
+
@app.post("/ask", response_model=AnswerResponse)
|
| 1098 |
+
async def ask_question(request: QuestionRequest):
|
| 1099 |
+
"""Ask a question about the uploaded documents"""
|
| 1100 |
+
try:
|
| 1101 |
+
if rag_system is None:
|
| 1102 |
+
raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 1103 |
+
|
| 1104 |
+
if not request.question.strip():
|
| 1105 |
+
raise HTTPException(status_code=400, detail="Question cannot be empty")
|
| 1106 |
+
|
| 1107 |
+
logger.info(f"Processing question: {request.question[:100]}...")
|
| 1108 |
+
|
| 1109 |
+
# Get answer from RAG system
|
| 1110 |
+
result = rag_system.ask_question(request.question, top_k=request.top_k)
|
| 1111 |
+
|
| 1112 |
+
# Handle the chunks data properly
|
| 1113 |
+
question_chunks = result.get("question_chunks", [])
|
| 1114 |
+
relevant_chunks = result.get("relevant_chunks", [])
|
| 1115 |
+
|
| 1116 |
+
# Add page numbers to sources
|
| 1117 |
+
sources = result.get("sources", [])
|
| 1118 |
+
for source in sources:
|
| 1119 |
+
if isinstance(source, dict):
|
| 1120 |
+
page_num = source.get('page')
|
| 1121 |
+
if page_num:
|
| 1122 |
+
source['page_reference'] = f"Page {page_num}"
|
| 1123 |
+
|
| 1124 |
+
# Log the structure to understand what we're getting
|
| 1125 |
+
logger.info(f"Question chunks type: {type(question_chunks)}")
|
| 1126 |
+
logger.info(f"Relevant chunks type: {type(relevant_chunks)}")
|
| 1127 |
+
if question_chunks:
|
| 1128 |
+
logger.info(f"First question chunk type: {type(question_chunks[0])}")
|
| 1129 |
+
if relevant_chunks:
|
| 1130 |
+
logger.info(f"First relevant chunk type: {type(relevant_chunks[0])}")
|
| 1131 |
+
|
| 1132 |
+
# Format the response - keep original structure but ensure it's serializable
|
| 1133 |
+
response = AnswerResponse(
|
| 1134 |
+
answer=result["answer"],
|
| 1135 |
+
sources=sources,
|
| 1136 |
+
question_chunks=question_chunks,
|
| 1137 |
+
relevant_chunks=relevant_chunks
|
| 1138 |
+
)
|
| 1139 |
+
|
| 1140 |
+
logger.info(f"Successfully answered question with {len(sources)} sources")
|
| 1141 |
+
|
| 1142 |
+
return response
|
| 1143 |
+
|
| 1144 |
+
except HTTPException:
|
| 1145 |
+
raise
|
| 1146 |
+
except Exception as e:
|
| 1147 |
+
logger.error(f"Question processing error: {e}", exc_info=True)
|
| 1148 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 1149 |
+
|
| 1150 |
+
@app.get("/documents")
|
| 1151 |
+
async def get_documents():
|
| 1152 |
+
"""Get list of all uploaded documents"""
|
| 1153 |
+
try:
|
| 1154 |
+
if rag_system is None:
|
| 1155 |
+
raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 1156 |
+
|
| 1157 |
+
docs = rag_system.list_documents()
|
| 1158 |
+
|
| 1159 |
+
# Format documents with additional metadata
|
| 1160 |
+
formatted_docs = []
|
| 1161 |
+
for doc in docs:
|
| 1162 |
+
formatted_doc = {
|
| 1163 |
+
"title": doc.get("title", "Unknown Document"),
|
| 1164 |
+
"chunk_count": doc.get("chunk_count", 0),
|
| 1165 |
+
"total_pages": doc.get("total_pages"),
|
| 1166 |
+
"file_type": os.path.splitext(doc.get("title", ""))[1].lower(),
|
| 1167 |
+
"upload_date": doc.get("upload_date", datetime.now().isoformat()),
|
| 1168 |
+
"icon": get_file_type_icon(doc.get("title", ""))
|
| 1169 |
+
}
|
| 1170 |
+
formatted_docs.append(formatted_doc)
|
| 1171 |
+
|
| 1172 |
+
return JSONResponse(content={"documents": formatted_docs})
|
| 1173 |
+
|
| 1174 |
+
except HTTPException:
|
| 1175 |
+
raise
|
| 1176 |
+
except Exception as e:
|
| 1177 |
+
logger.error(f"Documents list error: {e}", exc_info=True)
|
| 1178 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 1179 |
+
|
| 1180 |
+
@app.get("/stats", response_model=StatsResponse)
|
| 1181 |
+
async def get_stats():
|
| 1182 |
+
"""Get statistics about the document collection"""
|
| 1183 |
+
try:
|
| 1184 |
+
if rag_system is None:
|
| 1185 |
+
raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 1186 |
+
|
| 1187 |
+
docs = rag_system.list_documents()
|
| 1188 |
+
|
| 1189 |
+
total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
|
| 1190 |
+
total_pages = sum(doc.get("total_pages", 1) for doc in docs if doc.get("total_pages"))
|
| 1191 |
+
|
| 1192 |
+
# Format documents
|
| 1193 |
+
formatted_docs = []
|
| 1194 |
+
for doc in docs:
|
| 1195 |
+
formatted_doc = DocumentInfo(
|
| 1196 |
+
title=doc.get("title", "Unknown Document"),
|
| 1197 |
+
file_type=os.path.splitext(doc.get("title", ""))[1].lower(),
|
| 1198 |
+
upload_date=doc.get("upload_date", datetime.now().isoformat()),
|
| 1199 |
+
chunk_count=doc.get("chunk_count", 0),
|
| 1200 |
+
total_pages=doc.get("total_pages")
|
| 1201 |
+
)
|
| 1202 |
+
formatted_docs.append(formatted_doc)
|
| 1203 |
+
|
| 1204 |
+
stats = StatsResponse(
|
| 1205 |
+
total_documents=len(docs),
|
| 1206 |
+
total_chunks=total_chunks,
|
| 1207 |
+
total_pages=total_pages,
|
| 1208 |
+
documents=formatted_docs
|
| 1209 |
+
)
|
| 1210 |
+
|
| 1211 |
+
return stats
|
| 1212 |
+
|
| 1213 |
+
except HTTPException:
|
| 1214 |
+
raise
|
| 1215 |
+
except Exception as e:
|
| 1216 |
+
logger.error(f"Stats error: {e}", exc_info=True)
|
| 1217 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 1218 |
+
|
| 1219 |
+
@app.delete("/clear")
|
| 1220 |
+
async def clear_documents():
|
| 1221 |
+
"""Clear all documents from the archive"""
|
| 1222 |
+
try:
|
| 1223 |
+
if rag_system is None:
|
| 1224 |
+
raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 1225 |
+
|
| 1226 |
+
logger.info("Clearing all documents from Scholar's Archive")
|
| 1227 |
+
|
| 1228 |
+
success = rag_system.clear_index()
|
| 1229 |
+
if success:
|
| 1230 |
+
logger.info("Successfully cleared all documents")
|
| 1231 |
+
return JSONResponse(content={
|
| 1232 |
+
"message": "All documents have been successfully removed from the Scholar's Archive"
|
| 1233 |
+
})
|
| 1234 |
+
else:
|
| 1235 |
+
raise HTTPException(status_code=500, detail="Failed to clear documents")
|
| 1236 |
+
|
| 1237 |
+
except HTTPException:
|
| 1238 |
+
raise
|
| 1239 |
+
except Exception as e:
|
| 1240 |
+
logger.error(f"Clear error: {e}", exc_info=True)
|
| 1241 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 1242 |
+
|
| 1243 |
+
@app.post("/search")
|
| 1244 |
+
async def search_chunks(request: QuestionRequest):
|
| 1245 |
+
"""Search for relevant document chunks"""
|
| 1246 |
+
try:
|
| 1247 |
+
if rag_system is None:
|
| 1248 |
+
raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
|
| 1249 |
+
|
| 1250 |
+
if not request.question.strip():
|
| 1251 |
+
raise HTTPException(status_code=400, detail="Search query cannot be empty")
|
| 1252 |
+
|
| 1253 |
+
logger.info(f"Searching chunks for: {request.question[:100]}...")
|
| 1254 |
+
|
| 1255 |
+
chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
|
| 1256 |
+
|
| 1257 |
+
# Format chunks with additional metadata
|
| 1258 |
+
formatted_chunks = []
|
| 1259 |
+
for chunk in chunks:
|
| 1260 |
+
formatted_chunk = {
|
| 1261 |
+
"content": chunk.get("content", ""),
|
| 1262 |
+
"document": chunk.get("document", "Unknown"),
|
| 1263 |
+
"similarity": chunk.get("similarity", 0.0),
|
| 1264 |
+
"page": chunk.get("page"),
|
| 1265 |
+
"chunk_index": chunk.get("chunk_index")
|
| 1266 |
+
}
|
| 1267 |
+
formatted_chunks.append(formatted_chunk)
|
| 1268 |
+
|
| 1269 |
+
return JSONResponse(content={
|
| 1270 |
+
"query": request.question,
|
| 1271 |
+
"chunks": formatted_chunks,
|
| 1272 |
+
"total_found": len(formatted_chunks)
|
| 1273 |
+
})
|
| 1274 |
+
|
| 1275 |
+
except HTTPException:
|
| 1276 |
+
raise
|
| 1277 |
+
except Exception as e:
|
| 1278 |
+
logger.error(f"Search error: {e}", exc_info=True)
|
| 1279 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 1280 |
+
|
| 1281 |
+
@app.get("/health")
|
| 1282 |
+
async def health_check():
|
| 1283 |
+
"""Health check endpoint"""
|
| 1284 |
+
try:
|
| 1285 |
+
doc_count = len(rag_system.list_documents()) if rag_system else 0
|
| 1286 |
+
|
| 1287 |
+
return {
|
| 1288 |
+
"status": "healthy",
|
| 1289 |
+
"service": "Scholar's Archive - Document Intelligence System",
|
| 1290 |
+
"version": "1.0.0",
|
| 1291 |
+
"rag_system_initialized": rag_system is not None,
|
| 1292 |
+
"indexed_documents": doc_count,
|
| 1293 |
+
"timestamp": datetime.now().isoformat(),
|
| 1294 |
+
"message": "Scholar's Archive is operational and ready to serve"
|
| 1295 |
+
}
|
| 1296 |
+
except Exception as e:
|
| 1297 |
+
logger.error(f"Health check error: {e}")
|
| 1298 |
+
return {
|
| 1299 |
+
"status": "degraded",
|
| 1300 |
+
"service": "Scholar's Archive - Document Intelligence System",
|
| 1301 |
+
"error": str(e),
|
| 1302 |
+
"timestamp": datetime.now().isoformat()
|
| 1303 |
+
}
|
| 1304 |
+
|
| 1305 |
+
@app.get("/api/info")
|
| 1306 |
+
async def api_info():
|
| 1307 |
+
"""Get API information"""
|
| 1308 |
+
return {
|
| 1309 |
+
"name": "Scholar's Archive API",
|
| 1310 |
+
"description": "Document Intelligence System API",
|
| 1311 |
+
"version": "1.0.0",
|
| 1312 |
+
"endpoints": {
|
| 1313 |
+
"upload": "POST /upload - Upload documents",
|
| 1314 |
+
"ask": "POST /ask - Ask questions",
|
| 1315 |
+
"documents": "GET /documents - List documents",
|
| 1316 |
+
"stats": "GET /stats - Get statistics",
|
| 1317 |
+
"search": "POST /search - Search chunks",
|
| 1318 |
+
"clear": "DELETE /clear - Clear all documents",
|
| 1319 |
+
"health": "GET /health - Health check"
|
| 1320 |
+
},
|
| 1321 |
+
"supported_formats": [".pdf", ".docx", ".txt", ".csv"],
|
| 1322 |
+
"max_file_size": "50MB"
|
| 1323 |
+
}
|
| 1324 |
+
|
| 1325 |
+
# Event handlers
|
| 1326 |
+
@app.on_event("startup")
|
| 1327 |
+
async def startup_event():
|
| 1328 |
+
"""Application startup event"""
|
| 1329 |
+
logger.info("Starting Scholar's Archive - Document Intelligence System")
|
| 1330 |
+
logger.info("System initialized and ready to serve scholarly inquiries")
|
| 1331 |
+
|
| 1332 |
+
@app.on_event("shutdown")
|
| 1333 |
+
async def shutdown_event():
|
| 1334 |
+
"""Application shutdown event"""
|
| 1335 |
+
if rag_system:
|
| 1336 |
+
rag_system.close()
|
| 1337 |
+
logger.info("Scholar's Archive system closed gracefully")
|
| 1338 |
+
logger.info("Scholar's Archive shutdown complete")
|
| 1339 |
+
|
| 1340 |
+
# Error handlers
|
| 1341 |
+
@app.exception_handler(404)
|
| 1342 |
+
async def not_found_handler(request: Request, exc):
|
| 1343 |
+
"""Custom 404 handler"""
|
| 1344 |
+
return JSONResponse(
|
| 1345 |
+
status_code=404,
|
| 1346 |
+
content={
|
| 1347 |
+
"detail": "The requested resource was not found in the Scholar's Archive",
|
| 1348 |
+
"path": str(request.url.path)
|
| 1349 |
+
}
|
| 1350 |
+
)
|
| 1351 |
+
|
| 1352 |
+
@app.exception_handler(500)
|
| 1353 |
+
async def internal_error_handler(request: Request, exc):
|
| 1354 |
+
"""Custom 500 handler"""
|
| 1355 |
+
logger.error(f"Internal server error: {exc}")
|
| 1356 |
+
return JSONResponse(
|
| 1357 |
+
status_code=500,
|
| 1358 |
+
content={
|
| 1359 |
+
"detail": "An internal error occurred in the Scholar's Archive system",
|
| 1360 |
+
"message": "Please try again later or contact support"
|
| 1361 |
+
}
|
| 1362 |
+
)
|
| 1363 |
+
|
| 1364 |
+
# Main execution
|
| 1365 |
+
if __name__ == "__main__":
|
| 1366 |
+
import uvicorn
|
| 1367 |
+
|
| 1368 |
+
logger.info("Launching Scholar's Archive - Document Intelligence System")
|
| 1369 |
+
logger.info("Access the interface at: http://localhost:8000")
|
| 1370 |
+
logger.info("API documentation at: http://localhost:8000/api/docs")
|
| 1371 |
+
|
| 1372 |
+
uvicorn.run(
|
| 1373 |
+
app,
|
| 1374 |
+
host="0.0.0.0",
|
| 1375 |
+
port=7860,
|
| 1376 |
+
log_level="info",
|
| 1377 |
+
reload=False,
|
| 1378 |
+
access_log=True
|
| 1379 |
+
)
|
rag_storage/metadata.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea47f16e743fe3b0236ad60c5bf23262ac2a654bf3471d0ef3da50af2813bd3f
|
| 3 |
+
size 53947
|
rag_storage/vector_store.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51fa7faf223455f98a8e566eca56bcbb9604a37dbb2a41c80bf32bfd60454d8d
|
| 3 |
+
size 365613
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
gradio==4.44.1
|
| 4 |
+
sentence-transformers
|
| 5 |
+
nltk
|
| 6 |
+
pymupdf
|
| 7 |
+
numpy
|
| 8 |
+
faiss-cpu
|
| 9 |
+
torch
|
| 10 |
+
transformers
|
| 11 |
+
unstructured
|
| 12 |
+
python-multipart
|
| 13 |
+
Jinja2
|
templates/index.html
ADDED
|
@@ -0,0 +1,1338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- <!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>RAG PDF Question Answering System</title>
|
| 7 |
+
<style>
|
| 8 |
+
* {
|
| 9 |
+
margin: 0;
|
| 10 |
+
padding: 0;
|
| 11 |
+
box-sizing: border-box;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
body {
|
| 15 |
+
font-family: 'Times New Roman', serif;
|
| 16 |
+
line-height: 1.6;
|
| 17 |
+
color: #333;
|
| 18 |
+
background-color: #f8f9fa;
|
| 19 |
+
padding: 20px;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
.container {
|
| 23 |
+
max-width: 1000px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
background: white;
|
| 26 |
+
border-radius: 8px;
|
| 27 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
| 28 |
+
overflow: hidden;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
.header {
|
| 32 |
+
background: linear-gradient(135deg, #2c3e50, #34495e);
|
| 33 |
+
color: white;
|
| 34 |
+
padding: 30px;
|
| 35 |
+
text-align: center;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
.header h1 {
|
| 39 |
+
font-size: 2.2em;
|
| 40 |
+
margin-bottom: 10px;
|
| 41 |
+
font-weight: normal;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.header p {
|
| 45 |
+
font-size: 1.1em;
|
| 46 |
+
opacity: 0.9;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.main-content {
|
| 50 |
+
padding: 30px;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
.section {
|
| 54 |
+
margin-bottom: 30px;
|
| 55 |
+
padding: 25px;
|
| 56 |
+
border: 1px solid #e0e0e0;
|
| 57 |
+
border-radius: 6px;
|
| 58 |
+
background: #fafafa;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.section h2 {
|
| 62 |
+
color: #2c3e50;
|
| 63 |
+
margin-bottom: 15px;
|
| 64 |
+
font-size: 1.4em;
|
| 65 |
+
border-bottom: 2px solid #3498db;
|
| 66 |
+
padding-bottom: 8px;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
.upload-area {
|
| 70 |
+
border: 2px dashed #bdc3c7;
|
| 71 |
+
border-radius: 6px;
|
| 72 |
+
padding: 20px;
|
| 73 |
+
text-align: center;
|
| 74 |
+
background: white;
|
| 75 |
+
margin-bottom: 15px;
|
| 76 |
+
transition: border-color 0.3s ease;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.upload-area:hover {
|
| 80 |
+
border-color: #3498db;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.file-input {
|
| 84 |
+
margin: 10px 0;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
input[type="file"] {
|
| 88 |
+
padding: 8px;
|
| 89 |
+
border: 1px solid #ddd;
|
| 90 |
+
border-radius: 4px;
|
| 91 |
+
font-family: inherit;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
.btn {
|
| 95 |
+
background: #3498db;
|
| 96 |
+
color: white;
|
| 97 |
+
border: none;
|
| 98 |
+
padding: 12px 24px;
|
| 99 |
+
border-radius: 4px;
|
| 100 |
+
cursor: pointer;
|
| 101 |
+
font-size: 1em;
|
| 102 |
+
font-family: inherit;
|
| 103 |
+
transition: background 0.3s ease;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
.btn:hover {
|
| 107 |
+
background: #2980b9;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.btn:disabled {
|
| 111 |
+
background: #bdc3c7;
|
| 112 |
+
cursor: not-allowed;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.btn-danger {
|
| 116 |
+
background: #e74c3c;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.btn-danger:hover {
|
| 120 |
+
background: #c0392b;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.question-area {
|
| 124 |
+
background: white;
|
| 125 |
+
padding: 20px;
|
| 126 |
+
border-radius: 6px;
|
| 127 |
+
border: 1px solid #ddd;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
textarea {
|
| 131 |
+
width: 100%;
|
| 132 |
+
padding: 12px;
|
| 133 |
+
border: 1px solid #ddd;
|
| 134 |
+
border-radius: 4px;
|
| 135 |
+
font-family: inherit;
|
| 136 |
+
font-size: 1em;
|
| 137 |
+
resize: vertical;
|
| 138 |
+
min-height: 80px;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
.answer-section {
|
| 142 |
+
background: white;
|
| 143 |
+
padding: 20px;
|
| 144 |
+
border-radius: 6px;
|
| 145 |
+
border: 1px solid #ddd;
|
| 146 |
+
margin-top: 15px;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.answer {
|
| 150 |
+
background: #f8f9fa;
|
| 151 |
+
padding: 15px;
|
| 152 |
+
border-left: 4px solid #3498db;
|
| 153 |
+
margin-bottom: 15px;
|
| 154 |
+
border-radius: 0 4px 4px 0;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
.sources {
|
| 158 |
+
background: #fff;
|
| 159 |
+
border: 1px solid #e0e0e0;
|
| 160 |
+
border-radius: 4px;
|
| 161 |
+
padding: 15px;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
.sources h4 {
|
| 165 |
+
color: #2c3e50;
|
| 166 |
+
margin-bottom: 10px;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.source-item {
|
| 170 |
+
padding: 8px 0;
|
| 171 |
+
border-bottom: 1px solid #eee;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.source-item:last-child {
|
| 175 |
+
border-bottom: none;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
.documents-list {
|
| 179 |
+
background: white;
|
| 180 |
+
border-radius: 6px;
|
| 181 |
+
border: 1px solid #ddd;
|
| 182 |
+
max-height: 200px;
|
| 183 |
+
overflow-y: auto;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
.document-item {
|
| 187 |
+
padding: 12px 15px;
|
| 188 |
+
border-bottom: 1px solid #eee;
|
| 189 |
+
display: flex;
|
| 190 |
+
justify-content: space-between;
|
| 191 |
+
align-items: center;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
.document-item:last-child {
|
| 195 |
+
border-bottom: none;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
.document-name {
|
| 199 |
+
font-weight: bold;
|
| 200 |
+
color: #2c3e50;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
.document-chunks {
|
| 204 |
+
color: #7f8c8d;
|
| 205 |
+
font-size: 0.9em;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
.status-message {
|
| 209 |
+
padding: 12px;
|
| 210 |
+
border-radius: 4px;
|
| 211 |
+
margin: 10px 0;
|
| 212 |
+
font-weight: bold;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.status-success {
|
| 216 |
+
background: #d4edda;
|
| 217 |
+
color: #155724;
|
| 218 |
+
border: 1px solid #c3e6cb;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.status-error {
|
| 222 |
+
background: #f8d7da;
|
| 223 |
+
color: #721c24;
|
| 224 |
+
border: 1px solid #f5c6cb;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.loading {
|
| 228 |
+
display: inline-block;
|
| 229 |
+
width: 20px;
|
| 230 |
+
height: 20px;
|
| 231 |
+
border: 3px solid #f3f3f3;
|
| 232 |
+
border-top: 3px solid #3498db;
|
| 233 |
+
border-radius: 50%;
|
| 234 |
+
animation: spin 1s linear infinite;
|
| 235 |
+
margin-right: 10px;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
@keyframes spin {
|
| 239 |
+
0% { transform: rotate(0deg); }
|
| 240 |
+
100% { transform: rotate(360deg); }
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
.hidden {
|
| 244 |
+
display: none;
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
.controls {
|
| 248 |
+
display: flex;
|
| 249 |
+
gap: 10px;
|
| 250 |
+
align-items: center;
|
| 251 |
+
flex-wrap: wrap;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
.no-documents {
|
| 255 |
+
text-align: center;
|
| 256 |
+
color: #7f8c8d;
|
| 257 |
+
padding: 20px;
|
| 258 |
+
font-style: italic;
|
| 259 |
+
}
|
| 260 |
+
</style>
|
| 261 |
+
</head>
|
| 262 |
+
<body>
|
| 263 |
+
<div class="container">
|
| 264 |
+
<div class="header">
|
| 265 |
+
<h1>RAG PDF Question Answering System</h1>
|
| 266 |
+
<p>Upload documents and ask questions to get AI-powered answers</p>
|
| 267 |
+
</div>
|
| 268 |
+
|
| 269 |
+
<div class="main-content">
|
| 270 |
+
<div class="section">
|
| 271 |
+
<h2>📁 Upload Documents</h2>
|
| 272 |
+
<div class="upload-area">
|
| 273 |
+
<p>Select a document to upload (PDF, DOCX, TXT, CSV)</p>
|
| 274 |
+
<div class="file-input">
|
| 275 |
+
<input type="file" id="fileInput" accept=".pdf,.docx,.txt,.csv">
|
| 276 |
+
</div>
|
| 277 |
+
<div class="controls">
|
| 278 |
+
<button class="btn" onclick="uploadDocument()" id="uploadBtn">Upload Document</button>
|
| 279 |
+
<button class="btn btn-danger" onclick="clearAllDocuments()" id="clearBtn">Clear All Documents</button>
|
| 280 |
+
</div>
|
| 281 |
+
</div>
|
| 282 |
+
<div id="uploadStatus"></div>
|
| 283 |
+
</div>
|
| 284 |
+
|
| 285 |
+
<div class="section">
|
| 286 |
+
<h2>📚 Indexed Documents</h2>
|
| 287 |
+
<div id="documentsList" class="documents-list">
|
| 288 |
+
<div class="no-documents">No documents uploaded yet</div>
|
| 289 |
+
</div>
|
| 290 |
+
</div>
|
| 291 |
+
|
| 292 |
+
<div class="section">
|
| 293 |
+
<h2>❓ Ask Questions</h2>
|
| 294 |
+
<div class="question-area">
|
| 295 |
+
<textarea id="questionInput" placeholder="Enter your question about the uploaded documents..."></textarea>
|
| 296 |
+
<div style="margin-top: 15px;">
|
| 297 |
+
<button class="btn" onclick="askQuestion()" id="askBtn">Ask Question</button>
|
| 298 |
+
</div>
|
| 299 |
+
</div>
|
| 300 |
+
<div id="answerSection" class="answer-section hidden">
|
| 301 |
+
<div id="answerContent" class="answer"></div>
|
| 302 |
+
<div id="sourcesContent" class="sources"></div>
|
| 303 |
+
</div>
|
| 304 |
+
</div>
|
| 305 |
+
</div>
|
| 306 |
+
</div>
|
| 307 |
+
|
| 308 |
+
<script>
|
| 309 |
+
let isUploading = false;
|
| 310 |
+
let isAsking = false;
|
| 311 |
+
|
| 312 |
+
function showMessage(message, type) {
|
| 313 |
+
const statusDiv = document.getElementById('uploadStatus');
|
| 314 |
+
statusDiv.innerHTML = `<div class="status-message status-${type}">${message}</div>`;
|
| 315 |
+
setTimeout(() => {
|
| 316 |
+
statusDiv.innerHTML = '';
|
| 317 |
+
}, 5000);
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
function setLoadingState(isLoading, buttonId, loadingText, normalText) {
|
| 321 |
+
const button = document.getElementById(buttonId);
|
| 322 |
+
if (isLoading) {
|
| 323 |
+
button.innerHTML = `<span class="loading"></span>${loadingText}`;
|
| 324 |
+
button.disabled = true;
|
| 325 |
+
} else {
|
| 326 |
+
button.innerHTML = normalText;
|
| 327 |
+
button.disabled = false;
|
| 328 |
+
}
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
async function uploadDocument() {
|
| 332 |
+
const fileInput = document.getElementById('fileInput');
|
| 333 |
+
const file = fileInput.files[0];
|
| 334 |
+
|
| 335 |
+
if (!file) {
|
| 336 |
+
showMessage('Please select a file first', 'error');
|
| 337 |
+
return;
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
isUploading = true;
|
| 341 |
+
setLoadingState(true, 'uploadBtn', 'Uploading...', 'Upload Document');
|
| 342 |
+
|
| 343 |
+
const formData = new FormData();
|
| 344 |
+
formData.append('file', file);
|
| 345 |
+
|
| 346 |
+
try {
|
| 347 |
+
const response = await fetch('/upload', {
|
| 348 |
+
method: 'POST',
|
| 349 |
+
body: formData
|
| 350 |
+
});
|
| 351 |
+
|
| 352 |
+
const result = await response.json();
|
| 353 |
+
|
| 354 |
+
if (response.ok) {
|
| 355 |
+
showMessage(result.message, 'success');
|
| 356 |
+
fileInput.value = '';
|
| 357 |
+
loadDocuments();
|
| 358 |
+
} else {
|
| 359 |
+
showMessage(result.detail || 'Upload failed', 'error');
|
| 360 |
+
}
|
| 361 |
+
} catch (error) {
|
| 362 |
+
showMessage('Network error: ' + error.message, 'error');
|
| 363 |
+
} finally {
|
| 364 |
+
isUploading = false;
|
| 365 |
+
setLoadingState(false, 'uploadBtn', 'Uploading...', 'Upload Document');
|
| 366 |
+
}
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
async function askQuestion() {
|
| 370 |
+
const questionInput = document.getElementById('questionInput');
|
| 371 |
+
const question = questionInput.value.trim();
|
| 372 |
+
|
| 373 |
+
if (!question) {
|
| 374 |
+
showMessage('Please enter a question', 'error');
|
| 375 |
+
return;
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
isAsking = true;
|
| 379 |
+
setLoadingState(true, 'askBtn', 'Processing...', 'Ask Question');
|
| 380 |
+
|
| 381 |
+
try {
|
| 382 |
+
const response = await fetch('/ask', {
|
| 383 |
+
method: 'POST',
|
| 384 |
+
headers: {
|
| 385 |
+
'Content-Type': 'application/json',
|
| 386 |
+
},
|
| 387 |
+
body: JSON.stringify({ question: question })
|
| 388 |
+
});
|
| 389 |
+
|
| 390 |
+
const result = await response.json();
|
| 391 |
+
|
| 392 |
+
if (response.ok) {
|
| 393 |
+
displayAnswer(result.answer, result.sources);
|
| 394 |
+
} else {
|
| 395 |
+
showMessage(result.detail || 'Failed to get answer', 'error');
|
| 396 |
+
}
|
| 397 |
+
} catch (error) {
|
| 398 |
+
showMessage('Network error: ' + error.message, 'error');
|
| 399 |
+
} finally {
|
| 400 |
+
isAsking = false;
|
| 401 |
+
setLoadingState(false, 'askBtn', 'Processing...', 'Ask Question');
|
| 402 |
+
}
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
function displayAnswer(answer, sources) {
|
| 406 |
+
const answerSection = document.getElementById('answerSection');
|
| 407 |
+
const answerContent = document.getElementById('answerContent');
|
| 408 |
+
const sourcesContent = document.getElementById('sourcesContent');
|
| 409 |
+
|
| 410 |
+
answerContent.innerHTML = `<strong>Answer:</strong><br>${answer}`;
|
| 411 |
+
|
| 412 |
+
if (sources && sources.length > 0) {
|
| 413 |
+
let sourcesHtml = '<h4>Sources:</h4>';
|
| 414 |
+
sources.forEach((source, index) => {
|
| 415 |
+
sourcesHtml += `
|
| 416 |
+
<div class="source-item">
|
| 417 |
+
<strong>${index + 1}. ${source.document}</strong>
|
| 418 |
+
<br><small>Similarity: ${(source.similarity * 100).toFixed(1)}%</small>
|
| 419 |
+
</div>
|
| 420 |
+
`;
|
| 421 |
+
});
|
| 422 |
+
sourcesContent.innerHTML = sourcesHtml;
|
| 423 |
+
} else {
|
| 424 |
+
sourcesContent.innerHTML = '<h4>Sources:</h4><p>No sources found</p>';
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
answerSection.classList.remove('hidden');
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
async function loadDocuments() {
|
| 431 |
+
try {
|
| 432 |
+
const response = await fetch('/documents');
|
| 433 |
+
const result = await response.json();
|
| 434 |
+
|
| 435 |
+
const documentsList = document.getElementById('documentsList');
|
| 436 |
+
|
| 437 |
+
if (result.documents && result.documents.length > 0) {
|
| 438 |
+
let html = '';
|
| 439 |
+
result.documents.forEach(doc => {
|
| 440 |
+
html += `
|
| 441 |
+
<div class="document-item">
|
| 442 |
+
<div>
|
| 443 |
+
<div class="document-name">${doc.title}</div>
|
| 444 |
+
<div class="document-chunks">${doc.chunk_count} chunks</div>
|
| 445 |
+
</div>
|
| 446 |
+
</div>
|
| 447 |
+
`;
|
| 448 |
+
});
|
| 449 |
+
documentsList.innerHTML = html;
|
| 450 |
+
} else {
|
| 451 |
+
documentsList.innerHTML = '<div class="no-documents">No documents uploaded yet</div>';
|
| 452 |
+
}
|
| 453 |
+
} catch (error) {
|
| 454 |
+
console.error('Error loading documents:', error);
|
| 455 |
+
}
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
async function clearAllDocuments() {
|
| 459 |
+
if (!confirm('Are you sure you want to clear all documents? This action cannot be undone.')) {
|
| 460 |
+
return;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
setLoadingState(true, 'clearBtn', 'Clearing...', 'Clear All Documents');
|
| 464 |
+
|
| 465 |
+
try {
|
| 466 |
+
const response = await fetch('/clear', {
|
| 467 |
+
method: 'DELETE'
|
| 468 |
+
});
|
| 469 |
+
|
| 470 |
+
const result = await response.json();
|
| 471 |
+
|
| 472 |
+
if (response.ok) {
|
| 473 |
+
showMessage(result.message, 'success');
|
| 474 |
+
loadDocuments();
|
| 475 |
+
document.getElementById('answerSection').classList.add('hidden');
|
| 476 |
+
} else {
|
| 477 |
+
showMessage(result.detail || 'Failed to clear documents', 'error');
|
| 478 |
+
}
|
| 479 |
+
} catch (error) {
|
| 480 |
+
showMessage('Network error: ' + error.message, 'error');
|
| 481 |
+
} finally {
|
| 482 |
+
setLoadingState(false, 'clearBtn', 'Clearing...', 'Clear All Documents');
|
| 483 |
+
}
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
document.getElementById('questionInput').addEventListener('keypress', function(e) {
|
| 487 |
+
if (e.key === 'Enter' && e.ctrlKey) {
|
| 488 |
+
askQuestion();
|
| 489 |
+
}
|
| 490 |
+
});
|
| 491 |
+
|
| 492 |
+
window.onload = function() {
|
| 493 |
+
loadDocuments();
|
| 494 |
+
};
|
| 495 |
+
</script>
|
| 496 |
+
</body>
|
| 497 |
+
</html> -->
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
<!-- perfect index.html -->
|
| 501 |
+
<!DOCTYPE html>
|
| 502 |
+
<html lang="en">
|
| 503 |
+
<head>
|
| 504 |
+
<meta charset="UTF-8">
|
| 505 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 506 |
+
<title>Scholar's Archive - Document Intelligence System</title>
|
| 507 |
+
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css" rel="stylesheet">
|
| 508 |
+
<style>
|
| 509 |
+
:root {
|
| 510 |
+
--primary-color: #1a365d;
|
| 511 |
+
--secondary-color: #2d5a87;
|
| 512 |
+
--accent-color: #c7955b;
|
| 513 |
+
--light-bg: #f8f6f0;
|
| 514 |
+
--cream: #faf8f2;
|
| 515 |
+
--text-dark: #2c3e50;
|
| 516 |
+
--text-muted: #718096;
|
| 517 |
+
--border-color: #e2d8cc;
|
| 518 |
+
--shadow: 0 4px 20px rgba(26, 54, 93, 0.1);
|
| 519 |
+
--shadow-hover: 0 8px 30px rgba(26, 54, 93, 0.15);
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
* {
|
| 523 |
+
margin: 0;
|
| 524 |
+
padding: 0;
|
| 525 |
+
box-sizing: border-box;
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
body {
|
| 529 |
+
font-family: 'Georgia', 'Times New Roman', serif;
|
| 530 |
+
line-height: 1.7;
|
| 531 |
+
background: linear-gradient(135deg, var(--light-bg) 0%, var(--cream) 100%);
|
| 532 |
+
color: var(--text-dark);
|
| 533 |
+
min-height: 100vh;
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
.container {
|
| 537 |
+
max-width: 1200px;
|
| 538 |
+
margin: 0 auto;
|
| 539 |
+
padding: 20px;
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
.header {
|
| 543 |
+
background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
|
| 544 |
+
color: white;
|
| 545 |
+
text-align: center;
|
| 546 |
+
padding: 3rem 2rem;
|
| 547 |
+
border-radius: 15px 15px 0 0;
|
| 548 |
+
box-shadow: var(--shadow);
|
| 549 |
+
position: relative;
|
| 550 |
+
overflow: hidden;
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
.header::before {
|
| 554 |
+
content: '';
|
| 555 |
+
position: absolute;
|
| 556 |
+
top: 0;
|
| 557 |
+
left: 0;
|
| 558 |
+
right: 0;
|
| 559 |
+
bottom: 0;
|
| 560 |
+
background: url('data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><defs><pattern id="grain" patternUnits="userSpaceOnUse" width="100" height="100"><circle cx="20" cy="20" r="1" fill="rgba(255,255,255,0.05)"/><circle cx="80" cy="40" r="1" fill="rgba(255,255,255,0.03)"/><circle cx="40" cy="80" r="1" fill="rgba(255,255,255,0.04)"/></pattern></defs><rect width="100" height="100" fill="url(%23grain)"/></svg>');
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
.header-content {
|
| 564 |
+
position: relative;
|
| 565 |
+
z-index: 1;
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
.header h1 {
|
| 569 |
+
font-size: 2.8rem;
|
| 570 |
+
margin-bottom: 0.5rem;
|
| 571 |
+
font-weight: 300;
|
| 572 |
+
letter-spacing: 1px;
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
.header .subtitle {
|
| 576 |
+
font-size: 1.2rem;
|
| 577 |
+
opacity: 0.9;
|
| 578 |
+
font-style: italic;
|
| 579 |
+
margin-bottom: 1rem;
|
| 580 |
+
}
|
| 581 |
+
|
| 582 |
+
.header .description {
|
| 583 |
+
font-size: 1rem;
|
| 584 |
+
opacity: 0.8;
|
| 585 |
+
max-width: 600px;
|
| 586 |
+
margin: 0 auto;
|
| 587 |
+
}
|
| 588 |
+
|
| 589 |
+
.main-content {
|
| 590 |
+
background: white;
|
| 591 |
+
border-radius: 0 0 15px 15px;
|
| 592 |
+
box-shadow: var(--shadow);
|
| 593 |
+
overflow: hidden;
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
.section {
|
| 597 |
+
padding: 2.5rem;
|
| 598 |
+
border-bottom: 1px solid var(--border-color);
|
| 599 |
+
position: relative;
|
| 600 |
+
}
|
| 601 |
+
|
| 602 |
+
.section:last-child {
|
| 603 |
+
border-bottom: none;
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
.section-header {
|
| 607 |
+
display: flex;
|
| 608 |
+
align-items: center;
|
| 609 |
+
margin-bottom: 2rem;
|
| 610 |
+
padding-bottom: 1rem;
|
| 611 |
+
border-bottom: 2px solid var(--accent-color);
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
.section-header i {
|
| 615 |
+
font-size: 1.5rem;
|
| 616 |
+
color: var(--accent-color);
|
| 617 |
+
margin-right: 1rem;
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
+
.section-header h2 {
|
| 621 |
+
font-size: 1.8rem;
|
| 622 |
+
color: var(--primary-color);
|
| 623 |
+
font-weight: 400;
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
.upload-zone {
|
| 627 |
+
border: 2px dashed var(--border-color);
|
| 628 |
+
border-radius: 12px;
|
| 629 |
+
padding: 3rem 2rem;
|
| 630 |
+
text-align: center;
|
| 631 |
+
background: var(--cream);
|
| 632 |
+
transition: all 0.3s ease;
|
| 633 |
+
cursor: pointer;
|
| 634 |
+
position: relative;
|
| 635 |
+
overflow: hidden;
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
.upload-zone:hover {
|
| 639 |
+
border-color: var(--accent-color);
|
| 640 |
+
background: white;
|
| 641 |
+
transform: translateY(-2px);
|
| 642 |
+
box-shadow: var(--shadow-hover);
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
.upload-zone.dragover {
|
| 646 |
+
border-color: var(--primary-color);
|
| 647 |
+
background: rgba(26, 54, 93, 0.05);
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
.upload-icon {
|
| 651 |
+
font-size: 3rem;
|
| 652 |
+
color: var(--accent-color);
|
| 653 |
+
margin-bottom: 1rem;
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
.upload-text {
|
| 657 |
+
font-size: 1.1rem;
|
| 658 |
+
color: var(--text-muted);
|
| 659 |
+
margin-bottom: 1rem;
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
.file-types {
|
| 663 |
+
font-size: 0.9rem;
|
| 664 |
+
color: var(--text-muted);
|
| 665 |
+
font-style: italic;
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
.btn {
|
| 669 |
+
background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
|
| 670 |
+
color: white;
|
| 671 |
+
border: none;
|
| 672 |
+
padding: 0.8rem 2rem;
|
| 673 |
+
border-radius: 8px;
|
| 674 |
+
font-family: inherit;
|
| 675 |
+
font-size: 1rem;
|
| 676 |
+
cursor: pointer;
|
| 677 |
+
transition: all 0.3s ease;
|
| 678 |
+
box-shadow: 0 2px 10px rgba(26, 54, 93, 0.2);
|
| 679 |
+
position: relative;
|
| 680 |
+
overflow: hidden;
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
.btn:hover {
|
| 684 |
+
transform: translateY(-2px);
|
| 685 |
+
box-shadow: 0 4px 20px rgba(26, 54, 93, 0.3);
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
+
.btn:active {
|
| 689 |
+
transform: translateY(0);
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
.btn-secondary {
|
| 693 |
+
background: linear-gradient(135deg, var(--accent-color) 0%, #d4a574 100%);
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
.btn-danger {
|
| 697 |
+
background: linear-gradient(135deg, #c53030 0%, #e53e3e 100%);
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
.btn:disabled {
|
| 701 |
+
background: #cbd5e0;
|
| 702 |
+
color: #a0aec0;
|
| 703 |
+
cursor: not-allowed;
|
| 704 |
+
transform: none;
|
| 705 |
+
box-shadow: none;
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
+
.input-group {
|
| 709 |
+
margin-bottom: 1.5rem;
|
| 710 |
+
}
|
| 711 |
+
|
| 712 |
+
.form-control {
|
| 713 |
+
width: 100%;
|
| 714 |
+
padding: 1rem;
|
| 715 |
+
border: 2px solid var(--border-color);
|
| 716 |
+
border-radius: 8px;
|
| 717 |
+
font-family: inherit;
|
| 718 |
+
font-size: 1rem;
|
| 719 |
+
transition: border-color 0.3s ease;
|
| 720 |
+
background: var(--cream);
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.form-control:focus {
|
| 724 |
+
outline: none;
|
| 725 |
+
border-color: var(--primary-color);
|
| 726 |
+
background: white;
|
| 727 |
+
box-shadow: 0 0 0 3px rgba(26, 54, 93, 0.1);
|
| 728 |
+
}
|
| 729 |
+
|
| 730 |
+
.question-textarea {
|
| 731 |
+
min-height: 120px;
|
| 732 |
+
resize: vertical;
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
.documents-grid {
|
| 736 |
+
display: grid;
|
| 737 |
+
gap: 1rem;
|
| 738 |
+
margin-top: 1rem;
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
.document-card {
|
| 742 |
+
background: var(--cream);
|
| 743 |
+
border: 1px solid var(--border-color);
|
| 744 |
+
border-radius: 10px;
|
| 745 |
+
padding: 1.5rem;
|
| 746 |
+
transition: all 0.3s ease;
|
| 747 |
+
position: relative;
|
| 748 |
+
}
|
| 749 |
+
|
| 750 |
+
.document-card:hover {
|
| 751 |
+
background: white;
|
| 752 |
+
box-shadow: var(--shadow);
|
| 753 |
+
transform: translateY(-2px);
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
.document-header {
|
| 757 |
+
display: flex;
|
| 758 |
+
justify-content: space-between;
|
| 759 |
+
align-items: flex-start;
|
| 760 |
+
margin-bottom: 0.5rem;
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
.document-title {
|
| 764 |
+
font-weight: 600;
|
| 765 |
+
color: var(--primary-color);
|
| 766 |
+
font-size: 1.1rem;
|
| 767 |
+
}
|
| 768 |
+
|
| 769 |
+
.document-meta {
|
| 770 |
+
color: var(--text-muted);
|
| 771 |
+
font-size: 0.9rem;
|
| 772 |
+
}
|
| 773 |
+
|
| 774 |
+
.answer-container {
|
| 775 |
+
background: white;
|
| 776 |
+
border: 1px solid var(--border-color);
|
| 777 |
+
border-radius: 12px;
|
| 778 |
+
margin-top: 1.5rem;
|
| 779 |
+
overflow: hidden;
|
| 780 |
+
box-shadow: var(--shadow);
|
| 781 |
+
}
|
| 782 |
+
|
| 783 |
+
.answer-header {
|
| 784 |
+
background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
|
| 785 |
+
color: white;
|
| 786 |
+
padding: 1rem 1.5rem;
|
| 787 |
+
font-weight: 500;
|
| 788 |
+
}
|
| 789 |
+
|
| 790 |
+
.answer-content {
|
| 791 |
+
padding: 2rem;
|
| 792 |
+
background: var(--cream);
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
+
.answer-text {
|
| 796 |
+
font-size: 1.1rem;
|
| 797 |
+
line-height: 1.8;
|
| 798 |
+
margin-bottom: 2rem;
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
.sources-section {
|
| 802 |
+
background: white;
|
| 803 |
+
border-top: 1px solid var(--border-color);
|
| 804 |
+
padding: 1.5rem;
|
| 805 |
+
}
|
| 806 |
+
|
| 807 |
+
.sources-title {
|
| 808 |
+
color: var(--primary-color);
|
| 809 |
+
font-size: 1.2rem;
|
| 810 |
+
margin-bottom: 1rem;
|
| 811 |
+
display: flex;
|
| 812 |
+
align-items: center;
|
| 813 |
+
}
|
| 814 |
+
|
| 815 |
+
.sources-title i {
|
| 816 |
+
margin-right: 0.5rem;
|
| 817 |
+
}
|
| 818 |
+
|
| 819 |
+
.source-item {
|
| 820 |
+
background: var(--cream);
|
| 821 |
+
border: 1px solid var(--border-color);
|
| 822 |
+
border-radius: 8px;
|
| 823 |
+
padding: 1rem;
|
| 824 |
+
margin-bottom: 0.8rem;
|
| 825 |
+
transition: all 0.3s ease;
|
| 826 |
+
}
|
| 827 |
+
|
| 828 |
+
.source-item:hover {
|
| 829 |
+
background: white;
|
| 830 |
+
box-shadow: 0 2px 10px rgba(26, 54, 93, 0.05);
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
.source-name {
|
| 834 |
+
font-weight: 600;
|
| 835 |
+
color: var(--primary-color);
|
| 836 |
+
margin-bottom: 0.3rem;
|
| 837 |
+
}
|
| 838 |
+
|
| 839 |
+
.source-similarity {
|
| 840 |
+
color: var(--text-muted);
|
| 841 |
+
font-size: 0.9rem;
|
| 842 |
+
}
|
| 843 |
+
|
| 844 |
+
.status-message {
|
| 845 |
+
padding: 1rem 1.5rem;
|
| 846 |
+
border-radius: 8px;
|
| 847 |
+
margin: 1rem 0;
|
| 848 |
+
font-weight: 500;
|
| 849 |
+
display: flex;
|
| 850 |
+
align-items: center;
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
.status-message i {
|
| 854 |
+
margin-right: 0.5rem;
|
| 855 |
+
}
|
| 856 |
+
|
| 857 |
+
.status-success {
|
| 858 |
+
background: #f0fff4;
|
| 859 |
+
color: #22543d;
|
| 860 |
+
border: 1px solid #9ae6b4;
|
| 861 |
+
}
|
| 862 |
+
|
| 863 |
+
.status-error {
|
| 864 |
+
background: #fed7d7;
|
| 865 |
+
color: #742a2a;
|
| 866 |
+
border: 1px solid #fc8181;
|
| 867 |
+
}
|
| 868 |
+
|
| 869 |
+
.loading-spinner {
|
| 870 |
+
display: inline-block;
|
| 871 |
+
width: 20px;
|
| 872 |
+
height: 20px;
|
| 873 |
+
border: 2px solid rgba(255, 255, 255, 0.3);
|
| 874 |
+
border-radius: 50%;
|
| 875 |
+
border-top-color: white;
|
| 876 |
+
animation: spin 0.8s linear infinite;
|
| 877 |
+
margin-right: 0.5rem;
|
| 878 |
+
}
|
| 879 |
+
|
| 880 |
+
@keyframes spin {
|
| 881 |
+
to { transform: rotate(360deg); }
|
| 882 |
+
}
|
| 883 |
+
|
| 884 |
+
.controls {
|
| 885 |
+
display: flex;
|
| 886 |
+
gap: 1rem;
|
| 887 |
+
align-items: center;
|
| 888 |
+
flex-wrap: wrap;
|
| 889 |
+
margin-top: 1.5rem;
|
| 890 |
+
}
|
| 891 |
+
|
| 892 |
+
.hidden {
|
| 893 |
+
display: none;
|
| 894 |
+
}
|
| 895 |
+
|
| 896 |
+
.empty-state {
|
| 897 |
+
text-align: center;
|
| 898 |
+
padding: 2rem;
|
| 899 |
+
color: var(--text-muted);
|
| 900 |
+
font-style: italic;
|
| 901 |
+
}
|
| 902 |
+
|
| 903 |
+
.empty-state i {
|
| 904 |
+
font-size: 3rem;
|
| 905 |
+
color: var(--accent-color);
|
| 906 |
+
margin-bottom: 1rem;
|
| 907 |
+
display: block;
|
| 908 |
+
}
|
| 909 |
+
|
| 910 |
+
.stats-bar {
|
| 911 |
+
background: var(--cream);
|
| 912 |
+
padding: 1rem 1.5rem;
|
| 913 |
+
border-radius: 8px;
|
| 914 |
+
display: flex;
|
| 915 |
+
justify-content: space-between;
|
| 916 |
+
align-items: center;
|
| 917 |
+
margin-bottom: 1.5rem;
|
| 918 |
+
border: 1px solid var(--border-color);
|
| 919 |
+
}
|
| 920 |
+
|
| 921 |
+
.stat-item {
|
| 922 |
+
text-align: center;
|
| 923 |
+
}
|
| 924 |
+
|
| 925 |
+
.stat-value {
|
| 926 |
+
font-size: 1.5rem;
|
| 927 |
+
font-weight: 600;
|
| 928 |
+
color: var(--primary-color);
|
| 929 |
+
}
|
| 930 |
+
|
| 931 |
+
.stat-label {
|
| 932 |
+
font-size: 0.9rem;
|
| 933 |
+
color: var(--text-muted);
|
| 934 |
+
}
|
| 935 |
+
|
| 936 |
+
@media (max-width: 768px) {
|
| 937 |
+
.container {
|
| 938 |
+
padding: 10px;
|
| 939 |
+
}
|
| 940 |
+
|
| 941 |
+
.section {
|
| 942 |
+
padding: 1.5rem;
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
.header h1 {
|
| 946 |
+
font-size: 2rem;
|
| 947 |
+
}
|
| 948 |
+
|
| 949 |
+
.controls {
|
| 950 |
+
flex-direction: column;
|
| 951 |
+
align-items: stretch;
|
| 952 |
+
}
|
| 953 |
+
|
| 954 |
+
.btn {
|
| 955 |
+
width: 100%;
|
| 956 |
+
}
|
| 957 |
+
}
|
| 958 |
+
</style>
|
| 959 |
+
</head>
|
| 960 |
+
<body>
|
| 961 |
+
<div class="container">
|
| 962 |
+
<div class="header">
|
| 963 |
+
<div class="header-content">
|
| 964 |
+
<h1><i class="fas fa-university"></i> Scholar's Archive</h1>
|
| 965 |
+
<p class="subtitle">Document Intelligence System</p>
|
| 966 |
+
<p class="description">A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology</p>
|
| 967 |
+
</div>
|
| 968 |
+
</div>
|
| 969 |
+
|
| 970 |
+
<div class="main-content">
|
| 971 |
+
<div class="section">
|
| 972 |
+
<div class="section-header">
|
| 973 |
+
<i class="fas fa-cloud-upload-alt"></i>
|
| 974 |
+
<h2>Document Repository</h2>
|
| 975 |
+
</div>
|
| 976 |
+
|
| 977 |
+
<div class="upload-zone" id="uploadZone">
|
| 978 |
+
<div class="upload-icon">
|
| 979 |
+
<i class="fas fa-file-upload"></i>
|
| 980 |
+
</div>
|
| 981 |
+
<div class="upload-text">
|
| 982 |
+
<strong>Drop your documents here</strong> or click to browse
|
| 983 |
+
</div>
|
| 984 |
+
<div class="file-types">
|
| 985 |
+
Supported formats: PDF, DOCX, TXT, CSV
|
| 986 |
+
</div>
|
| 987 |
+
<input type="file" id="fileInput" accept=".pdf,.docx,.txt,.csv" style="display: none;">
|
| 988 |
+
</div>
|
| 989 |
+
|
| 990 |
+
<div class="controls">
|
| 991 |
+
<button class="btn" onclick="uploadDocument()" id="uploadBtn">
|
| 992 |
+
<i class="fas fa-upload"></i> Upload Document
|
| 993 |
+
</button>
|
| 994 |
+
<button class="btn btn-danger" onclick="clearAllDocuments()" id="clearBtn">
|
| 995 |
+
<i class="fas fa-trash-alt"></i> Clear Repository
|
| 996 |
+
</button>
|
| 997 |
+
</div>
|
| 998 |
+
|
| 999 |
+
<div id="uploadStatus"></div>
|
| 1000 |
+
</div>
|
| 1001 |
+
|
| 1002 |
+
<div class="section">
|
| 1003 |
+
<div class="section-header">
|
| 1004 |
+
<i class="fas fa-books"></i>
|
| 1005 |
+
<h2>Document Collection</h2>
|
| 1006 |
+
</div>
|
| 1007 |
+
|
| 1008 |
+
<div class="stats-bar" id="statsBar">
|
| 1009 |
+
<div class="stat-item">
|
| 1010 |
+
<div class="stat-value" id="docCount">0</div>
|
| 1011 |
+
<div class="stat-label">Documents</div>
|
| 1012 |
+
</div>
|
| 1013 |
+
<div class="stat-item">
|
| 1014 |
+
<div class="stat-value" id="chunkCount">0</div>
|
| 1015 |
+
<div class="stat-label">Text Chunks</div>
|
| 1016 |
+
</div>
|
| 1017 |
+
<div class="stat-item">
|
| 1018 |
+
<div class="stat-value" id="pageCount">0</div>
|
| 1019 |
+
<div class="stat-label">Total Pages</div>
|
| 1020 |
+
</div>
|
| 1021 |
+
</div>
|
| 1022 |
+
|
| 1023 |
+
<div id="documentsList" class="documents-grid">
|
| 1024 |
+
<div class="empty-state">
|
| 1025 |
+
<i class="fas fa-folder-open"></i>
|
| 1026 |
+
<div>No documents in repository</div>
|
| 1027 |
+
</div>
|
| 1028 |
+
</div>
|
| 1029 |
+
</div>
|
| 1030 |
+
|
| 1031 |
+
<div class="section">
|
| 1032 |
+
<div class="section-header">
|
| 1033 |
+
<i class="fas fa-search"></i>
|
| 1034 |
+
<h2>Intelligent Inquiry</h2>
|
| 1035 |
+
</div>
|
| 1036 |
+
|
| 1037 |
+
<div class="input-group">
|
| 1038 |
+
<textarea
|
| 1039 |
+
id="questionInput"
|
| 1040 |
+
class="form-control question-textarea"
|
| 1041 |
+
placeholder="Enter your scholarly inquiry about the uploaded documents..."
|
| 1042 |
+
rows="4"
|
| 1043 |
+
></textarea>
|
| 1044 |
+
</div>
|
| 1045 |
+
|
| 1046 |
+
<div class="controls">
|
| 1047 |
+
<button class="btn" onclick="askQuestion()" id="askBtn">
|
| 1048 |
+
<i class="fas fa-brain"></i> Submit Inquiry
|
| 1049 |
+
</button>
|
| 1050 |
+
<button class="btn btn-secondary" onclick="clearAnswer()" id="clearAnswerBtn">
|
| 1051 |
+
<i class="fas fa-eraser"></i> Clear Response
|
| 1052 |
+
</button>
|
| 1053 |
+
</div>
|
| 1054 |
+
|
| 1055 |
+
<div id="answerContainer" class="answer-container hidden">
|
| 1056 |
+
<div class="answer-header">
|
| 1057 |
+
<i class="fas fa-lightbulb"></i> Scholarly Response
|
| 1058 |
+
</div>
|
| 1059 |
+
<div class="answer-content">
|
| 1060 |
+
<div id="answerText" class="answer-text"></div>
|
| 1061 |
+
</div>
|
| 1062 |
+
<div id="sourcesSection" class="sources-section">
|
| 1063 |
+
<div class="sources-title">
|
| 1064 |
+
<i class="fas fa-quote-left"></i> Referenced Sources
|
| 1065 |
+
</div>
|
| 1066 |
+
<div id="sourcesList"></div>
|
| 1067 |
+
</div>
|
| 1068 |
+
</div>
|
| 1069 |
+
</div>
|
| 1070 |
+
</div>
|
| 1071 |
+
</div>
|
| 1072 |
+
|
| 1073 |
+
<script>
|
| 1074 |
+
let isUploading = false;
|
| 1075 |
+
let isAsking = false;
|
| 1076 |
+
|
| 1077 |
+
|
| 1078 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 1079 |
+
loadDocuments();
|
| 1080 |
+
setupEventListeners();
|
| 1081 |
+
});
|
| 1082 |
+
|
| 1083 |
+
function setupEventListeners() {
|
| 1084 |
+
const uploadZone = document.getElementById('uploadZone');
|
| 1085 |
+
const fileInput = document.getElementById('fileInput');
|
| 1086 |
+
const questionInput = document.getElementById('questionInput');
|
| 1087 |
+
|
| 1088 |
+
|
| 1089 |
+
uploadZone.addEventListener('click', () => fileInput.click());
|
| 1090 |
+
|
| 1091 |
+
uploadZone.addEventListener('dragover', (e) => {
|
| 1092 |
+
e.preventDefault();
|
| 1093 |
+
uploadZone.classList.add('dragover');
|
| 1094 |
+
});
|
| 1095 |
+
|
| 1096 |
+
uploadZone.addEventListener('dragleave', () => {
|
| 1097 |
+
uploadZone.classList.remove('dragover');
|
| 1098 |
+
});
|
| 1099 |
+
|
| 1100 |
+
uploadZone.addEventListener('drop', (e) => {
|
| 1101 |
+
e.preventDefault();
|
| 1102 |
+
uploadZone.classList.remove('dragover');
|
| 1103 |
+
const files = e.dataTransfer.files;
|
| 1104 |
+
if (files.length > 0) {
|
| 1105 |
+
fileInput.files = files;
|
| 1106 |
+
uploadDocument();
|
| 1107 |
+
}
|
| 1108 |
+
});
|
| 1109 |
+
|
| 1110 |
+
|
| 1111 |
+
fileInput.addEventListener('change', uploadDocument);
|
| 1112 |
+
|
| 1113 |
+
questionInput.addEventListener('keydown', (e) => {
|
| 1114 |
+
if (e.key === 'Enter' && (e.ctrlKey || e.metaKey)) {
|
| 1115 |
+
askQuestion();
|
| 1116 |
+
}
|
| 1117 |
+
});
|
| 1118 |
+
}
|
| 1119 |
+
|
| 1120 |
+
function showMessage(message, type, icon = null) {
|
| 1121 |
+
const statusDiv = document.getElementById('uploadStatus');
|
| 1122 |
+
const iconHtml = icon ? `<i class="fas fa-${icon}"></i>` : '';
|
| 1123 |
+
statusDiv.innerHTML = `<div class="status-message status-${type}">${iconHtml}${message}</div>`;
|
| 1124 |
+
setTimeout(() => {
|
| 1125 |
+
statusDiv.innerHTML = '';
|
| 1126 |
+
}, 5000);
|
| 1127 |
+
}
|
| 1128 |
+
|
| 1129 |
+
function setLoadingState(isLoading, buttonId, loadingText, normalText, normalIcon = null) {
|
| 1130 |
+
const button = document.getElementById(buttonId);
|
| 1131 |
+
if (isLoading) {
|
| 1132 |
+
button.innerHTML = `<span class="loading-spinner"></span>${loadingText}`;
|
| 1133 |
+
button.disabled = true;
|
| 1134 |
+
} else {
|
| 1135 |
+
const iconHtml = normalIcon ? `<i class="fas fa-${normalIcon}"></i> ` : '';
|
| 1136 |
+
button.innerHTML = `${iconHtml}${normalText}`;
|
| 1137 |
+
button.disabled = false;
|
| 1138 |
+
}
|
| 1139 |
+
}
|
| 1140 |
+
|
| 1141 |
+
async function uploadDocument() {
|
| 1142 |
+
const fileInput = document.getElementById('fileInput');
|
| 1143 |
+
const file = fileInput.files[0];
|
| 1144 |
+
|
| 1145 |
+
if (!file) return;
|
| 1146 |
+
|
| 1147 |
+
isUploading = true;
|
| 1148 |
+
setLoadingState(true, 'uploadBtn', 'Processing Document...', 'Upload Document', 'upload');
|
| 1149 |
+
|
| 1150 |
+
const formData = new FormData();
|
| 1151 |
+
formData.append('file', file);
|
| 1152 |
+
|
| 1153 |
+
try {
|
| 1154 |
+
const response = await fetch('/upload', {
|
| 1155 |
+
method: 'POST',
|
| 1156 |
+
body: formData
|
| 1157 |
+
});
|
| 1158 |
+
|
| 1159 |
+
const result = await response.json();
|
| 1160 |
+
|
| 1161 |
+
if (response.ok) {
|
| 1162 |
+
showMessage(result.message, 'success', 'check-circle');
|
| 1163 |
+
fileInput.value = '';
|
| 1164 |
+
await loadDocuments();
|
| 1165 |
+
} else {
|
| 1166 |
+
showMessage(result.detail || 'Upload failed', 'error', 'exclamation-triangle');
|
| 1167 |
+
}
|
| 1168 |
+
} catch (error) {
|
| 1169 |
+
showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
|
| 1170 |
+
} finally {
|
| 1171 |
+
isUploading = false;
|
| 1172 |
+
setLoadingState(false, 'uploadBtn', 'Processing Document...', 'Upload Document', 'upload');
|
| 1173 |
+
}
|
| 1174 |
+
}
|
| 1175 |
+
|
| 1176 |
+
async function askQuestion() {
|
| 1177 |
+
const questionInput = document.getElementById('questionInput');
|
| 1178 |
+
const question = questionInput.value.trim();
|
| 1179 |
+
|
| 1180 |
+
if (!question) {
|
| 1181 |
+
showMessage('Please enter a question', 'error', 'exclamation-triangle');
|
| 1182 |
+
return;
|
| 1183 |
+
}
|
| 1184 |
+
|
| 1185 |
+
isAsking = true;
|
| 1186 |
+
setLoadingState(true, 'askBtn', 'Analyzing Documents...', 'Submit Inquiry', 'brain');
|
| 1187 |
+
|
| 1188 |
+
try {
|
| 1189 |
+
const response = await fetch('/ask', {
|
| 1190 |
+
method: 'POST',
|
| 1191 |
+
headers: {
|
| 1192 |
+
'Content-Type': 'application/json',
|
| 1193 |
+
},
|
| 1194 |
+
body: JSON.stringify({ question: question })
|
| 1195 |
+
});
|
| 1196 |
+
|
| 1197 |
+
const result = await response.json();
|
| 1198 |
+
|
| 1199 |
+
if (response.ok) {
|
| 1200 |
+
displayAnswer(result.answer, result.sources);
|
| 1201 |
+
} else {
|
| 1202 |
+
showMessage(result.detail || 'Failed to get answer', 'error', 'exclamation-triangle');
|
| 1203 |
+
}
|
| 1204 |
+
} catch (error) {
|
| 1205 |
+
showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
|
| 1206 |
+
} finally {
|
| 1207 |
+
isAsking = false;
|
| 1208 |
+
setLoadingState(false, 'askBtn', 'Analyzing Documents...', 'Submit Inquiry', 'brain');
|
| 1209 |
+
}
|
| 1210 |
+
}
|
| 1211 |
+
|
| 1212 |
+
function displayAnswer(answer, sources) {
|
| 1213 |
+
const answerContainer = document.getElementById('answerContainer');
|
| 1214 |
+
const answerText = document.getElementById('answerText');
|
| 1215 |
+
const sourcesList = document.getElementById('sourcesList');
|
| 1216 |
+
|
| 1217 |
+
answerText.innerHTML = answer;
|
| 1218 |
+
|
| 1219 |
+
if (sources && sources.length > 0) {
|
| 1220 |
+
let sourcesHtml = '';
|
| 1221 |
+
sources.forEach((source, index) => {
|
| 1222 |
+
const similarity = Math.round(source.similarity * 100);
|
| 1223 |
+
sourcesHtml += `
|
| 1224 |
+
<div class="source-item">
|
| 1225 |
+
<div class="source-name">
|
| 1226 |
+
<i class="fas fa-file-alt"></i> ${source.document}
|
| 1227 |
+
</div>
|
| 1228 |
+
<div class="source-similarity">
|
| 1229 |
+
Relevance: ${similarity}% • Chunk ${index + 1}
|
| 1230 |
+
</div>
|
| 1231 |
+
</div>
|
| 1232 |
+
`;
|
| 1233 |
+
});
|
| 1234 |
+
sourcesList.innerHTML = sourcesHtml;
|
| 1235 |
+
} else {
|
| 1236 |
+
sourcesList.innerHTML = `
|
| 1237 |
+
<div class="empty-state">
|
| 1238 |
+
<i class="fas fa-search"></i>
|
| 1239 |
+
<div>No specific sources referenced</div>
|
| 1240 |
+
</div>
|
| 1241 |
+
`;
|
| 1242 |
+
}
|
| 1243 |
+
|
| 1244 |
+
answerContainer.classList.remove('hidden');
|
| 1245 |
+
answerContainer.scrollIntoView({ behavior: 'smooth' });
|
| 1246 |
+
}
|
| 1247 |
+
|
| 1248 |
+
function clearAnswer() {
|
| 1249 |
+
const answerContainer = document.getElementById('answerContainer');
|
| 1250 |
+
answerContainer.classList.add('hidden');
|
| 1251 |
+
document.getElementById('questionInput').value = '';
|
| 1252 |
+
}
|
| 1253 |
+
|
| 1254 |
+
async function loadDocuments() {
|
| 1255 |
+
try {
|
| 1256 |
+
const [docsResponse, statsResponse] = await Promise.all([
|
| 1257 |
+
fetch('/documents'),
|
| 1258 |
+
fetch('/stats')
|
| 1259 |
+
]);
|
| 1260 |
+
|
| 1261 |
+
const docsResult = await docsResponse.json();
|
| 1262 |
+
const statsResult = await statsResponse.json();
|
| 1263 |
+
|
| 1264 |
+
updateDocumentsList(docsResult.documents || []);
|
| 1265 |
+
updateStats(statsResult);
|
| 1266 |
+
} catch (error) {
|
| 1267 |
+
console.error('Error loading documents:', error);
|
| 1268 |
+
}
|
| 1269 |
+
}
|
| 1270 |
+
|
| 1271 |
+
function updateDocumentsList(documents) {
|
| 1272 |
+
const documentsList = document.getElementById('documentsList');
|
| 1273 |
+
|
| 1274 |
+
if (documents.length === 0) {
|
| 1275 |
+
documentsList.innerHTML = `
|
| 1276 |
+
<div class="empty-state">
|
| 1277 |
+
<i class="fas fa-folder-open"></i>
|
| 1278 |
+
<div>No documents in repository</div>
|
| 1279 |
+
</div>
|
| 1280 |
+
`;
|
| 1281 |
+
return;
|
| 1282 |
+
}
|
| 1283 |
+
|
| 1284 |
+
let html = '';
|
| 1285 |
+
documents.forEach(doc => {
|
| 1286 |
+
html += `
|
| 1287 |
+
<div class="document-card">
|
| 1288 |
+
<div class="document-header">
|
| 1289 |
+
<div class="document-title">
|
| 1290 |
+
<i class="fas fa-file-alt"></i> ${doc.title}
|
| 1291 |
+
</div>
|
| 1292 |
+
</div>
|
| 1293 |
+
<div class="document-meta">
|
| 1294 |
+
<i class="fas fa-layer-group"></i> ${doc.chunk_count} chunks
|
| 1295 |
+
${doc.total_pages ? ` • <i class="fas fa-file-pdf"></i> ${doc.total_pages} pages` : ''}
|
| 1296 |
+
</div>
|
| 1297 |
+
</div>
|
| 1298 |
+
`;
|
| 1299 |
+
});
|
| 1300 |
+
documentsList.innerHTML = html;
|
| 1301 |
+
}
|
| 1302 |
+
|
| 1303 |
+
function updateStats(stats) {
|
| 1304 |
+
document.getElementById('docCount').textContent = stats.total_documents || 0;
|
| 1305 |
+
document.getElementById('chunkCount').textContent = stats.total_chunks || 0;
|
| 1306 |
+
document.getElementById('pageCount').textContent = stats.total_pages || 0;
|
| 1307 |
+
}
|
| 1308 |
+
|
| 1309 |
+
async function clearAllDocuments() {
|
| 1310 |
+
if (!confirm('Are you sure you want to clear all documents from the repository? This action cannot be undone.')) {
|
| 1311 |
+
return;
|
| 1312 |
+
}
|
| 1313 |
+
|
| 1314 |
+
setLoadingState(true, 'clearBtn', 'Clearing Repository...', 'Clear Repository', 'trash-alt');
|
| 1315 |
+
|
| 1316 |
+
try {
|
| 1317 |
+
const response = await fetch('/clear', {
|
| 1318 |
+
method: 'DELETE'
|
| 1319 |
+
});
|
| 1320 |
+
|
| 1321 |
+
const result = await response.json();
|
| 1322 |
+
|
| 1323 |
+
if (response.ok) {
|
| 1324 |
+
showMessage(result.message, 'success', 'check-circle');
|
| 1325 |
+
await loadDocuments();
|
| 1326 |
+
clearAnswer();
|
| 1327 |
+
} else {
|
| 1328 |
+
showMessage(result.detail || 'Failed to clear documents', 'error', 'exclamation-triangle');
|
| 1329 |
+
}
|
| 1330 |
+
} catch (error) {
|
| 1331 |
+
showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
|
| 1332 |
+
} finally {
|
| 1333 |
+
setLoadingState(false, 'clearBtn', 'Clearing Repository...', 'Clear Repository', 'trash-alt');
|
| 1334 |
+
}
|
| 1335 |
+
}
|
| 1336 |
+
</script>
|
| 1337 |
+
</body>
|
| 1338 |
+
</html>
|