Spaces:
Sleeping
Sleeping
Upload 43 files
Browse files- .gitattributes +25 -0
- Dockerfile +35 -0
- app.py +1037 -64
- chunker.py +189 -0
- env +31 -0
- faiss_storage/faiss_index/index.faiss +3 -0
- faiss_storage/faiss_index/index.pkl +3 -0
- gitattributes +59 -0
- llm_handling.py +542 -0
- requirements.txt +30 -1
- sources/Endodontics%20appendix%201.pdf +0 -0
- sources/Endodontics%20appendix%202.pdf +0 -0
- sources/Endodontics%20appendix%203.pdf +0 -0
- sources/Endodontics%20appendix%204.pdf +3 -0
- sources/Endodontics%20book.zip +3 -0
- sources/Endodontics%20cap%201.pdf +3 -0
- sources/Endodontics%20cap%2010.pdf +3 -0
- sources/Endodontics%20cap%2011.pdf +3 -0
- sources/Endodontics%20cap%2012.pdf +3 -0
- sources/Endodontics%20cap%2013.pdf +3 -0
- sources/Endodontics%20cap%2014.pdf +3 -0
- sources/Endodontics%20cap%2015.pdf +3 -0
- sources/Endodontics%20cap%2016.pdf +3 -0
- sources/Endodontics%20cap%2017.pdf +3 -0
- sources/Endodontics%20cap%2018.pdf +3 -0
- sources/Endodontics%20cap%2019.pdf +3 -0
- sources/Endodontics%20cap%202.pdf +3 -0
- sources/Endodontics%20cap%2020.pdf +3 -0
- sources/Endodontics%20cap%2021.pdf +3 -0
- sources/Endodontics%20cap%2022.pdf +3 -0
- sources/Endodontics%20cap%203.pdf +3 -0
- sources/Endodontics%20cap%204.pdf +3 -0
- sources/Endodontics%20cap%205.pdf +3 -0
- sources/Endodontics%20cap%206.pdf +3 -0
- sources/Endodontics%20cap%207.pdf +3 -0
- sources/Endodontics%20cap%208.pdf +3 -0
- sources/Endodontics%20cap%209.pdf +3 -0
- sources/_%24preguntas%20chatbot_01.xlsx +0 -0
- sources/database.csv +1 -0
- sources/general_qa.csv +1 -0
- sources/greetings.csv +1 -0
- sources/personal_qa.csv +1 -0
- sources/preguntas chatbot_01.xlsx +3 -0
- system_prompts.py +67 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
faiss_storage/faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
sources/Endodontics%20appendix%204.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
sources/Endodontics%20cap%201.pdf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
sources/Endodontics%20cap%2010.pdf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
sources/Endodontics%20cap%2011.pdf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
sources/Endodontics%20cap%2012.pdf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
sources/Endodontics%20cap%2013.pdf filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
sources/Endodontics%20cap%2014.pdf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
sources/Endodontics%20cap%2015.pdf filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
sources/Endodontics%20cap%2016.pdf filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
sources/Endodontics%20cap%2017.pdf filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
sources/Endodontics%20cap%2018.pdf filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
sources/Endodontics%20cap%2019.pdf filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
sources/Endodontics%20cap%202.pdf filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
sources/Endodontics%20cap%2020.pdf filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
sources/Endodontics%20cap%2021.pdf filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
sources/Endodontics%20cap%2022.pdf filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
sources/Endodontics%20cap%203.pdf filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
sources/Endodontics%20cap%204.pdf filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
sources/Endodontics%20cap%205.pdf filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
sources/Endodontics%20cap%206.pdf filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
sources/Endodontics%20cap%207.pdf filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
sources/Endodontics%20cap%208.pdf filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
sources/Endodontics%20cap%209.pdf filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
sources/preguntas[[:space:]]chatbot_01.xlsx filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
+
libgl1-mesa-glx \
|
| 10 |
+
libglib2.0-0 \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy the requirements file
|
| 14 |
+
COPY requirements.txt requirements.txt
|
| 15 |
+
|
| 16 |
+
# Install Python packages
|
| 17 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
+
|
| 19 |
+
# Copy application code
|
| 20 |
+
COPY . /app
|
| 21 |
+
|
| 22 |
+
# Create a non-root user
|
| 23 |
+
RUN useradd -m -u 1000 user
|
| 24 |
+
|
| 25 |
+
# Change ownership
|
| 26 |
+
RUN chown -R user:user /app
|
| 27 |
+
|
| 28 |
+
# Switch to the non-root user
|
| 29 |
+
USER user
|
| 30 |
+
|
| 31 |
+
# Expose the port Gunicorn will run on (Using 7860 as in CMD)
|
| 32 |
+
EXPOSE 7860
|
| 33 |
+
|
| 34 |
+
# Command to run the app
|
| 35 |
+
CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
|
@@ -1,64 +1,1037 @@
|
|
| 1 |
-
import
|
| 2 |
-
from
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, send_file, abort, jsonify, url_for, render_template, Response
|
| 2 |
+
from flask_cors import CORS
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sentence_transformers import SentenceTransformer, util
|
| 5 |
+
import torch
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import List, Dict, Tuple, Optional, Any, Iterator
|
| 8 |
+
from collections import deque
|
| 9 |
+
import os
|
| 10 |
+
import logging
|
| 11 |
+
import atexit
|
| 12 |
+
from threading import Thread, Lock
|
| 13 |
+
import time
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
from uuid import uuid4 as generate_uuid
|
| 16 |
+
import csv as csv_lib
|
| 17 |
+
import functools
|
| 18 |
+
import json
|
| 19 |
+
import re
|
| 20 |
+
import subprocess
|
| 21 |
+
import sys
|
| 22 |
+
import sqlite3
|
| 23 |
+
import io
|
| 24 |
+
|
| 25 |
+
from dotenv import load_dotenv
|
| 26 |
+
|
| 27 |
+
# Load environment variables from .env file AT THE VERY TOP
|
| 28 |
+
load_dotenv()
|
| 29 |
+
|
| 30 |
+
# Import RAG system and Fallback LLM from llm_handling AFTER load_dotenv
|
| 31 |
+
# MODIFIED: Imported new functions and prompts
|
| 32 |
+
from llm_handling import (
|
| 33 |
+
initialize_and_get_rag_system,
|
| 34 |
+
KnowledgeRAG,
|
| 35 |
+
groq_bot_instance,
|
| 36 |
+
RAG_SOURCES_DIR,
|
| 37 |
+
RAG_STORAGE_PARENT_DIR,
|
| 38 |
+
RAG_CHUNKED_SOURCES_FILENAME,
|
| 39 |
+
get_answer_from_context
|
| 40 |
+
)
|
| 41 |
+
from system_prompts import QA_FORMATTER_PROMPT
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Setup logging (remains global for the app)
|
| 45 |
+
logging.basicConfig(
|
| 46 |
+
level=logging.INFO,
|
| 47 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 48 |
+
handlers=[
|
| 49 |
+
logging.FileHandler("app_hybrid_rag.log"),
|
| 50 |
+
logging.StreamHandler()
|
| 51 |
+
]
|
| 52 |
+
)
|
| 53 |
+
logger = logging.getLogger(__name__) # Main app logger
|
| 54 |
+
|
| 55 |
+
# --- Application Constants and Configuration ---
|
| 56 |
+
ADMIN_USERNAME = os.getenv('FLASK_ADMIN_USERNAME', 'admin')
|
| 57 |
+
ADMIN_PASSWORD = os.getenv('FLASK_ADMIN_PASSWORD', 'admin')
|
| 58 |
+
FLASK_APP_HOST = os.getenv("FLASK_HOST", "0.0.0.0")
|
| 59 |
+
FLASK_APP_PORT = int(os.getenv("FLASK_PORT", "7860"))
|
| 60 |
+
FLASK_DEBUG_MODE = os.getenv("FLASK_DEBUG", "True").lower() == "true"
|
| 61 |
+
_APP_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 62 |
+
TEXT_EXTRACTIONS_DIR = os.path.join(_APP_BASE_DIR, 'text_extractions')
|
| 63 |
+
RELATED_QUESTIONS_TO_SHOW = 10
|
| 64 |
+
QUESTIONS_TO_SEND_TO_GROQ_QA = 3
|
| 65 |
+
# MODIFIED: Replaced separate confidence values with a single configurable one for the LLM formatter.
|
| 66 |
+
LLM_FORMATTER_CONFIDENCE_THRESHOLD = int(os.getenv("LLM_FORMATTER_CONFIDENCE_THRESHOLD", "95"))
|
| 67 |
+
HIGH_CONFIDENCE_THRESHOLD = 90 # For greetings, which are answered directly without LLM formatting.
|
| 68 |
+
# MODIFIED: Made CHAT_HISTORY_TO_SEND configurable via environment variable
|
| 69 |
+
CHAT_HISTORY_TO_SEND = int(os.getenv("CHAT_HISTORY_TO_SEND", "5")) # Defines how many *pairs* of (user, assistant) messages to send
|
| 70 |
+
CHAT_LOG_FILE = os.path.join(_APP_BASE_DIR, 'chat_history.csv')
|
| 71 |
+
|
| 72 |
+
rag_system: Optional[KnowledgeRAG] = None
|
| 73 |
+
|
| 74 |
+
# --- Persistent Chat History Management using SQLite ---
|
| 75 |
+
class ChatHistoryManager:
|
| 76 |
+
def __init__(self, db_path):
|
| 77 |
+
self.db_path = db_path
|
| 78 |
+
self.lock = Lock()
|
| 79 |
+
self._create_table()
|
| 80 |
+
logger.info(f"SQLite chat history manager initialized at: {self.db_path}")
|
| 81 |
+
|
| 82 |
+
def _get_connection(self):
|
| 83 |
+
conn = sqlite3.connect(self.db_path, timeout=10)
|
| 84 |
+
return conn
|
| 85 |
+
|
| 86 |
+
def _create_table(self):
|
| 87 |
+
with self.lock:
|
| 88 |
+
with self._get_connection() as conn:
|
| 89 |
+
cursor = conn.cursor()
|
| 90 |
+
cursor.execute("""
|
| 91 |
+
CREATE TABLE IF NOT EXISTS chat_histories (
|
| 92 |
+
session_id TEXT PRIMARY KEY,
|
| 93 |
+
history TEXT NOT NULL
|
| 94 |
+
)
|
| 95 |
+
""")
|
| 96 |
+
conn.commit()
|
| 97 |
+
|
| 98 |
+
def get_history(self, session_id: str, limit_turns: int = 5) -> list:
|
| 99 |
+
try:
|
| 100 |
+
with self._get_connection() as conn:
|
| 101 |
+
cursor = conn.cursor()
|
| 102 |
+
cursor.execute("SELECT history FROM chat_histories WHERE session_id = ?", (session_id,))
|
| 103 |
+
row = cursor.fetchone()
|
| 104 |
+
if row:
|
| 105 |
+
history_list = json.loads(row[0])
|
| 106 |
+
return history_list[-(limit_turns * 2):]
|
| 107 |
+
else:
|
| 108 |
+
return []
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"Error fetching history for session {session_id}: {e}", exc_info=True)
|
| 111 |
+
return []
|
| 112 |
+
|
| 113 |
+
def update_history(self, session_id: str, query: str, answer: str):
|
| 114 |
+
with self.lock:
|
| 115 |
+
try:
|
| 116 |
+
with self._get_connection() as conn:
|
| 117 |
+
cursor = conn.cursor()
|
| 118 |
+
cursor.execute("SELECT history FROM chat_histories WHERE session_id = ?", (session_id,))
|
| 119 |
+
row = cursor.fetchone()
|
| 120 |
+
|
| 121 |
+
history = json.loads(row[0]) if row else []
|
| 122 |
+
|
| 123 |
+
history.append({'role': 'user', 'content': query})
|
| 124 |
+
history.append({'role': 'assistant', 'content': answer})
|
| 125 |
+
|
| 126 |
+
updated_history_json = json.dumps(history)
|
| 127 |
+
|
| 128 |
+
cursor.execute("""
|
| 129 |
+
INSERT OR REPLACE INTO chat_histories (session_id, history)
|
| 130 |
+
VALUES (?, ?)
|
| 131 |
+
""", (session_id, updated_history_json))
|
| 132 |
+
conn.commit()
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.error(f"Error updating history for session {session_id}: {e}", exc_info=True)
|
| 135 |
+
|
| 136 |
+
def clear_history(self, session_id: str):
|
| 137 |
+
with self.lock:
|
| 138 |
+
try:
|
| 139 |
+
with self._get_connection() as conn:
|
| 140 |
+
cursor = conn.cursor()
|
| 141 |
+
cursor.execute("""
|
| 142 |
+
INSERT OR REPLACE INTO chat_histories (session_id, history)
|
| 143 |
+
VALUES (?, ?)
|
| 144 |
+
""", (session_id, json.dumps([])))
|
| 145 |
+
conn.commit()
|
| 146 |
+
logger.info(f"Chat history cleared for session: {session_id}")
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.error(f"Error clearing history for session {session_id}: {e}", exc_info=True)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# --- EmbeddingManager for CSV QA (remains in app.py) ---
|
| 152 |
+
@dataclass
|
| 153 |
+
class QAEmbeddings:
|
| 154 |
+
questions: List[str]
|
| 155 |
+
question_map: List[int]
|
| 156 |
+
embeddings: torch.Tensor
|
| 157 |
+
df_qa: pd.DataFrame
|
| 158 |
+
original_questions: List[str]
|
| 159 |
+
|
| 160 |
+
class EmbeddingManager:
|
| 161 |
+
def __init__(self, model_name='all-MiniLM-L6-v2'):
|
| 162 |
+
self.model = SentenceTransformer(model_name)
|
| 163 |
+
self.embeddings = {
|
| 164 |
+
'general': None,
|
| 165 |
+
'personal': None,
|
| 166 |
+
'greetings': None
|
| 167 |
+
}
|
| 168 |
+
logger.info(f"EmbeddingManager initialized with model: {model_name}")
|
| 169 |
+
|
| 170 |
+
def _process_questions(self, df: pd.DataFrame) -> Tuple[List[str], List[int], List[str]]:
|
| 171 |
+
questions = []
|
| 172 |
+
question_map = []
|
| 173 |
+
original_questions = []
|
| 174 |
+
|
| 175 |
+
if 'Question' not in df.columns:
|
| 176 |
+
logger.warning(f"DataFrame for EmbeddingManager is missing 'Question' column. Cannot process questions from it.")
|
| 177 |
+
return questions, question_map, original_questions
|
| 178 |
+
|
| 179 |
+
for idx, question_text_raw in enumerate(df['Question']):
|
| 180 |
+
if pd.isna(question_text_raw):
|
| 181 |
+
continue
|
| 182 |
+
question_text_cleaned = str(question_text_raw).strip()
|
| 183 |
+
if not question_text_cleaned or question_text_cleaned.lower() == "nan":
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
questions.append(question_text_cleaned)
|
| 187 |
+
question_map.append(idx)
|
| 188 |
+
original_questions.append(question_text_cleaned)
|
| 189 |
+
|
| 190 |
+
return questions, question_map, original_questions
|
| 191 |
+
|
| 192 |
+
def update_embeddings(self, general_qa: pd.DataFrame, personal_qa: pd.DataFrame, greetings_qa: pd.DataFrame):
|
| 193 |
+
gen_questions, gen_question_map, gen_original_questions = self._process_questions(general_qa)
|
| 194 |
+
gen_embeddings = self.model.encode(gen_questions, convert_to_tensor=True, show_progress_bar=False) if gen_questions else None
|
| 195 |
+
|
| 196 |
+
pers_questions, pers_question_map, pers_original_questions = self._process_questions(personal_qa)
|
| 197 |
+
pers_embeddings = self.model.encode(pers_questions, convert_to_tensor=True, show_progress_bar=False) if pers_questions else None
|
| 198 |
+
|
| 199 |
+
greet_questions, greet_question_map, greet_original_questions = self._process_questions(greetings_qa)
|
| 200 |
+
greet_embeddings = self.model.encode(greet_questions, convert_to_tensor=True, show_progress_bar=False) if greet_questions else None
|
| 201 |
+
|
| 202 |
+
self.embeddings['general'] = QAEmbeddings(
|
| 203 |
+
questions=gen_questions, question_map=gen_question_map, embeddings=gen_embeddings,
|
| 204 |
+
df_qa=general_qa, original_questions=gen_original_questions
|
| 205 |
+
)
|
| 206 |
+
self.embeddings['personal'] = QAEmbeddings(
|
| 207 |
+
questions=pers_questions, question_map=pers_question_map, embeddings=pers_embeddings,
|
| 208 |
+
df_qa=personal_qa, original_questions=pers_original_questions
|
| 209 |
+
)
|
| 210 |
+
self.embeddings['greetings'] = QAEmbeddings(
|
| 211 |
+
questions=greet_questions, question_map=greet_question_map, embeddings=greet_embeddings,
|
| 212 |
+
df_qa=greetings_qa, original_questions=greet_original_questions
|
| 213 |
+
)
|
| 214 |
+
logger.info("CSV QA embeddings updated in EmbeddingManager.")
|
| 215 |
+
|
| 216 |
+
def find_best_answers(self, user_query: str, qa_type: str, top_n: int = 5) -> Tuple[List[float], List[str], List[str], List[str], List[int]]:
|
| 217 |
+
qa_data = self.embeddings[qa_type]
|
| 218 |
+
if qa_data is None or qa_data.embeddings is None or len(qa_data.embeddings) == 0:
|
| 219 |
+
return [], [], [], [], []
|
| 220 |
+
|
| 221 |
+
query_embedding_tensor = self.model.encode([user_query], convert_to_tensor=True, show_progress_bar=False)
|
| 222 |
+
if not isinstance(qa_data.embeddings, torch.Tensor):
|
| 223 |
+
qa_data.embeddings = torch.tensor(qa_data.embeddings) # Safeguard
|
| 224 |
+
|
| 225 |
+
cos_scores = util.cos_sim(query_embedding_tensor, qa_data.embeddings)[0]
|
| 226 |
+
|
| 227 |
+
top_k = min(top_n, len(cos_scores))
|
| 228 |
+
if top_k == 0:
|
| 229 |
+
return [], [], [], [], []
|
| 230 |
+
|
| 231 |
+
top_scores_tensor, indices_tensor = torch.topk(cos_scores, k=top_k)
|
| 232 |
+
|
| 233 |
+
top_confidences = [score.item() * 100 for score in top_scores_tensor]
|
| 234 |
+
top_indices_mapped = []
|
| 235 |
+
top_questions = []
|
| 236 |
+
|
| 237 |
+
for idx_tensor in indices_tensor:
|
| 238 |
+
item_idx = idx_tensor.item()
|
| 239 |
+
if item_idx < len(qa_data.question_map) and item_idx < len(qa_data.original_questions):
|
| 240 |
+
original_df_idx = qa_data.question_map[item_idx]
|
| 241 |
+
if original_df_idx < len(qa_data.df_qa):
|
| 242 |
+
top_indices_mapped.append(original_df_idx)
|
| 243 |
+
top_questions.append(qa_data.original_questions[item_idx])
|
| 244 |
+
else:
|
| 245 |
+
logger.warning(f"Index out of bounds: original_df_idx {original_df_idx} for df_qa length {len(qa_data.df_qa)}")
|
| 246 |
+
else:
|
| 247 |
+
logger.warning(f"Index out of bounds: item_idx {item_idx} for question_map/original_questions")
|
| 248 |
+
|
| 249 |
+
valid_count = len(top_indices_mapped)
|
| 250 |
+
top_confidences = top_confidences[:valid_count]
|
| 251 |
+
top_questions = top_questions[:valid_count]
|
| 252 |
+
|
| 253 |
+
# MODIFIED: Changed Answer to Respuesta to match new loading logic for xlsx
|
| 254 |
+
answer_col = 'Respuesta' if 'Respuesta' in qa_data.df_qa.columns else 'Answer'
|
| 255 |
+
top_answers = [str(qa_data.df_qa[answer_col].iloc[i]) for i in top_indices_mapped]
|
| 256 |
+
top_images = [str(qa_data.df_qa['Image'].iloc[i]) if 'Image' in qa_data.df_qa.columns and pd.notna(qa_data.df_qa['Image'].iloc[i]) else None for i in top_indices_mapped]
|
| 257 |
+
|
| 258 |
+
return top_confidences, top_questions, top_answers, top_images, top_indices_mapped
|
| 259 |
+
|
| 260 |
+
# --- DatabaseMonitor for personal_qa.csv placeholders (remains in app.py) ---
|
| 261 |
+
class DatabaseMonitor:
|
| 262 |
+
def __init__(self, database_path):
|
| 263 |
+
self.logger = logging.getLogger(__name__ + ".DatabaseMonitor")
|
| 264 |
+
self.database_path = database_path
|
| 265 |
+
self.last_modified = None
|
| 266 |
+
self.last_size = None
|
| 267 |
+
self.df = None
|
| 268 |
+
self.lock = Lock()
|
| 269 |
+
self.running = True
|
| 270 |
+
self._load_database()
|
| 271 |
+
self.monitor_thread = Thread(target=self._monitor_database, daemon=True)
|
| 272 |
+
self.monitor_thread.start()
|
| 273 |
+
self.logger.info(f"DatabaseMonitor initialized for: {database_path}")
|
| 274 |
+
|
| 275 |
+
def _load_database(self):
|
| 276 |
+
try:
|
| 277 |
+
if not os.path.exists(self.database_path):
|
| 278 |
+
self.logger.warning(f"Personal data file not found: {self.database_path}.")
|
| 279 |
+
self.df = None
|
| 280 |
+
return
|
| 281 |
+
with self.lock:
|
| 282 |
+
self.df = pd.read_csv(self.database_path, encoding='cp1252')
|
| 283 |
+
self.last_modified = os.path.getmtime(self.database_path)
|
| 284 |
+
self.last_size = os.path.getsize(self.database_path)
|
| 285 |
+
self.logger.info(f"Personal data file reloaded: {self.database_path}")
|
| 286 |
+
except Exception as e:
|
| 287 |
+
self.logger.error(f"Error loading personal data file '{self.database_path}': {e}", exc_info=True)
|
| 288 |
+
self.df = None
|
| 289 |
+
|
| 290 |
+
def _monitor_database(self):
|
| 291 |
+
while self.running:
|
| 292 |
+
try:
|
| 293 |
+
if not os.path.exists(self.database_path):
|
| 294 |
+
if self.df is not None:
|
| 295 |
+
self.logger.warning(f"Personal data file disappeared: {self.database_path}")
|
| 296 |
+
self.df = None; self.last_modified = None; self.last_size = None
|
| 297 |
+
time.sleep(5)
|
| 298 |
+
continue
|
| 299 |
+
current_modified = os.path.getmtime(self.database_path); current_size = os.path.getsize(self.database_path)
|
| 300 |
+
if (self.last_modified is None or current_modified != self.last_modified or
|
| 301 |
+
self.last_size is None or current_size != self.last_size):
|
| 302 |
+
self.logger.info("Personal data file change detected.")
|
| 303 |
+
self._load_database()
|
| 304 |
+
time.sleep(1)
|
| 305 |
+
except Exception as e:
|
| 306 |
+
self.logger.error(f"Error monitoring personal data file: {e}", exc_info=True)
|
| 307 |
+
time.sleep(5)
|
| 308 |
+
|
| 309 |
+
def get_data(self, user_id):
|
| 310 |
+
with self.lock:
|
| 311 |
+
if self.df is not None and user_id:
|
| 312 |
+
try:
|
| 313 |
+
if 'id' not in self.df.columns:
|
| 314 |
+
self.logger.warning("'id' column not found in personal_data.csv")
|
| 315 |
+
return None
|
| 316 |
+
id_col_type = self.df['id'].dtype
|
| 317 |
+
target_user_id = user_id
|
| 318 |
+
if pd.api.types.is_numeric_dtype(id_col_type):
|
| 319 |
+
try:
|
| 320 |
+
if user_id is None: return None
|
| 321 |
+
valid_ids = self.df['id'].dropna()
|
| 322 |
+
if not valid_ids.empty:
|
| 323 |
+
target_user_id = type(valid_ids.iloc[0])(user_id)
|
| 324 |
+
else:
|
| 325 |
+
target_user_id = int(user_id)
|
| 326 |
+
except (ValueError, TypeError):
|
| 327 |
+
self.logger.warning(f"Could not convert user_id '{user_id}' to numeric type {id_col_type}")
|
| 328 |
+
return None
|
| 329 |
+
user_data = self.df[self.df['id'] == target_user_id]
|
| 330 |
+
if not user_data.empty: return user_data.iloc[0].to_dict()
|
| 331 |
+
except Exception as e:
|
| 332 |
+
self.logger.error(f"Error retrieving data for user_id {user_id}: {e}", exc_info=True)
|
| 333 |
+
return None
|
| 334 |
+
|
| 335 |
+
def stop(self):
|
| 336 |
+
self.running = False
|
| 337 |
+
if hasattr(self, 'monitor_thread') and self.monitor_thread.is_alive():
|
| 338 |
+
self.monitor_thread.join(timeout=5)
|
| 339 |
+
self.logger.info("DatabaseMonitor stopped.")
|
| 340 |
+
|
| 341 |
+
# --- Flask App Initialization ---
|
| 342 |
+
app = Flask(__name__)
|
| 343 |
+
CORS(app, resources={r"/*": {"origins": "*"}}, supports_credentials=True)
|
| 344 |
+
|
| 345 |
+
# --- Initialize Managers ---
|
| 346 |
+
embedding_manager = EmbeddingManager()
|
| 347 |
+
history_manager = ChatHistoryManager('chat_history.db')
|
| 348 |
+
database_csv_path = os.path.join(RAG_SOURCES_DIR, 'database.csv')
|
| 349 |
+
personal_data_monitor = DatabaseMonitor(database_csv_path)
|
| 350 |
+
|
| 351 |
+
# --- Helper Functions (App specific) ---
|
| 352 |
+
def clean_html_from_text(text: str) -> str:
|
| 353 |
+
"""Removes HTML tags from a string using a simple regex."""
|
| 354 |
+
if not isinstance(text, str):
|
| 355 |
+
return text
|
| 356 |
+
clean_text = re.sub(r'<[^>]+>', '', text)
|
| 357 |
+
return clean_text.strip()
|
| 358 |
+
|
| 359 |
+
def normalize_text(text):
|
| 360 |
+
if isinstance(text, str):
|
| 361 |
+
replacements = {
|
| 362 |
+
'\x91': "'", '\x92': "'", '\x93': '"', '\x94': '"',
|
| 363 |
+
'\x96': '-', '\x97': '-', '\x85': '...', '\x95': '-',
|
| 364 |
+
'"': '"', '"': '"', '‘': "'", '’': "'",
|
| 365 |
+
'–': '-', '—': '-', '…': '...', '•': '-',
|
| 366 |
+
}
|
| 367 |
+
for old, new in replacements.items(): text = text.replace(old, new)
|
| 368 |
+
return text
|
| 369 |
+
|
| 370 |
+
def require_admin_auth(f):
|
| 371 |
+
@functools.wraps(f)
|
| 372 |
+
def decorated(*args, **kwargs):
|
| 373 |
+
auth = request.authorization
|
| 374 |
+
if not auth or auth.username != ADMIN_USERNAME or auth.password != ADMIN_PASSWORD:
|
| 375 |
+
return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
|
| 376 |
+
return f(*args, **kwargs)
|
| 377 |
+
return decorated
|
| 378 |
+
|
| 379 |
+
def initialize_chat_log():
|
| 380 |
+
if not os.path.exists(CHAT_LOG_FILE):
|
| 381 |
+
with open(CHAT_LOG_FILE, 'w', newline='', encoding='utf-8') as f:
|
| 382 |
+
writer = csv_lib.writer(f)
|
| 383 |
+
writer.writerow(['sl', 'date_time', 'session_id', 'user_id', 'query', 'answer'])
|
| 384 |
+
|
| 385 |
+
def store_chat_history(sid: str, uid: Optional[str], query: str, resp: Dict[str, Any]):
|
| 386 |
+
try:
|
| 387 |
+
# This now gets the final response key, which is 'answer' in the old logic
|
| 388 |
+
answer = str(resp.get('answer', ''))
|
| 389 |
+
history_manager.update_history(sid, query, answer)
|
| 390 |
+
|
| 391 |
+
initialize_chat_log()
|
| 392 |
+
next_sl = 1
|
| 393 |
+
try:
|
| 394 |
+
if os.path.exists(CHAT_LOG_FILE) and os.path.getsize(CHAT_LOG_FILE) > 0:
|
| 395 |
+
df_log = pd.read_csv(CHAT_LOG_FILE, on_bad_lines='skip')
|
| 396 |
+
if not df_log.empty and 'sl' in df_log.columns and pd.api.types.is_numeric_dtype(df_log['sl'].dropna()):
|
| 397 |
+
if not df_log['sl'].dropna().empty:
|
| 398 |
+
next_sl = int(df_log['sl'].dropna().max()) + 1
|
| 399 |
+
except Exception as e:
|
| 400 |
+
logger.error(f"Error reading SL from {CHAT_LOG_FILE}: {e}", exc_info=True)
|
| 401 |
+
|
| 402 |
+
with open(CHAT_LOG_FILE, 'a', newline='', encoding='utf-8') as f:
|
| 403 |
+
csv_lib.writer(f).writerow([next_sl, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), sid, uid or "N/A", query, answer])
|
| 404 |
+
|
| 405 |
+
except Exception as e:
|
| 406 |
+
logger.error(f"Error in store_chat_history for session {sid}: {e}", exc_info=True)
|
| 407 |
+
|
| 408 |
+
def get_formatted_chat_history(session_id: str) -> List[Dict[str, str]]:
|
| 409 |
+
if not session_id:
|
| 410 |
+
return []
|
| 411 |
+
return history_manager.get_history(session_id, limit_turns=CHAT_HISTORY_TO_SEND)
|
| 412 |
+
|
| 413 |
+
def get_qa_context_for_groq(all_questions: List[Dict]) -> str:
|
| 414 |
+
valid_qa_pairs = []
|
| 415 |
+
non_greeting_questions = [q for q in all_questions if q.get('source_type') != 'greetings']
|
| 416 |
+
sorted_questions = sorted(non_greeting_questions, key=lambda x: x.get('confidence', 0), reverse=True)
|
| 417 |
+
|
| 418 |
+
for qa in sorted_questions[:QUESTIONS_TO_SEND_TO_GROQ_QA]:
|
| 419 |
+
answer = qa.get('answer')
|
| 420 |
+
if (not pd.isna(answer) and isinstance(answer, str) and answer.strip() and
|
| 421 |
+
"not available" not in answer.lower()):
|
| 422 |
+
valid_qa_pairs.append(f"Q: {qa.get('question')}\nA: {answer}")
|
| 423 |
+
return '\n'.join(valid_qa_pairs)
|
| 424 |
+
|
| 425 |
+
def replace_placeholders_in_answer(answer, db_data):
|
| 426 |
+
if pd.isna(answer) or str(answer).strip() == '':
|
| 427 |
+
return "Sorry, this information is not available yet"
|
| 428 |
+
answer_str = str(answer)
|
| 429 |
+
placeholders = re.findall(r'\{(\w+)\}', answer_str)
|
| 430 |
+
if not placeholders: return answer_str
|
| 431 |
+
if db_data is None:
|
| 432 |
+
return "To get this specific information, please ensure you are logged in or have provided your user ID."
|
| 433 |
+
missing_count = 0; replacements_made = 0
|
| 434 |
+
for placeholder in set(placeholders):
|
| 435 |
+
key = placeholder.strip()
|
| 436 |
+
value = db_data.get(key)
|
| 437 |
+
if value is None or (isinstance(value, float) and pd.isna(value)) or str(value).strip() == '':
|
| 438 |
+
answer_str = answer_str.replace(f'{{{key}}}', "not available")
|
| 439 |
+
missing_count += 1
|
| 440 |
+
else:
|
| 441 |
+
answer_str = answer_str.replace(f'{{{key}}}', str(value))
|
| 442 |
+
replacements_made +=1
|
| 443 |
+
if missing_count == len(placeholders) and len(placeholders) > 0 :
|
| 444 |
+
return "Sorry, some specific details for you are not available at the moment."
|
| 445 |
+
if "not available" in answer_str.lower() and replacements_made < len(placeholders):
|
| 446 |
+
if answer_str == "not available" and len(placeholders) == 1:
|
| 447 |
+
return "Sorry, this information is not available yet."
|
| 448 |
+
if re.search(r'\{(\w+)\}', answer_str):
|
| 449 |
+
logger.warning(f"Unresolved placeholders remain after replacement attempt: {answer_str}")
|
| 450 |
+
answer_str = re.sub(r'\{(\w+)\}', "a specific detail", answer_str)
|
| 451 |
+
if "a specific detail" in answer_str and not "Sorry" in answer_str:
|
| 452 |
+
return "Sorry, I couldn't retrieve all the specific details for this answer. " + answer_str
|
| 453 |
+
return "Sorry, I couldn't retrieve all the specific details for this answer. Some information has been generalized."
|
| 454 |
+
return answer_str
|
| 455 |
+
|
| 456 |
+
# --- Non-Streaming Logic (Preserved from original) ---
|
| 457 |
+
def get_hybrid_response_logic_non_streaming(user_query: str, session_id: str, user_id: Optional[str], chat_history: Optional[List[Dict]] = None) -> Dict[str, Any]:
|
| 458 |
+
global rag_system
|
| 459 |
+
|
| 460 |
+
if not user_query: return {'error': 'No query provided'}
|
| 461 |
+
if not session_id: return {'error': 'session_id is required'}
|
| 462 |
+
|
| 463 |
+
personal_db_data = personal_data_monitor.get_data(user_id) if user_id else None
|
| 464 |
+
|
| 465 |
+
# MODIFIED: Capture indices from the search
|
| 466 |
+
conf_greet, q_greet, a_greet, img_greet, idx_greet = embedding_manager.find_best_answers(user_query, 'greetings', top_n=1)
|
| 467 |
+
conf_pers, q_pers, a_pers, img_pers, idx_pers = embedding_manager.find_best_answers(user_query, 'personal', top_n=RELATED_QUESTIONS_TO_SHOW)
|
| 468 |
+
conf_gen, q_gen, a_gen, img_gen, idx_gen = embedding_manager.find_best_answers(user_query, 'general', top_n=RELATED_QUESTIONS_TO_SHOW)
|
| 469 |
+
|
| 470 |
+
all_csv_candidate_answers = []
|
| 471 |
+
if conf_greet and conf_greet[0] >= HIGH_CONFIDENCE_THRESHOLD:
|
| 472 |
+
all_csv_candidate_answers.append({'question': q_greet[0], 'answer': a_greet[0], 'image': img_greet[0] if img_greet else None, 'confidence': conf_greet[0], 'source_type': 'greetings', 'original_index': idx_greet[0]})
|
| 473 |
+
if conf_pers:
|
| 474 |
+
# MODIFIED: Add original_index to candidates
|
| 475 |
+
for c, q, a, img, idx in zip(conf_pers, q_pers, a_pers, img_pers, idx_pers):
|
| 476 |
+
processed_a = replace_placeholders_in_answer(a, personal_db_data)
|
| 477 |
+
if not ("Sorry, this information is not available yet" in processed_a or "To get this specific information" in processed_a):
|
| 478 |
+
all_csv_candidate_answers.append({'question': q, 'answer': processed_a, 'image': img, 'confidence': c, 'source_type': 'personal', 'original_index': idx})
|
| 479 |
+
if conf_gen:
|
| 480 |
+
# MODIFIED: Add original_index to candidates
|
| 481 |
+
for c, q, a, img, idx in zip(conf_gen, q_gen, a_gen, img_gen, idx_gen):
|
| 482 |
+
if not (pd.isna(a) or str(a).strip() == '' or str(a).lower() == 'nan'):
|
| 483 |
+
all_csv_candidate_answers.append({'question': q, 'answer': str(a), 'image': img, 'confidence': c, 'source_type': 'general', 'original_index': idx})
|
| 484 |
+
|
| 485 |
+
all_csv_candidate_answers.sort(key=lambda x: x['confidence'], reverse=True)
|
| 486 |
+
|
| 487 |
+
related_questions_list = []
|
| 488 |
+
|
| 489 |
+
if all_csv_candidate_answers:
|
| 490 |
+
best_csv_match = all_csv_candidate_answers[0]
|
| 491 |
+
is_direct_csv_answer = False
|
| 492 |
+
source_name = ""
|
| 493 |
+
|
| 494 |
+
# MODIFIED: Use new configurable threshold for LLM formatting
|
| 495 |
+
best_source_type = best_csv_match['source_type']
|
| 496 |
+
best_confidence = best_csv_match['confidence']
|
| 497 |
+
|
| 498 |
+
if best_source_type == 'greetings' and best_confidence >= HIGH_CONFIDENCE_THRESHOLD:
|
| 499 |
+
is_direct_csv_answer = True
|
| 500 |
+
source_name = 'greetings_qa'
|
| 501 |
+
elif best_source_type in ['personal', 'general'] and best_confidence >= LLM_FORMATTER_CONFIDENCE_THRESHOLD:
|
| 502 |
+
is_direct_csv_answer = True
|
| 503 |
+
source_name = f"{best_source_type}_qa"
|
| 504 |
+
|
| 505 |
+
if is_direct_csv_answer:
|
| 506 |
+
# MODIFICATION START: Reroute high-confidence matches to the LLM for formatting
|
| 507 |
+
best_match_source = best_csv_match['source_type']
|
| 508 |
+
|
| 509 |
+
# For greetings, we still provide a direct answer without LLM formatting
|
| 510 |
+
if best_match_source == 'greetings':
|
| 511 |
+
response_data = {'query': user_query, 'answer': best_csv_match['answer'], 'confidence': best_csv_match['confidence'], 'original_question': best_csv_match['question'], 'source': source_name}
|
| 512 |
+
if best_csv_match.get('image'):
|
| 513 |
+
response_data['image_url'] = url_for('static', filename=best_csv_match['image'], _external=True)
|
| 514 |
+
else:
|
| 515 |
+
# For 'personal' and 'general', use the LLM to format the answer from the full row
|
| 516 |
+
best_match_index = best_csv_match['original_index']
|
| 517 |
+
|
| 518 |
+
# Retrieve the full row from the original dataframe stored in the embedding manager
|
| 519 |
+
original_df = embedding_manager.embeddings[best_match_source].df_qa
|
| 520 |
+
matched_row_data = original_df.iloc[best_match_index]
|
| 521 |
+
|
| 522 |
+
# Format the row data as a string context for the LLM
|
| 523 |
+
# We drop the 'Question' column as it's a duplicate of 'Pregunta' and not needed in the context
|
| 524 |
+
context_dict = matched_row_data.drop('Question', errors='ignore').to_dict()
|
| 525 |
+
context_str = "\n".join([f"'{key}': '{value}'" for key, value in context_dict.items() if pd.notna(value) and str(value).strip() != ''])
|
| 526 |
+
|
| 527 |
+
# Call the LLM to generate a conversational answer based on the row data
|
| 528 |
+
final_answer = get_answer_from_context(
|
| 529 |
+
question=user_query,
|
| 530 |
+
context=context_str,
|
| 531 |
+
system_prompt=QA_FORMATTER_PROMPT
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
response_data = {
|
| 535 |
+
'query': user_query,
|
| 536 |
+
'answer': final_answer,
|
| 537 |
+
'confidence': best_csv_match['confidence'],
|
| 538 |
+
'original_question': best_csv_match['question'],
|
| 539 |
+
'source': f'{source_name}_llm_formatted'
|
| 540 |
+
}
|
| 541 |
+
if best_csv_match.get('image'):
|
| 542 |
+
response_data['image_url'] = url_for('static', filename=best_csv_match['image'], _external=True)
|
| 543 |
+
|
| 544 |
+
# MODIFICATION END
|
| 545 |
+
|
| 546 |
+
for i, cand_q in enumerate(all_csv_candidate_answers):
|
| 547 |
+
if i == 0: continue
|
| 548 |
+
if cand_q['source_type'] != 'greetings':
|
| 549 |
+
related_questions_list.append({'question': cand_q['question'], 'answer': cand_q['answer'], 'match': cand_q['confidence']})
|
| 550 |
+
if len(related_questions_list) >= RELATED_QUESTIONS_TO_SHOW: break
|
| 551 |
+
response_data['related_questions'] = related_questions_list
|
| 552 |
+
store_chat_history(session_id, user_id, user_query, response_data)
|
| 553 |
+
return response_data
|
| 554 |
+
|
| 555 |
+
if rag_system and rag_system.retriever:
|
| 556 |
+
logger.info(f"Attempting FAISS RAG query for: {user_query[:50]}...")
|
| 557 |
+
rag_result = rag_system.invoke(user_query) # Use invoke for non-streaming
|
| 558 |
+
rag_answer = rag_result.get("answer")
|
| 559 |
+
|
| 560 |
+
if rag_answer and "the provided bibliography does not contain specific information" not in rag_answer.lower():
|
| 561 |
+
logger.info(f"FAISS RAG system provided a valid answer: {rag_answer[:100]}...")
|
| 562 |
+
response_data = {
|
| 563 |
+
'query': user_query, 'answer': rag_answer, 'confidence': 85,
|
| 564 |
+
'source': 'document_rag_faiss', 'related_questions': [],
|
| 565 |
+
'document_sources_details': rag_result.get("cited_source_details")
|
| 566 |
+
}
|
| 567 |
+
store_chat_history(session_id, user_id, user_query, response_data)
|
| 568 |
+
return response_data
|
| 569 |
+
|
| 570 |
+
logger.info(f"No high-confidence answer. Using Groq fallback.")
|
| 571 |
+
chat_history_messages_for_groq = chat_history if chat_history is not None else get_formatted_chat_history(session_id)
|
| 572 |
+
groq_context = {'current_query': user_query, 'chat_history': chat_history_messages_for_groq, 'qa_related_info': ""}
|
| 573 |
+
groq_stream = groq_bot_instance.stream_response(groq_context)
|
| 574 |
+
groq_answer = "".join([chunk for chunk in groq_stream])
|
| 575 |
+
|
| 576 |
+
response_data = {'query': user_query, 'answer': groq_answer, 'confidence': 75, 'source': 'groq_general_fallback', 'related_questions': []}
|
| 577 |
+
store_chat_history(session_id, user_id, user_query, response_data)
|
| 578 |
+
return response_data
|
| 579 |
+
|
| 580 |
+
# --- Streaming Logic ---
|
| 581 |
+
def generate_streaming_response(user_query: str, session_id: str, user_id: Optional[str], chat_history: Optional[List[Dict]] = None) -> Iterator[str]:
|
| 582 |
+
"""
|
| 583 |
+
Handles the logic for generating a response and yields chunks of the response as a stream.
|
| 584 |
+
"""
|
| 585 |
+
global rag_system
|
| 586 |
+
|
| 587 |
+
personal_db_data = personal_data_monitor.get_data(user_id) if user_id else None
|
| 588 |
+
conf_greet, _, a_greet, _, idx_greet = embedding_manager.find_best_answers(user_query, 'greetings', top_n=1)
|
| 589 |
+
conf_pers, _, a_pers, _, idx_pers = embedding_manager.find_best_answers(user_query, 'personal', top_n=1)
|
| 590 |
+
conf_gen, _, a_gen, _, idx_gen = embedding_manager.find_best_answers(user_query, 'general', top_n=1)
|
| 591 |
+
|
| 592 |
+
# MODIFIED: Use new configurable threshold and logic for picking best candidate
|
| 593 |
+
candidates = []
|
| 594 |
+
# Greetings have their own threshold for a direct, non-LLM answer
|
| 595 |
+
if conf_greet and conf_greet[0] >= HIGH_CONFIDENCE_THRESHOLD:
|
| 596 |
+
candidates.append({'answer': a_greet[0], 'confidence': conf_greet[0], 'source': 'greetings', 'index': idx_greet[0]})
|
| 597 |
+
|
| 598 |
+
# Personal and General QA have a stricter threshold to be sent to the LLM formatter
|
| 599 |
+
if conf_pers and conf_pers[0] >= LLM_FORMATTER_CONFIDENCE_THRESHOLD:
|
| 600 |
+
processed_a = replace_placeholders_in_answer(a_pers[0], personal_db_data)
|
| 601 |
+
# Only add candidate if placeholder replacement was successful
|
| 602 |
+
if not ("Sorry, this information is not available yet" in processed_a or "To get this specific information" in processed_a):
|
| 603 |
+
candidates.append({'answer': processed_a, 'confidence': conf_pers[0], 'source': 'personal', 'index': idx_pers[0]})
|
| 604 |
+
|
| 605 |
+
if conf_gen and conf_gen[0] >= LLM_FORMATTER_CONFIDENCE_THRESHOLD:
|
| 606 |
+
# Filter out empty/invalid answers
|
| 607 |
+
if not (pd.isna(a_gen[0]) or str(a_gen[0]).strip() == '' or str(a_gen[0]).lower() == 'nan'):
|
| 608 |
+
candidates.append({'answer': a_gen[0], 'confidence': conf_gen[0], 'source': 'general', 'index': idx_gen[0]})
|
| 609 |
+
|
| 610 |
+
if candidates:
|
| 611 |
+
best_candidate = max(candidates, key=lambda x: x['confidence'])
|
| 612 |
+
best_source_type = best_candidate['source']
|
| 613 |
+
logger.info(f"High-confidence match from CSV source: {best_source_type}")
|
| 614 |
+
|
| 615 |
+
# If the best match is a greeting, yield it directly
|
| 616 |
+
if best_source_type == 'greetings':
|
| 617 |
+
yield best_candidate['answer']
|
| 618 |
+
return
|
| 619 |
+
|
| 620 |
+
# Otherwise, the best match is 'personal' or 'general' and needs LLM formatting
|
| 621 |
+
original_df = embedding_manager.embeddings[best_source_type].df_qa
|
| 622 |
+
matched_row_data = original_df.iloc[best_candidate['index']]
|
| 623 |
+
context_dict = matched_row_data.drop('Question', errors='ignore').to_dict()
|
| 624 |
+
context_str = "\n".join([f"'{key}': '{value}'" for key, value in context_dict.items() if pd.notna(value) and str(value).strip() != ''])
|
| 625 |
+
|
| 626 |
+
final_answer = get_answer_from_context(
|
| 627 |
+
question=user_query,
|
| 628 |
+
context=context_str,
|
| 629 |
+
system_prompt=QA_FORMATTER_PROMPT
|
| 630 |
+
)
|
| 631 |
+
yield final_answer
|
| 632 |
+
return
|
| 633 |
+
|
| 634 |
+
if rag_system and rag_system.retriever:
|
| 635 |
+
logger.info(f"Attempting to stream from FAISS RAG for: {user_query[:50]}...")
|
| 636 |
+
rag_stream = rag_system.stream(user_query)
|
| 637 |
+
first_chunk = next(rag_stream, None)
|
| 638 |
+
|
| 639 |
+
if first_chunk and "the provided bibliography does not contain specific information" not in first_chunk.lower():
|
| 640 |
+
logger.info("FAISS RAG streaming valid answer...")
|
| 641 |
+
yield first_chunk
|
| 642 |
+
yield from rag_stream
|
| 643 |
+
return
|
| 644 |
+
|
| 645 |
+
logger.info(f"No high-confidence CSV or RAG answer. Streaming from Groq fallback.")
|
| 646 |
+
chat_history_messages_for_groq = chat_history if chat_history is not None else get_formatted_chat_history(session_id)
|
| 647 |
+
groq_context = {'current_query': user_query, 'chat_history': chat_history_messages_for_groq, 'qa_related_info': ""}
|
| 648 |
+
yield from groq_bot_instance.stream_response(groq_context)
|
| 649 |
+
|
| 650 |
+
def stream_formatter(logic_generator: Iterator[str], session_id: str, user_id: Optional[str], query: str) -> Iterator[str]:
|
| 651 |
+
"""
|
| 652 |
+
Wraps raw text chunks into the Server-Sent Events (SSE) format and logs the full response at the end.
|
| 653 |
+
"""
|
| 654 |
+
chunk_id = f"chatcmpl-{str(generate_uuid())}"
|
| 655 |
+
model_name = "MedicalAssisstantBot/v1"
|
| 656 |
+
full_response_chunks = []
|
| 657 |
+
|
| 658 |
+
for chunk in logic_generator:
|
| 659 |
+
if not chunk: continue
|
| 660 |
+
full_response_chunks.append(chunk)
|
| 661 |
+
response_json = {
|
| 662 |
+
"id": chunk_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": model_name,
|
| 663 |
+
"choices": [{"index": 0, "delta": {"content": chunk}, "finish_reason": None}]
|
| 664 |
+
}
|
| 665 |
+
yield f"data: {json.dumps(response_json)}\n\n"
|
| 666 |
+
|
| 667 |
+
final_json = {
|
| 668 |
+
"id": chunk_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": model_name,
|
| 669 |
+
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
|
| 670 |
+
}
|
| 671 |
+
yield f"data: {json.dumps(final_json)}\n\n"
|
| 672 |
+
yield "data: [DONE]\n\n"
|
| 673 |
+
|
| 674 |
+
# After streaming is complete, log the full conversation to the database
|
| 675 |
+
full_response = "".join(full_response_chunks)
|
| 676 |
+
|
| 677 |
+
# MODIFIED: Added print statement for full streamed response
|
| 678 |
+
print(f"\n--- STREAMED FULL RESPONSE ---")
|
| 679 |
+
print(full_response)
|
| 680 |
+
print(f"------------------------------\n")
|
| 681 |
+
|
| 682 |
+
history_manager.update_history(session_id, query, full_response)
|
| 683 |
+
|
| 684 |
+
# --- Original Chat Endpoint (Preserved) ---
|
| 685 |
+
@app.route('/chat-bot', methods=['POST'])
|
| 686 |
+
def get_answer_hybrid():
|
| 687 |
+
data = request.json
|
| 688 |
+
user_query = data.get('query', '')
|
| 689 |
+
user_query = clean_html_from_text(user_query) # ADDED
|
| 690 |
+
user_id = data.get('user_id')
|
| 691 |
+
session_id = data.get('session_id')
|
| 692 |
+
|
| 693 |
+
if not user_query or not session_id:
|
| 694 |
+
return jsonify({'error': 'query and session_id are required'}), 400
|
| 695 |
+
|
| 696 |
+
response_data = get_hybrid_response_logic_non_streaming(user_query, session_id, user_id, None)
|
| 697 |
+
return jsonify(response_data)
|
| 698 |
+
|
| 699 |
+
# --- OpenAI Compatible Endpoints (Added) ---
|
| 700 |
+
@app.route('/v1/models', methods=['GET'])
|
| 701 |
+
def list_models():
|
| 702 |
+
model_data = {
|
| 703 |
+
"object": "list",
|
| 704 |
+
"data": [{"id": "MedicalAssisstantBot/v1", "object": "model", "created": int(time.time()), "owned_by": "user"}]
|
| 705 |
+
}
|
| 706 |
+
return jsonify(model_data)
|
| 707 |
+
|
| 708 |
+
@app.route('/v1/chat/completions', methods=['POST'])
|
| 709 |
+
def openai_compatible_chat_endpoint():
|
| 710 |
+
data = request.json
|
| 711 |
+
is_streaming = data.get("stream", False)
|
| 712 |
+
|
| 713 |
+
messages = data.get("messages", [])
|
| 714 |
+
if not messages: return jsonify({"error": "No messages provided"}), 400
|
| 715 |
+
|
| 716 |
+
user_query = messages[-1].get("content", "")
|
| 717 |
+
user_query = clean_html_from_text(user_query) # ADDED
|
| 718 |
+
chat_history = messages[:-1]
|
| 719 |
+
session_id = data.get("conversation_id", f"webui-session-{str(generate_uuid())}")
|
| 720 |
+
user_id = None
|
| 721 |
+
|
| 722 |
+
if is_streaming:
|
| 723 |
+
logic_generator = generate_streaming_response(user_query, session_id, user_id, chat_history)
|
| 724 |
+
return Response(stream_formatter(logic_generator, session_id, user_id, user_query), mimetype='text/event-stream')
|
| 725 |
+
else:
|
| 726 |
+
full_response_dict = get_hybrid_response_logic_non_streaming(user_query, session_id, user_id, chat_history)
|
| 727 |
+
response_content = full_response_dict.get("answer", "Sorry, an error occurred.")
|
| 728 |
+
|
| 729 |
+
openai_response = {
|
| 730 |
+
"id": f"chatcmpl-{str(generate_uuid())}", "object": "chat.completion", "created": int(time.time()),
|
| 731 |
+
"model": "MedicalAssisstantBot/v1",
|
| 732 |
+
"choices": [{"index": 0, "message": {"role": "assistant", "content": response_content}, "finish_reason": "stop"}],
|
| 733 |
+
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
| 734 |
+
}
|
| 735 |
+
history_manager.update_history(session_id, user_query, response_content)
|
| 736 |
+
return jsonify(openai_response)
|
| 737 |
+
|
| 738 |
+
|
| 739 |
+
# --- Admin and Utility Routes (Unchanged) ---
|
| 740 |
+
@app.route('/')
|
| 741 |
+
def index_route():
|
| 742 |
+
template_to_render = 'chat-bot.html'
|
| 743 |
+
if not os.path.exists(os.path.join(app.root_path, 'templates', template_to_render)):
|
| 744 |
+
logger.warning(f"Template '{template_to_render}' not found. Serving basic message.")
|
| 745 |
+
return "Chatbot interface not found. Please ensure 'templates/chat-bot.html' exists.", 404
|
| 746 |
+
return render_template(template_to_render)
|
| 747 |
+
|
| 748 |
+
@app.route('/admin/faiss_rag_status', methods=['GET'])
|
| 749 |
+
@require_admin_auth
|
| 750 |
+
def get_faiss_rag_status():
|
| 751 |
+
global rag_system
|
| 752 |
+
if not rag_system:
|
| 753 |
+
return jsonify({"error": "FAISS RAG system not initialized."}), 500
|
| 754 |
+
try:
|
| 755 |
+
status = {
|
| 756 |
+
"status": "Initialized" if rag_system.retriever else "Initialized (Retriever not ready)",
|
| 757 |
+
"index_storage_dir": rag_system.index_storage_dir,
|
| 758 |
+
"embedding_model": rag_system.embedding_model_name,
|
| 759 |
+
"groq_model": rag_system.groq_model_name,
|
| 760 |
+
"retriever_k": rag_system.retriever.k if rag_system.retriever else "N/A",
|
| 761 |
+
"processed_source_files": rag_system.processed_source_files,
|
| 762 |
+
"index_type": "FAISS",
|
| 763 |
+
"index_loaded_or_built": rag_system.vector_store is not None
|
| 764 |
+
}
|
| 765 |
+
if rag_system.vector_store and hasattr(rag_system.vector_store, 'index') and rag_system.vector_store.index:
|
| 766 |
+
try:
|
| 767 |
+
status["num_vectors_in_index"] = rag_system.vector_store.index.ntotal
|
| 768 |
+
except Exception:
|
| 769 |
+
status["num_vectors_in_index"] = "N/A (Could not get count)"
|
| 770 |
+
else:
|
| 771 |
+
status["num_vectors_in_index"] = "N/A (Vector store or index not available)"
|
| 772 |
+
return jsonify(status)
|
| 773 |
+
except Exception as e:
|
| 774 |
+
logger.error(f"Error getting FAISS RAG status: {e}", exc_info=True)
|
| 775 |
+
return jsonify({"error": str(e)}), 500
|
| 776 |
+
|
| 777 |
+
# NEW FUNCTION: Endpoint to download the combined QA databases as an Excel file
|
| 778 |
+
@app.route('/admin/download_qa_database', methods=['GET'])
|
| 779 |
+
@require_admin_auth
|
| 780 |
+
def download_qa_database():
|
| 781 |
+
try:
|
| 782 |
+
output = io.BytesIO()
|
| 783 |
+
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
| 784 |
+
# Safely access the dataframes from the embedding manager
|
| 785 |
+
if embedding_manager.embeddings['general'] and embedding_manager.embeddings['general'].df_qa is not None:
|
| 786 |
+
embedding_manager.embeddings['general'].df_qa.to_excel(writer, sheet_name='General_QA', index=False)
|
| 787 |
+
|
| 788 |
+
if embedding_manager.embeddings['personal'] and embedding_manager.embeddings['personal'].df_qa is not None:
|
| 789 |
+
embedding_manager.embeddings['personal'].df_qa.to_excel(writer, sheet_name='Personal_QA', index=False)
|
| 790 |
+
|
| 791 |
+
if embedding_manager.embeddings['greetings'] and embedding_manager.embeddings['greetings'].df_qa is not None:
|
| 792 |
+
embedding_manager.embeddings['g greetings'].df_qa.to_excel(writer, sheet_name='Greetings', index=False)
|
| 793 |
+
|
| 794 |
+
output.seek(0)
|
| 795 |
+
|
| 796 |
+
return send_file(
|
| 797 |
+
output,
|
| 798 |
+
mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
| 799 |
+
as_attachment=True,
|
| 800 |
+
download_name=f'qa_database_{datetime.now().strftime("%Y%m%d")}.xlsx'
|
| 801 |
+
)
|
| 802 |
+
except Exception as e:
|
| 803 |
+
logger.error(f"Error generating QA database file: {e}", exc_info=True)
|
| 804 |
+
return jsonify({'error': 'Failed to generate QA database file'}), 500
|
| 805 |
+
|
| 806 |
+
@app.route('/admin/rebuild_faiss_index', methods=['POST'])
|
| 807 |
+
@require_admin_auth
|
| 808 |
+
def rebuild_faiss_index_route():
|
| 809 |
+
global rag_system
|
| 810 |
+
logger.info("Admin request to rebuild FAISS RAG index received. Starting two-step process.")
|
| 811 |
+
|
| 812 |
+
logger.info("Step 1: Running chunker.py to pre-process source documents.")
|
| 813 |
+
chunker_script_path = os.path.join(_APP_BASE_DIR, 'chunker.py')
|
| 814 |
+
chunked_json_output_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_CHUNKED_SOURCES_FILENAME)
|
| 815 |
+
|
| 816 |
+
os.makedirs(TEXT_EXTRACTIONS_DIR, exist_ok=True)
|
| 817 |
+
|
| 818 |
+
if not os.path.exists(chunker_script_path):
|
| 819 |
+
logger.error(f"Chunker script not found at '{chunker_script_path}'. Aborting rebuild.")
|
| 820 |
+
return jsonify({"error": f"chunker.py not found. Cannot proceed with rebuild."}), 500
|
| 821 |
+
|
| 822 |
+
command = [
|
| 823 |
+
sys.executable,
|
| 824 |
+
chunker_script_path,
|
| 825 |
+
'--sources-dir', RAG_SOURCES_DIR,
|
| 826 |
+
'--output-file', chunked_json_output_path,
|
| 827 |
+
'--text-output-dir', TEXT_EXTRACTIONS_DIR
|
| 828 |
+
]
|
| 829 |
+
|
| 830 |
+
try:
|
| 831 |
+
process = subprocess.run(command, capture_output=True, text=True, check=True)
|
| 832 |
+
logger.info("Chunker script executed successfully.")
|
| 833 |
+
logger.info(f"Chunker stdout:\n{process.stdout}")
|
| 834 |
+
except subprocess.CalledProcessError as e:
|
| 835 |
+
logger.error(f"Chunker script failed with exit code {e.returncode}.")
|
| 836 |
+
logger.error(f"Chunker stderr:\n{e.stderr}")
|
| 837 |
+
return jsonify({"error": "Step 1 (Chunking) failed.", "details": e.stderr}), 500
|
| 838 |
+
except Exception as e:
|
| 839 |
+
logger.error(f"An unexpected error occurred while running the chunker script: {e}", exc_info=True)
|
| 840 |
+
return jsonify({"error": f"An unexpected error occurred during the chunking step: {str(e)}"}), 500
|
| 841 |
+
|
| 842 |
+
logger.info("Step 2: Rebuilding FAISS index from the newly generated chunks.")
|
| 843 |
+
try:
|
| 844 |
+
new_rag_system_instance = initialize_and_get_rag_system(force_rebuild=True)
|
| 845 |
+
|
| 846 |
+
if new_rag_system_instance and new_rag_system_instance.vector_store:
|
| 847 |
+
rag_system = new_rag_system_instance
|
| 848 |
+
logger.info("FAISS RAG index rebuild completed and new RAG system instance is active.")
|
| 849 |
+
updated_status_response = get_faiss_rag_status()
|
| 850 |
+
return jsonify({"message": "FAISS RAG index rebuild completed.", "status": updated_status_response.get_json()}), 200
|
| 851 |
+
else:
|
| 852 |
+
logger.error("FAISS RAG index rebuild failed during the indexing phase.")
|
| 853 |
+
return jsonify({"error": "Step 2 (Indexing) failed. Check logs."}), 500
|
| 854 |
+
|
| 855 |
+
except Exception as e:
|
| 856 |
+
logger.error(f"Error during admin FAISS index rebuild (indexing phase): {e}", exc_info=True)
|
| 857 |
+
return jsonify({"error": f"Failed to rebuild index during indexing phase: {str(e)}"}), 500
|
| 858 |
+
|
| 859 |
+
@app.route('/db/status', methods=['GET'])
|
| 860 |
+
@require_admin_auth
|
| 861 |
+
def get_personal_db_status():
|
| 862 |
+
try:
|
| 863 |
+
status_info = {
|
| 864 |
+
'personal_data_csv_monitor_status': 'running',
|
| 865 |
+
'file_exists': os.path.exists(personal_data_monitor.database_path),
|
| 866 |
+
'data_loaded': personal_data_monitor.df is not None, 'last_update': None
|
| 867 |
+
}
|
| 868 |
+
if status_info['file_exists'] and os.path.getmtime(personal_data_monitor.database_path) is not None:
|
| 869 |
+
status_info['last_update'] = datetime.fromtimestamp(os.path.getmtime(personal_data_monitor.database_path)).isoformat()
|
| 870 |
+
return jsonify(status_info)
|
| 871 |
+
except Exception as e: return jsonify({'status': 'error', 'error': str(e)}), 500
|
| 872 |
+
|
| 873 |
+
@app.route('/report', methods=['GET'])
|
| 874 |
+
@require_admin_auth
|
| 875 |
+
def download_report():
|
| 876 |
+
try:
|
| 877 |
+
if not os.path.exists(CHAT_LOG_FILE) or os.path.getsize(CHAT_LOG_FILE) == 0:
|
| 878 |
+
return jsonify({'error': 'No chat history available.'}), 404
|
| 879 |
+
return send_file(CHAT_LOG_FILE, mimetype='text/csv', as_attachment=True, download_name=f'chat_history_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')
|
| 880 |
+
except Exception as e:
|
| 881 |
+
logger.error(f"Error downloading report: {e}", exc_info=True)
|
| 882 |
+
return jsonify({'error': 'Failed to generate report'}), 500
|
| 883 |
+
|
| 884 |
+
@app.route('/create-session', methods=['POST'])
|
| 885 |
+
def create_session_route():
|
| 886 |
+
try:
|
| 887 |
+
session_id = str(generate_uuid())
|
| 888 |
+
logger.info(f"New session created: {session_id}")
|
| 889 |
+
return jsonify({'status': 'success', 'session_id': session_id}), 200
|
| 890 |
+
except Exception as e:
|
| 891 |
+
logger.error(f"Session creation error: {e}", exc_info=True)
|
| 892 |
+
return jsonify({'status': 'error', 'message': str(e)}), 500
|
| 893 |
+
|
| 894 |
+
@app.route('/version', methods=['GET'])
|
| 895 |
+
def get_version_route():
|
| 896 |
+
return jsonify({'version': '3.9.8-Env-Chat-History'}), 200 # Updated version
|
| 897 |
+
|
| 898 |
+
@app.route('/clear-history', methods=['POST'])
|
| 899 |
+
def clear_session_history_route():
|
| 900 |
+
data = request.json
|
| 901 |
+
session_id = data.get('session_id')
|
| 902 |
+
if not session_id:
|
| 903 |
+
return jsonify({'status': 'error', 'message': 'session_id is required'}), 400
|
| 904 |
+
history_manager.clear_history(session_id)
|
| 905 |
+
return jsonify({'status': 'success', 'message': f'History cleared for session {session_id}'})
|
| 906 |
+
|
| 907 |
+
# --- App Cleanup and Startup ---
|
| 908 |
+
def cleanup_application():
|
| 909 |
+
if personal_data_monitor: personal_data_monitor.stop()
|
| 910 |
+
logger.info("Application cleanup finished.")
|
| 911 |
+
atexit.register(cleanup_application)
|
| 912 |
+
|
| 913 |
+
def load_qa_data_on_startup():
|
| 914 |
+
global embedding_manager
|
| 915 |
+
# MODIFIED: Added print statement
|
| 916 |
+
print("\n--- Loading QA Source Files ---")
|
| 917 |
+
try:
|
| 918 |
+
general_qa_path = os.path.join(RAG_SOURCES_DIR, 'general_qa.csv')
|
| 919 |
+
personal_qa_path = os.path.join(RAG_SOURCES_DIR, 'personal_qa.csv')
|
| 920 |
+
greetings_qa_path = os.path.join(RAG_SOURCES_DIR, 'greetings.csv')
|
| 921 |
+
|
| 922 |
+
general_qa_df = pd.DataFrame(columns=['Question', 'Answer', 'Image'])
|
| 923 |
+
personal_qa_df = pd.DataFrame(columns=['Question', 'Answer', 'Image'])
|
| 924 |
+
greetings_qa_df = pd.DataFrame(columns=['Question', 'Answer', 'Image'])
|
| 925 |
+
|
| 926 |
+
if os.path.exists(general_qa_path):
|
| 927 |
+
try:
|
| 928 |
+
general_qa_df = pd.read_csv(general_qa_path, encoding='cp1252')
|
| 929 |
+
print(f"- Loaded: {os.path.basename(general_qa_path)}")
|
| 930 |
+
except Exception as e_csv: logger.error(f"Error reading general_qa.csv: {e_csv}")
|
| 931 |
+
else:
|
| 932 |
+
logger.warning(f"Optional file 'general_qa.csv' not found in '{RAG_SOURCES_DIR}'.")
|
| 933 |
+
|
| 934 |
+
if os.path.exists(personal_qa_path):
|
| 935 |
+
try:
|
| 936 |
+
personal_qa_df = pd.read_csv(personal_qa_path, encoding='cp1252')
|
| 937 |
+
print(f"- Loaded: {os.path.basename(personal_qa_path)}")
|
| 938 |
+
except Exception as e_csv: logger.error(f"Error reading personal_qa.csv: {e_csv}")
|
| 939 |
+
else:
|
| 940 |
+
logger.warning(f"Optional file 'personal_qa.csv' not found in '{RAG_SOURCES_DIR}'.")
|
| 941 |
+
|
| 942 |
+
if os.path.exists(greetings_qa_path):
|
| 943 |
+
try:
|
| 944 |
+
greetings_qa_df = pd.read_csv(greetings_qa_path, encoding='cp1252')
|
| 945 |
+
print(f"- Loaded: {os.path.basename(greetings_qa_path)}")
|
| 946 |
+
except Exception as e_csv: logger.error(f"Error reading greetings.csv: {e_csv}")
|
| 947 |
+
else:
|
| 948 |
+
logger.warning(f"Optional file 'greetings.csv' not found in '{RAG_SOURCES_DIR}'.")
|
| 949 |
+
|
| 950 |
+
logger.info(f"Scanning for additional QA sources (.xlsx) in '{RAG_SOURCES_DIR}'...")
|
| 951 |
+
if os.path.isdir(RAG_SOURCES_DIR):
|
| 952 |
+
xlsx_files_found = [f for f in os.listdir(RAG_SOURCES_DIR) if f.endswith('.xlsx') and os.path.isfile(os.path.join(RAG_SOURCES_DIR, f))]
|
| 953 |
+
|
| 954 |
+
if xlsx_files_found:
|
| 955 |
+
all_general_dfs = [general_qa_df] if not general_qa_df.empty else []
|
| 956 |
+
for xlsx_file in xlsx_files_found:
|
| 957 |
+
try:
|
| 958 |
+
xlsx_path = os.path.join(RAG_SOURCES_DIR, xlsx_file)
|
| 959 |
+
logger.info(f"Processing XLSX source file: {xlsx_file}")
|
| 960 |
+
df_excel = pd.read_excel(xlsx_path)
|
| 961 |
+
|
| 962 |
+
# MODIFIED: New logic to preserve all columns and handle dynamic headers
|
| 963 |
+
if 'Pregunta' in df_excel.columns and 'Respuesta' in df_excel.columns:
|
| 964 |
+
logger.info(f"Found 'Pregunta' and 'Respuesta' in {xlsx_file}. Preserving all columns.")
|
| 965 |
+
# The 'Question' column is required by the EmbeddingManager for semantic search.
|
| 966 |
+
# We create it from 'Pregunta' but keep all original columns.
|
| 967 |
+
df_excel['Question'] = df_excel['Pregunta']
|
| 968 |
+
all_general_dfs.append(df_excel)
|
| 969 |
+
print(f"- Loaded and processing: {xlsx_file}")
|
| 970 |
+
else:
|
| 971 |
+
logger.warning(f"Skipping XLSX file '{xlsx_file}' as it lacks the required 'Pregunta' and 'Respuesta' columns.")
|
| 972 |
+
except Exception as e_xlsx:
|
| 973 |
+
logger.error(f"Error processing XLSX file '{xlsx_file}': {e_xlsx}")
|
| 974 |
+
|
| 975 |
+
if len(all_general_dfs) > 0:
|
| 976 |
+
general_qa_df = pd.concat(all_general_dfs, ignore_index=True)
|
| 977 |
+
logger.info(f"Successfully merged data from {len(xlsx_files_found)} XLSX file(s) into the general QA set.")
|
| 978 |
+
else:
|
| 979 |
+
logger.warning(f"Sources directory '{RAG_SOURCES_DIR}' not found. Cannot scan for additional QA files.")
|
| 980 |
+
|
| 981 |
+
dataframes_to_process = {
|
| 982 |
+
"general": general_qa_df,
|
| 983 |
+
"personal": personal_qa_df,
|
| 984 |
+
"greetings": greetings_qa_df
|
| 985 |
+
}
|
| 986 |
+
|
| 987 |
+
for df_name, df_val in dataframes_to_process.items():
|
| 988 |
+
if df_val.empty: continue
|
| 989 |
+
|
| 990 |
+
# Normalize text in all columns to prevent issues
|
| 991 |
+
for col in df_val.columns:
|
| 992 |
+
if not df_val[col].isnull().all():
|
| 993 |
+
df_val[col] = df_val[col].astype(str).apply(normalize_text)
|
| 994 |
+
|
| 995 |
+
# Ensure 'Question' column exists for embedding manager compatibility
|
| 996 |
+
if 'Question' not in df_val.columns:
|
| 997 |
+
# For CSVs that might not have 'Pregunta' but have 'Question'
|
| 998 |
+
if 'Question' in df_val.columns:
|
| 999 |
+
pass # Already exists
|
| 1000 |
+
else:
|
| 1001 |
+
df_val['Question'] = None
|
| 1002 |
+
logger.warning(f"'Question' column missing in {df_name} data. Added empty column.")
|
| 1003 |
+
|
| 1004 |
+
embedding_manager.update_embeddings(
|
| 1005 |
+
dataframes_to_process["general"],
|
| 1006 |
+
dataframes_to_process["personal"],
|
| 1007 |
+
dataframes_to_process["greetings"]
|
| 1008 |
+
)
|
| 1009 |
+
logger.info("CSV & XLSX QA data loaded and embeddings initialized.")
|
| 1010 |
+
|
| 1011 |
+
except Exception as e:
|
| 1012 |
+
logger.critical(f"CRITICAL: Error loading or processing QA data: {e}. Semantic QA may not function.", exc_info=True)
|
| 1013 |
+
# MODIFIED: Added print statement
|
| 1014 |
+
print("-----------------------------\n")
|
| 1015 |
+
|
| 1016 |
+
if __name__ == '__main__':
|
| 1017 |
+
for folder_path in [os.path.join(_APP_BASE_DIR, 'templates'),
|
| 1018 |
+
os.path.join(_APP_BASE_DIR, 'static'),
|
| 1019 |
+
TEXT_EXTRACTIONS_DIR]:
|
| 1020 |
+
os.makedirs(folder_path, exist_ok=True)
|
| 1021 |
+
|
| 1022 |
+
load_qa_data_on_startup()
|
| 1023 |
+
initialize_chat_log()
|
| 1024 |
+
|
| 1025 |
+
logger.info("Attempting to initialize RAG system from llm_handling module...")
|
| 1026 |
+
rag_system = initialize_and_get_rag_system()
|
| 1027 |
+
if rag_system:
|
| 1028 |
+
logger.info("RAG system initialized successfully via llm_handling module.")
|
| 1029 |
+
else:
|
| 1030 |
+
logger.warning("RAG system failed to initialize. Document RAG functionality will be unavailable.")
|
| 1031 |
+
|
| 1032 |
+
logger.info(f"Flask application starting with Hybrid RAG on {FLASK_APP_HOST}:{FLASK_APP_PORT} Debug: {FLASK_DEBUG_MODE}...")
|
| 1033 |
+
if not FLASK_DEBUG_MODE:
|
| 1034 |
+
werkzeug_log = logging.getLogger('werkzeug')
|
| 1035 |
+
werkzeug_log.setLevel(logging.ERROR)
|
| 1036 |
+
|
| 1037 |
+
app.run(host=FLASK_APP_HOST, port=FLASK_APP_PORT, debug=FLASK_DEBUG_MODE, use_reloader=False)
|
chunker.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import json
|
| 4 |
+
import argparse
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
|
| 7 |
+
from pypdf import PdfReader
|
| 8 |
+
import docx as python_docx
|
| 9 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
+
|
| 11 |
+
# --- Logging Setup ---
|
| 12 |
+
logging.basicConfig(
|
| 13 |
+
level=logging.INFO,
|
| 14 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 15 |
+
handlers=[
|
| 16 |
+
logging.StreamHandler()
|
| 17 |
+
]
|
| 18 |
+
)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# --- Text Extraction Helper Functions ---
|
| 22 |
+
# Note: These are duplicated from llm_handling.py to make this a standalone script.
|
| 23 |
+
def extract_text_from_file(file_path: str, file_type: str) -> Optional[str]:
|
| 24 |
+
logger.info(f"Extracting text from {file_type.upper()} file: {os.path.basename(file_path)}")
|
| 25 |
+
text_content = None
|
| 26 |
+
try:
|
| 27 |
+
if file_type == 'pdf':
|
| 28 |
+
reader = PdfReader(file_path)
|
| 29 |
+
text_content = "".join(page.extract_text() + "\n" for page in reader.pages if page.extract_text())
|
| 30 |
+
elif file_type == 'docx':
|
| 31 |
+
doc = python_docx.Document(file_path)
|
| 32 |
+
text_content = "\n".join(para.text for para in doc.paragraphs if para.text)
|
| 33 |
+
elif file_type == 'txt':
|
| 34 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 35 |
+
text_content = f.read()
|
| 36 |
+
else:
|
| 37 |
+
logger.warning(f"Unsupported file type for text extraction: {file_type} for file {os.path.basename(file_path)}")
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
if not text_content or not text_content.strip():
|
| 41 |
+
logger.warning(f"No text content extracted from {os.path.basename(file_path)}")
|
| 42 |
+
return None
|
| 43 |
+
return text_content.strip()
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Error extracting text from {os.path.basename(file_path)} ({file_type.upper()}): {e}", exc_info=True)
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
SUPPORTED_EXTENSIONS = {
|
| 49 |
+
'pdf': lambda path: extract_text_from_file(path, 'pdf'),
|
| 50 |
+
'docx': lambda path: extract_text_from_file(path, 'docx'),
|
| 51 |
+
'txt': lambda path: extract_text_from_file(path, 'txt'),
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
def process_sources_and_create_chunks(
|
| 55 |
+
sources_dir: str,
|
| 56 |
+
output_file: str,
|
| 57 |
+
chunk_size: int = 1000,
|
| 58 |
+
chunk_overlap: int = 150,
|
| 59 |
+
text_output_dir: Optional[str] = None # MODIFIED: Added optional parameter
|
| 60 |
+
) -> None:
|
| 61 |
+
"""
|
| 62 |
+
Scans a directory for source files, extracts text, splits it into chunks,
|
| 63 |
+
and saves the chunks to a single JSON file.
|
| 64 |
+
Optionally saves the raw extracted text to a specified directory.
|
| 65 |
+
"""
|
| 66 |
+
if not os.path.isdir(sources_dir):
|
| 67 |
+
logger.error(f"Source directory not found: '{sources_dir}'")
|
| 68 |
+
raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
|
| 69 |
+
|
| 70 |
+
logger.info(f"Starting chunking process. Sources: '{sources_dir}', Output: '{output_file}'")
|
| 71 |
+
|
| 72 |
+
# MODIFIED: Create text output directory if provided
|
| 73 |
+
if text_output_dir:
|
| 74 |
+
os.makedirs(text_output_dir, exist_ok=True)
|
| 75 |
+
logger.info(f"Will save raw extracted text to: '{text_output_dir}'")
|
| 76 |
+
|
| 77 |
+
all_chunks_for_json: List[Dict] = []
|
| 78 |
+
processed_files_count = 0
|
| 79 |
+
|
| 80 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 81 |
+
|
| 82 |
+
for filename in os.listdir(sources_dir):
|
| 83 |
+
file_path = os.path.join(sources_dir, filename)
|
| 84 |
+
if not os.path.isfile(file_path):
|
| 85 |
+
continue
|
| 86 |
+
|
| 87 |
+
file_ext = filename.split('.')[-1].lower()
|
| 88 |
+
if file_ext not in SUPPORTED_EXTENSIONS:
|
| 89 |
+
logger.debug(f"Skipping unsupported file: {filename}")
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
logger.info(f"Processing source file: {filename}")
|
| 93 |
+
text_content = SUPPORTED_EXTENSIONS[file_ext](file_path)
|
| 94 |
+
|
| 95 |
+
if text_content:
|
| 96 |
+
# MODIFIED: Save the raw text to a file if directory is specified
|
| 97 |
+
if text_output_dir:
|
| 98 |
+
try:
|
| 99 |
+
text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
|
| 100 |
+
with open(text_output_path, 'w', encoding='utf-8') as f_text:
|
| 101 |
+
f_text.write(text_content)
|
| 102 |
+
logger.info(f"Saved extracted text for '{filename}' to '{text_output_path}'")
|
| 103 |
+
except Exception as e_text_save:
|
| 104 |
+
logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
|
| 105 |
+
|
| 106 |
+
chunks = text_splitter.split_text(text_content)
|
| 107 |
+
if not chunks:
|
| 108 |
+
logger.warning(f"No chunks generated from {filename}. Skipping.")
|
| 109 |
+
continue
|
| 110 |
+
|
| 111 |
+
for i, chunk_text in enumerate(chunks):
|
| 112 |
+
chunk_data = {
|
| 113 |
+
"page_content": chunk_text,
|
| 114 |
+
"metadata": {
|
| 115 |
+
"source_document_name": filename,
|
| 116 |
+
"chunk_index": i,
|
| 117 |
+
"full_location": f"{filename}, Chunk {i+1}"
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
all_chunks_for_json.append(chunk_data)
|
| 121 |
+
|
| 122 |
+
processed_files_count += 1
|
| 123 |
+
else:
|
| 124 |
+
logger.warning(f"Could not extract text from {filename}. Skipping.")
|
| 125 |
+
|
| 126 |
+
if not all_chunks_for_json:
|
| 127 |
+
logger.warning(f"No processable documents found or no text extracted in '{sources_dir}'. JSON file will be empty.")
|
| 128 |
+
|
| 129 |
+
output_dir = os.path.dirname(output_file)
|
| 130 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 131 |
+
|
| 132 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 133 |
+
json.dump(all_chunks_for_json, f, indent=2)
|
| 134 |
+
|
| 135 |
+
logger.info(f"Chunking complete. Processed {processed_files_count} files.")
|
| 136 |
+
logger.info(f"Created a total of {len(all_chunks_for_json)} chunks.")
|
| 137 |
+
logger.info(f"Chunked JSON output saved to: {output_file}")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def main():
|
| 141 |
+
parser = argparse.ArgumentParser(description="Process source documents into a JSON file of text chunks for RAG.")
|
| 142 |
+
parser.add_argument(
|
| 143 |
+
'--sources-dir',
|
| 144 |
+
type=str,
|
| 145 |
+
required=True,
|
| 146 |
+
help="The directory containing source files (PDFs, DOCX, TXT)."
|
| 147 |
+
)
|
| 148 |
+
parser.add_argument(
|
| 149 |
+
'--output-file',
|
| 150 |
+
type=str,
|
| 151 |
+
required=True,
|
| 152 |
+
help="The full path for the output JSON file containing the chunks."
|
| 153 |
+
)
|
| 154 |
+
# MODIFIED: Added new optional argument
|
| 155 |
+
parser.add_argument(
|
| 156 |
+
'--text-output-dir',
|
| 157 |
+
type=str,
|
| 158 |
+
default=None,
|
| 159 |
+
help="Optional: The directory to save raw extracted text files for debugging."
|
| 160 |
+
)
|
| 161 |
+
parser.add_argument(
|
| 162 |
+
'--chunk-size',
|
| 163 |
+
type=int,
|
| 164 |
+
default=1000,
|
| 165 |
+
help="The character size for each text chunk."
|
| 166 |
+
)
|
| 167 |
+
parser.add_argument(
|
| 168 |
+
'--chunk-overlap',
|
| 169 |
+
type=int,
|
| 170 |
+
default=150,
|
| 171 |
+
help="The character overlap between consecutive chunks."
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
args = parser.parse_args()
|
| 175 |
+
|
| 176 |
+
try:
|
| 177 |
+
process_sources_and_create_chunks(
|
| 178 |
+
sources_dir=args.sources_dir,
|
| 179 |
+
output_file=args.output_file,
|
| 180 |
+
chunk_size=args.chunk_size,
|
| 181 |
+
chunk_overlap=args.chunk_overlap,
|
| 182 |
+
text_output_dir=args.text_output_dir # MODIFIED: Pass argument
|
| 183 |
+
)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.critical(f"A critical error occurred during the chunking process: {e}", exc_info=True)
|
| 186 |
+
exit(1)
|
| 187 |
+
|
| 188 |
+
if __name__ == "__main__":
|
| 189 |
+
main()
|
env
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
FLASK_HOST=0.0.0.0
|
| 3 |
+
FLASK_PORT=7860
|
| 4 |
+
# FLASK_DEBUG="False" # Gradio uses debug in launch()
|
| 5 |
+
|
| 6 |
+
# --- Groq LLM Models ---
|
| 7 |
+
GROQ_FALLBACK_MODEL=qwen/qwen3-32b
|
| 8 |
+
GROQ_AUXILIARY_MODEL=llama3-8b-8192
|
| 9 |
+
|
| 10 |
+
RAG_LLM_MODEL=qwen/qwen3-32b
|
| 11 |
+
RAG_TEMPERATURE=0.1
|
| 12 |
+
|
| 13 |
+
# --- RAG System Configuration ---
|
| 14 |
+
# Embedding model for RAG
|
| 15 |
+
RAG_EMBEDDING_MODEL=all-MiniLM-L6-v2
|
| 16 |
+
|
| 17 |
+
# Whether to use GPU for RAG embeddings (if available and faiss-gpu is installed)
|
| 18 |
+
|
| 19 |
+
RAG_EMBEDDING_GPU=false
|
| 20 |
+
|
| 21 |
+
# Whether to attempt loading an existing RAG index on startup.
|
| 22 |
+
RAG_LOAD_INDEX=true
|
| 23 |
+
|
| 24 |
+
# Default number of documents the RAG retriever should fetch.
|
| 25 |
+
RAG_RETRIEVER_K=5
|
| 26 |
+
|
| 27 |
+
GDRIVE_SOURCES_ENABLED=False
|
| 28 |
+
GDRIVE_FOLDER_URL=&HBGGtZ4TJA
|
| 29 |
+
|
| 30 |
+
LLM_FORMATTER_CONFIDENCE_THRESHOLD=95
|
| 31 |
+
CHAT_HISTORY_TO_SEND=5
|
faiss_storage/faiss_index/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c627f912c7eead10f1ed14a68211eaad41b98e8920deb999f79f8671dc01979
|
| 3 |
+
size 3640365
|
faiss_storage/faiss_index/index.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7f4e06d1c5d0de27cae7d9a9328c94073faa58705ceea28242166dca11c581e
|
| 3 |
+
size 2569631
|
gitattributes
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
faiss_storage/faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
sources/Endodontics[[:space:]]appendix[[:space:]]4.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
sources/Endodontics[[:space:]]cap[[:space:]]1.pdf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
sources/Endodontics[[:space:]]cap[[:space:]]10.pdf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
sources/Endodontics[[:space:]]cap[[:space:]]11.pdf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
sources/Endodontics[[:space:]]cap[[:space:]]12.pdf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
sources/Endodontics[[:space:]]cap[[:space:]]13.pdf filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
sources/Endodontics[[:space:]]cap[[:space:]]14.pdf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
sources/Endodontics[[:space:]]cap[[:space:]]15.pdf filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
sources/Endodontics[[:space:]]cap[[:space:]]16.pdf filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
sources/Endodontics[[:space:]]cap[[:space:]]17.pdf filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
sources/Endodontics[[:space:]]cap[[:space:]]18.pdf filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
sources/Endodontics[[:space:]]cap[[:space:]]19.pdf filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
sources/Endodontics[[:space:]]cap[[:space:]]2.pdf filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
sources/Endodontics[[:space:]]cap[[:space:]]20.pdf filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
sources/Endodontics[[:space:]]cap[[:space:]]21.pdf filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
sources/Endodontics[[:space:]]cap[[:space:]]22.pdf filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
sources/Endodontics[[:space:]]cap[[:space:]]3.pdf filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
sources/Endodontics[[:space:]]cap[[:space:]]4.pdf filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
sources/Endodontics[[:space:]]cap[[:space:]]5.pdf filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
sources/Endodontics[[:space:]]cap[[:space:]]6.pdf filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
sources/Endodontics[[:space:]]cap[[:space:]]7.pdf filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
sources/Endodontics[[:space:]]cap[[:space:]]8.pdf filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
sources/Endodontics[[:space:]]cap[[:space:]]9.pdf filter=lfs diff=lfs merge=lfs -text
|
llm_handling.py
ADDED
|
@@ -0,0 +1,542 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import json
|
| 4 |
+
from typing import List, Dict, Tuple, Optional, Any, Iterator
|
| 5 |
+
import shutil
|
| 6 |
+
import re
|
| 7 |
+
import time
|
| 8 |
+
import requests
|
| 9 |
+
import zipfile
|
| 10 |
+
import tempfile
|
| 11 |
+
import gdown
|
| 12 |
+
|
| 13 |
+
import torch
|
| 14 |
+
from sentence_transformers import SentenceTransformer
|
| 15 |
+
from pypdf import PdfReader
|
| 16 |
+
import docx as python_docx
|
| 17 |
+
|
| 18 |
+
from llama_index.core.llms import ChatMessage
|
| 19 |
+
from llama_index.llms.groq import Groq as LlamaIndexGroqClient
|
| 20 |
+
|
| 21 |
+
from langchain_groq import ChatGroq
|
| 22 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 23 |
+
from langchain_community.vectorstores import FAISS
|
| 24 |
+
from langchain.prompts import ChatPromptTemplate
|
| 25 |
+
from langchain.schema import Document, BaseRetriever
|
| 26 |
+
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
|
| 27 |
+
from langchain.schema.runnable import RunnablePassthrough, RunnableParallel
|
| 28 |
+
from langchain.schema.output_parser import StrOutputParser
|
| 29 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 30 |
+
# MODIFIED: Import the new prompt
|
| 31 |
+
from system_prompts import RAG_SYSTEM_PROMPT, FALLBACK_SYSTEM_PROMPT, QA_FORMATTER_PROMPT
|
| 32 |
+
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
if not logger.handlers:
|
| 35 |
+
logging.basicConfig(
|
| 36 |
+
level=logging.INFO,
|
| 37 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# --- Configuration Constants ---
|
| 41 |
+
GROQ_API_KEY = os.getenv('BOT_API_KEY')
|
| 42 |
+
if not GROQ_API_KEY:
|
| 43 |
+
logger.critical("CRITICAL: BOT_API_KEY environment variable not found. Services will fail.")
|
| 44 |
+
|
| 45 |
+
FALLBACK_LLM_MODEL_NAME = os.getenv("GROQ_FALLBACK_MODEL", "llama-3.1-70b-versatile")
|
| 46 |
+
# ADDED: New constant for the auxiliary model
|
| 47 |
+
AUXILIARY_LLM_MODEL_NAME = os.getenv("GROQ_AUXILIARY_MODEL", "llama3-8b-8192")
|
| 48 |
+
_MODULE_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 49 |
+
RAG_FAISS_INDEX_SUBDIR_NAME = "faiss_index"
|
| 50 |
+
RAG_STORAGE_PARENT_DIR = os.getenv("RAG_STORAGE_DIR", os.path.join(_MODULE_BASE_DIR, "faiss_storage"))
|
| 51 |
+
RAG_SOURCES_DIR = os.getenv("SOURCES_DIR", os.path.join(_MODULE_BASE_DIR, "sources"))
|
| 52 |
+
RAG_CHUNKED_SOURCES_FILENAME = "pre_chunked_sources.json"
|
| 53 |
+
os.makedirs(RAG_SOURCES_DIR, exist_ok=True)
|
| 54 |
+
os.makedirs(RAG_STORAGE_PARENT_DIR, exist_ok=True)
|
| 55 |
+
RAG_EMBEDDING_MODEL_NAME = os.getenv("RAG_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
|
| 56 |
+
RAG_EMBEDDING_USE_GPU = os.getenv("RAG_EMBEDDING_GPU", "False").lower() == "true"
|
| 57 |
+
RAG_LLM_MODEL_NAME = os.getenv("RAG_LLM_MODEL", "llama-3.1-70b-versatile")
|
| 58 |
+
RAG_LLM_TEMPERATURE = float(os.getenv("RAG_TEMPERATURE", 0.0))
|
| 59 |
+
RAG_LOAD_INDEX_ON_STARTUP = os.getenv("RAG_LOAD_INDEX", "True").lower() == "true"
|
| 60 |
+
RAG_DEFAULT_RETRIEVER_K = int(os.getenv("RAG_RETRIEVER_K", 3))
|
| 61 |
+
GDRIVE_SOURCES_ENABLED = os.getenv("GDRIVE_SOURCES_ENABLED", "False").lower() == "true"
|
| 62 |
+
GDRIVE_FOLDER_ID_OR_URL = os.getenv("GDRIVE_FOLDER_URL")
|
| 63 |
+
|
| 64 |
+
# --- Text Extraction Helper Function ---
|
| 65 |
+
def extract_text_from_file(file_path: str, file_type: str) -> Optional[str]:
|
| 66 |
+
logger.info(f"Extracting text from {file_type.upper()} file: {os.path.basename(file_path)}")
|
| 67 |
+
try:
|
| 68 |
+
if file_type == 'pdf':
|
| 69 |
+
reader = PdfReader(file_path)
|
| 70 |
+
return "".join(page.extract_text() + "\n" for page in reader.pages if page.extract_text())
|
| 71 |
+
elif file_type == 'docx':
|
| 72 |
+
doc = python_docx.Document(file_path)
|
| 73 |
+
return "\n".join(para.text for para in doc.paragraphs if para.text)
|
| 74 |
+
elif file_type == 'txt':
|
| 75 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 76 |
+
return f.read()
|
| 77 |
+
logger.warning(f"Unsupported file type for text extraction: {file_type}")
|
| 78 |
+
return None
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Error extracting text from {os.path.basename(file_path)}: {e}", exc_info=True)
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
FAISS_RAG_SUPPORTED_EXTENSIONS = {'pdf': 'pdf', 'docx': 'docx', 'txt': 'txt'}
|
| 84 |
+
|
| 85 |
+
# --- FAISS RAG System ---
|
| 86 |
+
class FAISSRetrieverWithScore(BaseRetriever):
|
| 87 |
+
vectorstore: FAISS
|
| 88 |
+
k: int = RAG_DEFAULT_RETRIEVER_K
|
| 89 |
+
|
| 90 |
+
def _get_relevant_documents(
|
| 91 |
+
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
| 92 |
+
) -> List[Document]:
|
| 93 |
+
docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=self.k)
|
| 94 |
+
relevant_docs = []
|
| 95 |
+
for doc, score in docs_and_scores:
|
| 96 |
+
doc.metadata["retrieval_score"] = float(score)
|
| 97 |
+
relevant_docs.append(doc)
|
| 98 |
+
return relevant_docs
|
| 99 |
+
|
| 100 |
+
class KnowledgeRAG:
|
| 101 |
+
def __init__(
|
| 102 |
+
self,
|
| 103 |
+
index_storage_dir: str,
|
| 104 |
+
embedding_model_name: str,
|
| 105 |
+
groq_model_name_for_rag: str,
|
| 106 |
+
use_gpu_for_embeddings: bool,
|
| 107 |
+
groq_api_key_for_rag: str,
|
| 108 |
+
temperature: float,
|
| 109 |
+
):
|
| 110 |
+
self.logger = logging.getLogger(__name__ + ".KnowledgeRAG")
|
| 111 |
+
self.index_storage_dir = index_storage_dir
|
| 112 |
+
self.embedding_model_name = embedding_model_name
|
| 113 |
+
self.groq_model_name = groq_model_name_for_rag
|
| 114 |
+
self.temperature = temperature
|
| 115 |
+
|
| 116 |
+
device = "cuda" if use_gpu_for_embeddings and torch.cuda.is_available() else "cpu"
|
| 117 |
+
self.logger.info(f"Initializing Hugging Face embedding model: {self.embedding_model_name} on device: {device}")
|
| 118 |
+
try:
|
| 119 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 120 |
+
model_name=self.embedding_model_name,
|
| 121 |
+
model_kwargs={"device": device},
|
| 122 |
+
encode_kwargs={"normalize_embeddings": True}
|
| 123 |
+
)
|
| 124 |
+
except Exception as e:
|
| 125 |
+
self.logger.critical(f"Failed to load embedding model: {e}", exc_info=True)
|
| 126 |
+
raise
|
| 127 |
+
|
| 128 |
+
self.logger.info(f"Initializing Langchain ChatGroq LLM for RAG: {self.groq_model_name}")
|
| 129 |
+
if not groq_api_key_for_rag:
|
| 130 |
+
raise ValueError("Groq API Key for RAG is missing.")
|
| 131 |
+
try:
|
| 132 |
+
self.llm = ChatGroq(
|
| 133 |
+
temperature=self.temperature,
|
| 134 |
+
groq_api_key=groq_api_key_for_rag,
|
| 135 |
+
model_name=self.groq_model_name
|
| 136 |
+
)
|
| 137 |
+
except Exception as e:
|
| 138 |
+
self.logger.critical(f"Failed to initialize Langchain ChatGroq LLM: {e}", exc_info=True)
|
| 139 |
+
raise
|
| 140 |
+
|
| 141 |
+
self.vector_store: Optional[FAISS] = None
|
| 142 |
+
self.retriever: Optional[FAISSRetrieverWithScore] = None
|
| 143 |
+
self.rag_chain = None
|
| 144 |
+
self.processed_source_files: List[str] = []
|
| 145 |
+
|
| 146 |
+
def build_index_from_source_files(self, source_folder_path: str, k: int = RAG_DEFAULT_RETRIEVER_K):
|
| 147 |
+
all_docs_for_vectorstore: List[Document] = []
|
| 148 |
+
processed_files_this_build: List[str] = []
|
| 149 |
+
pre_chunked_json_path = os.path.join(self.index_storage_dir, RAG_CHUNKED_SOURCES_FILENAME)
|
| 150 |
+
|
| 151 |
+
if os.path.exists(pre_chunked_json_path):
|
| 152 |
+
self.logger.info(f"Loading documents from pre-chunked file: {pre_chunked_json_path}")
|
| 153 |
+
try:
|
| 154 |
+
with open(pre_chunked_json_path, 'r', encoding='utf-8') as f:
|
| 155 |
+
chunk_data_list = json.load(f)
|
| 156 |
+
source_filenames = set()
|
| 157 |
+
for chunk_data in chunk_data_list:
|
| 158 |
+
doc = Document(page_content=chunk_data.get("page_content", ""), metadata=chunk_data.get("metadata", {}))
|
| 159 |
+
all_docs_for_vectorstore.append(doc)
|
| 160 |
+
if 'source_document_name' in doc.metadata:
|
| 161 |
+
source_filenames.add(doc.metadata['source_document_name'])
|
| 162 |
+
processed_files_this_build = sorted(list(source_filenames))
|
| 163 |
+
except Exception as e:
|
| 164 |
+
self.logger.error(f"Error processing pre-chunked JSON, falling back to raw files: {e}")
|
| 165 |
+
all_docs_for_vectorstore.clear()
|
| 166 |
+
|
| 167 |
+
if not all_docs_for_vectorstore:
|
| 168 |
+
self.logger.info(f"Processing raw files from '{source_folder_path}' to build index.")
|
| 169 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
| 170 |
+
for filename in os.listdir(source_folder_path):
|
| 171 |
+
file_path = os.path.join(source_folder_path, filename)
|
| 172 |
+
file_ext = filename.split('.')[-1].lower()
|
| 173 |
+
if os.path.isfile(file_path) and file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
|
| 174 |
+
text_content = extract_text_from_file(file_path, file_ext)
|
| 175 |
+
if text_content:
|
| 176 |
+
chunks = text_splitter.split_text(text_content)
|
| 177 |
+
for i, chunk_text in enumerate(chunks):
|
| 178 |
+
metadata = {"source_document_name": filename, "chunk_index": i}
|
| 179 |
+
all_docs_for_vectorstore.append(Document(page_content=chunk_text, metadata=metadata))
|
| 180 |
+
processed_files_this_build.append(filename)
|
| 181 |
+
|
| 182 |
+
if not all_docs_for_vectorstore:
|
| 183 |
+
self.logger.warning(f"No processable PDF/DOCX/TXT documents found in '{source_folder_path}'. RAG index will only contain other sources if available.")
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
self.processed_source_files = processed_files_this_build
|
| 187 |
+
|
| 188 |
+
# This print statement is kept for console visibility on startup/rebuild
|
| 189 |
+
print("\n--- Document Files Used for RAG Index ---")
|
| 190 |
+
if self.processed_source_files:
|
| 191 |
+
for filename in self.processed_source_files:
|
| 192 |
+
print(f"- {filename}")
|
| 193 |
+
else:
|
| 194 |
+
print("No PDF/DOCX/TXT source files were processed for the RAG index.")
|
| 195 |
+
print("---------------------------------------\n")
|
| 196 |
+
|
| 197 |
+
if not all_docs_for_vectorstore:
|
| 198 |
+
self.logger.warning("No documents to build FAISS index from. Skipping FAISS build.")
|
| 199 |
+
return
|
| 200 |
+
|
| 201 |
+
self.logger.info(f"Creating FAISS index from {len(all_docs_for_vectorstore)} document chunks...")
|
| 202 |
+
self.vector_store = FAISS.from_documents(all_docs_for_vectorstore, self.embeddings)
|
| 203 |
+
faiss_index_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 204 |
+
self.vector_store.save_local(faiss_index_path)
|
| 205 |
+
self.logger.info(f"FAISS index built and saved to '{faiss_index_path}'.")
|
| 206 |
+
self.retriever = FAISSRetrieverWithScore(vectorstore=self.vector_store, k=k)
|
| 207 |
+
self.setup_rag_chain()
|
| 208 |
+
|
| 209 |
+
def load_index_from_disk(self, k: int = RAG_DEFAULT_RETRIEVER_K):
|
| 210 |
+
faiss_index_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 211 |
+
if not os.path.isdir(faiss_index_path):
|
| 212 |
+
raise FileNotFoundError(f"FAISS index directory not found at '{faiss_index_path}'.")
|
| 213 |
+
|
| 214 |
+
self.logger.info(f"Loading FAISS index from: {faiss_index_path}")
|
| 215 |
+
self.vector_store = FAISS.load_local(
|
| 216 |
+
folder_path=faiss_index_path,
|
| 217 |
+
embeddings=self.embeddings,
|
| 218 |
+
allow_dangerous_deserialization=True
|
| 219 |
+
)
|
| 220 |
+
self.retriever = FAISSRetrieverWithScore(vectorstore=self.vector_store, k=k)
|
| 221 |
+
self.setup_rag_chain()
|
| 222 |
+
|
| 223 |
+
def format_docs(self, docs: List[Document]) -> str:
|
| 224 |
+
return "\n\n---\n\n".join([f"[Excerpt from {doc.metadata.get('source_document_name', 'N/A')}, Chunk {doc.metadata.get('chunk_index', 'N/A')}]\nContent:\n{doc.page_content}" for doc in docs])
|
| 225 |
+
|
| 226 |
+
def setup_rag_chain(self):
|
| 227 |
+
if not self.retriever or not self.llm:
|
| 228 |
+
raise RuntimeError("Retriever and LLM must be initialized.")
|
| 229 |
+
|
| 230 |
+
prompt = ChatPromptTemplate.from_template(RAG_SYSTEM_PROMPT)
|
| 231 |
+
|
| 232 |
+
self.rag_chain = (
|
| 233 |
+
RunnableParallel(
|
| 234 |
+
context=(self.retriever | self.format_docs),
|
| 235 |
+
question=RunnablePassthrough()
|
| 236 |
+
)
|
| 237 |
+
| prompt
|
| 238 |
+
| self.llm
|
| 239 |
+
| StrOutputParser()
|
| 240 |
+
)
|
| 241 |
+
self.logger.info("RAG LCEL chain set up successfully with dental assistant persona.")
|
| 242 |
+
|
| 243 |
+
def invoke(self, query: str, top_k: Optional[int] = None) -> Dict[str, Any]:
|
| 244 |
+
if not self.rag_chain:
|
| 245 |
+
# MODIFIED: Changed severity
|
| 246 |
+
self.logger.warning("RAG system not fully initialized. Cannot invoke.")
|
| 247 |
+
return {"answer": "The provided bibliography does not contain specific information on this topic.", "source": "system_error", "cited_source_details": []}
|
| 248 |
+
|
| 249 |
+
if not query or not query.strip():
|
| 250 |
+
return {"answer": "Please provide a valid question.", "source": "system_error", "cited_source_details": []}
|
| 251 |
+
|
| 252 |
+
k_to_use = top_k if top_k is not None and top_k > 0 else self.retriever.k
|
| 253 |
+
self.logger.info(f"Processing RAG query with k={k_to_use}: '{query[:100]}...'")
|
| 254 |
+
|
| 255 |
+
original_k = self.retriever.k
|
| 256 |
+
if k_to_use != original_k:
|
| 257 |
+
self.retriever.k = k_to_use
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
retrieved_docs = self.retriever.get_relevant_documents(query)
|
| 261 |
+
if not retrieved_docs:
|
| 262 |
+
return {"answer": "The provided bibliography does not contain specific information on this topic.", "source": "no_docs_found", "cited_source_details": []}
|
| 263 |
+
|
| 264 |
+
context_str = self.format_docs(retrieved_docs)
|
| 265 |
+
|
| 266 |
+
# MODIFIED: Added full logging as per user request
|
| 267 |
+
print(f"\n--- RAG INVOKE ---")
|
| 268 |
+
print(f"QUESTION: {query}")
|
| 269 |
+
print(f"CONTEXT:\n{context_str}")
|
| 270 |
+
|
| 271 |
+
llm_answer = self.rag_chain.invoke(query, config={"context": context_str})
|
| 272 |
+
|
| 273 |
+
print(f"LLM_ANSWER: {llm_answer}")
|
| 274 |
+
print(f"--------------------\n")
|
| 275 |
+
|
| 276 |
+
structured_sources = [{
|
| 277 |
+
"source_document_name": doc.metadata.get('source_document_name', 'Unknown'),
|
| 278 |
+
"chunk_index": doc.metadata.get('chunk_index', 'N/A'),
|
| 279 |
+
"retrieval_score": doc.metadata.get("retrieval_score"),
|
| 280 |
+
} for doc in retrieved_docs]
|
| 281 |
+
|
| 282 |
+
if "the provided bibliography does not contain specific information" in llm_answer.lower():
|
| 283 |
+
final_answer = llm_answer
|
| 284 |
+
source_tag = "no_answer_in_bibliography"
|
| 285 |
+
else:
|
| 286 |
+
final_answer = f"{llm_answer}\n\n*Source: Bibliography-Based*"
|
| 287 |
+
source_tag = "bibliography_based"
|
| 288 |
+
|
| 289 |
+
return {
|
| 290 |
+
"query": query,
|
| 291 |
+
"answer": final_answer.strip(),
|
| 292 |
+
"source": source_tag,
|
| 293 |
+
"cited_source_details": structured_sources,
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
self.logger.error(f"Error during RAG query processing: {e}", exc_info=True)
|
| 298 |
+
return {"answer": "An error occurred while processing your request.", "source": "system_error", "cited_source_details": []}
|
| 299 |
+
finally:
|
| 300 |
+
if k_to_use != original_k:
|
| 301 |
+
self.retriever.k = original_k
|
| 302 |
+
|
| 303 |
+
def stream(self, query: str, top_k: Optional[int] = None) -> Iterator[str]:
|
| 304 |
+
if not self.rag_chain:
|
| 305 |
+
self.logger.error("RAG system not fully initialized for streaming.")
|
| 306 |
+
yield "Error: RAG system is not ready."
|
| 307 |
+
return
|
| 308 |
+
|
| 309 |
+
k_to_use = top_k if top_k is not None and top_k > 0 else self.retriever.k
|
| 310 |
+
self.logger.info(f"Processing RAG stream with k={k_to_use}: '{query[:100]}...'")
|
| 311 |
+
|
| 312 |
+
original_k = self.retriever.k
|
| 313 |
+
if k_to_use != original_k:
|
| 314 |
+
self.retriever.k = k_to_use
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
# Check for docs first to avoid streaming "no info" message
|
| 318 |
+
retrieved_docs = self.retriever.get_relevant_documents(query)
|
| 319 |
+
if not retrieved_docs:
|
| 320 |
+
yield "The provided bibliography does not contain specific information on this topic."
|
| 321 |
+
return
|
| 322 |
+
|
| 323 |
+
# MODIFIED: Added full logging for streaming as per user request
|
| 324 |
+
context_str = self.format_docs(retrieved_docs)
|
| 325 |
+
print(f"\n--- RAG STREAM ---")
|
| 326 |
+
print(f"QUESTION: {query}")
|
| 327 |
+
print(f"CONTEXT:\n{context_str}")
|
| 328 |
+
print(f"STREAMING LLM_ANSWER...")
|
| 329 |
+
print(f"--------------------\n")
|
| 330 |
+
|
| 331 |
+
yield from self.rag_chain.stream(query, config={"context": context_str})
|
| 332 |
+
except Exception as e:
|
| 333 |
+
self.logger.error(f"Error during RAG stream processing: {e}", exc_info=True)
|
| 334 |
+
yield "An error occurred while processing your request."
|
| 335 |
+
finally:
|
| 336 |
+
if k_to_use != original_k:
|
| 337 |
+
self.retriever.k = original_k
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
# --- Groq Fallback Bot ---
|
| 341 |
+
class GroqBot:
|
| 342 |
+
def __init__(self):
|
| 343 |
+
self.logger = logging.getLogger(__name__ + ".GroqBot")
|
| 344 |
+
if not GROQ_API_KEY:
|
| 345 |
+
self.client = None
|
| 346 |
+
self.logger.critical("GroqBot not initialized: BOT_API_KEY is missing.")
|
| 347 |
+
return
|
| 348 |
+
try:
|
| 349 |
+
self.client = LlamaIndexGroqClient(model=FALLBACK_LLM_MODEL_NAME, api_key=GROQ_API_KEY)
|
| 350 |
+
self.system_prompt = FALLBACK_SYSTEM_PROMPT
|
| 351 |
+
except Exception as e:
|
| 352 |
+
self.logger.error(f"Failed to initialize LlamaIndexGroqClient for Fallback Bot: {e}", exc_info=True)
|
| 353 |
+
self.client = None
|
| 354 |
+
|
| 355 |
+
def stream_response(self, context: dict) -> Iterator[str]:
|
| 356 |
+
if not self.client:
|
| 357 |
+
yield "The system is currently unable to process this request."
|
| 358 |
+
return
|
| 359 |
+
|
| 360 |
+
current_query = context.get('current_query', '')
|
| 361 |
+
chat_history = context.get('chat_history', [])
|
| 362 |
+
qa_info = context.get('qa_related_info', '')
|
| 363 |
+
|
| 364 |
+
messages = [ChatMessage(role="system", content=self.system_prompt)]
|
| 365 |
+
if chat_history:
|
| 366 |
+
messages.extend([ChatMessage(**msg) for msg in chat_history])
|
| 367 |
+
if qa_info:
|
| 368 |
+
messages.append(ChatMessage(role="system", content=f"**Potentially Relevant Q&A Information from other sources:**\n{qa_info}"))
|
| 369 |
+
messages.append(ChatMessage(role="user", content=f"**Current User Query:**\n{current_query}"))
|
| 370 |
+
|
| 371 |
+
# MODIFIED: Added full logging as per user request
|
| 372 |
+
# The conversion to dict is necessary because ChatMessage is not directly JSON serializable
|
| 373 |
+
messages_for_print = [msg.dict() for msg in messages]
|
| 374 |
+
print(f"\n--- FALLBACK STREAM ---")
|
| 375 |
+
print(f"MESSAGES SENT TO LLM:\n{json.dumps(messages_for_print, indent=2)}")
|
| 376 |
+
print(f"STREAMING LLM_ANSWER...")
|
| 377 |
+
print(f"-----------------------\n")
|
| 378 |
+
|
| 379 |
+
try:
|
| 380 |
+
response_stream = self.client.stream_chat(messages)
|
| 381 |
+
for r_chunk in response_stream:
|
| 382 |
+
yield r_chunk.delta
|
| 383 |
+
except Exception as e:
|
| 384 |
+
self.logger.error(f"Groq API error in get_response (Fallback): {e}", exc_info=True)
|
| 385 |
+
yield "I am currently unable to process this request due to a technical issue."
|
| 386 |
+
|
| 387 |
+
# ADDED: New function for formatting QA answers
|
| 388 |
+
def get_answer_from_context(question: str, context: str, system_prompt: str) -> str:
|
| 389 |
+
"""
|
| 390 |
+
Calls the LLM with a specific question and context from a QA source (CSV/XLSX).
|
| 391 |
+
"""
|
| 392 |
+
logger.info(f"Formatting answer for question '{question[:50]}...' using QA context.")
|
| 393 |
+
try:
|
| 394 |
+
# Use the auxiliary model for this task for speed and cost-efficiency
|
| 395 |
+
formatter_llm = ChatGroq(
|
| 396 |
+
temperature=0.1,
|
| 397 |
+
groq_api_key=GROQ_API_KEY,
|
| 398 |
+
model_name=AUXILIARY_LLM_MODEL_NAME
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
prompt_template = ChatPromptTemplate.from_template(system_prompt)
|
| 402 |
+
|
| 403 |
+
chain = prompt_template | formatter_llm | StrOutputParser()
|
| 404 |
+
|
| 405 |
+
# MODIFIED: Added full logging as per user request
|
| 406 |
+
print(f"\n--- QA FORMATTER ---")
|
| 407 |
+
print(f"QUESTION: {question}")
|
| 408 |
+
print(f"CONTEXT:\n{context}")
|
| 409 |
+
|
| 410 |
+
response = chain.invoke({
|
| 411 |
+
"context": context,
|
| 412 |
+
"question": question
|
| 413 |
+
})
|
| 414 |
+
|
| 415 |
+
print(f"LLM_ANSWER: {response}")
|
| 416 |
+
print(f"--------------------\n")
|
| 417 |
+
|
| 418 |
+
return response.strip()
|
| 419 |
+
|
| 420 |
+
except Exception as e:
|
| 421 |
+
logger.error(f"Error in get_answer_from_context: {e}", exc_info=True)
|
| 422 |
+
return "Sorry, I was unable to formulate an answer based on the available information."
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
# --- Initialization and Interface Functions ---
|
| 426 |
+
def get_id_from_gdrive_input(url_or_id: str) -> Optional[str]:
|
| 427 |
+
if not url_or_id: return None
|
| 428 |
+
patterns = [r"/folders/([a-zA-Z0-9_-]+)", r"/d/([a-zA-Z0-9_-]+)", r"id=([a-zA-Z0-9_-]+)"]
|
| 429 |
+
for pattern in patterns:
|
| 430 |
+
match = re.search(pattern, url_or_id)
|
| 431 |
+
if match: return match.group(1)
|
| 432 |
+
if "/" not in url_or_id and "=" not in url_or_id and len(url_or_id) > 15:
|
| 433 |
+
return url_or_id
|
| 434 |
+
return None
|
| 435 |
+
|
| 436 |
+
def download_and_unzip_gdrive_folder(folder_id_or_url: str, target_dir: str) -> bool:
|
| 437 |
+
folder_id = get_id_from_gdrive_input(folder_id_or_url)
|
| 438 |
+
if not folder_id:
|
| 439 |
+
logger.error(f"Invalid Google Drive Folder ID or URL: {folder_id_or_url}")
|
| 440 |
+
return False
|
| 441 |
+
|
| 442 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 443 |
+
try:
|
| 444 |
+
logger.info(f"Attempting to download GDrive folder ID: {folder_id}")
|
| 445 |
+
download_path = gdown.download_folder(id=folder_id, output=temp_dir, quiet=False, use_cookies=False)
|
| 446 |
+
if not download_path or not os.listdir(temp_dir):
|
| 447 |
+
logger.error("gdown failed to download or extract the folder.")
|
| 448 |
+
return False
|
| 449 |
+
|
| 450 |
+
source_content_root = temp_dir
|
| 451 |
+
items_in_temp = os.listdir(temp_dir)
|
| 452 |
+
if len(items_in_temp) == 1 and os.path.isdir(os.path.join(temp_dir, items_in_temp[0])):
|
| 453 |
+
source_content_root = os.path.join(temp_dir, items_in_temp[0])
|
| 454 |
+
|
| 455 |
+
logger.info(f"Moving contents from {source_content_root} to {target_dir}")
|
| 456 |
+
if os.path.exists(target_dir):
|
| 457 |
+
shutil.rmtree(target_dir)
|
| 458 |
+
shutil.copytree(source_content_root, target_dir)
|
| 459 |
+
logger.info(f"Successfully moved GDrive contents to {target_dir}")
|
| 460 |
+
return True
|
| 461 |
+
except Exception as e:
|
| 462 |
+
# MODIFIED: Corrected self.logger to logger
|
| 463 |
+
logger.error(f"Error during GDrive download/processing: {e}", exc_info=True)
|
| 464 |
+
return False
|
| 465 |
+
|
| 466 |
+
def initialize_and_get_rag_system(force_rebuild: bool = False) -> Optional[KnowledgeRAG]:
|
| 467 |
+
if not GROQ_API_KEY:
|
| 468 |
+
logger.error("RAG system cannot be initialized without BOT_API_KEY.")
|
| 469 |
+
return None
|
| 470 |
+
|
| 471 |
+
if GDRIVE_SOURCES_ENABLED and GDRIVE_FOLDER_ID_OR_URL:
|
| 472 |
+
logger.info("Google Drive sources enabled. Downloading...")
|
| 473 |
+
if os.path.isdir(RAG_SOURCES_DIR):
|
| 474 |
+
logger.info(f"Clearing existing RAG sources directory: {RAG_SOURCES_DIR}")
|
| 475 |
+
shutil.rmtree(RAG_SOURCES_DIR)
|
| 476 |
+
os.makedirs(RAG_SOURCES_DIR)
|
| 477 |
+
|
| 478 |
+
download_successful = download_and_unzip_gdrive_folder(GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR)
|
| 479 |
+
if not download_successful:
|
| 480 |
+
logger.error("Failed to download sources from Google Drive. Using local files if available.")
|
| 481 |
+
|
| 482 |
+
faiss_index_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 483 |
+
if force_rebuild and os.path.exists(RAG_STORAGE_PARENT_DIR):
|
| 484 |
+
logger.info(f"Force Rebuild: Deleting existing index storage directory at '{RAG_STORAGE_PARENT_DIR}'")
|
| 485 |
+
shutil.rmtree(RAG_STORAGE_PARENT_DIR)
|
| 486 |
+
os.makedirs(RAG_STORAGE_PARENT_DIR)
|
| 487 |
+
|
| 488 |
+
try:
|
| 489 |
+
rag_instance = KnowledgeRAG(
|
| 490 |
+
index_storage_dir=RAG_STORAGE_PARENT_DIR,
|
| 491 |
+
embedding_model_name=RAG_EMBEDDING_MODEL_NAME,
|
| 492 |
+
groq_model_name_for_rag=RAG_LLM_MODEL_NAME,
|
| 493 |
+
use_gpu_for_embeddings=RAG_EMBEDDING_USE_GPU,
|
| 494 |
+
groq_api_key_for_rag=GROQ_API_KEY,
|
| 495 |
+
temperature=RAG_LLM_TEMPERATURE,
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
should_build = True
|
| 499 |
+
if RAG_LOAD_INDEX_ON_STARTUP and not force_rebuild:
|
| 500 |
+
try:
|
| 501 |
+
rag_instance.load_index_from_disk(k=RAG_DEFAULT_RETRIEVER_K)
|
| 502 |
+
logger.info("RAG index loaded successfully from disk.")
|
| 503 |
+
should_build = False
|
| 504 |
+
except FileNotFoundError:
|
| 505 |
+
logger.warning("Index not found on disk. Will attempt to build.")
|
| 506 |
+
except Exception as e:
|
| 507 |
+
logger.error(f"Error loading index: {e}. Will attempt to rebuild.", exc_info=True)
|
| 508 |
+
|
| 509 |
+
if should_build:
|
| 510 |
+
logger.info("Building new RAG index from source data...")
|
| 511 |
+
rag_instance.build_index_from_source_files(
|
| 512 |
+
source_folder_path=RAG_SOURCES_DIR,
|
| 513 |
+
k=RAG_DEFAULT_RETRIEVER_K
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
return rag_instance
|
| 517 |
+
|
| 518 |
+
except Exception as e:
|
| 519 |
+
logger.critical(f"FATAL: Failed to initialize RAG system: {e}", exc_info=True)
|
| 520 |
+
return None
|
| 521 |
+
|
| 522 |
+
groq_bot_instance = GroqBot()
|
| 523 |
+
|
| 524 |
+
# ADDED: New function to handle auxiliary model calls (This function is no longer used, replaced by get_answer_from_context)
|
| 525 |
+
def get_auxiliary_chat_response(messages: List[Dict]) -> str:
|
| 526 |
+
"""
|
| 527 |
+
Handles requests for auxiliary tasks like generating titles or follow-up questions.
|
| 528 |
+
Uses a separate, smaller model for efficiency.
|
| 529 |
+
"""
|
| 530 |
+
logger.info(f"Routing auxiliary request to model: {AUXILIARY_LLM_MODEL_NAME}")
|
| 531 |
+
try:
|
| 532 |
+
# Initialize a dedicated client for this call to use the specific auxiliary model
|
| 533 |
+
aux_client = ChatGroq(
|
| 534 |
+
temperature=0.2, # A bit more creative than RAG, but still grounded
|
| 535 |
+
groq_api_key=GROQ_API_KEY,
|
| 536 |
+
model_name=AUXILIARY_LLM_MODEL_NAME
|
| 537 |
+
)
|
| 538 |
+
response = aux_client.invoke(messages)
|
| 539 |
+
return response.content
|
| 540 |
+
except Exception as e:
|
| 541 |
+
logger.error(f"Error with auxiliary model call: {e}", exc_info=True)
|
| 542 |
+
return "Could not generate suggestions."
|
requirements.txt
CHANGED
|
@@ -1 +1,30 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask==3.0.3
|
| 2 |
+
Flask_Cors==5.0.0
|
| 3 |
+
numpy
|
| 4 |
+
pandas==2.2.3
|
| 5 |
+
#rapidfuzz==3.10.1
|
| 6 |
+
Requests==2.32.3
|
| 7 |
+
#scikit_learn==1.4.1.post1
|
| 8 |
+
#scikit_learn==1.5.2
|
| 9 |
+
psycopg2-binary==2.9.10
|
| 10 |
+
python-dotenv==1.0.1
|
| 11 |
+
apscheduler==3.11.0
|
| 12 |
+
redis==3.5.3
|
| 13 |
+
faiss-cpu==1.10.0
|
| 14 |
+
groq==0.15.0
|
| 15 |
+
llama_index==0.12.13
|
| 16 |
+
llama_index.llms.groq==0.3.1
|
| 17 |
+
#langchain_groq==0.2.4
|
| 18 |
+
#langchain_core==0.3.39
|
| 19 |
+
sentence_transformers==3.4.0
|
| 20 |
+
gunicorn
|
| 21 |
+
llama-index-embeddings-huggingface==0.5.4
|
| 22 |
+
onnxruntime==1.22.0
|
| 23 |
+
langchain-groq==0.3.2
|
| 24 |
+
python-docx==1.1.2
|
| 25 |
+
langchain_community==0.3.23
|
| 26 |
+
requests==2.32.3
|
| 27 |
+
gdown==5.2.0
|
| 28 |
+
pymupdf==1.25.5
|
| 29 |
+
openpyxl==3.1.5
|
| 30 |
+
# must install https://aka.ms/vs/17/release/vc_redist.x64.exe
|
sources/Endodontics%20appendix%201.pdf
ADDED
|
Binary file (90.7 kB). View file
|
|
|
sources/Endodontics%20appendix%202.pdf
ADDED
|
Binary file (66.3 kB). View file
|
|
|
sources/Endodontics%20appendix%203.pdf
ADDED
|
Binary file (92 kB). View file
|
|
|
sources/Endodontics%20appendix%204.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8b972d927a7a01f466cfe30e24834e653009926891919df1078f3563b718d1b
|
| 3 |
+
size 188535
|
sources/Endodontics%20book.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:045c84eef3be88136f67bea03299a5e54ff121a017b8d6575d2fa7dd1269d460
|
| 3 |
+
size 19170850
|
sources/Endodontics%20cap%201.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2956cff53ab6755bd5769d96ea6e021d21992d66898fa7ad19d59a651db08552
|
| 3 |
+
size 1393891
|
sources/Endodontics%20cap%2010.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d60888b81a8ba43e14a85dee20a485120c6a168e75d8cb44f51f98ea1dc86c6
|
| 3 |
+
size 858903
|
sources/Endodontics%20cap%2011.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6cdfa6604cc9bdad78c8ffb4ec61e6ae5f12c3b75f0b52d45bab57c43983e676
|
| 3 |
+
size 1132015
|
sources/Endodontics%20cap%2012.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b42e9dca8977115f11342fd3d264576c5b7224a51b82f3646d3a766669417f91
|
| 3 |
+
size 1539100
|
sources/Endodontics%20cap%2013.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed0b9c196e3afe7477709996a261580721c42c391ecab6a0183ef4261b5a86fd
|
| 3 |
+
size 1625649
|
sources/Endodontics%20cap%2014.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92e0c462d8eb85d94b08ba84ad2270c86e87a678dea4be0d700f910af89c64e7
|
| 3 |
+
size 1357984
|
sources/Endodontics%20cap%2015.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54d2a579eed28d2bb58c8aca669dc6baaad1824a156517212494b0557f4837d7
|
| 3 |
+
size 944072
|
sources/Endodontics%20cap%2016.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ab55472e1887827d5f55e51993ebb5ab9a031fafb6e81db258b82277c7ce4e5
|
| 3 |
+
size 681335
|
sources/Endodontics%20cap%2017.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f5c395668f607f37c3f67131bdb0f82bc792b3bdbabb03071abd623d066db27
|
| 3 |
+
size 528079
|
sources/Endodontics%20cap%2018.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8cb86f5fa0e481b2d91db3e93dcabe580a447630f7716d0f8309d5a018bbc64
|
| 3 |
+
size 746444
|
sources/Endodontics%20cap%2019.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5746dc123bc59f7373699d48e753ad05552acaa18c743dc5026990e603686b2
|
| 3 |
+
size 830885
|
sources/Endodontics%20cap%202.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b06e9d058a6f05afb04bfef398b8d0f36c02e4d78aba873dc294652c9fe2517c
|
| 3 |
+
size 612225
|
sources/Endodontics%20cap%2020.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:127d6f0efcac6fc7d12cbee72f9f0cbf05136be860e5837662037d267e8c621b
|
| 3 |
+
size 609023
|
sources/Endodontics%20cap%2021.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d3b2e5699b2a92f2bb5187e825aa5f11afc1d2ed54bce56e6ba56fab699f2a4
|
| 3 |
+
size 464443
|
sources/Endodontics%20cap%2022.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a9dee2fd5d783cdf4f53b8e3d79ab012369e1c55ade79c4e5ee7544c83d690f
|
| 3 |
+
size 561329
|
sources/Endodontics%20cap%203.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a79808a3da34f9817136e441254449dc5504f115cae1190505b8ce5f3c3f9b51
|
| 3 |
+
size 786616
|
sources/Endodontics%20cap%204.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3223d4b6c6ae9b1b66bd5f0ac750bf8e3a514e7941555632ad98e67202929993
|
| 3 |
+
size 810838
|
sources/Endodontics%20cap%205.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe63ba988c8c88207a7482c677983cd5f1a0b06cb5bc210b650a657b20924ebf
|
| 3 |
+
size 761083
|
sources/Endodontics%20cap%206.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c490f38f73cd72e8bde876bb082964d43d1080abd1593ab7c4e53ca97a4f492
|
| 3 |
+
size 1056526
|
sources/Endodontics%20cap%207.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23b1f2f03a024f0520a798a7e7bd2c128cc5eda3d929a6b56d9a2b9dde184d14
|
| 3 |
+
size 1619923
|
sources/Endodontics%20cap%208.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:159d99387578b0d9164e243ae27f7ee77ee998aa8c71dc15310e6ca274d96f7e
|
| 3 |
+
size 751157
|
sources/Endodontics%20cap%209.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db47bb570fa521c1e7462e07bd22f9ce38485c2b262b0c774d16ba7795844179
|
| 3 |
+
size 498083
|
sources/_%24preguntas%20chatbot_01.xlsx
ADDED
|
Binary file (165 Bytes). View file
|
|
|
sources/database.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Question,Answer,Image
|
sources/general_qa.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Question,Answer,Image
|
sources/greetings.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Question,Answer,Image
|
sources/personal_qa.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Question,Answer,Image
|
sources/preguntas chatbot_01.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65e4dfe9f1e7dad838718a016a486ca8a9e3c99068fcc22ae6e2e2064fd86943
|
| 3 |
+
size 156246
|
system_prompts.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
This module centralizes all system prompts for the specialized dental chatbot application.
|
| 4 |
+
This allows for easy management and updating of prompts without altering core logic.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
# --- RAG System Prompt for Bibliography-Based Answers ---
|
| 8 |
+
# This prompt instructs the LLM to answer based *only* on the context provided
|
| 9 |
+
# by the RAG system from scientific documents (PDFs, etc.).
|
| 10 |
+
# Placeholders {context} and {question} will be filled by the LangChain pipeline.
|
| 11 |
+
RAG_SYSTEM_PROMPT = """You are a specialized dental assistant AI. Your role is to provide accurate, evidence-based information on a specific dental topic.
|
| 12 |
+
|
| 13 |
+
**Your Task:**
|
| 14 |
+
Your primary task is to answer the user's question accurately and concisely, based *exclusively* on the "Provided Document Excerpts" below. These excerpts are from vetted scientific and dental publications.
|
| 15 |
+
|
| 16 |
+
**Provided Document Excerpts:**
|
| 17 |
+
{context}
|
| 18 |
+
|
| 19 |
+
**User Question:**
|
| 20 |
+
{question}
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
**Core Instructions:**
|
| 24 |
+
1. **Language:** Your default language is **Spanish**. But follow the language of user. If they ask question in Spanish, reply in Spanish. If they ask in English, reply in English, even if the context is Spanish.
|
| 25 |
+
2. **Strictly Adhere to Context:** Your answer **must** be derived solely from the "Provided Document Excerpts." Do not use any external knowledge or make assumptions beyond what is presented in the text.
|
| 26 |
+
3. **Professional Tone:** Maintain a clinical, objective, and professional tone suitable for a dental context.
|
| 27 |
+
4. **Do Not Speculate:** If the provided excerpts do not contain the information needed to answer the question, you must not invent an answer.
|
| 28 |
+
5. **Handling Unanswerable Questions:** If you cannot answer the question based on the provided excerpts, respond with: "The provided bibliography does not contain specific information on this topic." Do not attempt to guide the user elsewhere or apologize.
|
| 29 |
+
6. **No Self-Reference:** Do not mention that you are an AI, that you are "looking at documents," or refer to the "provided excerpts" in your final answer. Simply present the information as requested.
|
| 30 |
+
|
| 31 |
+
**Answer Format:**
|
| 32 |
+
Provide a direct answer to the user's question based on the information available.
|
| 33 |
+
|
| 34 |
+
**Answer:**"""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# --- Fallback System Prompt for General/Triage Purposes ---
|
| 38 |
+
# REVISED: This prompt is now much stricter and will only handle dental-related queries.
|
| 39 |
+
FALLBACK_SYSTEM_PROMPT = """You are a specialized dental assistant AI. Your one and only role is to answer questions strictly related to dentistry.
|
| 40 |
+
|
| 41 |
+
**Core Instructions:**
|
| 42 |
+
1. **Dental Focus Only:** You MUST NOT engage in any general conversation, small talk, or answer questions outside the scope of dentistry.
|
| 43 |
+
2. **Handle Out-of-Scope Questions:** If the user's question is unrelated to dentistry, you must respond with the following exact phrase: "I am a dental assistant AI and my capabilities are limited to dental topics. Do you have a question about oral health?"
|
| 44 |
+
3. **Stateful Conversation:** Pay attention to the `Prior Conversation History` to understand the context of the user's dental inquiries.
|
| 45 |
+
4. **Professional Tone:** Always be polite, helpful, and professional.
|
| 46 |
+
5. **Do Not Make Up Clinical Advice:** Do not provide medical diagnoses or treatment plans. You can provide general information but should always recommend consulting a professional for personal health concerns.
|
| 47 |
+
|
| 48 |
+
**Response Guidance:**
|
| 49 |
+
- Review the `Prior Conversation History` to understand the context.
|
| 50 |
+
- Formulate a helpful, professional answer to the `Current User Query` if it is about dentistry.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
# ADDED: New prompt to format answers based on structured data from CSV/XLSX files.
|
| 54 |
+
QA_FORMATTER_PROMPT = """You are a helpful assistant. You will be given a user's question and structured data from a database row that is highly relevant to the question.
|
| 55 |
+
Your task is to formulate a natural, conversational answer to the user's question based *only* on the provided data.
|
| 56 |
+
|
| 57 |
+
- Synthesize the information from the data fields into a coherent response.
|
| 58 |
+
- Do not just list the data. Create a proper sentence or paragraph.
|
| 59 |
+
- If the data contains a 'Fuente' or 'Source' field, cite it at the end of your answer like this: (Source: [source_value]).
|
| 60 |
+
|
| 61 |
+
**Provided Data:**
|
| 62 |
+
{context}
|
| 63 |
+
|
| 64 |
+
**User Question:**
|
| 65 |
+
{question}
|
| 66 |
+
|
| 67 |
+
**Answer:**"""
|