SergioI1991 commited on
Commit
606fa93
·
verified ·
1 Parent(s): dc940d4

Upload 43 files

Browse files
Files changed (44) hide show
  1. .gitattributes +25 -0
  2. Dockerfile +35 -0
  3. app.py +1037 -64
  4. chunker.py +189 -0
  5. env +31 -0
  6. faiss_storage/faiss_index/index.faiss +3 -0
  7. faiss_storage/faiss_index/index.pkl +3 -0
  8. gitattributes +59 -0
  9. llm_handling.py +542 -0
  10. requirements.txt +30 -1
  11. sources/Endodontics%20appendix%201.pdf +0 -0
  12. sources/Endodontics%20appendix%202.pdf +0 -0
  13. sources/Endodontics%20appendix%203.pdf +0 -0
  14. sources/Endodontics%20appendix%204.pdf +3 -0
  15. sources/Endodontics%20book.zip +3 -0
  16. sources/Endodontics%20cap%201.pdf +3 -0
  17. sources/Endodontics%20cap%2010.pdf +3 -0
  18. sources/Endodontics%20cap%2011.pdf +3 -0
  19. sources/Endodontics%20cap%2012.pdf +3 -0
  20. sources/Endodontics%20cap%2013.pdf +3 -0
  21. sources/Endodontics%20cap%2014.pdf +3 -0
  22. sources/Endodontics%20cap%2015.pdf +3 -0
  23. sources/Endodontics%20cap%2016.pdf +3 -0
  24. sources/Endodontics%20cap%2017.pdf +3 -0
  25. sources/Endodontics%20cap%2018.pdf +3 -0
  26. sources/Endodontics%20cap%2019.pdf +3 -0
  27. sources/Endodontics%20cap%202.pdf +3 -0
  28. sources/Endodontics%20cap%2020.pdf +3 -0
  29. sources/Endodontics%20cap%2021.pdf +3 -0
  30. sources/Endodontics%20cap%2022.pdf +3 -0
  31. sources/Endodontics%20cap%203.pdf +3 -0
  32. sources/Endodontics%20cap%204.pdf +3 -0
  33. sources/Endodontics%20cap%205.pdf +3 -0
  34. sources/Endodontics%20cap%206.pdf +3 -0
  35. sources/Endodontics%20cap%207.pdf +3 -0
  36. sources/Endodontics%20cap%208.pdf +3 -0
  37. sources/Endodontics%20cap%209.pdf +3 -0
  38. sources/_%24preguntas%20chatbot_01.xlsx +0 -0
  39. sources/database.csv +1 -0
  40. sources/general_qa.csv +1 -0
  41. sources/greetings.csv +1 -0
  42. sources/personal_qa.csv +1 -0
  43. sources/preguntas chatbot_01.xlsx +3 -0
  44. system_prompts.py +67 -0
.gitattributes CHANGED
@@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ faiss_storage/faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ sources/Endodontics%20appendix%204.pdf filter=lfs diff=lfs merge=lfs -text
38
+ sources/Endodontics%20cap%201.pdf filter=lfs diff=lfs merge=lfs -text
39
+ sources/Endodontics%20cap%2010.pdf filter=lfs diff=lfs merge=lfs -text
40
+ sources/Endodontics%20cap%2011.pdf filter=lfs diff=lfs merge=lfs -text
41
+ sources/Endodontics%20cap%2012.pdf filter=lfs diff=lfs merge=lfs -text
42
+ sources/Endodontics%20cap%2013.pdf filter=lfs diff=lfs merge=lfs -text
43
+ sources/Endodontics%20cap%2014.pdf filter=lfs diff=lfs merge=lfs -text
44
+ sources/Endodontics%20cap%2015.pdf filter=lfs diff=lfs merge=lfs -text
45
+ sources/Endodontics%20cap%2016.pdf filter=lfs diff=lfs merge=lfs -text
46
+ sources/Endodontics%20cap%2017.pdf filter=lfs diff=lfs merge=lfs -text
47
+ sources/Endodontics%20cap%2018.pdf filter=lfs diff=lfs merge=lfs -text
48
+ sources/Endodontics%20cap%2019.pdf filter=lfs diff=lfs merge=lfs -text
49
+ sources/Endodontics%20cap%202.pdf filter=lfs diff=lfs merge=lfs -text
50
+ sources/Endodontics%20cap%2020.pdf filter=lfs diff=lfs merge=lfs -text
51
+ sources/Endodontics%20cap%2021.pdf filter=lfs diff=lfs merge=lfs -text
52
+ sources/Endodontics%20cap%2022.pdf filter=lfs diff=lfs merge=lfs -text
53
+ sources/Endodontics%20cap%203.pdf filter=lfs diff=lfs merge=lfs -text
54
+ sources/Endodontics%20cap%204.pdf filter=lfs diff=lfs merge=lfs -text
55
+ sources/Endodontics%20cap%205.pdf filter=lfs diff=lfs merge=lfs -text
56
+ sources/Endodontics%20cap%206.pdf filter=lfs diff=lfs merge=lfs -text
57
+ sources/Endodontics%20cap%207.pdf filter=lfs diff=lfs merge=lfs -text
58
+ sources/Endodontics%20cap%208.pdf filter=lfs diff=lfs merge=lfs -text
59
+ sources/Endodontics%20cap%209.pdf filter=lfs diff=lfs merge=lfs -text
60
+ sources/preguntas[[:space:]]chatbot_01.xlsx filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ libgl1-mesa-glx \
10
+ libglib2.0-0 \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy the requirements file
14
+ COPY requirements.txt requirements.txt
15
+
16
+ # Install Python packages
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy application code
20
+ COPY . /app
21
+
22
+ # Create a non-root user
23
+ RUN useradd -m -u 1000 user
24
+
25
+ # Change ownership
26
+ RUN chown -R user:user /app
27
+
28
+ # Switch to the non-root user
29
+ USER user
30
+
31
+ # Expose the port Gunicorn will run on (Using 7860 as in CMD)
32
+ EXPOSE 7860
33
+
34
+ # Command to run the app
35
+ CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,64 +1,1037 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
62
-
63
- if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, send_file, abort, jsonify, url_for, render_template, Response
2
+ from flask_cors import CORS
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer, util
5
+ import torch
6
+ from dataclasses import dataclass
7
+ from typing import List, Dict, Tuple, Optional, Any, Iterator
8
+ from collections import deque
9
+ import os
10
+ import logging
11
+ import atexit
12
+ from threading import Thread, Lock
13
+ import time
14
+ from datetime import datetime
15
+ from uuid import uuid4 as generate_uuid
16
+ import csv as csv_lib
17
+ import functools
18
+ import json
19
+ import re
20
+ import subprocess
21
+ import sys
22
+ import sqlite3
23
+ import io
24
+
25
+ from dotenv import load_dotenv
26
+
27
+ # Load environment variables from .env file AT THE VERY TOP
28
+ load_dotenv()
29
+
30
+ # Import RAG system and Fallback LLM from llm_handling AFTER load_dotenv
31
+ # MODIFIED: Imported new functions and prompts
32
+ from llm_handling import (
33
+ initialize_and_get_rag_system,
34
+ KnowledgeRAG,
35
+ groq_bot_instance,
36
+ RAG_SOURCES_DIR,
37
+ RAG_STORAGE_PARENT_DIR,
38
+ RAG_CHUNKED_SOURCES_FILENAME,
39
+ get_answer_from_context
40
+ )
41
+ from system_prompts import QA_FORMATTER_PROMPT
42
+
43
+
44
+ # Setup logging (remains global for the app)
45
+ logging.basicConfig(
46
+ level=logging.INFO,
47
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
48
+ handlers=[
49
+ logging.FileHandler("app_hybrid_rag.log"),
50
+ logging.StreamHandler()
51
+ ]
52
+ )
53
+ logger = logging.getLogger(__name__) # Main app logger
54
+
55
+ # --- Application Constants and Configuration ---
56
+ ADMIN_USERNAME = os.getenv('FLASK_ADMIN_USERNAME', 'admin')
57
+ ADMIN_PASSWORD = os.getenv('FLASK_ADMIN_PASSWORD', 'admin')
58
+ FLASK_APP_HOST = os.getenv("FLASK_HOST", "0.0.0.0")
59
+ FLASK_APP_PORT = int(os.getenv("FLASK_PORT", "7860"))
60
+ FLASK_DEBUG_MODE = os.getenv("FLASK_DEBUG", "True").lower() == "true"
61
+ _APP_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
62
+ TEXT_EXTRACTIONS_DIR = os.path.join(_APP_BASE_DIR, 'text_extractions')
63
+ RELATED_QUESTIONS_TO_SHOW = 10
64
+ QUESTIONS_TO_SEND_TO_GROQ_QA = 3
65
+ # MODIFIED: Replaced separate confidence values with a single configurable one for the LLM formatter.
66
+ LLM_FORMATTER_CONFIDENCE_THRESHOLD = int(os.getenv("LLM_FORMATTER_CONFIDENCE_THRESHOLD", "95"))
67
+ HIGH_CONFIDENCE_THRESHOLD = 90 # For greetings, which are answered directly without LLM formatting.
68
+ # MODIFIED: Made CHAT_HISTORY_TO_SEND configurable via environment variable
69
+ CHAT_HISTORY_TO_SEND = int(os.getenv("CHAT_HISTORY_TO_SEND", "5")) # Defines how many *pairs* of (user, assistant) messages to send
70
+ CHAT_LOG_FILE = os.path.join(_APP_BASE_DIR, 'chat_history.csv')
71
+
72
+ rag_system: Optional[KnowledgeRAG] = None
73
+
74
+ # --- Persistent Chat History Management using SQLite ---
75
+ class ChatHistoryManager:
76
+ def __init__(self, db_path):
77
+ self.db_path = db_path
78
+ self.lock = Lock()
79
+ self._create_table()
80
+ logger.info(f"SQLite chat history manager initialized at: {self.db_path}")
81
+
82
+ def _get_connection(self):
83
+ conn = sqlite3.connect(self.db_path, timeout=10)
84
+ return conn
85
+
86
+ def _create_table(self):
87
+ with self.lock:
88
+ with self._get_connection() as conn:
89
+ cursor = conn.cursor()
90
+ cursor.execute("""
91
+ CREATE TABLE IF NOT EXISTS chat_histories (
92
+ session_id TEXT PRIMARY KEY,
93
+ history TEXT NOT NULL
94
+ )
95
+ """)
96
+ conn.commit()
97
+
98
+ def get_history(self, session_id: str, limit_turns: int = 5) -> list:
99
+ try:
100
+ with self._get_connection() as conn:
101
+ cursor = conn.cursor()
102
+ cursor.execute("SELECT history FROM chat_histories WHERE session_id = ?", (session_id,))
103
+ row = cursor.fetchone()
104
+ if row:
105
+ history_list = json.loads(row[0])
106
+ return history_list[-(limit_turns * 2):]
107
+ else:
108
+ return []
109
+ except Exception as e:
110
+ logger.error(f"Error fetching history for session {session_id}: {e}", exc_info=True)
111
+ return []
112
+
113
+ def update_history(self, session_id: str, query: str, answer: str):
114
+ with self.lock:
115
+ try:
116
+ with self._get_connection() as conn:
117
+ cursor = conn.cursor()
118
+ cursor.execute("SELECT history FROM chat_histories WHERE session_id = ?", (session_id,))
119
+ row = cursor.fetchone()
120
+
121
+ history = json.loads(row[0]) if row else []
122
+
123
+ history.append({'role': 'user', 'content': query})
124
+ history.append({'role': 'assistant', 'content': answer})
125
+
126
+ updated_history_json = json.dumps(history)
127
+
128
+ cursor.execute("""
129
+ INSERT OR REPLACE INTO chat_histories (session_id, history)
130
+ VALUES (?, ?)
131
+ """, (session_id, updated_history_json))
132
+ conn.commit()
133
+ except Exception as e:
134
+ logger.error(f"Error updating history for session {session_id}: {e}", exc_info=True)
135
+
136
+ def clear_history(self, session_id: str):
137
+ with self.lock:
138
+ try:
139
+ with self._get_connection() as conn:
140
+ cursor = conn.cursor()
141
+ cursor.execute("""
142
+ INSERT OR REPLACE INTO chat_histories (session_id, history)
143
+ VALUES (?, ?)
144
+ """, (session_id, json.dumps([])))
145
+ conn.commit()
146
+ logger.info(f"Chat history cleared for session: {session_id}")
147
+ except Exception as e:
148
+ logger.error(f"Error clearing history for session {session_id}: {e}", exc_info=True)
149
+
150
+
151
+ # --- EmbeddingManager for CSV QA (remains in app.py) ---
152
+ @dataclass
153
+ class QAEmbeddings:
154
+ questions: List[str]
155
+ question_map: List[int]
156
+ embeddings: torch.Tensor
157
+ df_qa: pd.DataFrame
158
+ original_questions: List[str]
159
+
160
+ class EmbeddingManager:
161
+ def __init__(self, model_name='all-MiniLM-L6-v2'):
162
+ self.model = SentenceTransformer(model_name)
163
+ self.embeddings = {
164
+ 'general': None,
165
+ 'personal': None,
166
+ 'greetings': None
167
+ }
168
+ logger.info(f"EmbeddingManager initialized with model: {model_name}")
169
+
170
+ def _process_questions(self, df: pd.DataFrame) -> Tuple[List[str], List[int], List[str]]:
171
+ questions = []
172
+ question_map = []
173
+ original_questions = []
174
+
175
+ if 'Question' not in df.columns:
176
+ logger.warning(f"DataFrame for EmbeddingManager is missing 'Question' column. Cannot process questions from it.")
177
+ return questions, question_map, original_questions
178
+
179
+ for idx, question_text_raw in enumerate(df['Question']):
180
+ if pd.isna(question_text_raw):
181
+ continue
182
+ question_text_cleaned = str(question_text_raw).strip()
183
+ if not question_text_cleaned or question_text_cleaned.lower() == "nan":
184
+ continue
185
+
186
+ questions.append(question_text_cleaned)
187
+ question_map.append(idx)
188
+ original_questions.append(question_text_cleaned)
189
+
190
+ return questions, question_map, original_questions
191
+
192
+ def update_embeddings(self, general_qa: pd.DataFrame, personal_qa: pd.DataFrame, greetings_qa: pd.DataFrame):
193
+ gen_questions, gen_question_map, gen_original_questions = self._process_questions(general_qa)
194
+ gen_embeddings = self.model.encode(gen_questions, convert_to_tensor=True, show_progress_bar=False) if gen_questions else None
195
+
196
+ pers_questions, pers_question_map, pers_original_questions = self._process_questions(personal_qa)
197
+ pers_embeddings = self.model.encode(pers_questions, convert_to_tensor=True, show_progress_bar=False) if pers_questions else None
198
+
199
+ greet_questions, greet_question_map, greet_original_questions = self._process_questions(greetings_qa)
200
+ greet_embeddings = self.model.encode(greet_questions, convert_to_tensor=True, show_progress_bar=False) if greet_questions else None
201
+
202
+ self.embeddings['general'] = QAEmbeddings(
203
+ questions=gen_questions, question_map=gen_question_map, embeddings=gen_embeddings,
204
+ df_qa=general_qa, original_questions=gen_original_questions
205
+ )
206
+ self.embeddings['personal'] = QAEmbeddings(
207
+ questions=pers_questions, question_map=pers_question_map, embeddings=pers_embeddings,
208
+ df_qa=personal_qa, original_questions=pers_original_questions
209
+ )
210
+ self.embeddings['greetings'] = QAEmbeddings(
211
+ questions=greet_questions, question_map=greet_question_map, embeddings=greet_embeddings,
212
+ df_qa=greetings_qa, original_questions=greet_original_questions
213
+ )
214
+ logger.info("CSV QA embeddings updated in EmbeddingManager.")
215
+
216
+ def find_best_answers(self, user_query: str, qa_type: str, top_n: int = 5) -> Tuple[List[float], List[str], List[str], List[str], List[int]]:
217
+ qa_data = self.embeddings[qa_type]
218
+ if qa_data is None or qa_data.embeddings is None or len(qa_data.embeddings) == 0:
219
+ return [], [], [], [], []
220
+
221
+ query_embedding_tensor = self.model.encode([user_query], convert_to_tensor=True, show_progress_bar=False)
222
+ if not isinstance(qa_data.embeddings, torch.Tensor):
223
+ qa_data.embeddings = torch.tensor(qa_data.embeddings) # Safeguard
224
+
225
+ cos_scores = util.cos_sim(query_embedding_tensor, qa_data.embeddings)[0]
226
+
227
+ top_k = min(top_n, len(cos_scores))
228
+ if top_k == 0:
229
+ return [], [], [], [], []
230
+
231
+ top_scores_tensor, indices_tensor = torch.topk(cos_scores, k=top_k)
232
+
233
+ top_confidences = [score.item() * 100 for score in top_scores_tensor]
234
+ top_indices_mapped = []
235
+ top_questions = []
236
+
237
+ for idx_tensor in indices_tensor:
238
+ item_idx = idx_tensor.item()
239
+ if item_idx < len(qa_data.question_map) and item_idx < len(qa_data.original_questions):
240
+ original_df_idx = qa_data.question_map[item_idx]
241
+ if original_df_idx < len(qa_data.df_qa):
242
+ top_indices_mapped.append(original_df_idx)
243
+ top_questions.append(qa_data.original_questions[item_idx])
244
+ else:
245
+ logger.warning(f"Index out of bounds: original_df_idx {original_df_idx} for df_qa length {len(qa_data.df_qa)}")
246
+ else:
247
+ logger.warning(f"Index out of bounds: item_idx {item_idx} for question_map/original_questions")
248
+
249
+ valid_count = len(top_indices_mapped)
250
+ top_confidences = top_confidences[:valid_count]
251
+ top_questions = top_questions[:valid_count]
252
+
253
+ # MODIFIED: Changed Answer to Respuesta to match new loading logic for xlsx
254
+ answer_col = 'Respuesta' if 'Respuesta' in qa_data.df_qa.columns else 'Answer'
255
+ top_answers = [str(qa_data.df_qa[answer_col].iloc[i]) for i in top_indices_mapped]
256
+ top_images = [str(qa_data.df_qa['Image'].iloc[i]) if 'Image' in qa_data.df_qa.columns and pd.notna(qa_data.df_qa['Image'].iloc[i]) else None for i in top_indices_mapped]
257
+
258
+ return top_confidences, top_questions, top_answers, top_images, top_indices_mapped
259
+
260
+ # --- DatabaseMonitor for personal_qa.csv placeholders (remains in app.py) ---
261
+ class DatabaseMonitor:
262
+ def __init__(self, database_path):
263
+ self.logger = logging.getLogger(__name__ + ".DatabaseMonitor")
264
+ self.database_path = database_path
265
+ self.last_modified = None
266
+ self.last_size = None
267
+ self.df = None
268
+ self.lock = Lock()
269
+ self.running = True
270
+ self._load_database()
271
+ self.monitor_thread = Thread(target=self._monitor_database, daemon=True)
272
+ self.monitor_thread.start()
273
+ self.logger.info(f"DatabaseMonitor initialized for: {database_path}")
274
+
275
+ def _load_database(self):
276
+ try:
277
+ if not os.path.exists(self.database_path):
278
+ self.logger.warning(f"Personal data file not found: {self.database_path}.")
279
+ self.df = None
280
+ return
281
+ with self.lock:
282
+ self.df = pd.read_csv(self.database_path, encoding='cp1252')
283
+ self.last_modified = os.path.getmtime(self.database_path)
284
+ self.last_size = os.path.getsize(self.database_path)
285
+ self.logger.info(f"Personal data file reloaded: {self.database_path}")
286
+ except Exception as e:
287
+ self.logger.error(f"Error loading personal data file '{self.database_path}': {e}", exc_info=True)
288
+ self.df = None
289
+
290
+ def _monitor_database(self):
291
+ while self.running:
292
+ try:
293
+ if not os.path.exists(self.database_path):
294
+ if self.df is not None:
295
+ self.logger.warning(f"Personal data file disappeared: {self.database_path}")
296
+ self.df = None; self.last_modified = None; self.last_size = None
297
+ time.sleep(5)
298
+ continue
299
+ current_modified = os.path.getmtime(self.database_path); current_size = os.path.getsize(self.database_path)
300
+ if (self.last_modified is None or current_modified != self.last_modified or
301
+ self.last_size is None or current_size != self.last_size):
302
+ self.logger.info("Personal data file change detected.")
303
+ self._load_database()
304
+ time.sleep(1)
305
+ except Exception as e:
306
+ self.logger.error(f"Error monitoring personal data file: {e}", exc_info=True)
307
+ time.sleep(5)
308
+
309
+ def get_data(self, user_id):
310
+ with self.lock:
311
+ if self.df is not None and user_id:
312
+ try:
313
+ if 'id' not in self.df.columns:
314
+ self.logger.warning("'id' column not found in personal_data.csv")
315
+ return None
316
+ id_col_type = self.df['id'].dtype
317
+ target_user_id = user_id
318
+ if pd.api.types.is_numeric_dtype(id_col_type):
319
+ try:
320
+ if user_id is None: return None
321
+ valid_ids = self.df['id'].dropna()
322
+ if not valid_ids.empty:
323
+ target_user_id = type(valid_ids.iloc[0])(user_id)
324
+ else:
325
+ target_user_id = int(user_id)
326
+ except (ValueError, TypeError):
327
+ self.logger.warning(f"Could not convert user_id '{user_id}' to numeric type {id_col_type}")
328
+ return None
329
+ user_data = self.df[self.df['id'] == target_user_id]
330
+ if not user_data.empty: return user_data.iloc[0].to_dict()
331
+ except Exception as e:
332
+ self.logger.error(f"Error retrieving data for user_id {user_id}: {e}", exc_info=True)
333
+ return None
334
+
335
+ def stop(self):
336
+ self.running = False
337
+ if hasattr(self, 'monitor_thread') and self.monitor_thread.is_alive():
338
+ self.monitor_thread.join(timeout=5)
339
+ self.logger.info("DatabaseMonitor stopped.")
340
+
341
+ # --- Flask App Initialization ---
342
+ app = Flask(__name__)
343
+ CORS(app, resources={r"/*": {"origins": "*"}}, supports_credentials=True)
344
+
345
+ # --- Initialize Managers ---
346
+ embedding_manager = EmbeddingManager()
347
+ history_manager = ChatHistoryManager('chat_history.db')
348
+ database_csv_path = os.path.join(RAG_SOURCES_DIR, 'database.csv')
349
+ personal_data_monitor = DatabaseMonitor(database_csv_path)
350
+
351
+ # --- Helper Functions (App specific) ---
352
+ def clean_html_from_text(text: str) -> str:
353
+ """Removes HTML tags from a string using a simple regex."""
354
+ if not isinstance(text, str):
355
+ return text
356
+ clean_text = re.sub(r'<[^>]+>', '', text)
357
+ return clean_text.strip()
358
+
359
+ def normalize_text(text):
360
+ if isinstance(text, str):
361
+ replacements = {
362
+ '\x91': "'", '\x92': "'", '\x93': '"', '\x94': '"',
363
+ '\x96': '-', '\x97': '-', '\x85': '...', '\x95': '-',
364
+ '"': '"', '"': '"', '‘': "'", '’': "'",
365
+ '–': '-', '—': '-', '…': '...', '•': '-',
366
+ }
367
+ for old, new in replacements.items(): text = text.replace(old, new)
368
+ return text
369
+
370
+ def require_admin_auth(f):
371
+ @functools.wraps(f)
372
+ def decorated(*args, **kwargs):
373
+ auth = request.authorization
374
+ if not auth or auth.username != ADMIN_USERNAME or auth.password != ADMIN_PASSWORD:
375
+ return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
376
+ return f(*args, **kwargs)
377
+ return decorated
378
+
379
+ def initialize_chat_log():
380
+ if not os.path.exists(CHAT_LOG_FILE):
381
+ with open(CHAT_LOG_FILE, 'w', newline='', encoding='utf-8') as f:
382
+ writer = csv_lib.writer(f)
383
+ writer.writerow(['sl', 'date_time', 'session_id', 'user_id', 'query', 'answer'])
384
+
385
+ def store_chat_history(sid: str, uid: Optional[str], query: str, resp: Dict[str, Any]):
386
+ try:
387
+ # This now gets the final response key, which is 'answer' in the old logic
388
+ answer = str(resp.get('answer', ''))
389
+ history_manager.update_history(sid, query, answer)
390
+
391
+ initialize_chat_log()
392
+ next_sl = 1
393
+ try:
394
+ if os.path.exists(CHAT_LOG_FILE) and os.path.getsize(CHAT_LOG_FILE) > 0:
395
+ df_log = pd.read_csv(CHAT_LOG_FILE, on_bad_lines='skip')
396
+ if not df_log.empty and 'sl' in df_log.columns and pd.api.types.is_numeric_dtype(df_log['sl'].dropna()):
397
+ if not df_log['sl'].dropna().empty:
398
+ next_sl = int(df_log['sl'].dropna().max()) + 1
399
+ except Exception as e:
400
+ logger.error(f"Error reading SL from {CHAT_LOG_FILE}: {e}", exc_info=True)
401
+
402
+ with open(CHAT_LOG_FILE, 'a', newline='', encoding='utf-8') as f:
403
+ csv_lib.writer(f).writerow([next_sl, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), sid, uid or "N/A", query, answer])
404
+
405
+ except Exception as e:
406
+ logger.error(f"Error in store_chat_history for session {sid}: {e}", exc_info=True)
407
+
408
+ def get_formatted_chat_history(session_id: str) -> List[Dict[str, str]]:
409
+ if not session_id:
410
+ return []
411
+ return history_manager.get_history(session_id, limit_turns=CHAT_HISTORY_TO_SEND)
412
+
413
+ def get_qa_context_for_groq(all_questions: List[Dict]) -> str:
414
+ valid_qa_pairs = []
415
+ non_greeting_questions = [q for q in all_questions if q.get('source_type') != 'greetings']
416
+ sorted_questions = sorted(non_greeting_questions, key=lambda x: x.get('confidence', 0), reverse=True)
417
+
418
+ for qa in sorted_questions[:QUESTIONS_TO_SEND_TO_GROQ_QA]:
419
+ answer = qa.get('answer')
420
+ if (not pd.isna(answer) and isinstance(answer, str) and answer.strip() and
421
+ "not available" not in answer.lower()):
422
+ valid_qa_pairs.append(f"Q: {qa.get('question')}\nA: {answer}")
423
+ return '\n'.join(valid_qa_pairs)
424
+
425
+ def replace_placeholders_in_answer(answer, db_data):
426
+ if pd.isna(answer) or str(answer).strip() == '':
427
+ return "Sorry, this information is not available yet"
428
+ answer_str = str(answer)
429
+ placeholders = re.findall(r'\{(\w+)\}', answer_str)
430
+ if not placeholders: return answer_str
431
+ if db_data is None:
432
+ return "To get this specific information, please ensure you are logged in or have provided your user ID."
433
+ missing_count = 0; replacements_made = 0
434
+ for placeholder in set(placeholders):
435
+ key = placeholder.strip()
436
+ value = db_data.get(key)
437
+ if value is None or (isinstance(value, float) and pd.isna(value)) or str(value).strip() == '':
438
+ answer_str = answer_str.replace(f'{{{key}}}', "not available")
439
+ missing_count += 1
440
+ else:
441
+ answer_str = answer_str.replace(f'{{{key}}}', str(value))
442
+ replacements_made +=1
443
+ if missing_count == len(placeholders) and len(placeholders) > 0 :
444
+ return "Sorry, some specific details for you are not available at the moment."
445
+ if "not available" in answer_str.lower() and replacements_made < len(placeholders):
446
+ if answer_str == "not available" and len(placeholders) == 1:
447
+ return "Sorry, this information is not available yet."
448
+ if re.search(r'\{(\w+)\}', answer_str):
449
+ logger.warning(f"Unresolved placeholders remain after replacement attempt: {answer_str}")
450
+ answer_str = re.sub(r'\{(\w+)\}', "a specific detail", answer_str)
451
+ if "a specific detail" in answer_str and not "Sorry" in answer_str:
452
+ return "Sorry, I couldn't retrieve all the specific details for this answer. " + answer_str
453
+ return "Sorry, I couldn't retrieve all the specific details for this answer. Some information has been generalized."
454
+ return answer_str
455
+
456
+ # --- Non-Streaming Logic (Preserved from original) ---
457
+ def get_hybrid_response_logic_non_streaming(user_query: str, session_id: str, user_id: Optional[str], chat_history: Optional[List[Dict]] = None) -> Dict[str, Any]:
458
+ global rag_system
459
+
460
+ if not user_query: return {'error': 'No query provided'}
461
+ if not session_id: return {'error': 'session_id is required'}
462
+
463
+ personal_db_data = personal_data_monitor.get_data(user_id) if user_id else None
464
+
465
+ # MODIFIED: Capture indices from the search
466
+ conf_greet, q_greet, a_greet, img_greet, idx_greet = embedding_manager.find_best_answers(user_query, 'greetings', top_n=1)
467
+ conf_pers, q_pers, a_pers, img_pers, idx_pers = embedding_manager.find_best_answers(user_query, 'personal', top_n=RELATED_QUESTIONS_TO_SHOW)
468
+ conf_gen, q_gen, a_gen, img_gen, idx_gen = embedding_manager.find_best_answers(user_query, 'general', top_n=RELATED_QUESTIONS_TO_SHOW)
469
+
470
+ all_csv_candidate_answers = []
471
+ if conf_greet and conf_greet[0] >= HIGH_CONFIDENCE_THRESHOLD:
472
+ all_csv_candidate_answers.append({'question': q_greet[0], 'answer': a_greet[0], 'image': img_greet[0] if img_greet else None, 'confidence': conf_greet[0], 'source_type': 'greetings', 'original_index': idx_greet[0]})
473
+ if conf_pers:
474
+ # MODIFIED: Add original_index to candidates
475
+ for c, q, a, img, idx in zip(conf_pers, q_pers, a_pers, img_pers, idx_pers):
476
+ processed_a = replace_placeholders_in_answer(a, personal_db_data)
477
+ if not ("Sorry, this information is not available yet" in processed_a or "To get this specific information" in processed_a):
478
+ all_csv_candidate_answers.append({'question': q, 'answer': processed_a, 'image': img, 'confidence': c, 'source_type': 'personal', 'original_index': idx})
479
+ if conf_gen:
480
+ # MODIFIED: Add original_index to candidates
481
+ for c, q, a, img, idx in zip(conf_gen, q_gen, a_gen, img_gen, idx_gen):
482
+ if not (pd.isna(a) or str(a).strip() == '' or str(a).lower() == 'nan'):
483
+ all_csv_candidate_answers.append({'question': q, 'answer': str(a), 'image': img, 'confidence': c, 'source_type': 'general', 'original_index': idx})
484
+
485
+ all_csv_candidate_answers.sort(key=lambda x: x['confidence'], reverse=True)
486
+
487
+ related_questions_list = []
488
+
489
+ if all_csv_candidate_answers:
490
+ best_csv_match = all_csv_candidate_answers[0]
491
+ is_direct_csv_answer = False
492
+ source_name = ""
493
+
494
+ # MODIFIED: Use new configurable threshold for LLM formatting
495
+ best_source_type = best_csv_match['source_type']
496
+ best_confidence = best_csv_match['confidence']
497
+
498
+ if best_source_type == 'greetings' and best_confidence >= HIGH_CONFIDENCE_THRESHOLD:
499
+ is_direct_csv_answer = True
500
+ source_name = 'greetings_qa'
501
+ elif best_source_type in ['personal', 'general'] and best_confidence >= LLM_FORMATTER_CONFIDENCE_THRESHOLD:
502
+ is_direct_csv_answer = True
503
+ source_name = f"{best_source_type}_qa"
504
+
505
+ if is_direct_csv_answer:
506
+ # MODIFICATION START: Reroute high-confidence matches to the LLM for formatting
507
+ best_match_source = best_csv_match['source_type']
508
+
509
+ # For greetings, we still provide a direct answer without LLM formatting
510
+ if best_match_source == 'greetings':
511
+ response_data = {'query': user_query, 'answer': best_csv_match['answer'], 'confidence': best_csv_match['confidence'], 'original_question': best_csv_match['question'], 'source': source_name}
512
+ if best_csv_match.get('image'):
513
+ response_data['image_url'] = url_for('static', filename=best_csv_match['image'], _external=True)
514
+ else:
515
+ # For 'personal' and 'general', use the LLM to format the answer from the full row
516
+ best_match_index = best_csv_match['original_index']
517
+
518
+ # Retrieve the full row from the original dataframe stored in the embedding manager
519
+ original_df = embedding_manager.embeddings[best_match_source].df_qa
520
+ matched_row_data = original_df.iloc[best_match_index]
521
+
522
+ # Format the row data as a string context for the LLM
523
+ # We drop the 'Question' column as it's a duplicate of 'Pregunta' and not needed in the context
524
+ context_dict = matched_row_data.drop('Question', errors='ignore').to_dict()
525
+ context_str = "\n".join([f"'{key}': '{value}'" for key, value in context_dict.items() if pd.notna(value) and str(value).strip() != ''])
526
+
527
+ # Call the LLM to generate a conversational answer based on the row data
528
+ final_answer = get_answer_from_context(
529
+ question=user_query,
530
+ context=context_str,
531
+ system_prompt=QA_FORMATTER_PROMPT
532
+ )
533
+
534
+ response_data = {
535
+ 'query': user_query,
536
+ 'answer': final_answer,
537
+ 'confidence': best_csv_match['confidence'],
538
+ 'original_question': best_csv_match['question'],
539
+ 'source': f'{source_name}_llm_formatted'
540
+ }
541
+ if best_csv_match.get('image'):
542
+ response_data['image_url'] = url_for('static', filename=best_csv_match['image'], _external=True)
543
+
544
+ # MODIFICATION END
545
+
546
+ for i, cand_q in enumerate(all_csv_candidate_answers):
547
+ if i == 0: continue
548
+ if cand_q['source_type'] != 'greetings':
549
+ related_questions_list.append({'question': cand_q['question'], 'answer': cand_q['answer'], 'match': cand_q['confidence']})
550
+ if len(related_questions_list) >= RELATED_QUESTIONS_TO_SHOW: break
551
+ response_data['related_questions'] = related_questions_list
552
+ store_chat_history(session_id, user_id, user_query, response_data)
553
+ return response_data
554
+
555
+ if rag_system and rag_system.retriever:
556
+ logger.info(f"Attempting FAISS RAG query for: {user_query[:50]}...")
557
+ rag_result = rag_system.invoke(user_query) # Use invoke for non-streaming
558
+ rag_answer = rag_result.get("answer")
559
+
560
+ if rag_answer and "the provided bibliography does not contain specific information" not in rag_answer.lower():
561
+ logger.info(f"FAISS RAG system provided a valid answer: {rag_answer[:100]}...")
562
+ response_data = {
563
+ 'query': user_query, 'answer': rag_answer, 'confidence': 85,
564
+ 'source': 'document_rag_faiss', 'related_questions': [],
565
+ 'document_sources_details': rag_result.get("cited_source_details")
566
+ }
567
+ store_chat_history(session_id, user_id, user_query, response_data)
568
+ return response_data
569
+
570
+ logger.info(f"No high-confidence answer. Using Groq fallback.")
571
+ chat_history_messages_for_groq = chat_history if chat_history is not None else get_formatted_chat_history(session_id)
572
+ groq_context = {'current_query': user_query, 'chat_history': chat_history_messages_for_groq, 'qa_related_info': ""}
573
+ groq_stream = groq_bot_instance.stream_response(groq_context)
574
+ groq_answer = "".join([chunk for chunk in groq_stream])
575
+
576
+ response_data = {'query': user_query, 'answer': groq_answer, 'confidence': 75, 'source': 'groq_general_fallback', 'related_questions': []}
577
+ store_chat_history(session_id, user_id, user_query, response_data)
578
+ return response_data
579
+
580
+ # --- Streaming Logic ---
581
+ def generate_streaming_response(user_query: str, session_id: str, user_id: Optional[str], chat_history: Optional[List[Dict]] = None) -> Iterator[str]:
582
+ """
583
+ Handles the logic for generating a response and yields chunks of the response as a stream.
584
+ """
585
+ global rag_system
586
+
587
+ personal_db_data = personal_data_monitor.get_data(user_id) if user_id else None
588
+ conf_greet, _, a_greet, _, idx_greet = embedding_manager.find_best_answers(user_query, 'greetings', top_n=1)
589
+ conf_pers, _, a_pers, _, idx_pers = embedding_manager.find_best_answers(user_query, 'personal', top_n=1)
590
+ conf_gen, _, a_gen, _, idx_gen = embedding_manager.find_best_answers(user_query, 'general', top_n=1)
591
+
592
+ # MODIFIED: Use new configurable threshold and logic for picking best candidate
593
+ candidates = []
594
+ # Greetings have their own threshold for a direct, non-LLM answer
595
+ if conf_greet and conf_greet[0] >= HIGH_CONFIDENCE_THRESHOLD:
596
+ candidates.append({'answer': a_greet[0], 'confidence': conf_greet[0], 'source': 'greetings', 'index': idx_greet[0]})
597
+
598
+ # Personal and General QA have a stricter threshold to be sent to the LLM formatter
599
+ if conf_pers and conf_pers[0] >= LLM_FORMATTER_CONFIDENCE_THRESHOLD:
600
+ processed_a = replace_placeholders_in_answer(a_pers[0], personal_db_data)
601
+ # Only add candidate if placeholder replacement was successful
602
+ if not ("Sorry, this information is not available yet" in processed_a or "To get this specific information" in processed_a):
603
+ candidates.append({'answer': processed_a, 'confidence': conf_pers[0], 'source': 'personal', 'index': idx_pers[0]})
604
+
605
+ if conf_gen and conf_gen[0] >= LLM_FORMATTER_CONFIDENCE_THRESHOLD:
606
+ # Filter out empty/invalid answers
607
+ if not (pd.isna(a_gen[0]) or str(a_gen[0]).strip() == '' or str(a_gen[0]).lower() == 'nan'):
608
+ candidates.append({'answer': a_gen[0], 'confidence': conf_gen[0], 'source': 'general', 'index': idx_gen[0]})
609
+
610
+ if candidates:
611
+ best_candidate = max(candidates, key=lambda x: x['confidence'])
612
+ best_source_type = best_candidate['source']
613
+ logger.info(f"High-confidence match from CSV source: {best_source_type}")
614
+
615
+ # If the best match is a greeting, yield it directly
616
+ if best_source_type == 'greetings':
617
+ yield best_candidate['answer']
618
+ return
619
+
620
+ # Otherwise, the best match is 'personal' or 'general' and needs LLM formatting
621
+ original_df = embedding_manager.embeddings[best_source_type].df_qa
622
+ matched_row_data = original_df.iloc[best_candidate['index']]
623
+ context_dict = matched_row_data.drop('Question', errors='ignore').to_dict()
624
+ context_str = "\n".join([f"'{key}': '{value}'" for key, value in context_dict.items() if pd.notna(value) and str(value).strip() != ''])
625
+
626
+ final_answer = get_answer_from_context(
627
+ question=user_query,
628
+ context=context_str,
629
+ system_prompt=QA_FORMATTER_PROMPT
630
+ )
631
+ yield final_answer
632
+ return
633
+
634
+ if rag_system and rag_system.retriever:
635
+ logger.info(f"Attempting to stream from FAISS RAG for: {user_query[:50]}...")
636
+ rag_stream = rag_system.stream(user_query)
637
+ first_chunk = next(rag_stream, None)
638
+
639
+ if first_chunk and "the provided bibliography does not contain specific information" not in first_chunk.lower():
640
+ logger.info("FAISS RAG streaming valid answer...")
641
+ yield first_chunk
642
+ yield from rag_stream
643
+ return
644
+
645
+ logger.info(f"No high-confidence CSV or RAG answer. Streaming from Groq fallback.")
646
+ chat_history_messages_for_groq = chat_history if chat_history is not None else get_formatted_chat_history(session_id)
647
+ groq_context = {'current_query': user_query, 'chat_history': chat_history_messages_for_groq, 'qa_related_info': ""}
648
+ yield from groq_bot_instance.stream_response(groq_context)
649
+
650
+ def stream_formatter(logic_generator: Iterator[str], session_id: str, user_id: Optional[str], query: str) -> Iterator[str]:
651
+ """
652
+ Wraps raw text chunks into the Server-Sent Events (SSE) format and logs the full response at the end.
653
+ """
654
+ chunk_id = f"chatcmpl-{str(generate_uuid())}"
655
+ model_name = "MedicalAssisstantBot/v1"
656
+ full_response_chunks = []
657
+
658
+ for chunk in logic_generator:
659
+ if not chunk: continue
660
+ full_response_chunks.append(chunk)
661
+ response_json = {
662
+ "id": chunk_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": model_name,
663
+ "choices": [{"index": 0, "delta": {"content": chunk}, "finish_reason": None}]
664
+ }
665
+ yield f"data: {json.dumps(response_json)}\n\n"
666
+
667
+ final_json = {
668
+ "id": chunk_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": model_name,
669
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
670
+ }
671
+ yield f"data: {json.dumps(final_json)}\n\n"
672
+ yield "data: [DONE]\n\n"
673
+
674
+ # After streaming is complete, log the full conversation to the database
675
+ full_response = "".join(full_response_chunks)
676
+
677
+ # MODIFIED: Added print statement for full streamed response
678
+ print(f"\n--- STREAMED FULL RESPONSE ---")
679
+ print(full_response)
680
+ print(f"------------------------------\n")
681
+
682
+ history_manager.update_history(session_id, query, full_response)
683
+
684
+ # --- Original Chat Endpoint (Preserved) ---
685
+ @app.route('/chat-bot', methods=['POST'])
686
+ def get_answer_hybrid():
687
+ data = request.json
688
+ user_query = data.get('query', '')
689
+ user_query = clean_html_from_text(user_query) # ADDED
690
+ user_id = data.get('user_id')
691
+ session_id = data.get('session_id')
692
+
693
+ if not user_query or not session_id:
694
+ return jsonify({'error': 'query and session_id are required'}), 400
695
+
696
+ response_data = get_hybrid_response_logic_non_streaming(user_query, session_id, user_id, None)
697
+ return jsonify(response_data)
698
+
699
+ # --- OpenAI Compatible Endpoints (Added) ---
700
+ @app.route('/v1/models', methods=['GET'])
701
+ def list_models():
702
+ model_data = {
703
+ "object": "list",
704
+ "data": [{"id": "MedicalAssisstantBot/v1", "object": "model", "created": int(time.time()), "owned_by": "user"}]
705
+ }
706
+ return jsonify(model_data)
707
+
708
+ @app.route('/v1/chat/completions', methods=['POST'])
709
+ def openai_compatible_chat_endpoint():
710
+ data = request.json
711
+ is_streaming = data.get("stream", False)
712
+
713
+ messages = data.get("messages", [])
714
+ if not messages: return jsonify({"error": "No messages provided"}), 400
715
+
716
+ user_query = messages[-1].get("content", "")
717
+ user_query = clean_html_from_text(user_query) # ADDED
718
+ chat_history = messages[:-1]
719
+ session_id = data.get("conversation_id", f"webui-session-{str(generate_uuid())}")
720
+ user_id = None
721
+
722
+ if is_streaming:
723
+ logic_generator = generate_streaming_response(user_query, session_id, user_id, chat_history)
724
+ return Response(stream_formatter(logic_generator, session_id, user_id, user_query), mimetype='text/event-stream')
725
+ else:
726
+ full_response_dict = get_hybrid_response_logic_non_streaming(user_query, session_id, user_id, chat_history)
727
+ response_content = full_response_dict.get("answer", "Sorry, an error occurred.")
728
+
729
+ openai_response = {
730
+ "id": f"chatcmpl-{str(generate_uuid())}", "object": "chat.completion", "created": int(time.time()),
731
+ "model": "MedicalAssisstantBot/v1",
732
+ "choices": [{"index": 0, "message": {"role": "assistant", "content": response_content}, "finish_reason": "stop"}],
733
+ "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
734
+ }
735
+ history_manager.update_history(session_id, user_query, response_content)
736
+ return jsonify(openai_response)
737
+
738
+
739
+ # --- Admin and Utility Routes (Unchanged) ---
740
+ @app.route('/')
741
+ def index_route():
742
+ template_to_render = 'chat-bot.html'
743
+ if not os.path.exists(os.path.join(app.root_path, 'templates', template_to_render)):
744
+ logger.warning(f"Template '{template_to_render}' not found. Serving basic message.")
745
+ return "Chatbot interface not found. Please ensure 'templates/chat-bot.html' exists.", 404
746
+ return render_template(template_to_render)
747
+
748
+ @app.route('/admin/faiss_rag_status', methods=['GET'])
749
+ @require_admin_auth
750
+ def get_faiss_rag_status():
751
+ global rag_system
752
+ if not rag_system:
753
+ return jsonify({"error": "FAISS RAG system not initialized."}), 500
754
+ try:
755
+ status = {
756
+ "status": "Initialized" if rag_system.retriever else "Initialized (Retriever not ready)",
757
+ "index_storage_dir": rag_system.index_storage_dir,
758
+ "embedding_model": rag_system.embedding_model_name,
759
+ "groq_model": rag_system.groq_model_name,
760
+ "retriever_k": rag_system.retriever.k if rag_system.retriever else "N/A",
761
+ "processed_source_files": rag_system.processed_source_files,
762
+ "index_type": "FAISS",
763
+ "index_loaded_or_built": rag_system.vector_store is not None
764
+ }
765
+ if rag_system.vector_store and hasattr(rag_system.vector_store, 'index') and rag_system.vector_store.index:
766
+ try:
767
+ status["num_vectors_in_index"] = rag_system.vector_store.index.ntotal
768
+ except Exception:
769
+ status["num_vectors_in_index"] = "N/A (Could not get count)"
770
+ else:
771
+ status["num_vectors_in_index"] = "N/A (Vector store or index not available)"
772
+ return jsonify(status)
773
+ except Exception as e:
774
+ logger.error(f"Error getting FAISS RAG status: {e}", exc_info=True)
775
+ return jsonify({"error": str(e)}), 500
776
+
777
+ # NEW FUNCTION: Endpoint to download the combined QA databases as an Excel file
778
+ @app.route('/admin/download_qa_database', methods=['GET'])
779
+ @require_admin_auth
780
+ def download_qa_database():
781
+ try:
782
+ output = io.BytesIO()
783
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
784
+ # Safely access the dataframes from the embedding manager
785
+ if embedding_manager.embeddings['general'] and embedding_manager.embeddings['general'].df_qa is not None:
786
+ embedding_manager.embeddings['general'].df_qa.to_excel(writer, sheet_name='General_QA', index=False)
787
+
788
+ if embedding_manager.embeddings['personal'] and embedding_manager.embeddings['personal'].df_qa is not None:
789
+ embedding_manager.embeddings['personal'].df_qa.to_excel(writer, sheet_name='Personal_QA', index=False)
790
+
791
+ if embedding_manager.embeddings['greetings'] and embedding_manager.embeddings['greetings'].df_qa is not None:
792
+ embedding_manager.embeddings['g greetings'].df_qa.to_excel(writer, sheet_name='Greetings', index=False)
793
+
794
+ output.seek(0)
795
+
796
+ return send_file(
797
+ output,
798
+ mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
799
+ as_attachment=True,
800
+ download_name=f'qa_database_{datetime.now().strftime("%Y%m%d")}.xlsx'
801
+ )
802
+ except Exception as e:
803
+ logger.error(f"Error generating QA database file: {e}", exc_info=True)
804
+ return jsonify({'error': 'Failed to generate QA database file'}), 500
805
+
806
+ @app.route('/admin/rebuild_faiss_index', methods=['POST'])
807
+ @require_admin_auth
808
+ def rebuild_faiss_index_route():
809
+ global rag_system
810
+ logger.info("Admin request to rebuild FAISS RAG index received. Starting two-step process.")
811
+
812
+ logger.info("Step 1: Running chunker.py to pre-process source documents.")
813
+ chunker_script_path = os.path.join(_APP_BASE_DIR, 'chunker.py')
814
+ chunked_json_output_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_CHUNKED_SOURCES_FILENAME)
815
+
816
+ os.makedirs(TEXT_EXTRACTIONS_DIR, exist_ok=True)
817
+
818
+ if not os.path.exists(chunker_script_path):
819
+ logger.error(f"Chunker script not found at '{chunker_script_path}'. Aborting rebuild.")
820
+ return jsonify({"error": f"chunker.py not found. Cannot proceed with rebuild."}), 500
821
+
822
+ command = [
823
+ sys.executable,
824
+ chunker_script_path,
825
+ '--sources-dir', RAG_SOURCES_DIR,
826
+ '--output-file', chunked_json_output_path,
827
+ '--text-output-dir', TEXT_EXTRACTIONS_DIR
828
+ ]
829
+
830
+ try:
831
+ process = subprocess.run(command, capture_output=True, text=True, check=True)
832
+ logger.info("Chunker script executed successfully.")
833
+ logger.info(f"Chunker stdout:\n{process.stdout}")
834
+ except subprocess.CalledProcessError as e:
835
+ logger.error(f"Chunker script failed with exit code {e.returncode}.")
836
+ logger.error(f"Chunker stderr:\n{e.stderr}")
837
+ return jsonify({"error": "Step 1 (Chunking) failed.", "details": e.stderr}), 500
838
+ except Exception as e:
839
+ logger.error(f"An unexpected error occurred while running the chunker script: {e}", exc_info=True)
840
+ return jsonify({"error": f"An unexpected error occurred during the chunking step: {str(e)}"}), 500
841
+
842
+ logger.info("Step 2: Rebuilding FAISS index from the newly generated chunks.")
843
+ try:
844
+ new_rag_system_instance = initialize_and_get_rag_system(force_rebuild=True)
845
+
846
+ if new_rag_system_instance and new_rag_system_instance.vector_store:
847
+ rag_system = new_rag_system_instance
848
+ logger.info("FAISS RAG index rebuild completed and new RAG system instance is active.")
849
+ updated_status_response = get_faiss_rag_status()
850
+ return jsonify({"message": "FAISS RAG index rebuild completed.", "status": updated_status_response.get_json()}), 200
851
+ else:
852
+ logger.error("FAISS RAG index rebuild failed during the indexing phase.")
853
+ return jsonify({"error": "Step 2 (Indexing) failed. Check logs."}), 500
854
+
855
+ except Exception as e:
856
+ logger.error(f"Error during admin FAISS index rebuild (indexing phase): {e}", exc_info=True)
857
+ return jsonify({"error": f"Failed to rebuild index during indexing phase: {str(e)}"}), 500
858
+
859
+ @app.route('/db/status', methods=['GET'])
860
+ @require_admin_auth
861
+ def get_personal_db_status():
862
+ try:
863
+ status_info = {
864
+ 'personal_data_csv_monitor_status': 'running',
865
+ 'file_exists': os.path.exists(personal_data_monitor.database_path),
866
+ 'data_loaded': personal_data_monitor.df is not None, 'last_update': None
867
+ }
868
+ if status_info['file_exists'] and os.path.getmtime(personal_data_monitor.database_path) is not None:
869
+ status_info['last_update'] = datetime.fromtimestamp(os.path.getmtime(personal_data_monitor.database_path)).isoformat()
870
+ return jsonify(status_info)
871
+ except Exception as e: return jsonify({'status': 'error', 'error': str(e)}), 500
872
+
873
+ @app.route('/report', methods=['GET'])
874
+ @require_admin_auth
875
+ def download_report():
876
+ try:
877
+ if not os.path.exists(CHAT_LOG_FILE) or os.path.getsize(CHAT_LOG_FILE) == 0:
878
+ return jsonify({'error': 'No chat history available.'}), 404
879
+ return send_file(CHAT_LOG_FILE, mimetype='text/csv', as_attachment=True, download_name=f'chat_history_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')
880
+ except Exception as e:
881
+ logger.error(f"Error downloading report: {e}", exc_info=True)
882
+ return jsonify({'error': 'Failed to generate report'}), 500
883
+
884
+ @app.route('/create-session', methods=['POST'])
885
+ def create_session_route():
886
+ try:
887
+ session_id = str(generate_uuid())
888
+ logger.info(f"New session created: {session_id}")
889
+ return jsonify({'status': 'success', 'session_id': session_id}), 200
890
+ except Exception as e:
891
+ logger.error(f"Session creation error: {e}", exc_info=True)
892
+ return jsonify({'status': 'error', 'message': str(e)}), 500
893
+
894
+ @app.route('/version', methods=['GET'])
895
+ def get_version_route():
896
+ return jsonify({'version': '3.9.8-Env-Chat-History'}), 200 # Updated version
897
+
898
+ @app.route('/clear-history', methods=['POST'])
899
+ def clear_session_history_route():
900
+ data = request.json
901
+ session_id = data.get('session_id')
902
+ if not session_id:
903
+ return jsonify({'status': 'error', 'message': 'session_id is required'}), 400
904
+ history_manager.clear_history(session_id)
905
+ return jsonify({'status': 'success', 'message': f'History cleared for session {session_id}'})
906
+
907
+ # --- App Cleanup and Startup ---
908
+ def cleanup_application():
909
+ if personal_data_monitor: personal_data_monitor.stop()
910
+ logger.info("Application cleanup finished.")
911
+ atexit.register(cleanup_application)
912
+
913
+ def load_qa_data_on_startup():
914
+ global embedding_manager
915
+ # MODIFIED: Added print statement
916
+ print("\n--- Loading QA Source Files ---")
917
+ try:
918
+ general_qa_path = os.path.join(RAG_SOURCES_DIR, 'general_qa.csv')
919
+ personal_qa_path = os.path.join(RAG_SOURCES_DIR, 'personal_qa.csv')
920
+ greetings_qa_path = os.path.join(RAG_SOURCES_DIR, 'greetings.csv')
921
+
922
+ general_qa_df = pd.DataFrame(columns=['Question', 'Answer', 'Image'])
923
+ personal_qa_df = pd.DataFrame(columns=['Question', 'Answer', 'Image'])
924
+ greetings_qa_df = pd.DataFrame(columns=['Question', 'Answer', 'Image'])
925
+
926
+ if os.path.exists(general_qa_path):
927
+ try:
928
+ general_qa_df = pd.read_csv(general_qa_path, encoding='cp1252')
929
+ print(f"- Loaded: {os.path.basename(general_qa_path)}")
930
+ except Exception as e_csv: logger.error(f"Error reading general_qa.csv: {e_csv}")
931
+ else:
932
+ logger.warning(f"Optional file 'general_qa.csv' not found in '{RAG_SOURCES_DIR}'.")
933
+
934
+ if os.path.exists(personal_qa_path):
935
+ try:
936
+ personal_qa_df = pd.read_csv(personal_qa_path, encoding='cp1252')
937
+ print(f"- Loaded: {os.path.basename(personal_qa_path)}")
938
+ except Exception as e_csv: logger.error(f"Error reading personal_qa.csv: {e_csv}")
939
+ else:
940
+ logger.warning(f"Optional file 'personal_qa.csv' not found in '{RAG_SOURCES_DIR}'.")
941
+
942
+ if os.path.exists(greetings_qa_path):
943
+ try:
944
+ greetings_qa_df = pd.read_csv(greetings_qa_path, encoding='cp1252')
945
+ print(f"- Loaded: {os.path.basename(greetings_qa_path)}")
946
+ except Exception as e_csv: logger.error(f"Error reading greetings.csv: {e_csv}")
947
+ else:
948
+ logger.warning(f"Optional file 'greetings.csv' not found in '{RAG_SOURCES_DIR}'.")
949
+
950
+ logger.info(f"Scanning for additional QA sources (.xlsx) in '{RAG_SOURCES_DIR}'...")
951
+ if os.path.isdir(RAG_SOURCES_DIR):
952
+ xlsx_files_found = [f for f in os.listdir(RAG_SOURCES_DIR) if f.endswith('.xlsx') and os.path.isfile(os.path.join(RAG_SOURCES_DIR, f))]
953
+
954
+ if xlsx_files_found:
955
+ all_general_dfs = [general_qa_df] if not general_qa_df.empty else []
956
+ for xlsx_file in xlsx_files_found:
957
+ try:
958
+ xlsx_path = os.path.join(RAG_SOURCES_DIR, xlsx_file)
959
+ logger.info(f"Processing XLSX source file: {xlsx_file}")
960
+ df_excel = pd.read_excel(xlsx_path)
961
+
962
+ # MODIFIED: New logic to preserve all columns and handle dynamic headers
963
+ if 'Pregunta' in df_excel.columns and 'Respuesta' in df_excel.columns:
964
+ logger.info(f"Found 'Pregunta' and 'Respuesta' in {xlsx_file}. Preserving all columns.")
965
+ # The 'Question' column is required by the EmbeddingManager for semantic search.
966
+ # We create it from 'Pregunta' but keep all original columns.
967
+ df_excel['Question'] = df_excel['Pregunta']
968
+ all_general_dfs.append(df_excel)
969
+ print(f"- Loaded and processing: {xlsx_file}")
970
+ else:
971
+ logger.warning(f"Skipping XLSX file '{xlsx_file}' as it lacks the required 'Pregunta' and 'Respuesta' columns.")
972
+ except Exception as e_xlsx:
973
+ logger.error(f"Error processing XLSX file '{xlsx_file}': {e_xlsx}")
974
+
975
+ if len(all_general_dfs) > 0:
976
+ general_qa_df = pd.concat(all_general_dfs, ignore_index=True)
977
+ logger.info(f"Successfully merged data from {len(xlsx_files_found)} XLSX file(s) into the general QA set.")
978
+ else:
979
+ logger.warning(f"Sources directory '{RAG_SOURCES_DIR}' not found. Cannot scan for additional QA files.")
980
+
981
+ dataframes_to_process = {
982
+ "general": general_qa_df,
983
+ "personal": personal_qa_df,
984
+ "greetings": greetings_qa_df
985
+ }
986
+
987
+ for df_name, df_val in dataframes_to_process.items():
988
+ if df_val.empty: continue
989
+
990
+ # Normalize text in all columns to prevent issues
991
+ for col in df_val.columns:
992
+ if not df_val[col].isnull().all():
993
+ df_val[col] = df_val[col].astype(str).apply(normalize_text)
994
+
995
+ # Ensure 'Question' column exists for embedding manager compatibility
996
+ if 'Question' not in df_val.columns:
997
+ # For CSVs that might not have 'Pregunta' but have 'Question'
998
+ if 'Question' in df_val.columns:
999
+ pass # Already exists
1000
+ else:
1001
+ df_val['Question'] = None
1002
+ logger.warning(f"'Question' column missing in {df_name} data. Added empty column.")
1003
+
1004
+ embedding_manager.update_embeddings(
1005
+ dataframes_to_process["general"],
1006
+ dataframes_to_process["personal"],
1007
+ dataframes_to_process["greetings"]
1008
+ )
1009
+ logger.info("CSV & XLSX QA data loaded and embeddings initialized.")
1010
+
1011
+ except Exception as e:
1012
+ logger.critical(f"CRITICAL: Error loading or processing QA data: {e}. Semantic QA may not function.", exc_info=True)
1013
+ # MODIFIED: Added print statement
1014
+ print("-----------------------------\n")
1015
+
1016
+ if __name__ == '__main__':
1017
+ for folder_path in [os.path.join(_APP_BASE_DIR, 'templates'),
1018
+ os.path.join(_APP_BASE_DIR, 'static'),
1019
+ TEXT_EXTRACTIONS_DIR]:
1020
+ os.makedirs(folder_path, exist_ok=True)
1021
+
1022
+ load_qa_data_on_startup()
1023
+ initialize_chat_log()
1024
+
1025
+ logger.info("Attempting to initialize RAG system from llm_handling module...")
1026
+ rag_system = initialize_and_get_rag_system()
1027
+ if rag_system:
1028
+ logger.info("RAG system initialized successfully via llm_handling module.")
1029
+ else:
1030
+ logger.warning("RAG system failed to initialize. Document RAG functionality will be unavailable.")
1031
+
1032
+ logger.info(f"Flask application starting with Hybrid RAG on {FLASK_APP_HOST}:{FLASK_APP_PORT} Debug: {FLASK_DEBUG_MODE}...")
1033
+ if not FLASK_DEBUG_MODE:
1034
+ werkzeug_log = logging.getLogger('werkzeug')
1035
+ werkzeug_log.setLevel(logging.ERROR)
1036
+
1037
+ app.run(host=FLASK_APP_HOST, port=FLASK_APP_PORT, debug=FLASK_DEBUG_MODE, use_reloader=False)
chunker.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import json
4
+ import argparse
5
+ from typing import List, Dict, Optional
6
+
7
+ from pypdf import PdfReader
8
+ import docx as python_docx
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+
11
+ # --- Logging Setup ---
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
15
+ handlers=[
16
+ logging.StreamHandler()
17
+ ]
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # --- Text Extraction Helper Functions ---
22
+ # Note: These are duplicated from llm_handling.py to make this a standalone script.
23
+ def extract_text_from_file(file_path: str, file_type: str) -> Optional[str]:
24
+ logger.info(f"Extracting text from {file_type.upper()} file: {os.path.basename(file_path)}")
25
+ text_content = None
26
+ try:
27
+ if file_type == 'pdf':
28
+ reader = PdfReader(file_path)
29
+ text_content = "".join(page.extract_text() + "\n" for page in reader.pages if page.extract_text())
30
+ elif file_type == 'docx':
31
+ doc = python_docx.Document(file_path)
32
+ text_content = "\n".join(para.text for para in doc.paragraphs if para.text)
33
+ elif file_type == 'txt':
34
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
35
+ text_content = f.read()
36
+ else:
37
+ logger.warning(f"Unsupported file type for text extraction: {file_type} for file {os.path.basename(file_path)}")
38
+ return None
39
+
40
+ if not text_content or not text_content.strip():
41
+ logger.warning(f"No text content extracted from {os.path.basename(file_path)}")
42
+ return None
43
+ return text_content.strip()
44
+ except Exception as e:
45
+ logger.error(f"Error extracting text from {os.path.basename(file_path)} ({file_type.upper()}): {e}", exc_info=True)
46
+ return None
47
+
48
+ SUPPORTED_EXTENSIONS = {
49
+ 'pdf': lambda path: extract_text_from_file(path, 'pdf'),
50
+ 'docx': lambda path: extract_text_from_file(path, 'docx'),
51
+ 'txt': lambda path: extract_text_from_file(path, 'txt'),
52
+ }
53
+
54
+ def process_sources_and_create_chunks(
55
+ sources_dir: str,
56
+ output_file: str,
57
+ chunk_size: int = 1000,
58
+ chunk_overlap: int = 150,
59
+ text_output_dir: Optional[str] = None # MODIFIED: Added optional parameter
60
+ ) -> None:
61
+ """
62
+ Scans a directory for source files, extracts text, splits it into chunks,
63
+ and saves the chunks to a single JSON file.
64
+ Optionally saves the raw extracted text to a specified directory.
65
+ """
66
+ if not os.path.isdir(sources_dir):
67
+ logger.error(f"Source directory not found: '{sources_dir}'")
68
+ raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
69
+
70
+ logger.info(f"Starting chunking process. Sources: '{sources_dir}', Output: '{output_file}'")
71
+
72
+ # MODIFIED: Create text output directory if provided
73
+ if text_output_dir:
74
+ os.makedirs(text_output_dir, exist_ok=True)
75
+ logger.info(f"Will save raw extracted text to: '{text_output_dir}'")
76
+
77
+ all_chunks_for_json: List[Dict] = []
78
+ processed_files_count = 0
79
+
80
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
81
+
82
+ for filename in os.listdir(sources_dir):
83
+ file_path = os.path.join(sources_dir, filename)
84
+ if not os.path.isfile(file_path):
85
+ continue
86
+
87
+ file_ext = filename.split('.')[-1].lower()
88
+ if file_ext not in SUPPORTED_EXTENSIONS:
89
+ logger.debug(f"Skipping unsupported file: {filename}")
90
+ continue
91
+
92
+ logger.info(f"Processing source file: {filename}")
93
+ text_content = SUPPORTED_EXTENSIONS[file_ext](file_path)
94
+
95
+ if text_content:
96
+ # MODIFIED: Save the raw text to a file if directory is specified
97
+ if text_output_dir:
98
+ try:
99
+ text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
100
+ with open(text_output_path, 'w', encoding='utf-8') as f_text:
101
+ f_text.write(text_content)
102
+ logger.info(f"Saved extracted text for '{filename}' to '{text_output_path}'")
103
+ except Exception as e_text_save:
104
+ logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
105
+
106
+ chunks = text_splitter.split_text(text_content)
107
+ if not chunks:
108
+ logger.warning(f"No chunks generated from {filename}. Skipping.")
109
+ continue
110
+
111
+ for i, chunk_text in enumerate(chunks):
112
+ chunk_data = {
113
+ "page_content": chunk_text,
114
+ "metadata": {
115
+ "source_document_name": filename,
116
+ "chunk_index": i,
117
+ "full_location": f"{filename}, Chunk {i+1}"
118
+ }
119
+ }
120
+ all_chunks_for_json.append(chunk_data)
121
+
122
+ processed_files_count += 1
123
+ else:
124
+ logger.warning(f"Could not extract text from {filename}. Skipping.")
125
+
126
+ if not all_chunks_for_json:
127
+ logger.warning(f"No processable documents found or no text extracted in '{sources_dir}'. JSON file will be empty.")
128
+
129
+ output_dir = os.path.dirname(output_file)
130
+ os.makedirs(output_dir, exist_ok=True)
131
+
132
+ with open(output_file, 'w', encoding='utf-8') as f:
133
+ json.dump(all_chunks_for_json, f, indent=2)
134
+
135
+ logger.info(f"Chunking complete. Processed {processed_files_count} files.")
136
+ logger.info(f"Created a total of {len(all_chunks_for_json)} chunks.")
137
+ logger.info(f"Chunked JSON output saved to: {output_file}")
138
+
139
+
140
+ def main():
141
+ parser = argparse.ArgumentParser(description="Process source documents into a JSON file of text chunks for RAG.")
142
+ parser.add_argument(
143
+ '--sources-dir',
144
+ type=str,
145
+ required=True,
146
+ help="The directory containing source files (PDFs, DOCX, TXT)."
147
+ )
148
+ parser.add_argument(
149
+ '--output-file',
150
+ type=str,
151
+ required=True,
152
+ help="The full path for the output JSON file containing the chunks."
153
+ )
154
+ # MODIFIED: Added new optional argument
155
+ parser.add_argument(
156
+ '--text-output-dir',
157
+ type=str,
158
+ default=None,
159
+ help="Optional: The directory to save raw extracted text files for debugging."
160
+ )
161
+ parser.add_argument(
162
+ '--chunk-size',
163
+ type=int,
164
+ default=1000,
165
+ help="The character size for each text chunk."
166
+ )
167
+ parser.add_argument(
168
+ '--chunk-overlap',
169
+ type=int,
170
+ default=150,
171
+ help="The character overlap between consecutive chunks."
172
+ )
173
+
174
+ args = parser.parse_args()
175
+
176
+ try:
177
+ process_sources_and_create_chunks(
178
+ sources_dir=args.sources_dir,
179
+ output_file=args.output_file,
180
+ chunk_size=args.chunk_size,
181
+ chunk_overlap=args.chunk_overlap,
182
+ text_output_dir=args.text_output_dir # MODIFIED: Pass argument
183
+ )
184
+ except Exception as e:
185
+ logger.critical(f"A critical error occurred during the chunking process: {e}", exc_info=True)
186
+ exit(1)
187
+
188
+ if __name__ == "__main__":
189
+ main()
env ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FLASK_HOST=0.0.0.0
3
+ FLASK_PORT=7860
4
+ # FLASK_DEBUG="False" # Gradio uses debug in launch()
5
+
6
+ # --- Groq LLM Models ---
7
+ GROQ_FALLBACK_MODEL=qwen/qwen3-32b
8
+ GROQ_AUXILIARY_MODEL=llama3-8b-8192
9
+
10
+ RAG_LLM_MODEL=qwen/qwen3-32b
11
+ RAG_TEMPERATURE=0.1
12
+
13
+ # --- RAG System Configuration ---
14
+ # Embedding model for RAG
15
+ RAG_EMBEDDING_MODEL=all-MiniLM-L6-v2
16
+
17
+ # Whether to use GPU for RAG embeddings (if available and faiss-gpu is installed)
18
+
19
+ RAG_EMBEDDING_GPU=false
20
+
21
+ # Whether to attempt loading an existing RAG index on startup.
22
+ RAG_LOAD_INDEX=true
23
+
24
+ # Default number of documents the RAG retriever should fetch.
25
+ RAG_RETRIEVER_K=5
26
+
27
+ GDRIVE_SOURCES_ENABLED=False
28
+ GDRIVE_FOLDER_URL=&HBGGtZ4TJA
29
+
30
+ LLM_FORMATTER_CONFIDENCE_THRESHOLD=95
31
+ CHAT_HISTORY_TO_SEND=5
faiss_storage/faiss_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c627f912c7eead10f1ed14a68211eaad41b98e8920deb999f79f8671dc01979
3
+ size 3640365
faiss_storage/faiss_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7f4e06d1c5d0de27cae7d9a9328c94073faa58705ceea28242166dca11c581e
3
+ size 2569631
gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ faiss_storage/faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ sources/Endodontics[[:space:]]appendix[[:space:]]4.pdf filter=lfs diff=lfs merge=lfs -text
38
+ sources/Endodontics[[:space:]]cap[[:space:]]1.pdf filter=lfs diff=lfs merge=lfs -text
39
+ sources/Endodontics[[:space:]]cap[[:space:]]10.pdf filter=lfs diff=lfs merge=lfs -text
40
+ sources/Endodontics[[:space:]]cap[[:space:]]11.pdf filter=lfs diff=lfs merge=lfs -text
41
+ sources/Endodontics[[:space:]]cap[[:space:]]12.pdf filter=lfs diff=lfs merge=lfs -text
42
+ sources/Endodontics[[:space:]]cap[[:space:]]13.pdf filter=lfs diff=lfs merge=lfs -text
43
+ sources/Endodontics[[:space:]]cap[[:space:]]14.pdf filter=lfs diff=lfs merge=lfs -text
44
+ sources/Endodontics[[:space:]]cap[[:space:]]15.pdf filter=lfs diff=lfs merge=lfs -text
45
+ sources/Endodontics[[:space:]]cap[[:space:]]16.pdf filter=lfs diff=lfs merge=lfs -text
46
+ sources/Endodontics[[:space:]]cap[[:space:]]17.pdf filter=lfs diff=lfs merge=lfs -text
47
+ sources/Endodontics[[:space:]]cap[[:space:]]18.pdf filter=lfs diff=lfs merge=lfs -text
48
+ sources/Endodontics[[:space:]]cap[[:space:]]19.pdf filter=lfs diff=lfs merge=lfs -text
49
+ sources/Endodontics[[:space:]]cap[[:space:]]2.pdf filter=lfs diff=lfs merge=lfs -text
50
+ sources/Endodontics[[:space:]]cap[[:space:]]20.pdf filter=lfs diff=lfs merge=lfs -text
51
+ sources/Endodontics[[:space:]]cap[[:space:]]21.pdf filter=lfs diff=lfs merge=lfs -text
52
+ sources/Endodontics[[:space:]]cap[[:space:]]22.pdf filter=lfs diff=lfs merge=lfs -text
53
+ sources/Endodontics[[:space:]]cap[[:space:]]3.pdf filter=lfs diff=lfs merge=lfs -text
54
+ sources/Endodontics[[:space:]]cap[[:space:]]4.pdf filter=lfs diff=lfs merge=lfs -text
55
+ sources/Endodontics[[:space:]]cap[[:space:]]5.pdf filter=lfs diff=lfs merge=lfs -text
56
+ sources/Endodontics[[:space:]]cap[[:space:]]6.pdf filter=lfs diff=lfs merge=lfs -text
57
+ sources/Endodontics[[:space:]]cap[[:space:]]7.pdf filter=lfs diff=lfs merge=lfs -text
58
+ sources/Endodontics[[:space:]]cap[[:space:]]8.pdf filter=lfs diff=lfs merge=lfs -text
59
+ sources/Endodontics[[:space:]]cap[[:space:]]9.pdf filter=lfs diff=lfs merge=lfs -text
llm_handling.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import json
4
+ from typing import List, Dict, Tuple, Optional, Any, Iterator
5
+ import shutil
6
+ import re
7
+ import time
8
+ import requests
9
+ import zipfile
10
+ import tempfile
11
+ import gdown
12
+
13
+ import torch
14
+ from sentence_transformers import SentenceTransformer
15
+ from pypdf import PdfReader
16
+ import docx as python_docx
17
+
18
+ from llama_index.core.llms import ChatMessage
19
+ from llama_index.llms.groq import Groq as LlamaIndexGroqClient
20
+
21
+ from langchain_groq import ChatGroq
22
+ from langchain_community.embeddings import HuggingFaceEmbeddings
23
+ from langchain_community.vectorstores import FAISS
24
+ from langchain.prompts import ChatPromptTemplate
25
+ from langchain.schema import Document, BaseRetriever
26
+ from langchain.callbacks.manager import CallbackManagerForRetrieverRun
27
+ from langchain.schema.runnable import RunnablePassthrough, RunnableParallel
28
+ from langchain.schema.output_parser import StrOutputParser
29
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
30
+ # MODIFIED: Import the new prompt
31
+ from system_prompts import RAG_SYSTEM_PROMPT, FALLBACK_SYSTEM_PROMPT, QA_FORMATTER_PROMPT
32
+
33
+ logger = logging.getLogger(__name__)
34
+ if not logger.handlers:
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
38
+ )
39
+
40
+ # --- Configuration Constants ---
41
+ GROQ_API_KEY = os.getenv('BOT_API_KEY')
42
+ if not GROQ_API_KEY:
43
+ logger.critical("CRITICAL: BOT_API_KEY environment variable not found. Services will fail.")
44
+
45
+ FALLBACK_LLM_MODEL_NAME = os.getenv("GROQ_FALLBACK_MODEL", "llama-3.1-70b-versatile")
46
+ # ADDED: New constant for the auxiliary model
47
+ AUXILIARY_LLM_MODEL_NAME = os.getenv("GROQ_AUXILIARY_MODEL", "llama3-8b-8192")
48
+ _MODULE_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
49
+ RAG_FAISS_INDEX_SUBDIR_NAME = "faiss_index"
50
+ RAG_STORAGE_PARENT_DIR = os.getenv("RAG_STORAGE_DIR", os.path.join(_MODULE_BASE_DIR, "faiss_storage"))
51
+ RAG_SOURCES_DIR = os.getenv("SOURCES_DIR", os.path.join(_MODULE_BASE_DIR, "sources"))
52
+ RAG_CHUNKED_SOURCES_FILENAME = "pre_chunked_sources.json"
53
+ os.makedirs(RAG_SOURCES_DIR, exist_ok=True)
54
+ os.makedirs(RAG_STORAGE_PARENT_DIR, exist_ok=True)
55
+ RAG_EMBEDDING_MODEL_NAME = os.getenv("RAG_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
56
+ RAG_EMBEDDING_USE_GPU = os.getenv("RAG_EMBEDDING_GPU", "False").lower() == "true"
57
+ RAG_LLM_MODEL_NAME = os.getenv("RAG_LLM_MODEL", "llama-3.1-70b-versatile")
58
+ RAG_LLM_TEMPERATURE = float(os.getenv("RAG_TEMPERATURE", 0.0))
59
+ RAG_LOAD_INDEX_ON_STARTUP = os.getenv("RAG_LOAD_INDEX", "True").lower() == "true"
60
+ RAG_DEFAULT_RETRIEVER_K = int(os.getenv("RAG_RETRIEVER_K", 3))
61
+ GDRIVE_SOURCES_ENABLED = os.getenv("GDRIVE_SOURCES_ENABLED", "False").lower() == "true"
62
+ GDRIVE_FOLDER_ID_OR_URL = os.getenv("GDRIVE_FOLDER_URL")
63
+
64
+ # --- Text Extraction Helper Function ---
65
+ def extract_text_from_file(file_path: str, file_type: str) -> Optional[str]:
66
+ logger.info(f"Extracting text from {file_type.upper()} file: {os.path.basename(file_path)}")
67
+ try:
68
+ if file_type == 'pdf':
69
+ reader = PdfReader(file_path)
70
+ return "".join(page.extract_text() + "\n" for page in reader.pages if page.extract_text())
71
+ elif file_type == 'docx':
72
+ doc = python_docx.Document(file_path)
73
+ return "\n".join(para.text for para in doc.paragraphs if para.text)
74
+ elif file_type == 'txt':
75
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
76
+ return f.read()
77
+ logger.warning(f"Unsupported file type for text extraction: {file_type}")
78
+ return None
79
+ except Exception as e:
80
+ logger.error(f"Error extracting text from {os.path.basename(file_path)}: {e}", exc_info=True)
81
+ return None
82
+
83
+ FAISS_RAG_SUPPORTED_EXTENSIONS = {'pdf': 'pdf', 'docx': 'docx', 'txt': 'txt'}
84
+
85
+ # --- FAISS RAG System ---
86
+ class FAISSRetrieverWithScore(BaseRetriever):
87
+ vectorstore: FAISS
88
+ k: int = RAG_DEFAULT_RETRIEVER_K
89
+
90
+ def _get_relevant_documents(
91
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
92
+ ) -> List[Document]:
93
+ docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=self.k)
94
+ relevant_docs = []
95
+ for doc, score in docs_and_scores:
96
+ doc.metadata["retrieval_score"] = float(score)
97
+ relevant_docs.append(doc)
98
+ return relevant_docs
99
+
100
+ class KnowledgeRAG:
101
+ def __init__(
102
+ self,
103
+ index_storage_dir: str,
104
+ embedding_model_name: str,
105
+ groq_model_name_for_rag: str,
106
+ use_gpu_for_embeddings: bool,
107
+ groq_api_key_for_rag: str,
108
+ temperature: float,
109
+ ):
110
+ self.logger = logging.getLogger(__name__ + ".KnowledgeRAG")
111
+ self.index_storage_dir = index_storage_dir
112
+ self.embedding_model_name = embedding_model_name
113
+ self.groq_model_name = groq_model_name_for_rag
114
+ self.temperature = temperature
115
+
116
+ device = "cuda" if use_gpu_for_embeddings and torch.cuda.is_available() else "cpu"
117
+ self.logger.info(f"Initializing Hugging Face embedding model: {self.embedding_model_name} on device: {device}")
118
+ try:
119
+ self.embeddings = HuggingFaceEmbeddings(
120
+ model_name=self.embedding_model_name,
121
+ model_kwargs={"device": device},
122
+ encode_kwargs={"normalize_embeddings": True}
123
+ )
124
+ except Exception as e:
125
+ self.logger.critical(f"Failed to load embedding model: {e}", exc_info=True)
126
+ raise
127
+
128
+ self.logger.info(f"Initializing Langchain ChatGroq LLM for RAG: {self.groq_model_name}")
129
+ if not groq_api_key_for_rag:
130
+ raise ValueError("Groq API Key for RAG is missing.")
131
+ try:
132
+ self.llm = ChatGroq(
133
+ temperature=self.temperature,
134
+ groq_api_key=groq_api_key_for_rag,
135
+ model_name=self.groq_model_name
136
+ )
137
+ except Exception as e:
138
+ self.logger.critical(f"Failed to initialize Langchain ChatGroq LLM: {e}", exc_info=True)
139
+ raise
140
+
141
+ self.vector_store: Optional[FAISS] = None
142
+ self.retriever: Optional[FAISSRetrieverWithScore] = None
143
+ self.rag_chain = None
144
+ self.processed_source_files: List[str] = []
145
+
146
+ def build_index_from_source_files(self, source_folder_path: str, k: int = RAG_DEFAULT_RETRIEVER_K):
147
+ all_docs_for_vectorstore: List[Document] = []
148
+ processed_files_this_build: List[str] = []
149
+ pre_chunked_json_path = os.path.join(self.index_storage_dir, RAG_CHUNKED_SOURCES_FILENAME)
150
+
151
+ if os.path.exists(pre_chunked_json_path):
152
+ self.logger.info(f"Loading documents from pre-chunked file: {pre_chunked_json_path}")
153
+ try:
154
+ with open(pre_chunked_json_path, 'r', encoding='utf-8') as f:
155
+ chunk_data_list = json.load(f)
156
+ source_filenames = set()
157
+ for chunk_data in chunk_data_list:
158
+ doc = Document(page_content=chunk_data.get("page_content", ""), metadata=chunk_data.get("metadata", {}))
159
+ all_docs_for_vectorstore.append(doc)
160
+ if 'source_document_name' in doc.metadata:
161
+ source_filenames.add(doc.metadata['source_document_name'])
162
+ processed_files_this_build = sorted(list(source_filenames))
163
+ except Exception as e:
164
+ self.logger.error(f"Error processing pre-chunked JSON, falling back to raw files: {e}")
165
+ all_docs_for_vectorstore.clear()
166
+
167
+ if not all_docs_for_vectorstore:
168
+ self.logger.info(f"Processing raw files from '{source_folder_path}' to build index.")
169
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
170
+ for filename in os.listdir(source_folder_path):
171
+ file_path = os.path.join(source_folder_path, filename)
172
+ file_ext = filename.split('.')[-1].lower()
173
+ if os.path.isfile(file_path) and file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
174
+ text_content = extract_text_from_file(file_path, file_ext)
175
+ if text_content:
176
+ chunks = text_splitter.split_text(text_content)
177
+ for i, chunk_text in enumerate(chunks):
178
+ metadata = {"source_document_name": filename, "chunk_index": i}
179
+ all_docs_for_vectorstore.append(Document(page_content=chunk_text, metadata=metadata))
180
+ processed_files_this_build.append(filename)
181
+
182
+ if not all_docs_for_vectorstore:
183
+ self.logger.warning(f"No processable PDF/DOCX/TXT documents found in '{source_folder_path}'. RAG index will only contain other sources if available.")
184
+
185
+
186
+ self.processed_source_files = processed_files_this_build
187
+
188
+ # This print statement is kept for console visibility on startup/rebuild
189
+ print("\n--- Document Files Used for RAG Index ---")
190
+ if self.processed_source_files:
191
+ for filename in self.processed_source_files:
192
+ print(f"- {filename}")
193
+ else:
194
+ print("No PDF/DOCX/TXT source files were processed for the RAG index.")
195
+ print("---------------------------------------\n")
196
+
197
+ if not all_docs_for_vectorstore:
198
+ self.logger.warning("No documents to build FAISS index from. Skipping FAISS build.")
199
+ return
200
+
201
+ self.logger.info(f"Creating FAISS index from {len(all_docs_for_vectorstore)} document chunks...")
202
+ self.vector_store = FAISS.from_documents(all_docs_for_vectorstore, self.embeddings)
203
+ faiss_index_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
204
+ self.vector_store.save_local(faiss_index_path)
205
+ self.logger.info(f"FAISS index built and saved to '{faiss_index_path}'.")
206
+ self.retriever = FAISSRetrieverWithScore(vectorstore=self.vector_store, k=k)
207
+ self.setup_rag_chain()
208
+
209
+ def load_index_from_disk(self, k: int = RAG_DEFAULT_RETRIEVER_K):
210
+ faiss_index_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
211
+ if not os.path.isdir(faiss_index_path):
212
+ raise FileNotFoundError(f"FAISS index directory not found at '{faiss_index_path}'.")
213
+
214
+ self.logger.info(f"Loading FAISS index from: {faiss_index_path}")
215
+ self.vector_store = FAISS.load_local(
216
+ folder_path=faiss_index_path,
217
+ embeddings=self.embeddings,
218
+ allow_dangerous_deserialization=True
219
+ )
220
+ self.retriever = FAISSRetrieverWithScore(vectorstore=self.vector_store, k=k)
221
+ self.setup_rag_chain()
222
+
223
+ def format_docs(self, docs: List[Document]) -> str:
224
+ return "\n\n---\n\n".join([f"[Excerpt from {doc.metadata.get('source_document_name', 'N/A')}, Chunk {doc.metadata.get('chunk_index', 'N/A')}]\nContent:\n{doc.page_content}" for doc in docs])
225
+
226
+ def setup_rag_chain(self):
227
+ if not self.retriever or not self.llm:
228
+ raise RuntimeError("Retriever and LLM must be initialized.")
229
+
230
+ prompt = ChatPromptTemplate.from_template(RAG_SYSTEM_PROMPT)
231
+
232
+ self.rag_chain = (
233
+ RunnableParallel(
234
+ context=(self.retriever | self.format_docs),
235
+ question=RunnablePassthrough()
236
+ )
237
+ | prompt
238
+ | self.llm
239
+ | StrOutputParser()
240
+ )
241
+ self.logger.info("RAG LCEL chain set up successfully with dental assistant persona.")
242
+
243
+ def invoke(self, query: str, top_k: Optional[int] = None) -> Dict[str, Any]:
244
+ if not self.rag_chain:
245
+ # MODIFIED: Changed severity
246
+ self.logger.warning("RAG system not fully initialized. Cannot invoke.")
247
+ return {"answer": "The provided bibliography does not contain specific information on this topic.", "source": "system_error", "cited_source_details": []}
248
+
249
+ if not query or not query.strip():
250
+ return {"answer": "Please provide a valid question.", "source": "system_error", "cited_source_details": []}
251
+
252
+ k_to_use = top_k if top_k is not None and top_k > 0 else self.retriever.k
253
+ self.logger.info(f"Processing RAG query with k={k_to_use}: '{query[:100]}...'")
254
+
255
+ original_k = self.retriever.k
256
+ if k_to_use != original_k:
257
+ self.retriever.k = k_to_use
258
+
259
+ try:
260
+ retrieved_docs = self.retriever.get_relevant_documents(query)
261
+ if not retrieved_docs:
262
+ return {"answer": "The provided bibliography does not contain specific information on this topic.", "source": "no_docs_found", "cited_source_details": []}
263
+
264
+ context_str = self.format_docs(retrieved_docs)
265
+
266
+ # MODIFIED: Added full logging as per user request
267
+ print(f"\n--- RAG INVOKE ---")
268
+ print(f"QUESTION: {query}")
269
+ print(f"CONTEXT:\n{context_str}")
270
+
271
+ llm_answer = self.rag_chain.invoke(query, config={"context": context_str})
272
+
273
+ print(f"LLM_ANSWER: {llm_answer}")
274
+ print(f"--------------------\n")
275
+
276
+ structured_sources = [{
277
+ "source_document_name": doc.metadata.get('source_document_name', 'Unknown'),
278
+ "chunk_index": doc.metadata.get('chunk_index', 'N/A'),
279
+ "retrieval_score": doc.metadata.get("retrieval_score"),
280
+ } for doc in retrieved_docs]
281
+
282
+ if "the provided bibliography does not contain specific information" in llm_answer.lower():
283
+ final_answer = llm_answer
284
+ source_tag = "no_answer_in_bibliography"
285
+ else:
286
+ final_answer = f"{llm_answer}\n\n*Source: Bibliography-Based*"
287
+ source_tag = "bibliography_based"
288
+
289
+ return {
290
+ "query": query,
291
+ "answer": final_answer.strip(),
292
+ "source": source_tag,
293
+ "cited_source_details": structured_sources,
294
+ }
295
+
296
+ except Exception as e:
297
+ self.logger.error(f"Error during RAG query processing: {e}", exc_info=True)
298
+ return {"answer": "An error occurred while processing your request.", "source": "system_error", "cited_source_details": []}
299
+ finally:
300
+ if k_to_use != original_k:
301
+ self.retriever.k = original_k
302
+
303
+ def stream(self, query: str, top_k: Optional[int] = None) -> Iterator[str]:
304
+ if not self.rag_chain:
305
+ self.logger.error("RAG system not fully initialized for streaming.")
306
+ yield "Error: RAG system is not ready."
307
+ return
308
+
309
+ k_to_use = top_k if top_k is not None and top_k > 0 else self.retriever.k
310
+ self.logger.info(f"Processing RAG stream with k={k_to_use}: '{query[:100]}...'")
311
+
312
+ original_k = self.retriever.k
313
+ if k_to_use != original_k:
314
+ self.retriever.k = k_to_use
315
+
316
+ try:
317
+ # Check for docs first to avoid streaming "no info" message
318
+ retrieved_docs = self.retriever.get_relevant_documents(query)
319
+ if not retrieved_docs:
320
+ yield "The provided bibliography does not contain specific information on this topic."
321
+ return
322
+
323
+ # MODIFIED: Added full logging for streaming as per user request
324
+ context_str = self.format_docs(retrieved_docs)
325
+ print(f"\n--- RAG STREAM ---")
326
+ print(f"QUESTION: {query}")
327
+ print(f"CONTEXT:\n{context_str}")
328
+ print(f"STREAMING LLM_ANSWER...")
329
+ print(f"--------------------\n")
330
+
331
+ yield from self.rag_chain.stream(query, config={"context": context_str})
332
+ except Exception as e:
333
+ self.logger.error(f"Error during RAG stream processing: {e}", exc_info=True)
334
+ yield "An error occurred while processing your request."
335
+ finally:
336
+ if k_to_use != original_k:
337
+ self.retriever.k = original_k
338
+
339
+
340
+ # --- Groq Fallback Bot ---
341
+ class GroqBot:
342
+ def __init__(self):
343
+ self.logger = logging.getLogger(__name__ + ".GroqBot")
344
+ if not GROQ_API_KEY:
345
+ self.client = None
346
+ self.logger.critical("GroqBot not initialized: BOT_API_KEY is missing.")
347
+ return
348
+ try:
349
+ self.client = LlamaIndexGroqClient(model=FALLBACK_LLM_MODEL_NAME, api_key=GROQ_API_KEY)
350
+ self.system_prompt = FALLBACK_SYSTEM_PROMPT
351
+ except Exception as e:
352
+ self.logger.error(f"Failed to initialize LlamaIndexGroqClient for Fallback Bot: {e}", exc_info=True)
353
+ self.client = None
354
+
355
+ def stream_response(self, context: dict) -> Iterator[str]:
356
+ if not self.client:
357
+ yield "The system is currently unable to process this request."
358
+ return
359
+
360
+ current_query = context.get('current_query', '')
361
+ chat_history = context.get('chat_history', [])
362
+ qa_info = context.get('qa_related_info', '')
363
+
364
+ messages = [ChatMessage(role="system", content=self.system_prompt)]
365
+ if chat_history:
366
+ messages.extend([ChatMessage(**msg) for msg in chat_history])
367
+ if qa_info:
368
+ messages.append(ChatMessage(role="system", content=f"**Potentially Relevant Q&A Information from other sources:**\n{qa_info}"))
369
+ messages.append(ChatMessage(role="user", content=f"**Current User Query:**\n{current_query}"))
370
+
371
+ # MODIFIED: Added full logging as per user request
372
+ # The conversion to dict is necessary because ChatMessage is not directly JSON serializable
373
+ messages_for_print = [msg.dict() for msg in messages]
374
+ print(f"\n--- FALLBACK STREAM ---")
375
+ print(f"MESSAGES SENT TO LLM:\n{json.dumps(messages_for_print, indent=2)}")
376
+ print(f"STREAMING LLM_ANSWER...")
377
+ print(f"-----------------------\n")
378
+
379
+ try:
380
+ response_stream = self.client.stream_chat(messages)
381
+ for r_chunk in response_stream:
382
+ yield r_chunk.delta
383
+ except Exception as e:
384
+ self.logger.error(f"Groq API error in get_response (Fallback): {e}", exc_info=True)
385
+ yield "I am currently unable to process this request due to a technical issue."
386
+
387
+ # ADDED: New function for formatting QA answers
388
+ def get_answer_from_context(question: str, context: str, system_prompt: str) -> str:
389
+ """
390
+ Calls the LLM with a specific question and context from a QA source (CSV/XLSX).
391
+ """
392
+ logger.info(f"Formatting answer for question '{question[:50]}...' using QA context.")
393
+ try:
394
+ # Use the auxiliary model for this task for speed and cost-efficiency
395
+ formatter_llm = ChatGroq(
396
+ temperature=0.1,
397
+ groq_api_key=GROQ_API_KEY,
398
+ model_name=AUXILIARY_LLM_MODEL_NAME
399
+ )
400
+
401
+ prompt_template = ChatPromptTemplate.from_template(system_prompt)
402
+
403
+ chain = prompt_template | formatter_llm | StrOutputParser()
404
+
405
+ # MODIFIED: Added full logging as per user request
406
+ print(f"\n--- QA FORMATTER ---")
407
+ print(f"QUESTION: {question}")
408
+ print(f"CONTEXT:\n{context}")
409
+
410
+ response = chain.invoke({
411
+ "context": context,
412
+ "question": question
413
+ })
414
+
415
+ print(f"LLM_ANSWER: {response}")
416
+ print(f"--------------------\n")
417
+
418
+ return response.strip()
419
+
420
+ except Exception as e:
421
+ logger.error(f"Error in get_answer_from_context: {e}", exc_info=True)
422
+ return "Sorry, I was unable to formulate an answer based on the available information."
423
+
424
+
425
+ # --- Initialization and Interface Functions ---
426
+ def get_id_from_gdrive_input(url_or_id: str) -> Optional[str]:
427
+ if not url_or_id: return None
428
+ patterns = [r"/folders/([a-zA-Z0-9_-]+)", r"/d/([a-zA-Z0-9_-]+)", r"id=([a-zA-Z0-9_-]+)"]
429
+ for pattern in patterns:
430
+ match = re.search(pattern, url_or_id)
431
+ if match: return match.group(1)
432
+ if "/" not in url_or_id and "=" not in url_or_id and len(url_or_id) > 15:
433
+ return url_or_id
434
+ return None
435
+
436
+ def download_and_unzip_gdrive_folder(folder_id_or_url: str, target_dir: str) -> bool:
437
+ folder_id = get_id_from_gdrive_input(folder_id_or_url)
438
+ if not folder_id:
439
+ logger.error(f"Invalid Google Drive Folder ID or URL: {folder_id_or_url}")
440
+ return False
441
+
442
+ with tempfile.TemporaryDirectory() as temp_dir:
443
+ try:
444
+ logger.info(f"Attempting to download GDrive folder ID: {folder_id}")
445
+ download_path = gdown.download_folder(id=folder_id, output=temp_dir, quiet=False, use_cookies=False)
446
+ if not download_path or not os.listdir(temp_dir):
447
+ logger.error("gdown failed to download or extract the folder.")
448
+ return False
449
+
450
+ source_content_root = temp_dir
451
+ items_in_temp = os.listdir(temp_dir)
452
+ if len(items_in_temp) == 1 and os.path.isdir(os.path.join(temp_dir, items_in_temp[0])):
453
+ source_content_root = os.path.join(temp_dir, items_in_temp[0])
454
+
455
+ logger.info(f"Moving contents from {source_content_root} to {target_dir}")
456
+ if os.path.exists(target_dir):
457
+ shutil.rmtree(target_dir)
458
+ shutil.copytree(source_content_root, target_dir)
459
+ logger.info(f"Successfully moved GDrive contents to {target_dir}")
460
+ return True
461
+ except Exception as e:
462
+ # MODIFIED: Corrected self.logger to logger
463
+ logger.error(f"Error during GDrive download/processing: {e}", exc_info=True)
464
+ return False
465
+
466
+ def initialize_and_get_rag_system(force_rebuild: bool = False) -> Optional[KnowledgeRAG]:
467
+ if not GROQ_API_KEY:
468
+ logger.error("RAG system cannot be initialized without BOT_API_KEY.")
469
+ return None
470
+
471
+ if GDRIVE_SOURCES_ENABLED and GDRIVE_FOLDER_ID_OR_URL:
472
+ logger.info("Google Drive sources enabled. Downloading...")
473
+ if os.path.isdir(RAG_SOURCES_DIR):
474
+ logger.info(f"Clearing existing RAG sources directory: {RAG_SOURCES_DIR}")
475
+ shutil.rmtree(RAG_SOURCES_DIR)
476
+ os.makedirs(RAG_SOURCES_DIR)
477
+
478
+ download_successful = download_and_unzip_gdrive_folder(GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR)
479
+ if not download_successful:
480
+ logger.error("Failed to download sources from Google Drive. Using local files if available.")
481
+
482
+ faiss_index_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME)
483
+ if force_rebuild and os.path.exists(RAG_STORAGE_PARENT_DIR):
484
+ logger.info(f"Force Rebuild: Deleting existing index storage directory at '{RAG_STORAGE_PARENT_DIR}'")
485
+ shutil.rmtree(RAG_STORAGE_PARENT_DIR)
486
+ os.makedirs(RAG_STORAGE_PARENT_DIR)
487
+
488
+ try:
489
+ rag_instance = KnowledgeRAG(
490
+ index_storage_dir=RAG_STORAGE_PARENT_DIR,
491
+ embedding_model_name=RAG_EMBEDDING_MODEL_NAME,
492
+ groq_model_name_for_rag=RAG_LLM_MODEL_NAME,
493
+ use_gpu_for_embeddings=RAG_EMBEDDING_USE_GPU,
494
+ groq_api_key_for_rag=GROQ_API_KEY,
495
+ temperature=RAG_LLM_TEMPERATURE,
496
+ )
497
+
498
+ should_build = True
499
+ if RAG_LOAD_INDEX_ON_STARTUP and not force_rebuild:
500
+ try:
501
+ rag_instance.load_index_from_disk(k=RAG_DEFAULT_RETRIEVER_K)
502
+ logger.info("RAG index loaded successfully from disk.")
503
+ should_build = False
504
+ except FileNotFoundError:
505
+ logger.warning("Index not found on disk. Will attempt to build.")
506
+ except Exception as e:
507
+ logger.error(f"Error loading index: {e}. Will attempt to rebuild.", exc_info=True)
508
+
509
+ if should_build:
510
+ logger.info("Building new RAG index from source data...")
511
+ rag_instance.build_index_from_source_files(
512
+ source_folder_path=RAG_SOURCES_DIR,
513
+ k=RAG_DEFAULT_RETRIEVER_K
514
+ )
515
+
516
+ return rag_instance
517
+
518
+ except Exception as e:
519
+ logger.critical(f"FATAL: Failed to initialize RAG system: {e}", exc_info=True)
520
+ return None
521
+
522
+ groq_bot_instance = GroqBot()
523
+
524
+ # ADDED: New function to handle auxiliary model calls (This function is no longer used, replaced by get_answer_from_context)
525
+ def get_auxiliary_chat_response(messages: List[Dict]) -> str:
526
+ """
527
+ Handles requests for auxiliary tasks like generating titles or follow-up questions.
528
+ Uses a separate, smaller model for efficiency.
529
+ """
530
+ logger.info(f"Routing auxiliary request to model: {AUXILIARY_LLM_MODEL_NAME}")
531
+ try:
532
+ # Initialize a dedicated client for this call to use the specific auxiliary model
533
+ aux_client = ChatGroq(
534
+ temperature=0.2, # A bit more creative than RAG, but still grounded
535
+ groq_api_key=GROQ_API_KEY,
536
+ model_name=AUXILIARY_LLM_MODEL_NAME
537
+ )
538
+ response = aux_client.invoke(messages)
539
+ return response.content
540
+ except Exception as e:
541
+ logger.error(f"Error with auxiliary model call: {e}", exc_info=True)
542
+ return "Could not generate suggestions."
requirements.txt CHANGED
@@ -1 +1,30 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==3.0.3
2
+ Flask_Cors==5.0.0
3
+ numpy
4
+ pandas==2.2.3
5
+ #rapidfuzz==3.10.1
6
+ Requests==2.32.3
7
+ #scikit_learn==1.4.1.post1
8
+ #scikit_learn==1.5.2
9
+ psycopg2-binary==2.9.10
10
+ python-dotenv==1.0.1
11
+ apscheduler==3.11.0
12
+ redis==3.5.3
13
+ faiss-cpu==1.10.0
14
+ groq==0.15.0
15
+ llama_index==0.12.13
16
+ llama_index.llms.groq==0.3.1
17
+ #langchain_groq==0.2.4
18
+ #langchain_core==0.3.39
19
+ sentence_transformers==3.4.0
20
+ gunicorn
21
+ llama-index-embeddings-huggingface==0.5.4
22
+ onnxruntime==1.22.0
23
+ langchain-groq==0.3.2
24
+ python-docx==1.1.2
25
+ langchain_community==0.3.23
26
+ requests==2.32.3
27
+ gdown==5.2.0
28
+ pymupdf==1.25.5
29
+ openpyxl==3.1.5
30
+ # must install https://aka.ms/vs/17/release/vc_redist.x64.exe
sources/Endodontics%20appendix%201.pdf ADDED
Binary file (90.7 kB). View file
 
sources/Endodontics%20appendix%202.pdf ADDED
Binary file (66.3 kB). View file
 
sources/Endodontics%20appendix%203.pdf ADDED
Binary file (92 kB). View file
 
sources/Endodontics%20appendix%204.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8b972d927a7a01f466cfe30e24834e653009926891919df1078f3563b718d1b
3
+ size 188535
sources/Endodontics%20book.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:045c84eef3be88136f67bea03299a5e54ff121a017b8d6575d2fa7dd1269d460
3
+ size 19170850
sources/Endodontics%20cap%201.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2956cff53ab6755bd5769d96ea6e021d21992d66898fa7ad19d59a651db08552
3
+ size 1393891
sources/Endodontics%20cap%2010.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d60888b81a8ba43e14a85dee20a485120c6a168e75d8cb44f51f98ea1dc86c6
3
+ size 858903
sources/Endodontics%20cap%2011.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cdfa6604cc9bdad78c8ffb4ec61e6ae5f12c3b75f0b52d45bab57c43983e676
3
+ size 1132015
sources/Endodontics%20cap%2012.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b42e9dca8977115f11342fd3d264576c5b7224a51b82f3646d3a766669417f91
3
+ size 1539100
sources/Endodontics%20cap%2013.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed0b9c196e3afe7477709996a261580721c42c391ecab6a0183ef4261b5a86fd
3
+ size 1625649
sources/Endodontics%20cap%2014.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92e0c462d8eb85d94b08ba84ad2270c86e87a678dea4be0d700f910af89c64e7
3
+ size 1357984
sources/Endodontics%20cap%2015.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54d2a579eed28d2bb58c8aca669dc6baaad1824a156517212494b0557f4837d7
3
+ size 944072
sources/Endodontics%20cap%2016.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ab55472e1887827d5f55e51993ebb5ab9a031fafb6e81db258b82277c7ce4e5
3
+ size 681335
sources/Endodontics%20cap%2017.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f5c395668f607f37c3f67131bdb0f82bc792b3bdbabb03071abd623d066db27
3
+ size 528079
sources/Endodontics%20cap%2018.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8cb86f5fa0e481b2d91db3e93dcabe580a447630f7716d0f8309d5a018bbc64
3
+ size 746444
sources/Endodontics%20cap%2019.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5746dc123bc59f7373699d48e753ad05552acaa18c743dc5026990e603686b2
3
+ size 830885
sources/Endodontics%20cap%202.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b06e9d058a6f05afb04bfef398b8d0f36c02e4d78aba873dc294652c9fe2517c
3
+ size 612225
sources/Endodontics%20cap%2020.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:127d6f0efcac6fc7d12cbee72f9f0cbf05136be860e5837662037d267e8c621b
3
+ size 609023
sources/Endodontics%20cap%2021.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d3b2e5699b2a92f2bb5187e825aa5f11afc1d2ed54bce56e6ba56fab699f2a4
3
+ size 464443
sources/Endodontics%20cap%2022.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9dee2fd5d783cdf4f53b8e3d79ab012369e1c55ade79c4e5ee7544c83d690f
3
+ size 561329
sources/Endodontics%20cap%203.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a79808a3da34f9817136e441254449dc5504f115cae1190505b8ce5f3c3f9b51
3
+ size 786616
sources/Endodontics%20cap%204.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3223d4b6c6ae9b1b66bd5f0ac750bf8e3a514e7941555632ad98e67202929993
3
+ size 810838
sources/Endodontics%20cap%205.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe63ba988c8c88207a7482c677983cd5f1a0b06cb5bc210b650a657b20924ebf
3
+ size 761083
sources/Endodontics%20cap%206.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c490f38f73cd72e8bde876bb082964d43d1080abd1593ab7c4e53ca97a4f492
3
+ size 1056526
sources/Endodontics%20cap%207.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b1f2f03a024f0520a798a7e7bd2c128cc5eda3d929a6b56d9a2b9dde184d14
3
+ size 1619923
sources/Endodontics%20cap%208.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:159d99387578b0d9164e243ae27f7ee77ee998aa8c71dc15310e6ca274d96f7e
3
+ size 751157
sources/Endodontics%20cap%209.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db47bb570fa521c1e7462e07bd22f9ce38485c2b262b0c774d16ba7795844179
3
+ size 498083
sources/_%24preguntas%20chatbot_01.xlsx ADDED
Binary file (165 Bytes). View file
 
sources/database.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Question,Answer,Image
sources/general_qa.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Question,Answer,Image
sources/greetings.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Question,Answer,Image
sources/personal_qa.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Question,Answer,Image
sources/preguntas chatbot_01.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65e4dfe9f1e7dad838718a016a486ca8a9e3c99068fcc22ae6e2e2064fd86943
3
+ size 156246
system_prompts.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ This module centralizes all system prompts for the specialized dental chatbot application.
4
+ This allows for easy management and updating of prompts without altering core logic.
5
+ """
6
+
7
+ # --- RAG System Prompt for Bibliography-Based Answers ---
8
+ # This prompt instructs the LLM to answer based *only* on the context provided
9
+ # by the RAG system from scientific documents (PDFs, etc.).
10
+ # Placeholders {context} and {question} will be filled by the LangChain pipeline.
11
+ RAG_SYSTEM_PROMPT = """You are a specialized dental assistant AI. Your role is to provide accurate, evidence-based information on a specific dental topic.
12
+
13
+ **Your Task:**
14
+ Your primary task is to answer the user's question accurately and concisely, based *exclusively* on the "Provided Document Excerpts" below. These excerpts are from vetted scientific and dental publications.
15
+
16
+ **Provided Document Excerpts:**
17
+ {context}
18
+
19
+ **User Question:**
20
+ {question}
21
+
22
+ ---
23
+ **Core Instructions:**
24
+ 1. **Language:** Your default language is **Spanish**. But follow the language of user. If they ask question in Spanish, reply in Spanish. If they ask in English, reply in English, even if the context is Spanish.
25
+ 2. **Strictly Adhere to Context:** Your answer **must** be derived solely from the "Provided Document Excerpts." Do not use any external knowledge or make assumptions beyond what is presented in the text.
26
+ 3. **Professional Tone:** Maintain a clinical, objective, and professional tone suitable for a dental context.
27
+ 4. **Do Not Speculate:** If the provided excerpts do not contain the information needed to answer the question, you must not invent an answer.
28
+ 5. **Handling Unanswerable Questions:** If you cannot answer the question based on the provided excerpts, respond with: "The provided bibliography does not contain specific information on this topic." Do not attempt to guide the user elsewhere or apologize.
29
+ 6. **No Self-Reference:** Do not mention that you are an AI, that you are "looking at documents," or refer to the "provided excerpts" in your final answer. Simply present the information as requested.
30
+
31
+ **Answer Format:**
32
+ Provide a direct answer to the user's question based on the information available.
33
+
34
+ **Answer:**"""
35
+
36
+
37
+ # --- Fallback System Prompt for General/Triage Purposes ---
38
+ # REVISED: This prompt is now much stricter and will only handle dental-related queries.
39
+ FALLBACK_SYSTEM_PROMPT = """You are a specialized dental assistant AI. Your one and only role is to answer questions strictly related to dentistry.
40
+
41
+ **Core Instructions:**
42
+ 1. **Dental Focus Only:** You MUST NOT engage in any general conversation, small talk, or answer questions outside the scope of dentistry.
43
+ 2. **Handle Out-of-Scope Questions:** If the user's question is unrelated to dentistry, you must respond with the following exact phrase: "I am a dental assistant AI and my capabilities are limited to dental topics. Do you have a question about oral health?"
44
+ 3. **Stateful Conversation:** Pay attention to the `Prior Conversation History` to understand the context of the user's dental inquiries.
45
+ 4. **Professional Tone:** Always be polite, helpful, and professional.
46
+ 5. **Do Not Make Up Clinical Advice:** Do not provide medical diagnoses or treatment plans. You can provide general information but should always recommend consulting a professional for personal health concerns.
47
+
48
+ **Response Guidance:**
49
+ - Review the `Prior Conversation History` to understand the context.
50
+ - Formulate a helpful, professional answer to the `Current User Query` if it is about dentistry.
51
+ """
52
+
53
+ # ADDED: New prompt to format answers based on structured data from CSV/XLSX files.
54
+ QA_FORMATTER_PROMPT = """You are a helpful assistant. You will be given a user's question and structured data from a database row that is highly relevant to the question.
55
+ Your task is to formulate a natural, conversational answer to the user's question based *only* on the provided data.
56
+
57
+ - Synthesize the information from the data fields into a coherent response.
58
+ - Do not just list the data. Create a proper sentence or paragraph.
59
+ - If the data contains a 'Fuente' or 'Source' field, cite it at the end of your answer like this: (Source: [source_value]).
60
+
61
+ **Provided Data:**
62
+ {context}
63
+
64
+ **User Question:**
65
+ {question}
66
+
67
+ **Answer:**"""