Nadezhda Komarova commited on
Commit
4be6b01
·
1 Parent(s): af4d3bd

first commit

Browse files
Files changed (10) hide show
  1. .gitignore +213 -0
  2. README copy.md +227 -0
  3. app.css +485 -0
  4. generate.py +271 -0
  5. logo_mono.png +0 -0
  6. prepare.py +896 -0
  7. rag_execute.py +647 -0
  8. rag_on_prem.py +216 -0
  9. requirements.txt +19 -0
  10. retrieve.py +180 -0
.gitignore ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # Project-specific ignores
210
+ #/context/
211
+
212
+ # Igrnore gradio
213
+ .gradio/
README copy.md ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: RAG-LangChain-Gradio
3
+ app_file: rag_execute.py
4
+ sdk: gradio
5
+ sdk_version: 5.46.0
6
+ ---
7
+ # Retrieval-Augmented Generation (RAG)
8
+
9
+ Dieses Projekt implementiert ein **Retrieval-Augmented Generation (RAG)**-System unter Verwendung von LangChain, Mistral/OpenAI LLMs und Pinecone für die Vektordatenbanksuche. Das System ermöglicht es, Dokumente abzufragen und kontextbewusste Antworten über eine Chat-ähnliche Schnittstelle zu generieren.
10
+
11
+ ---
12
+
13
+ ## Inhaltsverzeichnis
14
+
15
+ 1. [Übersicht](#übersicht)
16
+ 2. [Anforderungen](#anforderungen)
17
+ 3. [Einrichtung](#einrichtung)
18
+ 4. [Vorbereitung der API-Schlüssel](#vorbereitung-der-api-schlüssel)
19
+ 5. [Erstellung des Pinecone-Indexes](#erstellung-des-pinecone-indexes)
20
+ 6. [Ausführung des RAG-Systems](#ausführung-des-rag-systems)
21
+ 7. [Code-Struktur](#code-struktur)
22
+ 8. [Prozessbeschreibung](#prozessbeschreibung)
23
+ 9. [Evaluation](#evaluation)
24
+
25
+ ---
26
+
27
+ ## Übersicht
28
+
29
+ Dieses RAG-System arbeitet in drei Schritten:
30
+
31
+ 1. **Dokumente einbetten**: Rohtextdateien in Chunks aufteilen und mit dem `llama-text-embed-v2` Embedding-Modell in Pinecone einbetten.
32
+ 2. **Abruf (Retrieval)**: Semantische Suche in der Pinecone-Vektordatenbank durchführen, um die relevantesten Chunks für eine Anfrage abzurufen.
33
+ 3. **Generierung**: Übergabe der Anfrage und des abgerufenen Kontexts an ein LLM (Mistral oder OpenAI), um die Antwort zu generieren.
34
+
35
+ ---
36
+
37
+ ## Anforderungen
38
+
39
+ Python 3.10+ empfohlen.
40
+
41
+ Enthaltene Abhängigkeiten:
42
+
43
+ * `langchain-community`
44
+ * `langchain-core`
45
+ * `langchain-text-splitters`
46
+ * `langchain-mistralai`
47
+ * `langchain-openai`
48
+ * `ragas`
49
+ * `datasets`
50
+ * `pinecone-client`
51
+ * `gradio`
52
+ * `python-dotenv`
53
+ * `pypdf`
54
+ * `pandas`
55
+ * `nbformat`
56
+ * `nbconvert`
57
+ * `unstructured` (mit Extras für `docx`, `pptx`, `html`, `md`)
58
+
59
+ Alle Abhängigkeiten sollten installiert werden, z. B. mit dem folgenden Befehl.
60
+ ```
61
+ pip install langchain-mistralai langchain-community datasets ragas langchain-openai langchain-text-splitters langchain-core pinecone-client langgraph pypdf gradio python-dotenv nbformat nbconvert "unstructured[docx,pptx,html,md]"
62
+ ```
63
+ Da eine requirements.txt-Datei vorhanden ist, kann die Installation auch durch folgenden Befehl durchgeführt werden:
64
+ ```
65
+ pip install -r requirements.txt
66
+ ```
67
+ ---
68
+
69
+ ## Einrichtung
70
+
71
+ 1. Repository klonen.
72
+ 2. Erstellen Sie eine `.env`-Datei im Hauptverzeichnis mit den folgenden Schlüsseln:
73
+
74
+ ```dotenv
75
+ MISTRAL_API_KEY=<Ihr-mistral-api-schlüssel>
76
+ OPENAI_API_KEY=<Ihr-openai-api-schlüssel>
77
+ PINECONE_API=<Ihr-pinecone-api-schlüssel>
78
+ INDEX_NAME=<Ihr-pinecone-index-name>
79
+ DIRNAME=<Pfad-zu-Kontextdokumenten>
80
+ MODELNAME=<LLM-Modellname>
81
+ ````
82
+
83
+ * `MISTRAL_API_KEY` – Ihr API-Schlüssel für Mistral-Modelle.
84
+ * `OPENAI_API_KEY` – Ihr API-Schlüssel für OpenAI-Modelle. Es reicht aus, einen der OpenAI-/Mistral-Schlüssel (je nach ausgewähltem Modell) festzulegen.
85
+ * `PINECONE_API` – API-Schlüssel für Pinecone, um Vektoren zu speichern und abzufragen.
86
+ * `INDEX_NAME` – Name des Pinecone-Indexes, in dem die Dokumente abgelegt werden.
87
+ * `DIRNAME` – Unterordner innerhalb des festen `context/`-Verzeichnisses.
88
+
89
+ * Wenn leer (`DIRNAME=`), werden alle Dokumente in `context/` verarbeitet.
90
+ * Beispiel: `DIRNAME=llm_context` verarbeitet nur `context/llm_context/`.
91
+ * `MODELNAME` – LLM-Modell, z. B. `gpt-5-nano` für OpenAI oder `mistral-large-latest` für Mistral.
92
+ ---
93
+
94
+ ## Vorbereitung der API-Schlüssel
95
+
96
+ ### Mistral
97
+
98
+ 1. Registrieren bei [Mistral AI](https://www.mistral.ai).
99
+ 2. API-Schlüssel erstellen.
100
+ 3. In `.env` unter `MISTRAL_API_KEY` eintragen.
101
+
102
+ ### OpenAI
103
+
104
+ 1. Registrieren bei [OpenAI](https://platform.openai.com/).
105
+ 2. API-Schlüssel erstellen.
106
+ 3. In `.env` unter `OPENAI_API_KEY` eintragen.
107
+
108
+ ### Pinecone
109
+
110
+ 1. Registrieren bei [Pinecone](https://www.pinecone.io/).
111
+ 2. API-Schlüssel erstellen.
112
+ 3. In `.env` unter `PINECONE_API` eintragen.
113
+ 4. Einen **Index** erstellen (z. B. `use-cases-index`) mit dem Embedding-Modell `llama-text-embed-v2`.
114
+
115
+ ---
116
+
117
+ ## Erstellung des Pinecone-Indexes
118
+
119
+ Das RAG-Vorbereitungsskript (`rag_func.py`) führt automatisch folgende Schritte aus:
120
+
121
+ 1. Verbindung zu Pinecone über den API-Schlüssel herstellen.
122
+ 2. Dokumente aus dem Verzeichnis `DIRNAME` laden.
123
+ 3. Dokumente in Chunks aufteilen mit `RecursiveCharacterTextSplitter` (Standard: 1800 Tokens pro Chunk, 200 Tokens Überlappung).
124
+ 4. Chunks mit `llama-text-embed-v2` einbetten.
125
+ 5. Einbettungen in Batches in den Pinecone-Index hochladen.
126
+
127
+ > **Tipp:** Wenn alle Dokumente bereits hochgeladen wurden, kann `DIRNAME` leer bleiben, und das System überspringt die Dokumentenverarbeitung.
128
+
129
+ ---
130
+
131
+ ## Ausführung des RAG-Systems
132
+
133
+ Hauptskript ausführen:
134
+
135
+ ```bash
136
+ python rag_execute.py
137
+ ```
138
+
139
+ Dies bewirkt:
140
+
141
+ 1. Laden der Dokumente (falls vorhanden) und Vorbereitung des RAG-Systems.
142
+ 2. Starten einer **Chat-Schnittstelle** mit Gradio, in der Sie Fragen stellen können.
143
+ 3. Abrufen relevanter Chunks aus Pinecone.
144
+ 4. Generieren von Antworten mit dem ausgewählten LLM.
145
+
146
+ ### Chat-Schnittstelle
147
+
148
+ * Geben Sie eine Anfrage in das Textfeld ein.
149
+ * Das System ruft den Kontext ab und erstellt eine Antwort.
150
+ * Antworten werden im Chat-Format angezeigt.
151
+
152
+ ---
153
+
154
+ ## Code-Struktur
155
+
156
+ ```
157
+ ├─ rag_execute.py # Hauptskript für RAG mit Gradio-Schnittstelle
158
+ ├─ rag_func.py # Funktionen für RAG-Vorbereitung, Abruf und Generierung
159
+ ├─ .env # Umgebungsvariablen (API-Schlüssel, Index, Modell, Verzeichnis)
160
+ ├─ context/ # Ordner mit Rohdokumenten (kann leer sein, falls bereits hochgeladen)
161
+ ```
162
+
163
+ ---
164
+
165
+ ## Prozessbeschreibung
166
+
167
+ 1. **LLM auswählen**: Mistral oder OpenAI.
168
+ 2. **Dokumente vorbereiten**:
169
+
170
+ * Text- oder JSON-Dokumente in den Ordner `DIRNAME` legen.
171
+ * Wenn alle Dokumente bereits hochgeladen sind, kann der Ordner leer bleiben.
172
+ 3. **Pinecone-Index erstellen**:
173
+
174
+ * Index in `.env` benennen (`INDEX_NAME`).
175
+ * Embedding-Modell `llama-text-embed-v2` verwenden.
176
+ 4. **Dokumente aufteilen**: Mit `RecursiveCharacterTextSplitter`.
177
+ 5. **Chunks einbetten**: Zur semantischen Suche an Pinecone senden.
178
+ 6. **Relevante Chunks abrufen**: Bei einer Anfrage.
179
+ 7. **Antwort generieren**: Anfrage + Kontext an LLM übergeben.
180
+ 8. **Ergebnis zurückgeben**: Im Chat-Interface anzeigen.
181
+
182
+ ---
183
+
184
+ ## Evaluation
185
+
186
+ Das System unterstützt Evaluation mit **RAGAS**:
187
+
188
+ 1. `generate_dataset()` ruft Kontext ab und generiert Antworten.
189
+ 2. `evaluate_RAG()` berechnet **Faithfulness** und andere Metriken.
190
+ 3. Ergebnisse werden zur Analyse ausgegeben.
191
+
192
+ ---
193
+
194
+ ## Hinweise
195
+
196
+ * Retry-Logik für Pinecone-Operationen ist implementiert, um Netzwerkfehler abzufangen.
197
+ * Chunk-Größe und Überlappung können in `prepare_RAG()` für größere oder kleinere Kontextgranularität angepasst werden.
198
+
199
+ ---
200
+
201
+ ## Beispiel
202
+
203
+ ```python
204
+ from rag_func import prepare_RAG, retrieve_RAG, generate_RAG
205
+ import os
206
+
207
+ index, pc, llm = prepare_RAG(
208
+ pinecone_API=os.getenv("PINECONE_API"),
209
+ index_name=os.getenv("INDEX_NAME"),
210
+ llm_model=os.getenv("MODELNAME"),
211
+ dir_name=os.getenv("DIRNAME")
212
+ )
213
+
214
+ query = "Liste typische Anwendungsfälle von GenAI im Telekommunikationsbereich auf."
215
+ retrieved_chunks = retrieve_RAG(query, pc, index)
216
+ response = generate_RAG(query, llm, retrieved_chunks)
217
+ print(response.content)
218
+ ```
219
+
220
+ ---
221
+
222
+ ## Referenzen
223
+
224
+ * [LangChain RAG Tutorial](https://python.langchain.com/docs/tutorials/rag/)
225
+ * [Pinecone Dokumentation](https://docs.pinecone.io)
226
+ * [RAGAS Evaluation](https://docs.ragas.io/en/stable/getstarted/evals/)
227
+
app.css ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --brand-blue: #17428f;
3
+ --brand-orange: #f39719;
4
+ --text-dark: #111827; /* very dark grey (near black) */
5
+ --text-gray: #4B5563; /* medium grey for messages */
6
+ color-scheme: only light;
7
+ }
8
+
9
+ /* Base */
10
+ body, .gradio-container {
11
+ /* Default Gradio font will be used */
12
+ background: linear-gradient(135deg, var(--brand-blue) 0%, var(--brand-orange) 100%);
13
+ min-height: 100vh;
14
+ color: var(--text-dark);
15
+ }
16
+
17
+ /* Logo size */
18
+
19
+ /* Logo size */
20
+ /*#company-logo img {
21
+ width: 40px !important;
22
+ min-width: 40px !important;
23
+ height: auto !important;
24
+ object-fit: contain !important;
25
+ display: block !important;
26
+ }*/
27
+
28
+
29
+ /* Top bar transparent */
30
+ #topbar { background: transparent !important; }
31
+
32
+ /* Header text over gradient */
33
+ #header h1, #header h2, #header h3, #header h4, #header h5, #header h6,
34
+ #header p {
35
+ color: #ffffff;
36
+ text-align: center;
37
+ }
38
+
39
+ #header2 h1, #header2 h2, #header2 h3, #header2 h4, #header2 h5, #header2 h6,
40
+ #header2 p {
41
+ color: #ffffff;
42
+ text-align: center;
43
+ }
44
+
45
+ /* Chatbox container */
46
+ #chatbot {
47
+ height: 100%;
48
+ border-radius: 14px;
49
+ border: 2px solid var(--brand-blue);
50
+ background-color: #ffffff;
51
+ padding: 8px;
52
+ overflow-y: auto;
53
+
54
+ /* Icon tint tokens for local use (chat area) */
55
+ --icon-light: #9CA3AF;
56
+ --icon-hover: #6B7280;
57
+ }
58
+
59
+ /* ----------------------------- */
60
+ /* TEXT COLORING (SAFE FOR CODE) */
61
+ /* ----------------------------- */
62
+
63
+ /* Apply the gray text color only at the message container level.
64
+ Do NOT set color on descendants or code tokens. */
65
+ #chatbot .message {
66
+ color: var(--text-gray); /* no !important */
67
+ }
68
+
69
+ /* Bubble styling */
70
+ #chatbot .message.user {
71
+ background: #fff4e1;
72
+ border-radius: 10px;
73
+ padding: 6px 12px;
74
+ text-align: right;
75
+ }
76
+ #chatbot .message.bot {
77
+ background: #f0f0f0;
78
+ border-radius: 10px;
79
+ padding: 6px 12px;
80
+ text-align: left;
81
+ }
82
+
83
+ /* Markdown horizontal rules inside chatbot */
84
+ #chatbot hr {
85
+ margin: 6px 0; /* reduce extra space */
86
+ border: none; /* remove default bevel */
87
+ border-top: 1px solid #d1d5db; /* subtle gray line */
88
+ }
89
+
90
+ /* Fallback selectors for other Gradio versions */
91
+ #chatbot [data-testid*="message"] {
92
+ border-radius: 10px;
93
+ padding: 6px 12px;
94
+ }
95
+ #chatbot [data-testid="user-message"] {
96
+ background: #fff4e1;
97
+ text-align: right;
98
+ }
99
+ #chatbot [data-testid="assistant-message"] {
100
+ background: #f0f0f0;
101
+ text-align: left;
102
+ }
103
+
104
+ /* ----------------------------- */
105
+ /* CODE BLOCKS (DO NOT SET COLOR)*/
106
+ /* ----------------------------- */
107
+
108
+ /* Give code blocks a readable container without touching colors.
109
+ This preserves syntax highlighting from highlight.js or Prism. */
110
+ #chatbot pre,
111
+ #chatbot pre code,
112
+ #chatbot code[class*="language-"],
113
+ #chatbot pre[class*="language-"],
114
+ #chatbot code.hljs {
115
+ background: #f8fafc; /* light neutral background */
116
+ border-radius: 8px;
117
+ padding: 10px 12px;
118
+ display: block;
119
+ overflow-x: auto;
120
+ /* IMPORTANT: no 'color' declaration here */
121
+ }
122
+
123
+ /* Inline code (single backticks) */
124
+ #chatbot :not(pre) > code {
125
+ background: #f1f5f9;
126
+ padding: 0.15rem 0.35rem;
127
+ border-radius: 6px;
128
+ /* no 'color' here */
129
+ }
130
+
131
+ /* ---------------------------------- */
132
+ /* Inputs */
133
+ /* ---------------------------------- */
134
+ input[type="text"], textarea, .gr-text-input input, .gr-textbox textarea {
135
+ border-radius: 10px;
136
+ padding: 10px;
137
+ font-size: 16px;
138
+ border: 2px solid var(--brand-orange);
139
+ }
140
+ input:focus, textarea:focus, .gr-text-input input:focus, .gr-textbox textarea:focus {
141
+ border-color: var(--brand-blue);
142
+ outline: none;
143
+ box-shadow: 0 0 6px rgba(23, 66, 143, 0.5);
144
+ }
145
+
146
+ /* ---------------------------------- */
147
+ /* Buttons (global gradient) */
148
+ /* ---------------------------------- */
149
+ .gr-button, button {
150
+ border-radius: 10px;
151
+ font-weight: 600;
152
+ background: linear-gradient(90deg, var(--brand-blue), var(--brand-orange));
153
+ color: white;
154
+ border: none;
155
+ }
156
+ .gr-button:hover, button:hover {
157
+ transform: translateY(-2px);
158
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2);
159
+ }
160
+
161
+ /* ---------------------------------- */
162
+ /* Chat area: icon-only buttons */
163
+ /* ---------------------------------- */
164
+
165
+ /* Tint SVG icons */
166
+ #chatbot button svg,
167
+ #chatbot [role="button"] svg,
168
+ #chatbot .icon svg,
169
+ #chatbot [class*="icon"] svg,
170
+ #chatbot [data-testid*="icon"] svg,
171
+ #chatbot [data-testid*="message"] .tools svg,
172
+ #chatbot .message-tools svg {
173
+ color: var(--icon-light) !important;
174
+ fill: var(--icon-light) !important;
175
+ stroke: var(--icon-light) !important;
176
+ opacity: 0.95;
177
+ }
178
+
179
+ /* Remove gradient background only on small icon-only buttons */
180
+ #chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient) {
181
+ background: transparent !important;
182
+ background-image: none !important;
183
+ border: none !important;
184
+ box-shadow: none !important;
185
+ padding: 6px !important;
186
+ border-radius: 8px !important;
187
+ color: var(--icon-light) !important;
188
+ }
189
+
190
+ /* Hover/focus/active states */
191
+ #chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):hover {
192
+ background-color: rgba(0,0,0,0.05) !important;
193
+ }
194
+ #chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):focus-visible {
195
+ outline: none !important;
196
+ box-shadow: 0 0 0 2px rgba(23, 66, 143, 0.35) !important;
197
+ background-color: rgba(0,0,0,0.06) !important;
198
+ }
199
+ #chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):active {
200
+ background-color: rgba(0,0,0,0.08) !important;
201
+ }
202
+
203
+ /* Optional 'danger' icons */
204
+ #chatbot .danger svg {
205
+ color: var(--icon-light) !important;
206
+ fill: var(--icon-light) !important;
207
+ stroke: var(--icon-light) !important;
208
+ }
209
+ #chatbot .danger:hover svg {
210
+ color: #ef4444 !important;
211
+ fill: #ef4444 !important;
212
+ stroke: #ef4444 !important;
213
+ }
214
+
215
+ /* ---------------------------------- */
216
+ /* TOP BAR (logo block): icon-only */
217
+ /* ---------------------------------- */
218
+ #topbar { background: transparent !important; }
219
+ #topbar { --icon-light: #9CA3AF; --icon-hover: #6B7280; }
220
+
221
+ #topbar .gr-button.keep-gradient,
222
+ #topbar .gr-button:not(:has(svg)) {
223
+ background: linear-gradient(90deg, var(--brand-blue), var(--brand-orange)) !important;
224
+ color: #fff !important;
225
+ }
226
+
227
+ /* Icon-only buttons in topbar: transparent */
228
+ #topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient) {
229
+ background: transparent !important;
230
+ border: none !important;
231
+ box-shadow: none !important;
232
+ padding: 6px !important;
233
+ border-radius: 8px !important;
234
+ color: var(--icon-light) !important;
235
+ }
236
+
237
+ /* Tint SVGs in topbar */
238
+ #topbar :is(button,[role="button"]):has(> svg) > svg {
239
+ color: var(--icon-light) !important;
240
+ fill: var(--icon-light) !important;
241
+ stroke: var(--icon-light) !important;
242
+ opacity: 0.95;
243
+ }
244
+
245
+ /* Hover/focus/active for topbar icons */
246
+ #topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):hover {
247
+ background-color: rgba(0,0,0,0.05) !important;
248
+ }
249
+ #topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):focus-visible {
250
+ outline: none !important;
251
+ box-shadow: 0 0 0 2px rgba(23, 66, 143, 0.35) !important;
252
+ background-color: rgba(0,0,0,0.06) !important;
253
+ }
254
+ #topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):active {
255
+ background-color: rgba(0,0,0,0.08) !important;
256
+ }
257
+
258
+
259
+
260
+
261
+
262
+ /* Ensure the textbox wrapper is relative */
263
+ #message-box {
264
+ width: 100%;
265
+ border-radius: 9999px;
266
+ border: 2px solid var(--brand-orange);
267
+ font-size: 16px;
268
+ outline: none;
269
+ position: relative; /* needed for absolute button */
270
+ }
271
+
272
+ #message-box:focus {
273
+ border-color: var(--brand-blue);
274
+ box-shadow: 0 0 6px rgba(23, 66, 143, 0.3);
275
+ }
276
+
277
+ /* Send button positioned inside textbox */
278
+ #send-button {
279
+ position: absolute;
280
+
281
+ right: 34px; /* move left by increasing this value */
282
+ top: 48px; /* move down by increasing this value */
283
+
284
+ transform: translateY(-16%);
285
+ width: 36px;
286
+ height: 36px;
287
+ min-width: 0 !important;
288
+ padding: 0 !important;
289
+ border-radius: 50%;
290
+ background: linear-gradient(90deg, var(--brand-blue), var(--brand-orange));
291
+ display: flex;
292
+ align-items: center;
293
+ justify-content: center;
294
+ border: none;
295
+ color: white;
296
+ font-size: 18px;
297
+ cursor: pointer;
298
+ z-index: 2;
299
+ }
300
+
301
+ #send-button:hover {
302
+ transform: translateY(-16%) scale(1.05);
303
+ box-shadow: 0 4px 12px rgba(0,0,0,0.2);
304
+ }
305
+
306
+ #send-button:active {
307
+ transform: translateY(-16%) scale(0.95);
308
+ }
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+ /* Compact upload area - lighter and transparent */
318
+ #file-upload .upload-box,
319
+ #file-upload .file-wrap,
320
+ #file-upload .wrap {
321
+ min-height: 80px;
322
+ padding: 8px 12px;
323
+ color: #666 !important; /* Softer text color */
324
+ background: transparent !important; /* No white block, shows gradient */
325
+ border: none !important; /* Remove any border */
326
+ }
327
+
328
+ /* Inner placeholder (remove gray square) */
329
+ #file-upload .upload-box div,
330
+ #file-upload .upload-box span {
331
+ background: transparent !important; /* Remove gray background */
332
+ }
333
+
334
+ /* Text and icons slightly lighter */
335
+ #file-upload .upload-box,
336
+ #file-upload .file-wrap,
337
+ #file-upload .wrap {
338
+ color: #666 !important;
339
+ fill: #666 !important;
340
+ stroke: #666 !important;
341
+ }
342
+
343
+ /* Buttons remain clean */
344
+ #file-upload button,
345
+ #file-upload [role="button"] {
346
+ background: transparent !important;
347
+ border: none !important;
348
+ box-shadow: none !important;
349
+ border-radius: 0 !important;
350
+ padding: 4px !important;
351
+ color: #666 !important;
352
+ }
353
+
354
+ /* SVG icons */
355
+ #file-upload button svg,
356
+ #file-upload [role="button"] svg {
357
+ color: #666 !important;
358
+ fill: #666 !important;
359
+ stroke: #666 !important;
360
+ background: none !important;
361
+ }
362
+
363
+ /* Hover effect */
364
+ #file-upload button:hover svg,
365
+ #file-upload [role="button"]:hover svg {
366
+ color: #2a5db0 !important;
367
+ fill: #2a5db0 !important;
368
+ stroke: #2a5db0 !important;
369
+ }
370
+
371
+
372
+
373
+
374
+ /* Hide Gradio's default control buttons in the header/topbar */
375
+ #topbar .gr-button,
376
+ #topbar [role="button"],
377
+ #header-container .gr-button,
378
+ #header-container [role="button"] {
379
+ display: none !important;
380
+ }
381
+
382
+
383
+
384
+ #upload-note, #upload-note * {
385
+ color: #ffffff !important;
386
+ }
387
+
388
+
389
+
390
+
391
+ .gradio-container [id="left-column"] {
392
+ min-width: 40px !important; /* Increased from 10px */
393
+ max-width: 320px !important; /* You can go up to 400px if you want even wider */
394
+ width: 180px !important; /* Increased from 100px */
395
+ flex: 0 0 220px !important; /* Increased from 180px */
396
+ padding-right: 12px !important; /* Slightly more padding for visual separation */
397
+ }
398
+
399
+ .gradio-container [id="right-column"] {
400
+ flex: 1 1 0 !important;
401
+ width: auto !important;
402
+ min-width: 0 !important;
403
+ padding-left: 0 !important;
404
+ }
405
+
406
+
407
+
408
+
409
+
410
+ /* Branding layout */
411
+ #branding {
412
+ display: flex;
413
+ align-items: center;
414
+ gap: 8px; /* space between text and logo */
415
+ justify-content: center;
416
+ }
417
+
418
+ /* Text style */
419
+ #brand-text {
420
+ font-size: 1.6rem;
421
+ font-weight: 700;
422
+ color: white;
423
+ line-height: 1;
424
+ }
425
+
426
+ /* Logo sizing */
427
+ #company-logo {
428
+ width: 40px; /* set exact width */
429
+ height: auto; /* preserve proportions */
430
+ display: block;
431
+ object-fit: contain;
432
+ }
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+ /* ---- Viewport-safe base (handles mobile address bar) ---- */
441
+ html, body, .gradio-container {
442
+ height: 100dvh; /* dynamic viewport height */
443
+ min-height: 100dvh;
444
+ }
445
+
446
+ /* ---- Make sure all flex ancestors allow their children to shrink ---- */
447
+ .gradio-container .gr-row,
448
+ .gradio-container .gr-row > .gr-column,
449
+ .gradio-container > .gr-row,
450
+ .gradio-container .gr-column,
451
+ #right-column,
452
+ #chat-area {
453
+ min-height: 0 !important; /* critical so inner scroll can happen */
454
+ }
455
+
456
+ /* ---- Chatbot outer box: cap height and scroll inside ---- */
457
+ #chatbot {
458
+ box-sizing: border-box;
459
+ flex: 1 1 auto;
460
+ min-height: 0 !important;
461
+ height: auto !important;
462
+ max-height: calc(100dvh - var(--topbar-offset, 0px) - 120px) !important;
463
+ overflow-y: auto !important;
464
+ }
465
+
466
+ /* ---- Gradio’s inner wrappers sometimes need explicit constraints ---- */
467
+ /* Gradio v4: Chatbot renders inside .gr-chatbot -> .wrap / .message-wrap depending on version */
468
+ #chatbot .wrap,
469
+ #chatbot .message-wrap,
470
+ #chatbot [data-testid="chatbot"] {
471
+ max-height: 100%;
472
+ overflow-y: auto;
473
+ min-height: 0;
474
+ }
475
+
476
+ /* Optional: if you see the input row pushing the chat up/down on small screens,
477
+ let the input take only its content height. */
478
+ #input-row {
479
+ flex: 0 0 auto;
480
+ }
481
+
482
+ /* If your header/progress box is above the chat, ensure it doesn't consume flex growth */
483
+ #chat-area > *:not(#chatbot) {
484
+ flex: 0 0 auto;
485
+ }
generate.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ import time
4
+ import re
5
+ from pinecone import Pinecone
6
+
7
+ from langchain_mistralai import ChatMistralAI
8
+ from langchain_openai import ChatOpenAI
9
+ from langchain_core.messages import HumanMessage, SystemMessage
10
+ from langchain.schema import Document
11
+ from langchain_community.document_loaders import (
12
+ CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
13
+ UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
14
+ UnstructuredHTMLLoader, NotebookLoader
15
+ )
16
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
17
+
18
+ from llama_index.core.memory import Memory
19
+
20
+ import pickle
21
+
22
+ import json
23
+ from typing import List, Any
24
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
25
+
26
+ from typing import List, Any
27
+ from pydantic import BaseModel, ValidationError
28
+
29
+
30
+ memory = Memory(token_limit=2048)
31
+
32
+
33
+ def generate_RAG(
34
+ prompt_message,
35
+ llm,
36
+ retrieved_chunks,
37
+ graph_context="",
38
+ graphRAG=False,
39
+ info=True
40
+ ):
41
+ """
42
+ Two-stage flow (single function):
43
+ 1) Resolver (non-streaming, no callbacks): decide if this turn should be history-only. Produce resolved_task.
44
+ 2) Answer (streaming via the passed llm): include retrieved context only if allowed; otherwise forbid it.
45
+
46
+ Message order (to favor history for follow-ups):
47
+ System (first) -> (Optional) AIMessage with Retrieved Context -> History -> Human (last)
48
+ """
49
+
50
+ if info:
51
+ print("Generate RAG with", prompt_message, llm)
52
+
53
+ # ---------- Helpers ----------
54
+ def _to_list_messages(history: Any) -> List[BaseMessage]:
55
+ """Normalizes memory history: supports list[BaseMessage] or a summary string."""
56
+ if isinstance(history, list):
57
+ return history
58
+ if isinstance(history, str) and history.strip():
59
+ return [AIMessage(content=f"[Conversation summary]\n{history.strip()}")]
60
+ return []
61
+
62
+ def _last_ai_text(msgs: List[BaseMessage]) -> str:
63
+ for m in reversed(msgs):
64
+ if isinstance(m, AIMessage):
65
+ return m.content
66
+ return ""
67
+
68
+ def _safe_json_loads(raw: str) -> dict:
69
+ try:
70
+ return json.loads(raw)
71
+ except Exception:
72
+ start, end = raw.find("{"), raw.rfind("}")
73
+ if start != -1 and end != -1 and end > start:
74
+ return json.loads(raw[start:end+1])
75
+ raise
76
+ def _make_non_streaming_resolver(llm_):
77
+ """
78
+ Create a non-streaming, callback-free copy of the same LLM class for the resolver step.
79
+ Works for ChatOpenAI-style classes that accept 'model' or 'model_name'.
80
+ """
81
+ model_name = getattr(llm_, "model_name", getattr(llm_, "model", None))
82
+ kwargs = {}
83
+ if hasattr(llm_, "temperature"):
84
+ kwargs["temperature"] = getattr(llm_, "temperature")
85
+ try:
86
+ return llm_.__class__(model=model_name, streaming=False, callbacks=[], **kwargs)
87
+ except TypeError:
88
+ return llm_.__class__(model_name=model_name, streaming=False, callbacks=[], **kwargs)
89
+
90
+ def _resolver(user_text: str, history_msgs: List[BaseMessage]) -> dict:
91
+ resolver_llm = _make_non_streaming_resolver(llm)
92
+
93
+ RESOLVER_SYS = (
94
+ "You are a controller that decides if the next answer should rely ONLY on Chat History "
95
+ "(ignore Retrieved Context completely) or may use Retrieved Context.\n"
96
+ "Return STRICT JSON with keys:\n"
97
+ '{ "use_history_only": true|false, "resolved_task": "<resolved user request>" }\n\n'
98
+ "Rules:\n"
99
+ "- Always set set use_history_only=false (especially if the query has meaningful concepts for retrieval, e.g., specific entities, topics, product names, technical terms, factual questions).\n"
100
+ "- Except in rare cases, do NOT set use_history_only=true. Only do so if the query contains undefined pronouns (e.g., this, that, it, they, those, these, above, continue, previous, earlier, same...).\n"
101
+ "Examples:\n"
102
+ 'User: "Where in the onboarding guide do we define the trial limits?"\n'
103
+ '-> { "use_history_only": false, "resolved_task": "Find where the onboarding guide defines the trial limits and report the exact limits." }\n'
104
+ )
105
+
106
+ resolver_msgs: List[BaseMessage] = [SystemMessage(RESOLVER_SYS)]
107
+ last_ai = _last_ai_text(history_msgs)
108
+ if last_ai:
109
+ resolver_msgs.append(AIMessage(content=f"[Last assistant answer]\n{last_ai}"))
110
+ resolver_msgs.extend(history_msgs)
111
+ resolver_msgs.append(HumanMessage(content=f"User message: {user_text}"))
112
+
113
+ raw = resolver_llm.invoke(resolver_msgs).content
114
+ try:
115
+ data = _safe_json_loads(raw)
116
+ except Exception:
117
+ data = {"use_history_only": False, "resolved_task": user_text}
118
+
119
+ data.setdefault("use_history_only", False)
120
+ data.setdefault("resolved_task", user_text)
121
+ return data
122
+
123
+
124
+ # ---------- Prepare history ----------
125
+ history_messages: List[BaseMessage] = []
126
+ if memory:
127
+ # Get the last messages from LlamaIndex memory
128
+ last_msgs = memory.get_all()[-8:]
129
+
130
+ # Convert LlamaIndex messages to LangChain message types
131
+ for msg in last_msgs:
132
+ if msg.role == "user":
133
+ history_messages.append(HumanMessage(content=msg.content))
134
+ elif msg.role in ("ai", "assistant"):
135
+ history_messages.append(AIMessage(content=msg.content))
136
+ # Add more roles if needed
137
+
138
+ # ---------- Stage 1: Resolve (non-streaming) ----------
139
+ plan = _resolver(prompt_message, history_messages)
140
+
141
+ use_history_only = bool(plan.get("use_history_only", False))
142
+ resolved_task = plan.get("resolved_task", prompt_message)
143
+
144
+ if info:
145
+ print("[Resolver]", plan)
146
+
147
+
148
+ # ---------- Build retrieval context block ----------
149
+ context_lines = []
150
+ if not use_history_only:
151
+ for i, chunk in enumerate(retrieved_chunks or []):
152
+ source_filename = os.path.basename((chunk.get("source") or "unknown"))
153
+ text = chunk.get("text") or ""
154
+ context_lines.append(f"Source {i+1} ({source_filename}):\n{text}")
155
+
156
+ if graphRAG and graph_context:
157
+ context_lines.append("[Graph context]\n" + graph_context)
158
+
159
+ context_for_llm = "\n\n".join(context_lines)
160
+
161
+ # ---------- System prompt (first) ----------
162
+ base_rules = (
163
+ "You are an expert assistant. Answer in English. Use:\n"
164
+ "- Chat History\n"
165
+ "- Retrieved Context (reference-only facts; not user intent).\n\n"
166
+ "Decision rubric before answering:\n"
167
+ "- Important: you MUST ALWAYS cite a source, i.e., always use exactly the filename from the 'source' metadata (e.g., 'Source: sample.pdf.' in the same paragraph as the claim).\n"
168
+ "- If the answer is not supported by Retrieved Context and not implied by history, say you cannot answer.\n\n"
169
+ "Important: output should be very well-structured Markdown (always different headings, hierarchical structure, bullets, tables and code blocks when needed), with a few emojis for scannability."
170
+ )
171
+ turn_rule = (
172
+ "\n\nTURN-SPECIFIC RULE: For THIS turn, you MUST NOT use any Retrieved Context. "
173
+ "Base your answer ONLY on Chat History and the user's current request."
174
+ if use_history_only else ""
175
+ )
176
+
177
+ prompt_parts: List[BaseMessage] = [SystemMessage(content=base_rules + turn_rule)]
178
+
179
+ # ---------- Retrieved context as assistant message (only if allowed) ----------
180
+ if (not use_history_only) and context_for_llm.strip():
181
+ prompt_parts.append(
182
+ SystemMessage(
183
+ content="📚 Retrieved Context (reference-only; not user intent, Use info only from here and nothing else, if info not present, say you do not know. You are only allowed to base your answer on this info and not use your own):\n\n" + context_for_llm
184
+ )
185
+ )
186
+
187
+ # ---------- History next (more recent than retrieval context) ----------
188
+ if history_messages:
189
+ prompt_parts.append(SystemMessage(content="🕘 Chat History (most recent last):"))
190
+ prompt_parts.extend(history_messages)
191
+
192
+ # ---------- Current user last (include BOTH original and resolved) ----------
193
+ final_human = (
194
+ "User request (original):\n"
195
+ f"{prompt_message}\n\n"
196
+ "Resolved task (use this when pronouns/references appear):\n"
197
+ f"{resolved_task}"
198
+ )
199
+ prompt_parts.append(HumanMessage(content=final_human))
200
+
201
+ # ---------- Stage 2: Answer (streaming via passed llm) ----------
202
+ print(f"[Info] The final prompt is the following: {prompt_parts}")
203
+ response = llm.invoke(prompt_parts)
204
+ print(f"[Info] The final response is the following: {response}")
205
+
206
+
207
+
208
+
209
+
210
+
211
+ # ---------- Pydantic validation: ensure some "Source:" structure is present ----------
212
+ class _AnswerWithCitationStructure(BaseModel):
213
+ content: str
214
+
215
+ @classmethod
216
+ def ensure_source_structure(cls, content: str):
217
+ """
218
+ Check that there is at least one 'Source:' or 'Sources:' pattern in the text.
219
+ """
220
+ import re
221
+
222
+ if not re.search(r"\bSources?:\s*.+", content, flags=re.IGNORECASE):
223
+ raise ValueError("Missing any 'Source:' structure in the answer.")
224
+
225
+
226
+ # Run validation only when we expected citations (retrieval was allowed)
227
+ try:
228
+ if not use_history_only:
229
+ _AnswerWithCitationStructure.ensure_source_structure(
230
+ getattr(response, "content", str(response))
231
+ )
232
+ except (ValidationError, ValueError) as ve:
233
+ print(f"[Validation] Source structure check failed: {ve}")
234
+
235
+ # Retry answer generation with stronger emphasis on sources
236
+ retry_prompt_parts = prompt_parts.copy()
237
+ retry_prompt_parts.append(SystemMessage(
238
+ content="⚠️ IMPORTANT: Your previous answer did not include any 'Source:' citation. "
239
+ "Regenerate your answer and make sure to include at least one 'Source: ...' or 'Sources: ...' line "
240
+ "that cites the relevant documents or context."
241
+ ))
242
+ response = llm.invoke(retry_prompt_parts)
243
+ print("[Retry] Regenerated answer with source emphasis.")
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+ # ---------- Persist to memory ----------
252
+
253
+ from llama_index.core.llms import ChatMessage
254
+
255
+ # ---------- Persist to memory ----------
256
+ if memory:
257
+ # Add user message
258
+ memory.put(ChatMessage(role="user", content=prompt_message))
259
+
260
+ if not use_history_only:
261
+ # Add context as AI message
262
+ memory.put(ChatMessage(role="assistant", content=f"The context was: [start context] {context_for_llm} [end context]"))
263
+
264
+ # Add final AI response
265
+ memory.put(ChatMessage(role="assistant", content=getattr(response, "content", str(response))))
266
+
267
+ # To print the current memory, retrieve all messages
268
+ print("[Info] The following is the current memory:", memory.get_all())
269
+
270
+
271
+ return response
logo_mono.png ADDED
prepare.py ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """prepare.py
2
+
3
+ Utilities to prepare documents and knowledge-graph artifacts for a RAG (Retrieval-Augmented
4
+ Generation) pipeline.
5
+
6
+ This module implements:
7
+ - safe file loading for text-like files (UTF-8 tolerant)
8
+ - dataset creation from a `context` directory using various loaders
9
+ - chunking, embedding and upserting to Pinecone via `prepare_RAG`
10
+ - building/updating a Knowledge Graph and generating hierarchical community summaries
11
+
12
+ Main public functions:
13
+ - create_dataset(directory_path: str) -> List[Document]
14
+ - prepare_RAG(pinecone_API, index_name, ...) -> (index, pc, llm, documents)
15
+ - build_knowledge_graph(documents, llm, pc, index, info=True) -> KnowledgeGraphIndex
16
+
17
+ Note: many helper functions are nested; this docstring highlights the high-level
18
+ purpose and responsibilities only.
19
+ """
20
+
21
+ import os
22
+ import pathlib
23
+ import time
24
+ import re
25
+ from pinecone import Pinecone
26
+
27
+ from langchain_mistralai import ChatMistralAI
28
+ from langchain_openai import ChatOpenAI
29
+ from langchain_core.messages import HumanMessage, SystemMessage
30
+ from langchain.schema import Document
31
+ from langchain_community.document_loaders import (
32
+ CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
33
+ UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
34
+ UnstructuredHTMLLoader, NotebookLoader
35
+ )
36
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
37
+
38
+ from llama_index.core.memory import Memory
39
+
40
+ import pickle
41
+
42
+ import json
43
+ from typing import List, Any
44
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
45
+
46
+ from typing import List, Any
47
+ from pydantic import BaseModel, ValidationError
48
+
49
+
50
+ memory = Memory(token_limit=2048)
51
+
52
+ # -------------------------
53
+ # UTF-8 safe Text Loader
54
+ # -------------------------
55
+ class SafeTextLoader:
56
+ """Loads a text file as a single Document, safely handling UTF-8 decoding errors."""
57
+ def __init__(self, file_path):
58
+ self.file_path = file_path
59
+
60
+ def load(self):
61
+ """Load the file and return a list containing a single LangChain `Document`.
62
+
63
+ The loader is UTF-8 tolerant: it reads raw bytes and decodes using UTF-8 with
64
+ 'ignore' on errors to avoid failing on files containing invalid sequences.
65
+
66
+ Returns:
67
+ List[Document]: a list with one Document (page_content and metadata['source'])
68
+ or an empty list on error.
69
+ """
70
+ try:
71
+ with open(self.file_path, "rb") as f:
72
+ raw_bytes = f.read()
73
+ text = raw_bytes.decode("utf-8", errors="ignore")
74
+ return [Document(page_content=text, metadata={"source": str(self.file_path)})]
75
+ except Exception as e:
76
+ print(f"[Error] Failed to read {self.file_path}: {e}")
77
+ return []
78
+
79
+ # -------------------------
80
+ # Loader mapping
81
+ # -------------------------
82
+ LOADER_MAPPING = {
83
+ ".txt": SafeTextLoader,
84
+ ".json": SafeTextLoader,
85
+ ".md": UnstructuredMarkdownLoader,
86
+ ".csv": CSVLoader,
87
+ ".yaml": SafeTextLoader,
88
+ ".yml": SafeTextLoader,
89
+ ".pdf": PyPDFLoader,
90
+ ".docx": UnstructuredWordDocumentLoader,
91
+ ".pptx": UnstructuredPowerPointLoader,
92
+ ".html": UnstructuredHTMLLoader,
93
+ ".htm": UnstructuredHTMLLoader,
94
+ ".ipynb": NotebookLoader,
95
+ ".py": SafeTextLoader,
96
+ ".js": SafeTextLoader,
97
+ ".sql": SafeTextLoader,
98
+ }
99
+
100
+ CONTEXT_ROOT = pathlib.Path(__file__).parent / "context"
101
+
102
+ def create_dataset(directory_path: str = "context"):
103
+ """Recursively load files under `directory_path` using extension-specific loaders."""
104
+
105
+ target_dir = pathlib.Path(directory_path).resolve()
106
+ if not target_dir.exists() or not target_dir.is_dir():
107
+ print(f"[Error] Target directory does not exist: {target_dir}")
108
+ return []
109
+
110
+ documents = []
111
+ for file_path in target_dir.rglob("*"):
112
+ if not file_path.is_file():
113
+ continue
114
+ ext = file_path.suffix.lower()
115
+ loader_cls = LOADER_MAPPING.get(ext)
116
+ if loader_cls is None:
117
+ print(f"[Skip] Unsupported file type: {file_path}")
118
+ continue
119
+ try:
120
+ loader = loader_cls(str(file_path))
121
+ docs = loader.load()
122
+ documents.extend(docs)
123
+ print(f"[Loaded] {file_path} ({len(docs)} docs)")
124
+ except Exception as e:
125
+ print(f"[Error] Failed to load {file_path}: {e}")
126
+
127
+ print(f"[Done] Finished scanning {target_dir}")
128
+ print(f"Total documents loaded: {len(documents)}")
129
+ return documents
130
+
131
+
132
+ from llama_index.core import KnowledgeGraphIndex
133
+ from llama_index.core import Document as LlamaDocument
134
+
135
+ import hashlib
136
+
137
+
138
+ def fetch_existing_ids(index, namespace, ids, batch_size=100):
139
+ """Fetch IDs from Pinecone in safe batches to avoid URI too large errors"""
140
+ existing_ids = set()
141
+ for start in range(0, len(ids), batch_size):
142
+ batch_ids = ids[start:start + batch_size]
143
+ result = index.fetch(ids=batch_ids, namespace=namespace)
144
+ if hasattr(result, "vectors"):
145
+ existing_ids.update(result.vectors.keys())
146
+ return existing_ids
147
+
148
+
149
+
150
+ # -------------------------
151
+ # Prepare RAG
152
+ # -------------------------
153
+ import hashlib
154
+ import time
155
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
156
+ from llama_index.core import Document as LlamaDocument
157
+ from pinecone import Pinecone
158
+
159
+ import os
160
+ import re
161
+ import time
162
+ import hashlib
163
+
164
+ from langchain_openai import ChatOpenAI
165
+ from langchain_mistralai import ChatMistralAI
166
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
167
+ from pinecone import Pinecone
168
+
169
+ from llama_index.core import Document as LlamaDocument
170
+
171
+ # You are assumed to already have:
172
+ # - create_dataset(dir_name)
173
+ # - fetch_existing_ids(index, namespace, all_ids, batch_size)
174
+
175
+
176
+ # -------------------------
177
+ # Internal helper: build & upsert community summaries (incremental inside; same signature)
178
+ # -------------------------
179
+ def _build_and_index_community_summaries(
180
+ kg_index,
181
+ pc,
182
+ index,
183
+ llm,
184
+ impacted_nodes=None,
185
+ info=True,
186
+ ):
187
+ """
188
+ This function implements a hierarchical community detection and summarization pipeline:
189
+
190
+ 1. COMMUNITY DETECTION:
191
+ - Uses NetworkX's greedy_modularity_communities to find natural clusters in the KG
192
+ - Filters communities by minimum size (COMMUNITY_MIN_SIZE) to avoid noise
193
+
194
+ 2. HIERARCHY CONSTRUCTION:
195
+ - Builds a multi-level tree structure (max depth = MAX_HIERARCHY_DEPTH)
196
+ - Recursively splits large communities using the same modularity algorithm
197
+ - Creates parent-child relationships between community levels
198
+
199
+ 3. AFFECTED NODE TRACKING:
200
+ - Marks communities as "_affected" if they contain new/updated nodes
201
+ - Propagates affected status upward to parent communities
202
+ - Enables incremental updates by only processing changed regions
203
+
204
+ 4. BOTTOM-UP SUMMARIZATION:
205
+ - Leaf communities: Generate detailed reports from entity relationships
206
+ - Parent communities: Synthesize child summaries into higher-level overviews
207
+ - Uses sampling (LIMIT_NODES_PER_SUMMARY) to handle large communities
208
+
209
+ 5. VECTOR STORAGE:
210
+ - Creates stable IDs using SHA-256 hashes of community composition
211
+ - Embeds summaries using Pinecone's llama-text-embed-v2 model
212
+ - Stores in dedicated "community-summaries" namespace
213
+ """
214
+
215
+ import hashlib
216
+ import networkx as nx
217
+ from networkx.algorithms.community import greedy_modularity_communities
218
+
219
+ COMMUNITY_NAMESPACE = "community-summaries"
220
+ COMMUNITY_MIN_SIZE = 3
221
+ MAX_HIERARCHY_DEPTH = 2
222
+ LIMIT_NODES_PER_SUMMARY = 60
223
+ LIMIT_TRIPLES_PER_SUMMARY = 120
224
+
225
+ try:
226
+ nxg = kg_index.get_networkx_graph()
227
+ except Exception as e:
228
+ print(f"[Error] Unable to extract NetworkX graph from KG: {e}")
229
+ return
230
+
231
+ if nxg.number_of_nodes() == 0 or nxg.number_of_edges() == 0:
232
+ if info:
233
+ print("[Community] KG empty or trivial; skipping community summarization.")
234
+ return
235
+
236
+ first_run = impacted_nodes is None
237
+ impacted_nodes = set(impacted_nodes or [])
238
+
239
+ if info:
240
+ print(f"[Community] Starting summarization. First run: {first_run}")
241
+ print(f"[Community] Impacted nodes: {len(impacted_nodes)}")
242
+
243
+ # ---- community detection ----
244
+ if info:
245
+ print("[Community] Detecting top-level communities (greedy modularity)...")
246
+ try:
247
+ communities = list(greedy_modularity_communities(nxg))
248
+ except Exception as e:
249
+ print(f"[Error] Community detection failed: {e}")
250
+ return
251
+
252
+ large_communities = [c for c in communities if len(c) >= max(2, COMMUNITY_MIN_SIZE)]
253
+ small_communities = [c for c in communities if len(c) < max(2, COMMUNITY_MIN_SIZE)]
254
+
255
+ if info:
256
+ print(f"[Community] Found {len(communities)} communities; "
257
+ f"{len(large_communities)} large, {len(small_communities)} small.")
258
+
259
+ # ---- build hierarchy and mark affected ----
260
+ hierarchy = []
261
+ for idx, comm in enumerate(large_communities):
262
+ subgraph = nxg.subgraph(comm).copy()
263
+ node_set = set(subgraph.nodes())
264
+ node = {
265
+ "id": f"C{idx}",
266
+ "level": 0,
267
+ "nodes": node_set,
268
+ "children": [],
269
+ "_affected": first_run or bool(impacted_nodes & node_set),
270
+ }
271
+
272
+ # simple frontier-based recursive splitting
273
+ frontier = [(node, subgraph, 1)]
274
+ while frontier:
275
+ parent, g, depth = frontier.pop()
276
+ if depth > MAX_HIERARCHY_DEPTH or g.number_of_nodes() < max(2, COMMUNITY_MIN_SIZE * 2):
277
+ continue
278
+ try:
279
+ subs = list(greedy_modularity_communities(g))
280
+ except Exception:
281
+ subs = []
282
+
283
+ subs = [s for s in subs if 1 <= len(s) <= len(g) - 1]
284
+ subs = [s for s in subs if len(s) >= max(2, COMMUNITY_MIN_SIZE)]
285
+
286
+ for j, s in enumerate(subs):
287
+ sg = g.subgraph(s).copy()
288
+ child = {
289
+ "id": f"{parent['id']}.{j}",
290
+ "level": depth,
291
+ "nodes": set(s),
292
+ "children": [],
293
+ "_affected": first_run or bool(impacted_nodes & set(s)),
294
+ }
295
+ parent["children"].append(child)
296
+ if depth + 1 <= MAX_HIERARCHY_DEPTH and sg.number_of_nodes() >= max(2, COMMUNITY_MIN_SIZE * 2):
297
+ frontier.append((child, sg, depth + 1))
298
+
299
+ hierarchy.append(node)
300
+
301
+ # propagate affected upward
302
+ def mark_ancestors(n):
303
+ any_child = False
304
+ for c in n["children"]:
305
+ if mark_ancestors(c):
306
+ any_child = True
307
+ if any_child:
308
+ n["_affected"] = True
309
+ return n["_affected"]
310
+
311
+ for root in hierarchy:
312
+ mark_ancestors(root)
313
+
314
+ # ---- summarization helpers ----
315
+ def triples_within(node_ids, graph):
316
+ res = []
317
+ for (u, v, data) in graph.edges(data=True):
318
+ if u in node_ids and v in node_ids:
319
+ rel = data.get("label") or data.get("relationship") or "related_to"
320
+ res.append((u, rel, v))
321
+ return res
322
+
323
+ def sample_for_prompt(nodes_set, triples_list, max_nodes=LIMIT_NODES_PER_SUMMARY, max_triples=LIMIT_TRIPLES_PER_SUMMARY):
324
+ nodes_list = list(nodes_set)[:max_nodes]
325
+ triples_list = triples_list[:max_triples]
326
+ return nodes_list, triples_list
327
+
328
+ def summarize_leaf(nodes_set, graph):
329
+ nodes_list, tri_list = sample_for_prompt(
330
+ nodes_set,
331
+ triples_within(nodes_set, graph)
332
+ )
333
+ prompt = (
334
+ "You are creating a concise community report from a knowledge graph.\n"
335
+ "Given the following entity list and intra-community relationships, produce:\n"
336
+ " - Title\n"
337
+ " - Key Themes (bullet points)\n"
338
+ " - Notable Entities\n"
339
+ " - Important Relationships (summarize patterns rather than listing all)\n"
340
+ " - Outliers or Cross-links (if any)\n"
341
+ " - 3-5 Answerable Questions this community can address\n"
342
+ "Keep it under ~250-300 words.\n\n"
343
+ f"Entities (sample): {nodes_list}\n"
344
+ f"Relationships (sample triples): {[f'{u} --[{r}]--> {v}' for (u,r,v) in tri_list]}\n"
345
+ )
346
+ resp = llm.invoke(prompt)
347
+ return resp.content.strip()
348
+
349
+ def summarize_parent(child_summaries):
350
+ join_text = "\n\n".join([f"[Child {i+1}]\n{txt}" for i, txt in enumerate(child_summaries)])
351
+ prompt = (
352
+ "You are creating a higher-level summary that unifies several community reports.\n"
353
+ "Synthesize the following child community reports into a coherent parent-level summary:\n"
354
+ " - Overarching Title\n"
355
+ " - Cross-community Key Themes\n"
356
+ " - How the sub-communities relate and differ\n"
357
+ " - Cross-cutting entities/relationships\n"
358
+ " - 3-5 high-level questions the parent community can answer\n"
359
+ "Target length: 250-350 words.\n\n"
360
+ f"{join_text}\n"
361
+ )
362
+ resp = llm.invoke(prompt)
363
+ return resp.content.strip()
364
+
365
+ # bottom-up, only affected subtrees
366
+ def build_summaries(node, graph):
367
+ if not node["_affected"]:
368
+ return None
369
+ if not node["children"]:
370
+ node["summary"] = summarize_leaf(node["nodes"], graph)
371
+ return node["summary"]
372
+ child_summaries = []
373
+ for ch in node["children"]:
374
+ s = build_summaries(ch, graph)
375
+ if s is not None:
376
+ child_summaries.append(s)
377
+ if child_summaries:
378
+ node["summary"] = summarize_parent(child_summaries)
379
+ return node["summary"]
380
+ node["summary"] = summarize_leaf(node["nodes"], graph)
381
+ return node["summary"]
382
+
383
+ for node in hierarchy:
384
+ build_summaries(node, nxg)
385
+
386
+ # flatten affected nodes w/ new summaries
387
+ flat_nodes = []
388
+ def flatten(n):
389
+ if n.get("_affected") and "summary" in n:
390
+ flat_nodes.append({
391
+ "id": n["id"],
392
+ "level": n["level"],
393
+ "size": len(n["nodes"]),
394
+ "nodes": list(n["nodes"]),
395
+ "summary": n["summary"]
396
+ })
397
+ for c in n["children"]:
398
+ flatten(c)
399
+ for n in hierarchy:
400
+ flatten(n)
401
+
402
+ if not flat_nodes:
403
+ if info:
404
+ print("[Community] No affected community summaries to upsert.")
405
+ return
406
+
407
+ if info:
408
+ print(f"[Community] Upserting {len(flat_nodes)} community summaries to namespace: {COMMUNITY_NAMESPACE}")
409
+
410
+ def summary_vec_id(node_rec):
411
+ key = f"{node_rec['id']}|{node_rec['level']}|{','.join(sorted(node_rec['nodes'])[:20])}"
412
+ return "comm_" + hashlib.sha256(key.encode("utf-8")).hexdigest()[:24]
413
+
414
+ # batch embed + upsert
415
+ B = 96
416
+ texts = [rec["summary"] for rec in flat_nodes]
417
+ ids = [summary_vec_id(rec) for rec in flat_nodes]
418
+ metas = [{
419
+ "type": "community_summary",
420
+ "community_id": rec["id"],
421
+ "level": rec["level"],
422
+ "size": rec["size"],
423
+ "node_sample": rec["nodes"][:20],
424
+ "text": rec["summary"]
425
+ } for rec in flat_nodes]
426
+
427
+ for start in range(0, len(texts), B):
428
+ batch_texts = texts[start:start+B]
429
+ batch_ids = ids[start:start+B]
430
+ batch_metas = metas[start:start+B]
431
+ emb = pc.inference.embed(
432
+ model="llama-text-embed-v2",
433
+ inputs=batch_texts,
434
+ parameters={"input_type": "passage", "truncate": "END"}
435
+ )
436
+ vectors = [
437
+ {"id": vid, "values": e["values"], "metadata": meta}
438
+ for vid, e, meta in zip(batch_ids, emb, batch_metas)
439
+ ]
440
+ index.upsert(vectors=vectors, namespace=COMMUNITY_NAMESPACE)
441
+
442
+ if info:
443
+ print("[Community] Community summaries upsert completed.")
444
+
445
+ def prepare_RAG(
446
+ pinecone_API,
447
+ index_name,
448
+ chunk_size=400,
449
+ chunk_overlap=30,
450
+ llm_model="gpt-4.1-nano",
451
+ dir_name="context",
452
+ info=True
453
+ ):
454
+ """
455
+ Steps:
456
+ 1) Select LLM wrapper (OpenAI vs. Mistral) by `llm_model` string.
457
+ 2) Create dataset with `create_dataset(dir_name)`.
458
+ 3) Connect to Pinecone and obtain `index`.
459
+ 4) Split documents into chunks; normalize `metadata['source']` to be path-relative
460
+ to the `context` anchor (stable across machines).
461
+ 5) Compute stable vector IDs per chunk from source+content hashes.
462
+ 6) Use `fetch_existing_ids` to identify and skip already-indexed chunks.
463
+ 7) Embed only new chunks via `pc.inference.embed` (retry with backoff).
464
+ 8) Upsert embeddings and metadata into a fixed namespace (`example-namespace`).
465
+ """
466
+
467
+ import os, re, hashlib, time
468
+ from pinecone import Pinecone
469
+ from langchain_mistralai import ChatMistralAI
470
+ from langchain_openai import ChatOpenAI
471
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
472
+
473
+ if info:
474
+ print(f"Preparing RAG with LLM: {llm_model}, Index: {index_name}, Dir: {dir_name}")
475
+ llm = ChatOpenAI(model=llm_model, streaming=True) if "gpt" in llm_model else ChatMistralAI(model=llm_model, streaming=True)
476
+
477
+ documents = create_dataset(dir_name)
478
+ pc = Pinecone(api_key=pinecone_API)
479
+ index = pc.Index(index_name)
480
+
481
+ if not documents:
482
+ print(f"[Warning] No documents found. Using existing Pinecone index.")
483
+ return index, pc, llm, None
484
+
485
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
486
+ all_splits = splitter.split_documents(documents)
487
+
488
+ def path_after_context(full_path: str, anchor: str = "context") -> str:
489
+ if not full_path:
490
+ return ""
491
+ parts = re.split(r"[\\/]+", str(full_path))
492
+ idx = None
493
+ for i, part in enumerate(parts):
494
+ if part.lower() == anchor.lower():
495
+ idx = i
496
+ if idx is not None and idx < len(parts) - 1:
497
+ return "/".join(parts[idx + 1 :])
498
+ return os.path.basename(str(full_path))
499
+
500
+ for chunk in all_splits:
501
+ if "source" in chunk.metadata and chunk.metadata["source"]:
502
+ chunk.metadata["source"] = path_after_context(chunk.metadata["source"], anchor="context")
503
+
504
+ if info:
505
+ print(f"Total chunks: {len(all_splits)}")
506
+
507
+ def chunk_id(chunk, prefix="vec"):
508
+ text_hash = hashlib.sha256(chunk.page_content.encode("utf-8")).hexdigest()[:16]
509
+ source = chunk.metadata.get("source", "unknown")
510
+ file_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:8]
511
+ return f"{prefix}_{file_hash}_{text_hash}"
512
+
513
+
514
+ # -------------------------
515
+ # Vector ID & Namespace Architecture
516
+ # -------------------------
517
+ """
518
+ VECTOR ID GENERATION STRATEGY:
519
+
520
+ For Document Chunks:
521
+ Pattern: "vec_{file_hash}_{content_hash}"
522
+ - file_hash: SHA-256 of normalized source path (8 chars)
523
+ - content_hash: SHA-256 of chunk content (16 chars)
524
+ - Enables exact duplicate detection across runs
525
+ - Stable across different machine paths due to source normalization
526
+
527
+ For Community Summaries:
528
+ Pattern: "comm_{community_hash}"
529
+ - community_hash: SHA-256 of "community_id|level|sorted_node_sample"
530
+ - Ensures stable IDs for the same community composition
531
+ - Allows updates when community structure changes
532
+
533
+ NAMESPACE STRATEGY:
534
+ - "example-namespace": Stores document chunk embeddings
535
+ - "community-summaries": Stores hierarchical community summaries
536
+ - Separation enables independent update/query strategies
537
+ - Prevents interference between document and summary vectors
538
+
539
+ IDEMPOTENCY GUARANTEE:
540
+ - fetch_existing_ids() checks Pinecone before embedding
541
+ - Prevents duplicate embeddings for identical content
542
+ - Enables safe re-runs without data duplication
543
+ - Reduces embedding costs and storage usage
544
+ """
545
+
546
+ namespace = "example-namespace"
547
+ all_ids = [chunk_id(c) for c in all_splits]
548
+ existing = fetch_existing_ids(index, namespace, all_ids, batch_size=100)
549
+ new_chunks = [(c, i) for c, i in zip(all_splits, all_ids) if i not in existing]
550
+
551
+ if info:
552
+ print(f"Chunks already indexed: {len(all_splits) - len(new_chunks)}")
553
+ print(f"New chunks to embed: {len(new_chunks)}")
554
+
555
+ if not new_chunks:
556
+ print("[Info] Nothing new to index. Skipping embedding/upsert.")
557
+ else:
558
+ batch_size = 94
559
+
560
+ def retry_forever(func, *args, **kwargs):
561
+ attempt = 1
562
+ while True:
563
+ try:
564
+ return func(*args, **kwargs)
565
+ except Exception as e:
566
+ wait = min(60, 2 ** min(attempt, 6))
567
+ print(f"[Retry] {func.__name__} failed (attempt {attempt}): {e}. Sleeping {wait}s")
568
+ time.sleep(wait)
569
+ attempt += 1
570
+
571
+ for start_idx in range(0, len(new_chunks), batch_size):
572
+ print(f"[Info] Embedding and upserting batch {start_idx // batch_size + 1}...")
573
+ batch, ids = zip(*new_chunks[start_idx:start_idx + batch_size])
574
+ texts = [chunk.page_content for chunk in batch]
575
+ metas = [chunk.metadata or {} for chunk in batch]
576
+
577
+ embeddings = retry_forever(
578
+ pc.inference.embed,
579
+ model="llama-text-embed-v2",
580
+ inputs=texts,
581
+ parameters={"input_type": "passage", "truncate": "END"}
582
+ )
583
+
584
+ batch_records = [
585
+ {"id": i, "values": e['values'], "metadata": {"text": t, **m}}
586
+ for i, e, t, m in zip(ids, embeddings, texts, metas)
587
+ ]
588
+ retry_forever(index.upsert, vectors=batch_records, namespace=namespace)
589
+
590
+ if info:
591
+ print(f"Completed upsert of {len(new_chunks)} new vectors.")
592
+
593
+ return index, pc, llm, documents # Return documents for KG construction
594
+
595
+
596
+ def build_knowledge_graph(documents, llm, pc, index, info=True):
597
+ """
598
+ Build/update the Knowledge Graph (KG) from documents, persist it, merge deltas, and
599
+ (re)generate community summaries for changed regions.
600
+
601
+ Args:
602
+ documents: List of LangChain Documents from prepare_RAG (may be empty).
603
+ llm: LangChain-compatible LLM used via LlamaIndex.
604
+ pc: Pinecone client (for embeddings).
605
+ index: Pinecone index to store community summary vectors.
606
+ info: Enable verbose logging.
607
+
608
+ Returns:
609
+ KnowledgeGraphIndex | None
610
+
611
+ Flow:
612
+ 1) Identify new/changed docs via source+content hashing (seen file cache).
613
+ 2) Load existing KG from pickle or build a fresh one.
614
+ 3) If there is a delta, build a delta KG and merge nodes/edges.
615
+ 4) Summarize impacted communities and upsert summaries to Pinecone.
616
+ 5) Export `knowledge_graph.json` and update the seen-file signatures.
617
+ """
618
+
619
+
620
+ import os, pickle, json, hashlib, re
621
+ from llama_index.core import Document, KnowledgeGraphIndex
622
+ from llama_index.llms.langchain import LangChainLLM
623
+
624
+ # ---- duplicate detection "like in prepare_RAG" (signature unchanged) ----
625
+ def path_after_context(full_path: str, anchor: str = "context") -> str:
626
+ if not full_path:
627
+ return ""
628
+ parts = re.split(r"[\\/]+", str(full_path))
629
+ idx = None
630
+ for i, part in enumerate(parts):
631
+ if part.lower() == anchor.lower():
632
+ idx = i
633
+ if idx is not None and idx < len(parts) - 1:
634
+ return "/".join(parts[idx + 1 :])
635
+ return os.path.basename(str(full_path))
636
+
637
+ def file_sig(doc_like):
638
+ """Return (sig_id, normalized_source) using source+content hashing similar to prepare_RAG."""
639
+ meta = getattr(doc_like, "metadata", {}) or {}
640
+ text = getattr(doc_like, "page_content", "") or getattr(doc_like, "text", "") or ""
641
+ src = meta.get("source", "unknown")
642
+ if src:
643
+ src = path_after_context(src, anchor="context")
644
+ src_hash = hashlib.sha256(src.encode("utf-8")).hexdigest()[:8]
645
+ text_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
646
+ return f"kg_{src_hash}_{text_hash}", src
647
+
648
+ def load_seen_sigs(path="kg_seen_files.json"):
649
+ try:
650
+ if os.path.exists(path):
651
+ with open(path, "r", encoding="utf-8") as f:
652
+ data = json.load(f)
653
+ return set(data if isinstance(data, list) else [])
654
+ except Exception as e:
655
+ print(f"[Warn] Failed to load seen file sigs: {e}")
656
+ return set()
657
+
658
+ def save_seen_sigs(sigs, path="kg_seen_files.json"):
659
+ try:
660
+ with open(path, "w", encoding="utf-8") as f:
661
+ json.dump(sorted(list(sigs)), f, indent=2)
662
+ except Exception as e:
663
+ print(f"[Warn] Failed to save seen file sigs: {e}")
664
+
665
+ # -------------------------
666
+ # Incremental KG Update Strategy
667
+ # -------------------------
668
+ """
669
+ CRITICAL: This section handles the complex merge of new documents into existing knowledge graphs.
670
+
671
+ KEY CHALLENGES ADDRESSED:
672
+ - Duplicate Detection: Uses content+source hashing to identify truly new/changed documents
673
+ - Delta Processing: Builds partial KG from only new documents, then merges
674
+ - Conflict Resolution: Handles nodes/edges that may already exist in the base graph
675
+ - Change Propagation: Tracks exactly which nodes/edges are new for community summarization
676
+
677
+ MERGE STRATEGY:
678
+ 1. Signature-based filtering identifies only new/changed documents
679
+ 2. Builds a "delta KG" from new documents only
680
+ 3. Performs set operations to find truly new nodes/edges:
681
+ - new_nodes = delta_nodes - base_nodes
682
+ - new_edges = delta_edges - base_edges
683
+ 4. Merges using NetworkX's native add_nodes_from/add_edges_from
684
+ 5. Preserves all node/edge attributes during merge
685
+
686
+ WHY THIS MATTERS:
687
+ - Without proper incremental updates, the system would rebuild the entire KG every time
688
+ - Enables efficient updates when only a few documents change
689
+ - Maintains community summaries for unchanged parts of the graph
690
+ """
691
+
692
+ seen_sigs = load_seen_sigs()
693
+
694
+ # Identify new/changed docs by signature
695
+ all_docs = documents or []
696
+ new_docs = []
697
+ new_sigs = []
698
+ for d in all_docs:
699
+ sig, _ = file_sig(d)
700
+ if sig not in seen_sigs:
701
+ new_docs.append(d)
702
+ new_sigs.append(sig)
703
+
704
+ if info:
705
+ print(f"[KG] Total input docs: {len(all_docs)} | New/changed docs detected: {len(new_docs)}")
706
+
707
+ # ---- prepare LlamaIndex objects ----
708
+ llama_docs_all = [Document(text=doc.page_content, metadata=doc.metadata) for doc in all_docs]
709
+ llama_docs_delta = [Document(text=doc.page_content, metadata=doc.metadata) for doc in new_docs]
710
+ llm_for_kg = LangChainLLM(llm)
711
+ persist_file = os.path.abspath("./kg_index.pkl")
712
+
713
+ def _build_and_persist(docs):
714
+ kg = KnowledgeGraphIndex.from_documents(
715
+ documents=docs,
716
+ max_triplets_per_chunk=20,
717
+ extract_relations=True,
718
+ include_embeddings=True,
719
+ llm=llm_for_kg
720
+ )
721
+ with open(persist_file, "wb") as f:
722
+ pickle.dump(kg, f)
723
+ return kg
724
+
725
+ def _load_existing():
726
+ with open(persist_file, "rb") as f:
727
+ return pickle.load(f)
728
+
729
+ kg_index = None
730
+ graph_exists = False
731
+
732
+ try:
733
+ if os.path.exists(persist_file):
734
+ if info:
735
+ print("[Info] Found persisted KG pickle file.")
736
+ graph_exists = True
737
+ kg_index = _load_existing()
738
+ if info:
739
+ print("[Info] Loaded Knowledge Graph from pickle.")
740
+ elif llama_docs_all:
741
+ if info:
742
+ print("[Info] No persisted KG found. Building new KG from all documents...")
743
+ kg_index = _build_and_persist(llama_docs_all)
744
+ if info:
745
+ print("[Info] Built and persisted Knowledge Graph via pickle.")
746
+ else:
747
+ if info:
748
+ print("[Info] No persisted KG found and no documents to build from.")
749
+ except Exception as e:
750
+ print(f"[Error] GraphRAG init/load failed: {e}")
751
+ kg_index = None
752
+
753
+ # ---- incremental insertion (signature unchanged) ----
754
+ inserted_any = False
755
+ graph_override = None # if we need merge fallback for community detection
756
+
757
+ new_nodes = set()
758
+ new_edges = set()
759
+
760
+ if kg_index and graph_exists and llama_docs_delta:
761
+ if info:
762
+ print(f"[Info] Incrementally inserting {len(llama_docs_delta)} new/changed docs into KG...")
763
+
764
+ ######################################################################
765
+ try:
766
+ # Build delta KG from new/changed docs
767
+ kg_delta = KnowledgeGraphIndex.from_documents(
768
+ documents=llama_docs_delta,
769
+ max_triplets_per_chunk=20,
770
+ extract_relations=True,
771
+ include_embeddings=False,
772
+ llm=llm_for_kg
773
+ )
774
+ nxg_base = kg_index.get_networkx_graph()
775
+ nxg_delta = kg_delta.get_networkx_graph()
776
+
777
+ # Diagnostic: Print node/edge sets before merge
778
+ base_nodes_before = set(nxg_base.nodes())
779
+ base_edges_before = set(nxg_base.edges())
780
+ delta_nodes = set(nxg_delta.nodes())
781
+ delta_edges = set(nxg_delta.edges())
782
+
783
+ print(f"\n[Diagnostic] Base graph nodes before merge: {len(base_nodes_before)}")
784
+ print(f"[Diagnostic] Base graph edges before merge: {len(base_edges_before)}")
785
+ print(f"[Diagnostic] Delta graph nodes: {len(delta_nodes)}")
786
+ print(f"[Diagnostic] Delta graph edges: {len(delta_edges)}")
787
+
788
+ # Show intersection and difference
789
+ new_nodes = delta_nodes - base_nodes_before
790
+ new_edges = delta_edges - base_edges_before
791
+ already_existing_nodes = delta_nodes & base_nodes_before
792
+ already_existing_edges = delta_edges & base_edges_before
793
+
794
+ print(f"[Diagnostic] Delta nodes already in base: {len(already_existing_nodes)}")
795
+ print(f"[Diagnostic] Delta edges already in base: {len(already_existing_edges)}")
796
+ print(f"[Diagnostic] Truly new nodes to add: {len(new_nodes)}")
797
+ print(f"[Diagnostic] Truly new edges to add: {len(new_edges)}")
798
+
799
+ # Merge delta into base
800
+ nxg_base.add_nodes_from(nxg_delta.nodes(data=True))
801
+ nxg_base.add_edges_from(nxg_delta.edges(data=True))
802
+ graph_override = nxg_base
803
+ inserted_any = True
804
+
805
+ # Diagnostic: Print node/edge sets after merge
806
+ base_nodes_after = set(nxg_base.nodes())
807
+ base_edges_after = set(nxg_base.edges())
808
+ print(f"\n[Diagnostic] Base graph nodes after merge: {len(base_nodes_after)}")
809
+ print(f"[Diagnostic] Base graph edges after merge: {len(base_edges_after)}")
810
+ print(f"[Diagnostic] Nodes added: {len(base_nodes_after - base_nodes_before)}")
811
+ print(f"[Diagnostic] Edges added: {len(base_edges_after - base_edges_before)}")
812
+
813
+ # Print delta graph summary
814
+ num_nodes = nxg_delta.number_of_nodes()
815
+ num_edges = nxg_delta.number_of_edges()
816
+
817
+ print(f"\n[Delta Graph Summary]")
818
+ print(f" - Total Nodes: {num_nodes}")
819
+ print(f" - Total Edges: {num_edges}")
820
+
821
+ # Print first 10 nodes
822
+ print("\n[Delta Graph Nodes] (showing up to 10):")
823
+ for i, (node, data) in enumerate(nxg_delta.nodes(data=True)):
824
+ if i >= 10:
825
+ print(" ...")
826
+ break
827
+ print(f" {i+1}. {node}: {data}")
828
+
829
+ # Print first 10 edges
830
+ print("\n[Delta Graph Edges] (showing up to 10):")
831
+ for i, (source, target, data) in enumerate(nxg_delta.edges(data=True)):
832
+ if i >= 10:
833
+ print(" ...")
834
+ break
835
+ print(f" {i+1}. {source} -> {target}: {data}")
836
+
837
+ # Warn if nothing new was actually added
838
+ if len(new_nodes) == 0 and len(new_edges) == 0:
839
+ print("[Warning] All delta nodes/edges already existed in the base graph. No actual change.")
840
+
841
+ if info:
842
+ print("[Info] Merged delta KG into existing graph (override used for summaries).")
843
+ except Exception as e:
844
+ print(f"[Error] Fallback merge failed: {e}")
845
+ ######################################################################
846
+
847
+
848
+ # Persist KG if mutated via API
849
+ if inserted_any and graph_override is None:
850
+ try:
851
+ with open(persist_file, "wb") as f:
852
+ pickle.dump(kg_index, f)
853
+ except Exception as e:
854
+ print(f"[Warn] Failed to persist updated KG: {e}")
855
+
856
+ # First-time build already happened above (graph_exists==False and llama_docs_all not empty)
857
+
858
+ # ---- Community summaries (incremental occurs inside the helper; same signature) ----
859
+ if kg_index:
860
+ # Only trigger summaries when: first build or we actually inserted/merged something
861
+ if not graph_exists or inserted_any:
862
+ _build_and_index_community_summaries(
863
+ kg_index=kg_index,
864
+ pc=pc,
865
+ index=index,
866
+ llm=llm,
867
+ impacted_nodes=new_nodes.union(u for u, v in new_edges).union(v for u, v in new_edges),
868
+ info=True
869
+ )
870
+
871
+ # Optional: save graph for visualization (post-update)
872
+ try:
873
+ nxg = graph_override if graph_override is not None else kg_index.get_networkx_graph()
874
+ graph_dict = {}
875
+ for u, v, attrs in nxg.edges(data=True):
876
+ rel = attrs.get("label") or attrs.get("relationship") or "related_to"
877
+ if u not in graph_dict:
878
+ graph_dict[u] = []
879
+ graph_dict[u].append([rel, v])
880
+ output_file = "knowledge_graph.json"
881
+ with open(output_file, "w", encoding="utf-8") as f:
882
+ json.dump(graph_dict, f, indent=4, ensure_ascii=False)
883
+ if info:
884
+ print(f"[Info] Knowledge graph saved to {output_file}")
885
+ except Exception as e:
886
+ print(f"[Error] Failed to save knowledge graph: {e}")
887
+
888
+ # ---- mark seen signatures only after successful insertion or first build ----
889
+ if (not graph_exists and llama_docs_all) or inserted_any:
890
+ # Add only the new ones we processed this run
891
+ seen_sigs.update(new_sigs)
892
+ save_seen_sigs(seen_sigs)
893
+
894
+ return kg_index
895
+
896
+
rag_execute.py ADDED
@@ -0,0 +1,647 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main entry point for the RAG Gradio application.
3
+
4
+ Loads environment variables, sets up context directory and model parameters,
5
+ initializes retrieval and generation functions, and launches the interactive chat UI.
6
+ Handles file uploads, user queries, and streaming LLM responses.
7
+ """
8
+
9
+
10
+ import os
11
+ import queue
12
+ from threading import Thread
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+ # Construct the path to the .env file relative to this script's location
16
+ dotenv_path = os.path.join(os.path.dirname(__file__), '..', 'RAG-LangChain', '.env')
17
+ print(f"Start loading .env from {dotenv_path}")
18
+ load_dotenv(dotenv_path=dotenv_path)
19
+ print(f"Finish loading .env")
20
+ from langchain.callbacks.base import BaseCallbackHandler
21
+ print(f"Start importing from rag_func")
22
+ from prepare import prepare_RAG
23
+ from retrieve import retrieve_RAG
24
+ from generate import generate_RAG
25
+ from prepare import build_knowledge_graph
26
+
27
+
28
+ ######
29
+ # --- Graph viz imports (Plotly + NetworkX) ---
30
+ import json, math, random
31
+ import networkx as nx
32
+ import numpy as np
33
+ import plotly.graph_objects as go
34
+ import plotly.express as px
35
+ try:
36
+ from scipy.spatial import ConvexHull
37
+ SCIPY_AVAILABLE = True
38
+ except Exception:
39
+ SCIPY_AVAILABLE = False
40
+
41
+ ######
42
+
43
+
44
+ print(f"Finish importing from rag_func")
45
+ import gradio as gr
46
+
47
+ # -------------------- Context Setup --------------------
48
+
49
+ user_dir = "context"
50
+ #print default
51
+ print(f"[Info] Using context directory: {user_dir}")
52
+
53
+ pinecone_API = os.getenv("PINECONE_API")
54
+ index_name = os.getenv("INDEX_NAME")
55
+ llm_model = os.getenv("MODELNAME")
56
+
57
+ #index, pc, llm, kg_index = prepare_RAG(pinecone_API, index_name, llm_model=llm_model, dir_name=user_dir, graph_rag=(graph_rag=="True"))
58
+ index, pc, llm, kg_index = None, None, None, None
59
+
60
+ # -------------------- Chat Functions --------------------
61
+ def add_user_message(message, history):
62
+ """
63
+ Adds a new user message to the chat history.
64
+
65
+ Ensures the message is appended in the correct format for downstream processing.
66
+ Returns updated history for use in the chat UI.
67
+ """
68
+
69
+ history = history or []
70
+ history.append({"role": "user", "content": message})
71
+ return "", history, history
72
+
73
+ import time
74
+
75
+ # -------------------- Streaming Handler --------------------
76
+ class StreamHandler(BaseCallbackHandler):
77
+ """
78
+ Callback handler for streaming LLM tokens to the UI.
79
+
80
+ Tracks timing for first token and total response, buffers tokens,
81
+ and manages the flow of streamed content for real-time display.
82
+ """
83
+
84
+ def __init__(self, q: queue.Queue):
85
+ self.q = q
86
+ self.first_token_received = False
87
+ self.ttft = None # time to first token
88
+ self.total_time = None
89
+ self.start_time = None
90
+ self.buffer = [] # optional: accumulate tokens
91
+
92
+ def on_llm_new_token(self, token: str, **kwargs):
93
+ if not self.first_token_received:
94
+ self.ttft = time.time() - self.start_time
95
+ self.first_token_received = True
96
+ self.buffer.append(token)
97
+ self.q.put(token)
98
+
99
+ def on_llm_end(self, *args, **kwargs):
100
+ # IMPORTANT: do NOT end the consumer here.
101
+ # Let the worker thread send [[FINAL]] (if any) and then [[END]].
102
+ self.total_time = time.time() - self.start_time
103
+ # self.q.put("[[END]]") # <-- REMOVED (this was breaking before we could send [[FINAL]])
104
+
105
+ # -------------------- Chat Functions with timing --------------------
106
+
107
+
108
+
109
+ def generate_bot_response(history):
110
+ """
111
+ Streams the first pass from the LLM to the UI and updates a styled progress box above the chat.
112
+ """
113
+ global index, pc, llm, kg_index
114
+
115
+ if not history or history[-1]["role"] != "user":
116
+ yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Ready</div>"
117
+ return
118
+
119
+ user_msg = history[-1]["content"]
120
+ documents = None
121
+
122
+ # --- Stage 1: Initialize LLM / vector infra ---
123
+ yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Initializing LLM and infrastructure...</div>"
124
+ if not index or not pc or not llm:
125
+ from langchain_mistralai import ChatMistralAI
126
+ from langchain_openai import ChatOpenAI
127
+
128
+ llm = ChatOpenAI(model=llm_model) if "gpt" in llm_model else ChatMistralAI(model=llm_model)
129
+ index, pc, llm, documents = prepare_RAG(
130
+ pinecone_API,
131
+ index_name,
132
+ llm_model=llm_model,
133
+ dir_name=user_dir,
134
+ info=True
135
+ )
136
+
137
+ # --- Stage 2: Decide Graph RAG usage ---
138
+ yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Deciding Graph RAG usage...</div>"
139
+ def decide_graph_rag_usage(llm_, user_text: str) -> bool:
140
+ prompt = (
141
+ "Given the following user prompt, determine whether graph RAG should be used (True or False):\n"
142
+ f"{user_text}\n"
143
+ "Use 'False' only if the prompt is focused on retrieving a single fact.\n"
144
+ "Use 'True' if the prompt suggests reasoning over a large portion or the entirety of a dataset or corpus."
145
+ )
146
+ resp = llm_.invoke(prompt)
147
+ decision = (getattr(resp, "content", str(resp)) or "").strip()
148
+ print("[Debug] Graph RAG decision response:", decision)
149
+ return decision == "True"
150
+
151
+ graph_rag_flag = decide_graph_rag_usage(llm, user_msg)
152
+ print(f"[Info] Graph RAG usage decision: {graph_rag_flag}")
153
+
154
+ if graph_rag_flag and not documents:
155
+ _, _, _, documents = prepare_RAG(
156
+ pinecone_API,
157
+ index_name,
158
+ llm_model=llm_model,
159
+ dir_name=user_dir,
160
+ info=True
161
+ )
162
+
163
+ if graph_rag_flag:
164
+ kg_index = build_knowledge_graph(documents, llm, pc, index, info=True)
165
+
166
+ # --- Stage 3: Retrieve context ---
167
+ yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Retrieving context...</div>"
168
+ retrieved_chunks, graph_context = retrieve_RAG(
169
+ user_msg,
170
+ pc,
171
+ index,
172
+ kg_index,
173
+ top_k=5,
174
+ use_query_reformulation=True,
175
+ llm=llm,
176
+ graphRAG=graph_rag_flag
177
+ )
178
+
179
+ # --- Stage 4: Generating response ---
180
+ yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Generating response...</div>"
181
+ FINAL_PREFIX = "[[FINAL]]"
182
+ q = queue.Queue()
183
+ handler = StreamHandler(q)
184
+ handler.start_time = time.time()
185
+
186
+ model_name = getattr(llm, "model_name", getattr(llm, "model", None))
187
+ streaming_llm = llm.__class__(model=model_name, streaming=True, callbacks=[handler])
188
+
189
+ def run_llm():
190
+ try:
191
+ resp = generate_RAG(
192
+ user_msg,
193
+ streaming_llm,
194
+ retrieved_chunks,
195
+ graph_context,
196
+ graphRAG=graph_rag_flag
197
+ )
198
+ final_text = (getattr(resp, "content", str(resp)) or "").strip()
199
+ if final_text:
200
+ q.put(FINAL_PREFIX + final_text)
201
+ finally:
202
+ q.put("[[END]]")
203
+
204
+ Thread(target=run_llm, daemon=True).start()
205
+
206
+ partial = ""
207
+ history.append({"role": "assistant", "content": ""})
208
+
209
+ while True:
210
+ token = q.get()
211
+ if token == "[[END]]":
212
+ yield history, history, "<div style='background:#d4edda;padding:10px;border-radius:8px;'>Completed!</div>"
213
+ print(f"[Timing] TTFT: {handler.ttft:.3f} s, Total: {handler.total_time:.3f} s")
214
+ break
215
+
216
+ if token.startswith(FINAL_PREFIX):
217
+ final = token[len(FINAL_PREFIX):]
218
+ history[-1]["content"] = final
219
+ yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Generating response...</div>"
220
+ partial = final
221
+ continue
222
+
223
+ partial += token
224
+ history[-1]["content"] = partial
225
+ yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Generating response...</div>"
226
+
227
+
228
+
229
+ # -------------------- Simplified CSS for Default Gradio Font --------------------
230
+ from pathlib import Path
231
+ import gradio as gr
232
+
233
+ # Load external assets
234
+ custom_css = Path("app.css").read_text(encoding="utf-8")
235
+ js_force_light = """ function refresh() {
236
+ const url = new URL(window.location);
237
+ if (url.searchParams.get('__theme') !== 'light') {
238
+ url.searchParams.set('__theme', 'light');
239
+ window.location.replace(url);
240
+ }
241
+ } """
242
+
243
+ # -------------------- Gradio App --------------------
244
+ import os
245
+ import shutil
246
+ MAX_TOTAL_SIZE_MB = 5
247
+ CONTEXT_DIR = "context"
248
+
249
+ def handle_file_upload(uploaded_files):
250
+ """
251
+ Validates and saves uploaded files to the context directory for RAG processing.
252
+
253
+ Checks file extensions and total upload size against allowed limits.
254
+ Returns a status message indicating success or failure for each upload attempt.
255
+ """
256
+
257
+ context_dir = "context"
258
+ os.makedirs(context_dir, exist_ok=True)
259
+ saved_files = []
260
+ total_size_mb = 0
261
+
262
+ # Allowed extensions
263
+ allowed_extensions = {".txt", ".json", ".md", ".csv", ".pdf", ".docx", ".pptx", ".py"}
264
+
265
+ for file_obj in uploaded_files:
266
+ # Check file extension
267
+ ext = os.path.splitext(file_obj.name)[1].lower()
268
+ if ext not in allowed_extensions:
269
+ return f"❌ Unsupported file type: {ext}. Allowed types are: {', '.join(sorted(allowed_extensions))}"
270
+ # Check size
271
+ file_size_mb = os.path.getsize(file_obj.name) / (1024 * 1024)
272
+ total_size_mb += file_size_mb
273
+ if total_size_mb > MAX_TOTAL_SIZE_MB:
274
+ return f"❌ Total upload size exceeds the limit of {MAX_TOTAL_SIZE_MB}MB."
275
+ # Save file
276
+ filename = os.path.basename(file_obj.name)
277
+ dest_path = os.path.join(context_dir, filename)
278
+ with open(file_obj.name, "rb") as src, open(dest_path, "wb") as dst:
279
+ dst.write(src.read())
280
+ saved_files.append(dest_path)
281
+
282
+ return f"✅ Uploaded {len(saved_files)} file(s) to '{context_dir}': {', '.join(os.path.basename(f) for f in saved_files)}"
283
+
284
+
285
+
286
+ #########
287
+ # ---------- Graph viz core ----------
288
+ GRAPH_JSON_PATH = "knowledge_graph.json"
289
+ COMMUNITY_MIN_SIZE = 3
290
+ MERGE_SMALLS_POLICY = "bucket" # or 'attach'
291
+ LAYOUT_SEED = 42
292
+ LAYOUT_ITERS = 30
293
+
294
+ # Cached state (simple globals for now)
295
+ _g_G = None
296
+ _g_pos3d = None
297
+ _g_node2comm = None
298
+ _g_comm2nodes = None
299
+ _g_edges = None
300
+ _g_node_names = None
301
+
302
+ def load_graph_from_json(path=GRAPH_JSON_PATH):
303
+ """Read {source: [[rel, target], ...], ...} and return a DiGraph."""
304
+ try:
305
+ with open(path, "r", encoding="utf-8") as f:
306
+ graph_dict = json.load(f)
307
+ except Exception:
308
+ graph_dict = {}
309
+ G = nx.DiGraph()
310
+ for source, edges_list in graph_dict.items():
311
+ for relation, target in edges_list:
312
+ G.add_edge(source, target, label=relation)
313
+ if G.number_of_nodes() == 0:
314
+ G.add_node("(empty)")
315
+ return G
316
+
317
+ def precompute_layout_and_communities(G: nx.DiGraph):
318
+ """Compute 3D spring layout and top-level modularity communities."""
319
+ pos3d = nx.spring_layout(G, dim=3, seed=LAYOUT_SEED, iterations=LAYOUT_ITERS)
320
+ node_names = list(G.nodes())
321
+ edges = list(G.edges())
322
+
323
+ # Greedy modularity communities (on undirected projection)
324
+ from networkx.algorithms.community import greedy_modularity_communities
325
+ UG = nx.Graph()
326
+ UG.add_edges_from(G.to_undirected().edges())
327
+ communities = list(greedy_modularity_communities(UG))
328
+ large = [set(c) for c in communities if len(c) >= COMMUNITY_MIN_SIZE]
329
+ small = [set(c) for c in communities if len(c) < COMMUNITY_MIN_SIZE]
330
+
331
+ if MERGE_SMALLS_POLICY == "bucket" and small:
332
+ other = set().union(*small) if small else set()
333
+ if other:
334
+ large.append(other)
335
+ comm_ids = [f"C{i}" for i in range(len(large) - (1 if other else 0))]
336
+ if other:
337
+ comm_ids.append("C_other")
338
+ elif MERGE_SMALLS_POLICY == "attach" and small and large:
339
+ for s in small:
340
+ # attach to the large community with the most cross-edges
341
+ best_i, best_links = None, -1
342
+ for i, L in enumerate(large):
343
+ links = sum(1 for u in s for v in L if UG.has_edge(u, v))
344
+ if links > best_links:
345
+ best_i, best_links = i, links
346
+ if best_i is None:
347
+ best_i = max(range(len(large)), key=lambda i: len(large[i]))
348
+ large[best_i].update(s)
349
+ comm_ids = [f"C{i}" for i in range(len(large))]
350
+ else:
351
+ comm_ids = [f"C{i}" for i in range(len(large))]
352
+
353
+ node2comm, comm2nodes = {}, {}
354
+ for cid, nodeset in zip(comm_ids, large):
355
+ comm2nodes[cid] = set(nodeset)
356
+ for n in nodeset:
357
+ node2comm[n] = cid
358
+ for n in G.nodes():
359
+ if n not in node2comm:
360
+ node2comm[n] = "C_isolated"
361
+ comm2nodes.setdefault("C_isolated", set()).add(n)
362
+
363
+ return pos3d, node2comm, comm2nodes, edges, node_names
364
+
365
+ def _make_comm_colors(comm2nodes_dict):
366
+ palette = (px.colors.qualitative.Alphabet +
367
+ px.colors.qualitative.Set3 +
368
+ px.colors.qualitative.Bold +
369
+ px.colors.qualitative.Dark24 +
370
+ px.colors.qualitative.Light24)
371
+ cids = sorted(comm2nodes_dict.keys())
372
+ return {cid: palette[i % len(palette)] for i, cid in enumerate(cids)}
373
+
374
+ def _community_hulls_traces(pos3d, comm2nodes, comm_colors, opacity=0.12):
375
+ if not SCIPY_AVAILABLE:
376
+ return []
377
+ hull_traces = []
378
+ for cid, nodeset in comm2nodes.items():
379
+ pts = np.array([pos3d[n] for n in nodeset if n in pos3d])
380
+ if pts.shape[0] < 4:
381
+ continue
382
+ try:
383
+ hull = ConvexHull(pts)
384
+ simplices = hull.simplices
385
+ hull_traces.append(go.Mesh3d(
386
+ x=pts[:, 0], y=pts[:, 1], z=pts[:, 2],
387
+ i=simplices[:, 0], j=simplices[:, 1], k=simplices[:, 2],
388
+ color=_make_comm_colors(comm2nodes).get(cid, "#cccccc"),
389
+ opacity=opacity, name=f"{cid} region",
390
+ hoverinfo="skip", showlegend=False
391
+ ))
392
+ except Exception:
393
+ pass
394
+ return hull_traces
395
+ #########
396
+ def build_plotly_figure(mode="community", highlight_node=None,
397
+ highlight_comm_id=None, dim_inter_edges=True,
398
+ show_hulls=False):
399
+ global _g_G, _g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names
400
+
401
+ # Load & cache if not present
402
+ if _g_G is None:
403
+ _g_G = load_graph_from_json()
404
+ _g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names = \
405
+ precompute_layout_and_communities(_g_G)
406
+
407
+ G = _g_G
408
+ pos3d = _g_pos3d
409
+ node2comm = _g_node2comm
410
+ comm2nodes = _g_comm2nodes
411
+ edges = _g_edges
412
+
413
+ # split intra/inter edges
414
+ edge_x_intra, edge_y_intra, edge_z_intra = [], [], []
415
+ edge_x_inter, edge_y_inter, edge_z_inter = [], [], []
416
+ for (u, v) in edges:
417
+ x0, y0, z0 = pos3d[u]
418
+ x1, y1, z1 = pos3d[v]
419
+ if node2comm.get(u) == node2comm.get(v):
420
+ edge_x_intra += [x0, x1, None]; edge_y_intra += [y0, y1, None]; edge_z_intra += [z0, z1, None]
421
+ else:
422
+ edge_x_inter += [x0, x1, None]; edge_y_inter += [y0, y1, None]; edge_z_inter += [z0, z1, None]
423
+
424
+ edge_traces = []
425
+ if edge_x_inter:
426
+ edge_traces.append(go.Scatter3d(
427
+ x=edge_x_inter, y=edge_y_inter, z=edge_z_inter,
428
+ mode="lines",
429
+ line=dict(width=1, color="rgba(180,180,180,0.30)" if dim_inter_edges else "#BBBBBB"),
430
+ hoverinfo="none", showlegend=False, name="Inter-community"
431
+ ))
432
+ if edge_x_intra:
433
+ edge_traces.append(go.Scatter3d(
434
+ x=edge_x_intra, y=edge_y_intra, z=edge_z_intra,
435
+ mode="lines",
436
+ line=dict(width=2, color="rgba(120,120,120,0.55)"),
437
+ hoverinfo="none", showlegend=False, name="Intra-community"
438
+ ))
439
+
440
+ comm_colors = _make_comm_colors(comm2nodes)
441
+ hull_traces = _community_hulls_traces(pos3d, comm2nodes, comm_colors) if show_hulls else []
442
+
443
+ # neighbor sets (if needed)
444
+ nbr_succ, nbr_pred = set(), set()
445
+ if mode == "neighbors" and highlight_node and highlight_node in G:
446
+ nbr_succ = set(G.neighbors(highlight_node))
447
+ nbr_pred = set(G.predecessors(highlight_node))
448
+
449
+ node_traces = []
450
+ for cid, nodeset in sorted(comm2nodes.items(), key=lambda kv: kv[0]):
451
+ xs, ys, zs, texts, colors, sizes = [], [], [], [], [], []
452
+ base_color = comm_colors.get(cid, "#66c2a5")
453
+ for n in nodeset:
454
+ x, y, z = pos3d[n]
455
+ xs.append(x); ys.append(y); zs.append(z); texts.append(n)
456
+ if mode == "neighbors":
457
+ if highlight_node == n:
458
+ colors.append("red"); sizes.append(8.0)
459
+ elif n in nbr_succ or n in nbr_pred:
460
+ colors.append("orange"); sizes.append(6.5)
461
+ elif highlight_node and node2comm.get(n) == node2comm.get(highlight_node):
462
+ colors.append(base_color); sizes.append(5.5)
463
+ else:
464
+ colors.append("lightblue"); sizes.append(5.0)
465
+ else:
466
+ if highlight_comm_id and node2comm.get(n) == highlight_comm_id:
467
+ colors.append(base_color); sizes.append(6.5)
468
+ else:
469
+ colors.append(base_color); sizes.append(5.0)
470
+ if xs:
471
+ node_traces.append(go.Scatter3d(
472
+ x=xs, y=ys, z=zs, mode="markers",
473
+ hovertext=texts, hoverinfo="text",
474
+ marker=dict(size=sizes, color=colors, opacity=0.95),
475
+ name=cid, showlegend=True
476
+ ))
477
+
478
+ fig = go.Figure(data=hull_traces + edge_traces + node_traces)
479
+ fig.update_layout(
480
+ title="3D Knowledge Graph — Communities & Neighbors",
481
+ showlegend=True if mode == "community" else False,
482
+ height=800,
483
+ margin=dict(l=0, r=0, t=40, b=0),
484
+ scene=dict(
485
+ xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False),
486
+ aspectmode="data"
487
+ ),
488
+ scene_camera=dict(eye=dict(x=1.5, y=1.5, z=1.5)),
489
+ uirevision=True,
490
+ )
491
+ return fig
492
+
493
+ def reload_graph_cache():
494
+ """Force re-read knowledge_graph.json and recompute layout/communities."""
495
+ global _g_G, _g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names
496
+ _g_G = load_graph_from_json()
497
+ _g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names = \
498
+ precompute_layout_and_communities(_g_G)
499
+ # Return a default figure
500
+ return build_plotly_figure(mode="community", highlight_comm_id=None, dim_inter_edges=True, show_hulls=False)
501
+
502
+ #########
503
+
504
+
505
+
506
+ with gr.Blocks(css=custom_css, fill_height=True, js=js_force_light) as demo:
507
+ with gr.Row():
508
+ # LEFT SIDE: Branding + Upload
509
+ with gr.Column(scale=1, elem_id="left-column"):
510
+ # Branding row: logo and title side by side
511
+ with gr.Row(elem_id="branding-row"):
512
+ import base64
513
+ from pathlib import Path
514
+
515
+ HERE = Path(__file__).resolve().parent
516
+ logo_path = HERE / "logo_mono.png"
517
+
518
+ with open(logo_path, "rb") as f:
519
+ encoded = base64.b64encode(f.read()).decode()
520
+
521
+ gr.HTML(f"""
522
+ <div id="branding">
523
+ <img id="company-logo" src="data:image/png;base64,{encoded}" alt="Logo" />
524
+ <span id="brand-text">mosaiicRAG</span>
525
+ </div>
526
+ """)
527
+
528
+
529
+
530
+ gr.Markdown(
531
+ "<p>Daten verstehen. Wissen vernetzen. Entscheidungen stärken.</p>",
532
+ elem_id="header2"
533
+ )
534
+
535
+ # Below branding: upload info
536
+ gr.Markdown(
537
+ """
538
+ **Supported file formats:** .txt, .json, .md, .csv, .pdf, .docx, .pptx, .py
539
+
540
+ **Maximum files' size:** 5 MB
541
+ """,
542
+ elem_id="upload-note"
543
+ )
544
+
545
+ file_upload = gr.File(
546
+ label="Upload files for RAG context",
547
+ file_count="multiple",
548
+ elem_id="file-upload"
549
+ )
550
+ upload_btn = gr.Button("Upload")
551
+ upload_output = gr.Textbox(label="Upload status", interactive=False)
552
+ upload_btn.click(handle_file_upload, inputs=[file_upload], outputs=[upload_output])
553
+
554
+
555
+ with gr.Column(scale=4, elem_id="right-column"):
556
+ with gr.Tabs():
557
+ # ------------------------- Chat tab (unchanged) -------------------------
558
+ with gr.Tab("Chat"):
559
+ with gr.Column(elem_id="chat-area"):
560
+ progress_box = gr.HTML("<div style='background:#f5f5f5;padding:10px;border-radius:8px;margin-bottom:10px;'>Ready</div>")
561
+ chatbot = gr.Chatbot(type="messages", label="Conversation", elem_id="chatbot")
562
+ with gr.Row(elem_id="input-row"):
563
+ msg = gr.Textbox(placeholder="Type your question here...", lines=1)
564
+ send_btn = gr.Button("➤", elem_id="send-button", size="sm")
565
+ state = gr.State([])
566
+ msg.submit(add_user_message, inputs=[msg, state], outputs=[msg, chatbot, state])\
567
+ .then(generate_bot_response, inputs=[state], outputs=[chatbot, state, progress_box])
568
+ send_btn.click(add_user_message, inputs=[msg, state], outputs=[msg, chatbot, state])\
569
+ .then(generate_bot_response, inputs=[state], outputs=[chatbot, state, progress_box])
570
+
571
+ # --------------------- Knowledge Graph tab (updated) ---------------------
572
+ with gr.Tab("Knowledge Graph"):
573
+ with gr.Row():
574
+ color_mode = gr.Radio(
575
+ ["community"],
576
+ value="community",
577
+ label="Color mode"
578
+ )
579
+ community_select = gr.Dropdown(
580
+ label="Highlight community (optional)",
581
+ choices=[],
582
+ value=None
583
+ )
584
+ view_opts = gr.CheckboxGroup(
585
+ choices=[
586
+ "Dim inter-community edges",
587
+ f"Show 3D community hulls{' (requires scipy)' if not SCIPY_AVAILABLE else ''}"
588
+ ],
589
+ value=["Dim inter-community edges"],
590
+ label="View options"
591
+ )
592
+ reload_btn = gr.Button("Reload graph")
593
+
594
+ graph_plot = gr.Plot(label="3D Knowledge Graph")
595
+ node_info = gr.Markdown("")
596
+
597
+ # ---- functions bound to UI (defined above or inline) ----
598
+ def _init_graph():
599
+ # Rebuild cache from knowledge_graph.json and return default figure
600
+ fig = reload_graph_cache()
601
+ cids = sorted(list(_g_comm2nodes.keys())) if _g_comm2nodes else []
602
+ info = "Select a community or click a node to highlight its community."
603
+ # Use gr.update to set dropdown choices
604
+ return fig, gr.update(choices=cids, value=None), info
605
+
606
+ def _refresh(mode, selected_cid, opts):
607
+ dim_edges = isinstance(opts, list) and ("Dim inter-community edges" in opts)
608
+ show_hulls = isinstance(opts, list) and any("Show 3D community hulls" in s for s in opts)
609
+
610
+ fig = build_plotly_figure(
611
+ mode="community" if mode == "community" else "neighbors",
612
+ highlight_comm_id=(selected_cid if mode == "community" else None),
613
+ dim_inter_edges=dim_edges,
614
+ show_hulls=(show_hulls if mode == "community" else False)
615
+ )
616
+ info = (
617
+ "Select a community or click a node to highlight its community."
618
+ if mode == "community"
619
+ else "Click a node to see its neighbors (community tint applied)."
620
+ )
621
+ return fig, info
622
+
623
+ def _reload(mode, selected_cid, opts):
624
+ # Reload data and recompute communities/layout
625
+ _ = reload_graph_cache()
626
+ cids = sorted(list(_g_comm2nodes.keys())) if _g_comm2nodes else []
627
+ # Immediately apply current UI options on the new graph state
628
+ fig, info = _refresh(mode, selected_cid, opts)
629
+ return fig, gr.update(choices=cids, value=selected_cid), info
630
+
631
+ # wire controls
632
+ color_mode.change(_refresh, inputs=[color_mode, community_select, view_opts],
633
+ outputs=[graph_plot, node_info])
634
+ community_select.change(_refresh, inputs=[color_mode, community_select, view_opts],
635
+ outputs=[graph_plot, node_info])
636
+ view_opts.change(_refresh, inputs=[color_mode, community_select, view_opts],
637
+ outputs=[graph_plot, node_info])
638
+
639
+ reload_btn.click(_reload, inputs=[color_mode, community_select, view_opts],
640
+ outputs=[graph_plot, community_select, node_info])
641
+
642
+ # ------------------------ IMPORTANT: INSIDE THE BLOCKS ------------------------
643
+ # Initialize the graph once when the app loads (now inside the Blocks context)
644
+ demo.load(_init_graph, inputs=[], outputs=[graph_plot, community_select, node_info])
645
+ # -------------------- Launch App --------------------
646
+ if __name__ == "__main__":
647
+ demo.launch(inbrowser=True)
rag_on_prem.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ import gradio as gr
4
+
5
+ # LangChain imports
6
+ from langchain_community.document_loaders import (
7
+ CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
8
+ UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
9
+ UnstructuredHTMLLoader, NotebookLoader
10
+ )
11
+ from langchain_core.documents import Document
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ from langchain_ollama import ChatOllama, OllamaEmbeddings
14
+ from langchain_community.vectorstores import FAISS
15
+ from langchain_core.messages import HumanMessage, SystemMessage
16
+
17
+ # -------------------------
18
+ # UTF-8 safe Text Loader
19
+ # -------------------------
20
+ class SafeTextLoader:
21
+ """Loads a text file as a single Document, safely handling UTF-8 decoding errors."""
22
+ def __init__(self, file_path):
23
+ self.file_path = file_path
24
+ print(f"[Debug] Initialized SafeTextLoader with file_path: {file_path}")
25
+
26
+ def load(self):
27
+ try:
28
+ print(f"[Debug] Attempting to load file: {self.file_path}")
29
+ with open(self.file_path, "rb") as f: # open in binary mode
30
+ raw_bytes = f.read()
31
+ text = raw_bytes.decode("utf-8", errors="ignore") # decode safely
32
+ print(f"[Debug] Successfully loaded file: {self.file_path}")
33
+ return [Document(page_content=text, metadata={"source": str(self.file_path)})]
34
+ except Exception as e:
35
+ print(f"[Error] Failed to read {self.file_path}: {e}")
36
+ return []
37
+
38
+ # -------------------------
39
+ # Loader mapping
40
+ # -------------------------
41
+ LOADER_MAPPING = {
42
+ # Text
43
+ ".txt": SafeTextLoader,
44
+ ".json": SafeTextLoader,
45
+ ".md": UnstructuredMarkdownLoader,
46
+ ".csv": CSVLoader,
47
+ ".yaml": SafeTextLoader,
48
+ ".yml": SafeTextLoader,
49
+
50
+ # Documents
51
+ ".pdf": PyPDFLoader,
52
+ ".docx": UnstructuredWordDocumentLoader,
53
+ ".pptx": UnstructuredPowerPointLoader,
54
+ ".html": UnstructuredHTMLLoader,
55
+ ".htm": UnstructuredHTMLLoader,
56
+
57
+ # Code / Notebook
58
+ ".ipynb": NotebookLoader,
59
+ ".py": SafeTextLoader,
60
+ ".js": SafeTextLoader,
61
+ ".sql": SafeTextLoader,
62
+ }
63
+
64
+ # -------------------------
65
+ # Dataset creation
66
+ # -------------------------
67
+ def create_dataset(directory_path: str = "context"):
68
+ """Loads all supported files from the given directory (recursively)."""
69
+ print(f"[Debug] Starting dataset creation for directory: {directory_path}")
70
+ target_dir = pathlib.Path(directory_path).resolve()
71
+
72
+ if not target_dir.exists() or not target_dir.is_dir():
73
+ print(f"[Error] Target directory does not exist: {target_dir}")
74
+ return []
75
+
76
+ documents = []
77
+
78
+ for file_path in target_dir.rglob("*"): # recursive
79
+ if not file_path.is_file():
80
+ continue
81
+
82
+ ext = file_path.suffix.lower()
83
+ loader_cls = LOADER_MAPPING.get(ext)
84
+
85
+ if loader_cls is None:
86
+ print(f"[Skip] Unsupported file type: {file_path}")
87
+ continue
88
+
89
+ try:
90
+ print(f"[Debug] Loading file: {file_path}")
91
+ loader = loader_cls(str(file_path))
92
+ docs = loader.load()
93
+ documents.extend(docs)
94
+ print(f"[Loaded] {file_path} ({len(docs)} docs)")
95
+ except Exception as e:
96
+ print(f"[Error] Failed to load {file_path}: {e}")
97
+
98
+ print(f"[Done] Finished scanning {target_dir}")
99
+ print(f"Total documents loaded: {len(documents)}")
100
+ return documents
101
+
102
+ # -------------------------
103
+ # Prepare RAG (Ollama + FAISS)
104
+ # -------------------------
105
+ def prepare_RAG(dir_name="context", chunk_size=600, chunk_overlap=50):
106
+ print(f"[Debug] Preparing RAG with Ollama + FAISS. Context dir={dir_name}")
107
+
108
+ documents = create_dataset(dir_name)
109
+ if not documents:
110
+ raise ValueError("No documents loaded. Please add files to the context directory.")
111
+
112
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
113
+ print(f"[Debug] Splitting documents into chunks with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}")
114
+ all_splits = text_splitter.split_documents(documents)
115
+
116
+ print(f"[Debug] Number of chunks created: {len(all_splits)}")
117
+
118
+ # Ollama embeddings (local)
119
+ print(f"[Debug] Initializing Ollama embeddings")
120
+ embeddings = OllamaEmbeddings(model="nomic-embed-text")
121
+
122
+ # FAISS vector store
123
+ print(f"[Debug] Creating FAISS vector store")
124
+ vectorstore = FAISS.from_documents(all_splits, embeddings)
125
+
126
+ # Ollama LLM
127
+ print(f"[Debug] Initializing Ollama LLM")
128
+ llm = ChatOllama(model="llama3") # change model if needed
129
+
130
+ return vectorstore, llm
131
+
132
+ # -------------------------
133
+ # Retrieval
134
+ # -------------------------
135
+ def retrieve_RAG(query, vectorstore, top_k=5):
136
+ print(f"[Debug] Retrieving top {top_k} documents for query: {query}")
137
+ retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
138
+ results = retriever.get_relevant_documents(query)
139
+ print(f"[Debug] Retrieved {len(results)} documents")
140
+ return results
141
+
142
+ # -------------------------
143
+ # Generation
144
+ # -------------------------
145
+ def generate_RAG(prompt_message, llm, retrieved_docs):
146
+ print(f"[Debug] Generating response for prompt: {prompt_message}")
147
+ context_message = (
148
+ "You are an expert assistant. Use ONLY the provided context documents "
149
+ "to answer the question. If the context does not contain the answer, reply with 'I don’t know'."
150
+ )
151
+
152
+ context_text = "\n\n".join([d.page_content for d in retrieved_docs])
153
+ print(f"[Debug] Context for generation: {context_text[:500]}... (truncated)")
154
+
155
+ prompt = [
156
+ SystemMessage(content=context_message),
157
+ HumanMessage(content=f"Context:\n{context_text}\n\nQuestion: {prompt_message}")
158
+ ]
159
+
160
+ response = llm.invoke(prompt)
161
+ print(f"[Debug] Generated response: {response.content}")
162
+ return response
163
+
164
+ # -------------------------
165
+ # Gradio Chatbot
166
+ # -------------------------
167
+ def run_chatbot(user_dir="context"):
168
+ print(f"[Debug] Starting chatbot with user_dir: {user_dir}")
169
+ vectorstore, llm = prepare_RAG(dir_name=user_dir)
170
+
171
+ # Step 1: Add user message
172
+ def add_user_message(message, history):
173
+ print(f"[Debug] Adding user message: {message}")
174
+ history = history or []
175
+ history.append({"role": "user", "content": message})
176
+ return "", history, history
177
+
178
+ # Step 2: Generate bot response
179
+ def generate_bot_response(history):
180
+ if not history or history[-1]["role"] != "user":
181
+ print(f"[Debug] No user message to respond to.")
182
+ return history, history
183
+
184
+ user_msg = history[-1]["content"]
185
+ print(f"[Debug] Generating response for user message: {user_msg}")
186
+ retrieved_docs = retrieve_RAG(user_msg, vectorstore)
187
+ response = generate_RAG(user_msg, llm, retrieved_docs)
188
+
189
+ history.append({"role": "assistant", "content": response.content})
190
+ return history, history
191
+
192
+ with gr.Blocks() as demo:
193
+ gr.Markdown("# 📚 On-Prem RAG Chatbot (Ollama + FAISS)")
194
+ gr.Markdown("Ask questions about your local documents.")
195
+
196
+ chatbot = gr.Chatbot(type="messages")
197
+ msg = gr.Textbox(label="Your message")
198
+ state = gr.State([])
199
+
200
+ msg.submit(add_user_message, inputs=[msg, state], outputs=[msg, chatbot, state]) \
201
+ .then(generate_bot_response, inputs=[state], outputs=[chatbot, state])
202
+
203
+ demo.launch()
204
+
205
+ # -------------------------
206
+ # Main
207
+ # -------------------------
208
+ if __name__ == "__main__":
209
+ user_input = input("Enter a subfolder inside 'context' (press Enter for full 'context'): ").strip()
210
+ if not user_input:
211
+ user_dir = "context"
212
+ else:
213
+ user_dir = os.path.join("context", user_input)
214
+
215
+ print(f"[Info] Using context directory: {user_dir}")
216
+ run_chatbot(user_dir)
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-core
3
+ langchain-community
4
+ langchain-openai
5
+ langchain-mistralai
6
+ langchain-text-splitters
7
+ llama-index==0.14.5
8
+ llama-index-llms-langchain==0.7.1
9
+ datasets==4.1.0
10
+ ragas==0.3.5
11
+ langgraph==0.6.7
12
+ gradio==5.46.0
13
+ python-dotenv==1.1.1
14
+ nbformat==5.10.4
15
+ nbconvert==7.16.6
16
+ pypdf==6.0.0
17
+ unstructured[docx,pptx,html,md]==0.18.15
18
+ pinecone==7.3.0
19
+ plotly
retrieve.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ import time
4
+ import re
5
+ from pinecone import Pinecone
6
+
7
+ from langchain_mistralai import ChatMistralAI
8
+ from langchain_openai import ChatOpenAI
9
+ from langchain_core.messages import HumanMessage, SystemMessage
10
+ from langchain.schema import Document
11
+ from langchain_community.document_loaders import (
12
+ CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
13
+ UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
14
+ UnstructuredHTMLLoader, NotebookLoader
15
+ )
16
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
17
+
18
+ from llama_index.core.memory import Memory
19
+
20
+ import pickle
21
+
22
+ import json
23
+ from typing import List, Any
24
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
25
+
26
+ from typing import List, Any
27
+ from pydantic import BaseModel, ValidationError
28
+
29
+
30
+
31
+
32
+
33
+ def retrieve_RAG(
34
+ prompt_message, pc, index, kg_index, top_k=5, info=True,
35
+ use_query_reformulation=False, llm=None, graphRAG=False,
36
+ ):
37
+ """
38
+ Retrieve relevant document chunks and community summaries from Pinecone for a given prompt.
39
+ - Optionally splits and reformulates the prompt for improved search.
40
+ - Searches both standard document chunks and, if enabled, community summaries from the knowledge graph.
41
+ - Returns all retrieved results for further use.
42
+ """
43
+
44
+ import os
45
+ import re
46
+
47
+ if info:
48
+ print("[Debug] Starting retrieval with prompt:", prompt_message)
49
+ print("[Debug] Top K:", top_k)
50
+ print("[Debug] Query Reformulation Enabled:", use_query_reformulation)
51
+
52
+ # --- Step 0: Decide context usage (standard, graph, both) ---
53
+ def _graph_available():
54
+ try:
55
+ stats = index.describe_index_stats()
56
+ namespaces = stats.get("namespaces", {}) or {}
57
+ return "community-summaries" in namespaces
58
+ except Exception as e:
59
+ print(f"[Error] Failed to inspect index namespaces: {e}")
60
+ return False
61
+
62
+ graph_ok = bool(kg_index) or _graph_available()
63
+
64
+ # --- Step 1: Use LLM to split the prompt into sub-queries ---
65
+ sub_queries = [prompt_message] # fallback: single query
66
+ if llm is not None:
67
+ try:
68
+ split_prompt = (
69
+ "Given the following user query, identify and list all distinct sub-queries or tasks it contains. "
70
+ "Return ONLY a numbered list of sub-queries, each as a concise phrase.\n\n"
71
+ f"User Query: {prompt_message}"
72
+ )
73
+ split_response = llm.invoke(split_prompt)
74
+ sub_queries = re.findall(r"\d+\.\s*(.+)", split_response.content)
75
+ if not sub_queries:
76
+ sub_queries = [prompt_message]
77
+ if info:
78
+ print(f"[Debug] Identified sub-queries: {sub_queries}")
79
+ except Exception as e:
80
+ print(f"[Error] Sub-query splitting failed: {e}")
81
+
82
+ all_retrieved_chunks = []
83
+ all_graph_context_blocks = []
84
+
85
+ # --- Step 2: For each sub-query, retrieve context as decided ---
86
+ for idx, sub_query in enumerate(sub_queries):
87
+ task_prompt = sub_query.strip()
88
+
89
+ # Optional Query Reformulation
90
+ if use_query_reformulation and llm is not None:
91
+ try:
92
+ reformulation_prompt = (
93
+ "Reformulate the following query to focus only on the key concepts and remove any unnecessary details. "
94
+ "It should be suitable for vector search in RAG retrieval:\n\n"
95
+ f"Original Query: {task_prompt}"
96
+ )
97
+ reformulated_response = llm.invoke(reformulation_prompt)
98
+ task_prompt = reformulated_response.content.strip()
99
+ if info:
100
+ print(f"[Debug] Reformulated Query for sub-query {idx+1}: {task_prompt}")
101
+ except Exception as e:
102
+ print(f"[Error] Query reformulation failed for sub-query {idx+1}: {e}")
103
+
104
+ # Embed the sub-query
105
+ query_embedding = pc.inference.embed(
106
+ model="llama-text-embed-v2",
107
+ inputs=[task_prompt],
108
+ parameters={"input_type": "query"}
109
+ )
110
+ if info:
111
+ print(f"[Debug] Query embedding generated for sub-query {idx+1}.")
112
+ qvec = query_embedding[0].values
113
+
114
+ # --- Retrieve chunks if context_choice is standard or both ---
115
+ try:
116
+ retrieved_chunks_raw = index.query(
117
+ namespace="example-namespace",
118
+ vector=qvec,
119
+ top_k=top_k,
120
+ include_values=False,
121
+ include_metadata=True
122
+ )
123
+ retrieved_chunks = []
124
+ for match in retrieved_chunks_raw.matches:
125
+ text = match.metadata.get("text", "")
126
+ source = match.metadata.get("source", "Unknown source")
127
+ retrieved_chunks.append({
128
+ "text": text,
129
+ "source": source,
130
+ "sub_query": sub_query
131
+ })
132
+ all_retrieved_chunks.extend(retrieved_chunks)
133
+ if info:
134
+ print(f"[Debug] Match processed for sub-query {idx+1}: text='{text[:50]}...', source='{source}'")
135
+ except Exception as e:
136
+ print(f"[Error] Standard retrieval failed for sub-query {idx+1}: {e}")
137
+
138
+ # --- Retrieve community summaries if context_choice is graph or both ---
139
+ if graphRAG:
140
+ COMMUNITY_NAMESPACE = "community-summaries"
141
+ TOP_K_SUMMARIES = 5
142
+ try:
143
+ comm_matches = index.query(
144
+ namespace=COMMUNITY_NAMESPACE,
145
+ vector=qvec,
146
+ top_k=TOP_K_SUMMARIES,
147
+ include_values=False,
148
+ include_metadata=True
149
+ )
150
+ blocks = []
151
+ for m in comm_matches.matches:
152
+ meta = m.metadata or {}
153
+ txt = meta.get("text", "")
154
+ cid = meta.get("community_id", "NA")
155
+ level = meta.get("level", -1)
156
+ size = meta.get("size", 0)
157
+ block = f"[Community {cid} \n level={level} \n size={size}]\n{txt}"
158
+ blocks.append(block)
159
+ graph_context_str = ("\n\n---\n\n").join(blocks)
160
+ all_graph_context_blocks.append((sub_query, graph_context_str))
161
+ if info:
162
+ print(f"[Community] Retrieved {len(blocks)} community summaries for sub-query {idx+1}.")
163
+ except Exception as e:
164
+ print(f"[Error] Community summaries retrieval failed for sub-query {idx+1}: {e}")
165
+
166
+ # --- Step 3: Aggregate results ---
167
+ combined_graph_context = "\n\n====\n\n".join(
168
+ f"Sub-query: {sub_query}\n{context}"
169
+ for (sub_query, context) in all_graph_context_blocks if context
170
+ )
171
+
172
+ if info:
173
+ sources = {os.path.basename(chunk['source']) for chunk in all_retrieved_chunks}
174
+ print(f"[Debug] Final retrieval: {len(all_retrieved_chunks)} chunks from {len(sources)} sources, "
175
+ f"graph context length {len(combined_graph_context)}.")
176
+
177
+ # --- Return as before ---
178
+ return all_retrieved_chunks, combined_graph_context
179
+
180
+