Nadezhda Komarova commited on
Commit ·
4be6b01
1
Parent(s): af4d3bd
first commit
Browse files- .gitignore +213 -0
- README copy.md +227 -0
- app.css +485 -0
- generate.py +271 -0
- logo_mono.png +0 -0
- prepare.py +896 -0
- rag_execute.py +647 -0
- rag_on_prem.py +216 -0
- requirements.txt +19 -0
- retrieve.py +180 -0
.gitignore
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
|
| 209 |
+
# Project-specific ignores
|
| 210 |
+
#/context/
|
| 211 |
+
|
| 212 |
+
# Igrnore gradio
|
| 213 |
+
.gradio/
|
README copy.md
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RAG-LangChain-Gradio
|
| 3 |
+
app_file: rag_execute.py
|
| 4 |
+
sdk: gradio
|
| 5 |
+
sdk_version: 5.46.0
|
| 6 |
+
---
|
| 7 |
+
# Retrieval-Augmented Generation (RAG)
|
| 8 |
+
|
| 9 |
+
Dieses Projekt implementiert ein **Retrieval-Augmented Generation (RAG)**-System unter Verwendung von LangChain, Mistral/OpenAI LLMs und Pinecone für die Vektordatenbanksuche. Das System ermöglicht es, Dokumente abzufragen und kontextbewusste Antworten über eine Chat-ähnliche Schnittstelle zu generieren.
|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Inhaltsverzeichnis
|
| 14 |
+
|
| 15 |
+
1. [Übersicht](#übersicht)
|
| 16 |
+
2. [Anforderungen](#anforderungen)
|
| 17 |
+
3. [Einrichtung](#einrichtung)
|
| 18 |
+
4. [Vorbereitung der API-Schlüssel](#vorbereitung-der-api-schlüssel)
|
| 19 |
+
5. [Erstellung des Pinecone-Indexes](#erstellung-des-pinecone-indexes)
|
| 20 |
+
6. [Ausführung des RAG-Systems](#ausführung-des-rag-systems)
|
| 21 |
+
7. [Code-Struktur](#code-struktur)
|
| 22 |
+
8. [Prozessbeschreibung](#prozessbeschreibung)
|
| 23 |
+
9. [Evaluation](#evaluation)
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## Übersicht
|
| 28 |
+
|
| 29 |
+
Dieses RAG-System arbeitet in drei Schritten:
|
| 30 |
+
|
| 31 |
+
1. **Dokumente einbetten**: Rohtextdateien in Chunks aufteilen und mit dem `llama-text-embed-v2` Embedding-Modell in Pinecone einbetten.
|
| 32 |
+
2. **Abruf (Retrieval)**: Semantische Suche in der Pinecone-Vektordatenbank durchführen, um die relevantesten Chunks für eine Anfrage abzurufen.
|
| 33 |
+
3. **Generierung**: Übergabe der Anfrage und des abgerufenen Kontexts an ein LLM (Mistral oder OpenAI), um die Antwort zu generieren.
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Anforderungen
|
| 38 |
+
|
| 39 |
+
Python 3.10+ empfohlen.
|
| 40 |
+
|
| 41 |
+
Enthaltene Abhängigkeiten:
|
| 42 |
+
|
| 43 |
+
* `langchain-community`
|
| 44 |
+
* `langchain-core`
|
| 45 |
+
* `langchain-text-splitters`
|
| 46 |
+
* `langchain-mistralai`
|
| 47 |
+
* `langchain-openai`
|
| 48 |
+
* `ragas`
|
| 49 |
+
* `datasets`
|
| 50 |
+
* `pinecone-client`
|
| 51 |
+
* `gradio`
|
| 52 |
+
* `python-dotenv`
|
| 53 |
+
* `pypdf`
|
| 54 |
+
* `pandas`
|
| 55 |
+
* `nbformat`
|
| 56 |
+
* `nbconvert`
|
| 57 |
+
* `unstructured` (mit Extras für `docx`, `pptx`, `html`, `md`)
|
| 58 |
+
|
| 59 |
+
Alle Abhängigkeiten sollten installiert werden, z. B. mit dem folgenden Befehl.
|
| 60 |
+
```
|
| 61 |
+
pip install langchain-mistralai langchain-community datasets ragas langchain-openai langchain-text-splitters langchain-core pinecone-client langgraph pypdf gradio python-dotenv nbformat nbconvert "unstructured[docx,pptx,html,md]"
|
| 62 |
+
```
|
| 63 |
+
Da eine requirements.txt-Datei vorhanden ist, kann die Installation auch durch folgenden Befehl durchgeführt werden:
|
| 64 |
+
```
|
| 65 |
+
pip install -r requirements.txt
|
| 66 |
+
```
|
| 67 |
+
---
|
| 68 |
+
|
| 69 |
+
## Einrichtung
|
| 70 |
+
|
| 71 |
+
1. Repository klonen.
|
| 72 |
+
2. Erstellen Sie eine `.env`-Datei im Hauptverzeichnis mit den folgenden Schlüsseln:
|
| 73 |
+
|
| 74 |
+
```dotenv
|
| 75 |
+
MISTRAL_API_KEY=<Ihr-mistral-api-schlüssel>
|
| 76 |
+
OPENAI_API_KEY=<Ihr-openai-api-schlüssel>
|
| 77 |
+
PINECONE_API=<Ihr-pinecone-api-schlüssel>
|
| 78 |
+
INDEX_NAME=<Ihr-pinecone-index-name>
|
| 79 |
+
DIRNAME=<Pfad-zu-Kontextdokumenten>
|
| 80 |
+
MODELNAME=<LLM-Modellname>
|
| 81 |
+
````
|
| 82 |
+
|
| 83 |
+
* `MISTRAL_API_KEY` – Ihr API-Schlüssel für Mistral-Modelle.
|
| 84 |
+
* `OPENAI_API_KEY` – Ihr API-Schlüssel für OpenAI-Modelle. Es reicht aus, einen der OpenAI-/Mistral-Schlüssel (je nach ausgewähltem Modell) festzulegen.
|
| 85 |
+
* `PINECONE_API` – API-Schlüssel für Pinecone, um Vektoren zu speichern und abzufragen.
|
| 86 |
+
* `INDEX_NAME` – Name des Pinecone-Indexes, in dem die Dokumente abgelegt werden.
|
| 87 |
+
* `DIRNAME` – Unterordner innerhalb des festen `context/`-Verzeichnisses.
|
| 88 |
+
|
| 89 |
+
* Wenn leer (`DIRNAME=`), werden alle Dokumente in `context/` verarbeitet.
|
| 90 |
+
* Beispiel: `DIRNAME=llm_context` verarbeitet nur `context/llm_context/`.
|
| 91 |
+
* `MODELNAME` – LLM-Modell, z. B. `gpt-5-nano` für OpenAI oder `mistral-large-latest` für Mistral.
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
## Vorbereitung der API-Schlüssel
|
| 95 |
+
|
| 96 |
+
### Mistral
|
| 97 |
+
|
| 98 |
+
1. Registrieren bei [Mistral AI](https://www.mistral.ai).
|
| 99 |
+
2. API-Schlüssel erstellen.
|
| 100 |
+
3. In `.env` unter `MISTRAL_API_KEY` eintragen.
|
| 101 |
+
|
| 102 |
+
### OpenAI
|
| 103 |
+
|
| 104 |
+
1. Registrieren bei [OpenAI](https://platform.openai.com/).
|
| 105 |
+
2. API-Schlüssel erstellen.
|
| 106 |
+
3. In `.env` unter `OPENAI_API_KEY` eintragen.
|
| 107 |
+
|
| 108 |
+
### Pinecone
|
| 109 |
+
|
| 110 |
+
1. Registrieren bei [Pinecone](https://www.pinecone.io/).
|
| 111 |
+
2. API-Schlüssel erstellen.
|
| 112 |
+
3. In `.env` unter `PINECONE_API` eintragen.
|
| 113 |
+
4. Einen **Index** erstellen (z. B. `use-cases-index`) mit dem Embedding-Modell `llama-text-embed-v2`.
|
| 114 |
+
|
| 115 |
+
---
|
| 116 |
+
|
| 117 |
+
## Erstellung des Pinecone-Indexes
|
| 118 |
+
|
| 119 |
+
Das RAG-Vorbereitungsskript (`rag_func.py`) führt automatisch folgende Schritte aus:
|
| 120 |
+
|
| 121 |
+
1. Verbindung zu Pinecone über den API-Schlüssel herstellen.
|
| 122 |
+
2. Dokumente aus dem Verzeichnis `DIRNAME` laden.
|
| 123 |
+
3. Dokumente in Chunks aufteilen mit `RecursiveCharacterTextSplitter` (Standard: 1800 Tokens pro Chunk, 200 Tokens Überlappung).
|
| 124 |
+
4. Chunks mit `llama-text-embed-v2` einbetten.
|
| 125 |
+
5. Einbettungen in Batches in den Pinecone-Index hochladen.
|
| 126 |
+
|
| 127 |
+
> **Tipp:** Wenn alle Dokumente bereits hochgeladen wurden, kann `DIRNAME` leer bleiben, und das System überspringt die Dokumentenverarbeitung.
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
## Ausführung des RAG-Systems
|
| 132 |
+
|
| 133 |
+
Hauptskript ausführen:
|
| 134 |
+
|
| 135 |
+
```bash
|
| 136 |
+
python rag_execute.py
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
Dies bewirkt:
|
| 140 |
+
|
| 141 |
+
1. Laden der Dokumente (falls vorhanden) und Vorbereitung des RAG-Systems.
|
| 142 |
+
2. Starten einer **Chat-Schnittstelle** mit Gradio, in der Sie Fragen stellen können.
|
| 143 |
+
3. Abrufen relevanter Chunks aus Pinecone.
|
| 144 |
+
4. Generieren von Antworten mit dem ausgewählten LLM.
|
| 145 |
+
|
| 146 |
+
### Chat-Schnittstelle
|
| 147 |
+
|
| 148 |
+
* Geben Sie eine Anfrage in das Textfeld ein.
|
| 149 |
+
* Das System ruft den Kontext ab und erstellt eine Antwort.
|
| 150 |
+
* Antworten werden im Chat-Format angezeigt.
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## Code-Struktur
|
| 155 |
+
|
| 156 |
+
```
|
| 157 |
+
├─ rag_execute.py # Hauptskript für RAG mit Gradio-Schnittstelle
|
| 158 |
+
├─ rag_func.py # Funktionen für RAG-Vorbereitung, Abruf und Generierung
|
| 159 |
+
├─ .env # Umgebungsvariablen (API-Schlüssel, Index, Modell, Verzeichnis)
|
| 160 |
+
├─ context/ # Ordner mit Rohdokumenten (kann leer sein, falls bereits hochgeladen)
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## Prozessbeschreibung
|
| 166 |
+
|
| 167 |
+
1. **LLM auswählen**: Mistral oder OpenAI.
|
| 168 |
+
2. **Dokumente vorbereiten**:
|
| 169 |
+
|
| 170 |
+
* Text- oder JSON-Dokumente in den Ordner `DIRNAME` legen.
|
| 171 |
+
* Wenn alle Dokumente bereits hochgeladen sind, kann der Ordner leer bleiben.
|
| 172 |
+
3. **Pinecone-Index erstellen**:
|
| 173 |
+
|
| 174 |
+
* Index in `.env` benennen (`INDEX_NAME`).
|
| 175 |
+
* Embedding-Modell `llama-text-embed-v2` verwenden.
|
| 176 |
+
4. **Dokumente aufteilen**: Mit `RecursiveCharacterTextSplitter`.
|
| 177 |
+
5. **Chunks einbetten**: Zur semantischen Suche an Pinecone senden.
|
| 178 |
+
6. **Relevante Chunks abrufen**: Bei einer Anfrage.
|
| 179 |
+
7. **Antwort generieren**: Anfrage + Kontext an LLM übergeben.
|
| 180 |
+
8. **Ergebnis zurückgeben**: Im Chat-Interface anzeigen.
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
## Evaluation
|
| 185 |
+
|
| 186 |
+
Das System unterstützt Evaluation mit **RAGAS**:
|
| 187 |
+
|
| 188 |
+
1. `generate_dataset()` ruft Kontext ab und generiert Antworten.
|
| 189 |
+
2. `evaluate_RAG()` berechnet **Faithfulness** und andere Metriken.
|
| 190 |
+
3. Ergebnisse werden zur Analyse ausgegeben.
|
| 191 |
+
|
| 192 |
+
---
|
| 193 |
+
|
| 194 |
+
## Hinweise
|
| 195 |
+
|
| 196 |
+
* Retry-Logik für Pinecone-Operationen ist implementiert, um Netzwerkfehler abzufangen.
|
| 197 |
+
* Chunk-Größe und Überlappung können in `prepare_RAG()` für größere oder kleinere Kontextgranularität angepasst werden.
|
| 198 |
+
|
| 199 |
+
---
|
| 200 |
+
|
| 201 |
+
## Beispiel
|
| 202 |
+
|
| 203 |
+
```python
|
| 204 |
+
from rag_func import prepare_RAG, retrieve_RAG, generate_RAG
|
| 205 |
+
import os
|
| 206 |
+
|
| 207 |
+
index, pc, llm = prepare_RAG(
|
| 208 |
+
pinecone_API=os.getenv("PINECONE_API"),
|
| 209 |
+
index_name=os.getenv("INDEX_NAME"),
|
| 210 |
+
llm_model=os.getenv("MODELNAME"),
|
| 211 |
+
dir_name=os.getenv("DIRNAME")
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
query = "Liste typische Anwendungsfälle von GenAI im Telekommunikationsbereich auf."
|
| 215 |
+
retrieved_chunks = retrieve_RAG(query, pc, index)
|
| 216 |
+
response = generate_RAG(query, llm, retrieved_chunks)
|
| 217 |
+
print(response.content)
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
---
|
| 221 |
+
|
| 222 |
+
## Referenzen
|
| 223 |
+
|
| 224 |
+
* [LangChain RAG Tutorial](https://python.langchain.com/docs/tutorials/rag/)
|
| 225 |
+
* [Pinecone Dokumentation](https://docs.pinecone.io)
|
| 226 |
+
* [RAGAS Evaluation](https://docs.ragas.io/en/stable/getstarted/evals/)
|
| 227 |
+
|
app.css
ADDED
|
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
--brand-blue: #17428f;
|
| 3 |
+
--brand-orange: #f39719;
|
| 4 |
+
--text-dark: #111827; /* very dark grey (near black) */
|
| 5 |
+
--text-gray: #4B5563; /* medium grey for messages */
|
| 6 |
+
color-scheme: only light;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
/* Base */
|
| 10 |
+
body, .gradio-container {
|
| 11 |
+
/* Default Gradio font will be used */
|
| 12 |
+
background: linear-gradient(135deg, var(--brand-blue) 0%, var(--brand-orange) 100%);
|
| 13 |
+
min-height: 100vh;
|
| 14 |
+
color: var(--text-dark);
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
/* Logo size */
|
| 18 |
+
|
| 19 |
+
/* Logo size */
|
| 20 |
+
/*#company-logo img {
|
| 21 |
+
width: 40px !important;
|
| 22 |
+
min-width: 40px !important;
|
| 23 |
+
height: auto !important;
|
| 24 |
+
object-fit: contain !important;
|
| 25 |
+
display: block !important;
|
| 26 |
+
}*/
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
/* Top bar transparent */
|
| 30 |
+
#topbar { background: transparent !important; }
|
| 31 |
+
|
| 32 |
+
/* Header text over gradient */
|
| 33 |
+
#header h1, #header h2, #header h3, #header h4, #header h5, #header h6,
|
| 34 |
+
#header p {
|
| 35 |
+
color: #ffffff;
|
| 36 |
+
text-align: center;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
#header2 h1, #header2 h2, #header2 h3, #header2 h4, #header2 h5, #header2 h6,
|
| 40 |
+
#header2 p {
|
| 41 |
+
color: #ffffff;
|
| 42 |
+
text-align: center;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
/* Chatbox container */
|
| 46 |
+
#chatbot {
|
| 47 |
+
height: 100%;
|
| 48 |
+
border-radius: 14px;
|
| 49 |
+
border: 2px solid var(--brand-blue);
|
| 50 |
+
background-color: #ffffff;
|
| 51 |
+
padding: 8px;
|
| 52 |
+
overflow-y: auto;
|
| 53 |
+
|
| 54 |
+
/* Icon tint tokens for local use (chat area) */
|
| 55 |
+
--icon-light: #9CA3AF;
|
| 56 |
+
--icon-hover: #6B7280;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
/* ----------------------------- */
|
| 60 |
+
/* TEXT COLORING (SAFE FOR CODE) */
|
| 61 |
+
/* ----------------------------- */
|
| 62 |
+
|
| 63 |
+
/* Apply the gray text color only at the message container level.
|
| 64 |
+
Do NOT set color on descendants or code tokens. */
|
| 65 |
+
#chatbot .message {
|
| 66 |
+
color: var(--text-gray); /* no !important */
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
/* Bubble styling */
|
| 70 |
+
#chatbot .message.user {
|
| 71 |
+
background: #fff4e1;
|
| 72 |
+
border-radius: 10px;
|
| 73 |
+
padding: 6px 12px;
|
| 74 |
+
text-align: right;
|
| 75 |
+
}
|
| 76 |
+
#chatbot .message.bot {
|
| 77 |
+
background: #f0f0f0;
|
| 78 |
+
border-radius: 10px;
|
| 79 |
+
padding: 6px 12px;
|
| 80 |
+
text-align: left;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/* Markdown horizontal rules inside chatbot */
|
| 84 |
+
#chatbot hr {
|
| 85 |
+
margin: 6px 0; /* reduce extra space */
|
| 86 |
+
border: none; /* remove default bevel */
|
| 87 |
+
border-top: 1px solid #d1d5db; /* subtle gray line */
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
/* Fallback selectors for other Gradio versions */
|
| 91 |
+
#chatbot [data-testid*="message"] {
|
| 92 |
+
border-radius: 10px;
|
| 93 |
+
padding: 6px 12px;
|
| 94 |
+
}
|
| 95 |
+
#chatbot [data-testid="user-message"] {
|
| 96 |
+
background: #fff4e1;
|
| 97 |
+
text-align: right;
|
| 98 |
+
}
|
| 99 |
+
#chatbot [data-testid="assistant-message"] {
|
| 100 |
+
background: #f0f0f0;
|
| 101 |
+
text-align: left;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
/* ----------------------------- */
|
| 105 |
+
/* CODE BLOCKS (DO NOT SET COLOR)*/
|
| 106 |
+
/* ----------------------------- */
|
| 107 |
+
|
| 108 |
+
/* Give code blocks a readable container without touching colors.
|
| 109 |
+
This preserves syntax highlighting from highlight.js or Prism. */
|
| 110 |
+
#chatbot pre,
|
| 111 |
+
#chatbot pre code,
|
| 112 |
+
#chatbot code[class*="language-"],
|
| 113 |
+
#chatbot pre[class*="language-"],
|
| 114 |
+
#chatbot code.hljs {
|
| 115 |
+
background: #f8fafc; /* light neutral background */
|
| 116 |
+
border-radius: 8px;
|
| 117 |
+
padding: 10px 12px;
|
| 118 |
+
display: block;
|
| 119 |
+
overflow-x: auto;
|
| 120 |
+
/* IMPORTANT: no 'color' declaration here */
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
/* Inline code (single backticks) */
|
| 124 |
+
#chatbot :not(pre) > code {
|
| 125 |
+
background: #f1f5f9;
|
| 126 |
+
padding: 0.15rem 0.35rem;
|
| 127 |
+
border-radius: 6px;
|
| 128 |
+
/* no 'color' here */
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
/* ---------------------------------- */
|
| 132 |
+
/* Inputs */
|
| 133 |
+
/* ---------------------------------- */
|
| 134 |
+
input[type="text"], textarea, .gr-text-input input, .gr-textbox textarea {
|
| 135 |
+
border-radius: 10px;
|
| 136 |
+
padding: 10px;
|
| 137 |
+
font-size: 16px;
|
| 138 |
+
border: 2px solid var(--brand-orange);
|
| 139 |
+
}
|
| 140 |
+
input:focus, textarea:focus, .gr-text-input input:focus, .gr-textbox textarea:focus {
|
| 141 |
+
border-color: var(--brand-blue);
|
| 142 |
+
outline: none;
|
| 143 |
+
box-shadow: 0 0 6px rgba(23, 66, 143, 0.5);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
/* ---------------------------------- */
|
| 147 |
+
/* Buttons (global gradient) */
|
| 148 |
+
/* ---------------------------------- */
|
| 149 |
+
.gr-button, button {
|
| 150 |
+
border-radius: 10px;
|
| 151 |
+
font-weight: 600;
|
| 152 |
+
background: linear-gradient(90deg, var(--brand-blue), var(--brand-orange));
|
| 153 |
+
color: white;
|
| 154 |
+
border: none;
|
| 155 |
+
}
|
| 156 |
+
.gr-button:hover, button:hover {
|
| 157 |
+
transform: translateY(-2px);
|
| 158 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.2);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
/* ---------------------------------- */
|
| 162 |
+
/* Chat area: icon-only buttons */
|
| 163 |
+
/* ---------------------------------- */
|
| 164 |
+
|
| 165 |
+
/* Tint SVG icons */
|
| 166 |
+
#chatbot button svg,
|
| 167 |
+
#chatbot [role="button"] svg,
|
| 168 |
+
#chatbot .icon svg,
|
| 169 |
+
#chatbot [class*="icon"] svg,
|
| 170 |
+
#chatbot [data-testid*="icon"] svg,
|
| 171 |
+
#chatbot [data-testid*="message"] .tools svg,
|
| 172 |
+
#chatbot .message-tools svg {
|
| 173 |
+
color: var(--icon-light) !important;
|
| 174 |
+
fill: var(--icon-light) !important;
|
| 175 |
+
stroke: var(--icon-light) !important;
|
| 176 |
+
opacity: 0.95;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
/* Remove gradient background only on small icon-only buttons */
|
| 180 |
+
#chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient) {
|
| 181 |
+
background: transparent !important;
|
| 182 |
+
background-image: none !important;
|
| 183 |
+
border: none !important;
|
| 184 |
+
box-shadow: none !important;
|
| 185 |
+
padding: 6px !important;
|
| 186 |
+
border-radius: 8px !important;
|
| 187 |
+
color: var(--icon-light) !important;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
/* Hover/focus/active states */
|
| 191 |
+
#chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):hover {
|
| 192 |
+
background-color: rgba(0,0,0,0.05) !important;
|
| 193 |
+
}
|
| 194 |
+
#chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):focus-visible {
|
| 195 |
+
outline: none !important;
|
| 196 |
+
box-shadow: 0 0 0 2px rgba(23, 66, 143, 0.35) !important;
|
| 197 |
+
background-color: rgba(0,0,0,0.06) !important;
|
| 198 |
+
}
|
| 199 |
+
#chatbot :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):active {
|
| 200 |
+
background-color: rgba(0,0,0,0.08) !important;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
/* Optional 'danger' icons */
|
| 204 |
+
#chatbot .danger svg {
|
| 205 |
+
color: var(--icon-light) !important;
|
| 206 |
+
fill: var(--icon-light) !important;
|
| 207 |
+
stroke: var(--icon-light) !important;
|
| 208 |
+
}
|
| 209 |
+
#chatbot .danger:hover svg {
|
| 210 |
+
color: #ef4444 !important;
|
| 211 |
+
fill: #ef4444 !important;
|
| 212 |
+
stroke: #ef4444 !important;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
/* ---------------------------------- */
|
| 216 |
+
/* TOP BAR (logo block): icon-only */
|
| 217 |
+
/* ---------------------------------- */
|
| 218 |
+
#topbar { background: transparent !important; }
|
| 219 |
+
#topbar { --icon-light: #9CA3AF; --icon-hover: #6B7280; }
|
| 220 |
+
|
| 221 |
+
#topbar .gr-button.keep-gradient,
|
| 222 |
+
#topbar .gr-button:not(:has(svg)) {
|
| 223 |
+
background: linear-gradient(90deg, var(--brand-blue), var(--brand-orange)) !important;
|
| 224 |
+
color: #fff !important;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
/* Icon-only buttons in topbar: transparent */
|
| 228 |
+
#topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient) {
|
| 229 |
+
background: transparent !important;
|
| 230 |
+
border: none !important;
|
| 231 |
+
box-shadow: none !important;
|
| 232 |
+
padding: 6px !important;
|
| 233 |
+
border-radius: 8px !important;
|
| 234 |
+
color: var(--icon-light) !important;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
/* Tint SVGs in topbar */
|
| 238 |
+
#topbar :is(button,[role="button"]):has(> svg) > svg {
|
| 239 |
+
color: var(--icon-light) !important;
|
| 240 |
+
fill: var(--icon-light) !important;
|
| 241 |
+
stroke: var(--icon-light) !important;
|
| 242 |
+
opacity: 0.95;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
/* Hover/focus/active for topbar icons */
|
| 246 |
+
#topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):hover {
|
| 247 |
+
background-color: rgba(0,0,0,0.05) !important;
|
| 248 |
+
}
|
| 249 |
+
#topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):focus-visible {
|
| 250 |
+
outline: none !important;
|
| 251 |
+
box-shadow: 0 0 0 2px rgba(23, 66, 143, 0.35) !important;
|
| 252 |
+
background-color: rgba(0,0,0,0.06) !important;
|
| 253 |
+
}
|
| 254 |
+
#topbar :is(button,[role="button"]):is([aria-label],[title], :has(> svg)):not(.keep-gradient):active {
|
| 255 |
+
background-color: rgba(0,0,0,0.08) !important;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
/* Ensure the textbox wrapper is relative */
|
| 263 |
+
#message-box {
|
| 264 |
+
width: 100%;
|
| 265 |
+
border-radius: 9999px;
|
| 266 |
+
border: 2px solid var(--brand-orange);
|
| 267 |
+
font-size: 16px;
|
| 268 |
+
outline: none;
|
| 269 |
+
position: relative; /* needed for absolute button */
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
#message-box:focus {
|
| 273 |
+
border-color: var(--brand-blue);
|
| 274 |
+
box-shadow: 0 0 6px rgba(23, 66, 143, 0.3);
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
/* Send button positioned inside textbox */
|
| 278 |
+
#send-button {
|
| 279 |
+
position: absolute;
|
| 280 |
+
|
| 281 |
+
right: 34px; /* move left by increasing this value */
|
| 282 |
+
top: 48px; /* move down by increasing this value */
|
| 283 |
+
|
| 284 |
+
transform: translateY(-16%);
|
| 285 |
+
width: 36px;
|
| 286 |
+
height: 36px;
|
| 287 |
+
min-width: 0 !important;
|
| 288 |
+
padding: 0 !important;
|
| 289 |
+
border-radius: 50%;
|
| 290 |
+
background: linear-gradient(90deg, var(--brand-blue), var(--brand-orange));
|
| 291 |
+
display: flex;
|
| 292 |
+
align-items: center;
|
| 293 |
+
justify-content: center;
|
| 294 |
+
border: none;
|
| 295 |
+
color: white;
|
| 296 |
+
font-size: 18px;
|
| 297 |
+
cursor: pointer;
|
| 298 |
+
z-index: 2;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
#send-button:hover {
|
| 302 |
+
transform: translateY(-16%) scale(1.05);
|
| 303 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.2);
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
#send-button:active {
|
| 307 |
+
transform: translateY(-16%) scale(0.95);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
/* Compact upload area - lighter and transparent */
|
| 318 |
+
#file-upload .upload-box,
|
| 319 |
+
#file-upload .file-wrap,
|
| 320 |
+
#file-upload .wrap {
|
| 321 |
+
min-height: 80px;
|
| 322 |
+
padding: 8px 12px;
|
| 323 |
+
color: #666 !important; /* Softer text color */
|
| 324 |
+
background: transparent !important; /* No white block, shows gradient */
|
| 325 |
+
border: none !important; /* Remove any border */
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
/* Inner placeholder (remove gray square) */
|
| 329 |
+
#file-upload .upload-box div,
|
| 330 |
+
#file-upload .upload-box span {
|
| 331 |
+
background: transparent !important; /* Remove gray background */
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
/* Text and icons slightly lighter */
|
| 335 |
+
#file-upload .upload-box,
|
| 336 |
+
#file-upload .file-wrap,
|
| 337 |
+
#file-upload .wrap {
|
| 338 |
+
color: #666 !important;
|
| 339 |
+
fill: #666 !important;
|
| 340 |
+
stroke: #666 !important;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
/* Buttons remain clean */
|
| 344 |
+
#file-upload button,
|
| 345 |
+
#file-upload [role="button"] {
|
| 346 |
+
background: transparent !important;
|
| 347 |
+
border: none !important;
|
| 348 |
+
box-shadow: none !important;
|
| 349 |
+
border-radius: 0 !important;
|
| 350 |
+
padding: 4px !important;
|
| 351 |
+
color: #666 !important;
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
/* SVG icons */
|
| 355 |
+
#file-upload button svg,
|
| 356 |
+
#file-upload [role="button"] svg {
|
| 357 |
+
color: #666 !important;
|
| 358 |
+
fill: #666 !important;
|
| 359 |
+
stroke: #666 !important;
|
| 360 |
+
background: none !important;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
/* Hover effect */
|
| 364 |
+
#file-upload button:hover svg,
|
| 365 |
+
#file-upload [role="button"]:hover svg {
|
| 366 |
+
color: #2a5db0 !important;
|
| 367 |
+
fill: #2a5db0 !important;
|
| 368 |
+
stroke: #2a5db0 !important;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
/* Hide Gradio's default control buttons in the header/topbar */
|
| 375 |
+
#topbar .gr-button,
|
| 376 |
+
#topbar [role="button"],
|
| 377 |
+
#header-container .gr-button,
|
| 378 |
+
#header-container [role="button"] {
|
| 379 |
+
display: none !important;
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
#upload-note, #upload-note * {
|
| 385 |
+
color: #ffffff !important;
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
.gradio-container [id="left-column"] {
|
| 392 |
+
min-width: 40px !important; /* Increased from 10px */
|
| 393 |
+
max-width: 320px !important; /* You can go up to 400px if you want even wider */
|
| 394 |
+
width: 180px !important; /* Increased from 100px */
|
| 395 |
+
flex: 0 0 220px !important; /* Increased from 180px */
|
| 396 |
+
padding-right: 12px !important; /* Slightly more padding for visual separation */
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
.gradio-container [id="right-column"] {
|
| 400 |
+
flex: 1 1 0 !important;
|
| 401 |
+
width: auto !important;
|
| 402 |
+
min-width: 0 !important;
|
| 403 |
+
padding-left: 0 !important;
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
/* Branding layout */
|
| 411 |
+
#branding {
|
| 412 |
+
display: flex;
|
| 413 |
+
align-items: center;
|
| 414 |
+
gap: 8px; /* space between text and logo */
|
| 415 |
+
justify-content: center;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
/* Text style */
|
| 419 |
+
#brand-text {
|
| 420 |
+
font-size: 1.6rem;
|
| 421 |
+
font-weight: 700;
|
| 422 |
+
color: white;
|
| 423 |
+
line-height: 1;
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
/* Logo sizing */
|
| 427 |
+
#company-logo {
|
| 428 |
+
width: 40px; /* set exact width */
|
| 429 |
+
height: auto; /* preserve proportions */
|
| 430 |
+
display: block;
|
| 431 |
+
object-fit: contain;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
/* ---- Viewport-safe base (handles mobile address bar) ---- */
|
| 441 |
+
html, body, .gradio-container {
|
| 442 |
+
height: 100dvh; /* dynamic viewport height */
|
| 443 |
+
min-height: 100dvh;
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
/* ---- Make sure all flex ancestors allow their children to shrink ---- */
|
| 447 |
+
.gradio-container .gr-row,
|
| 448 |
+
.gradio-container .gr-row > .gr-column,
|
| 449 |
+
.gradio-container > .gr-row,
|
| 450 |
+
.gradio-container .gr-column,
|
| 451 |
+
#right-column,
|
| 452 |
+
#chat-area {
|
| 453 |
+
min-height: 0 !important; /* critical so inner scroll can happen */
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
/* ---- Chatbot outer box: cap height and scroll inside ---- */
|
| 457 |
+
#chatbot {
|
| 458 |
+
box-sizing: border-box;
|
| 459 |
+
flex: 1 1 auto;
|
| 460 |
+
min-height: 0 !important;
|
| 461 |
+
height: auto !important;
|
| 462 |
+
max-height: calc(100dvh - var(--topbar-offset, 0px) - 120px) !important;
|
| 463 |
+
overflow-y: auto !important;
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
/* ---- Gradio’s inner wrappers sometimes need explicit constraints ---- */
|
| 467 |
+
/* Gradio v4: Chatbot renders inside .gr-chatbot -> .wrap / .message-wrap depending on version */
|
| 468 |
+
#chatbot .wrap,
|
| 469 |
+
#chatbot .message-wrap,
|
| 470 |
+
#chatbot [data-testid="chatbot"] {
|
| 471 |
+
max-height: 100%;
|
| 472 |
+
overflow-y: auto;
|
| 473 |
+
min-height: 0;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
/* Optional: if you see the input row pushing the chat up/down on small screens,
|
| 477 |
+
let the input take only its content height. */
|
| 478 |
+
#input-row {
|
| 479 |
+
flex: 0 0 auto;
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
/* If your header/progress box is above the chat, ensure it doesn't consume flex growth */
|
| 483 |
+
#chat-area > *:not(#chatbot) {
|
| 484 |
+
flex: 0 0 auto;
|
| 485 |
+
}
|
generate.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pathlib
|
| 3 |
+
import time
|
| 4 |
+
import re
|
| 5 |
+
from pinecone import Pinecone
|
| 6 |
+
|
| 7 |
+
from langchain_mistralai import ChatMistralAI
|
| 8 |
+
from langchain_openai import ChatOpenAI
|
| 9 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 10 |
+
from langchain.schema import Document
|
| 11 |
+
from langchain_community.document_loaders import (
|
| 12 |
+
CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
|
| 13 |
+
UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
|
| 14 |
+
UnstructuredHTMLLoader, NotebookLoader
|
| 15 |
+
)
|
| 16 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 17 |
+
|
| 18 |
+
from llama_index.core.memory import Memory
|
| 19 |
+
|
| 20 |
+
import pickle
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
from typing import List, Any
|
| 24 |
+
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
|
| 25 |
+
|
| 26 |
+
from typing import List, Any
|
| 27 |
+
from pydantic import BaseModel, ValidationError
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
memory = Memory(token_limit=2048)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def generate_RAG(
|
| 34 |
+
prompt_message,
|
| 35 |
+
llm,
|
| 36 |
+
retrieved_chunks,
|
| 37 |
+
graph_context="",
|
| 38 |
+
graphRAG=False,
|
| 39 |
+
info=True
|
| 40 |
+
):
|
| 41 |
+
"""
|
| 42 |
+
Two-stage flow (single function):
|
| 43 |
+
1) Resolver (non-streaming, no callbacks): decide if this turn should be history-only. Produce resolved_task.
|
| 44 |
+
2) Answer (streaming via the passed llm): include retrieved context only if allowed; otherwise forbid it.
|
| 45 |
+
|
| 46 |
+
Message order (to favor history for follow-ups):
|
| 47 |
+
System (first) -> (Optional) AIMessage with Retrieved Context -> History -> Human (last)
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
if info:
|
| 51 |
+
print("Generate RAG with", prompt_message, llm)
|
| 52 |
+
|
| 53 |
+
# ---------- Helpers ----------
|
| 54 |
+
def _to_list_messages(history: Any) -> List[BaseMessage]:
|
| 55 |
+
"""Normalizes memory history: supports list[BaseMessage] or a summary string."""
|
| 56 |
+
if isinstance(history, list):
|
| 57 |
+
return history
|
| 58 |
+
if isinstance(history, str) and history.strip():
|
| 59 |
+
return [AIMessage(content=f"[Conversation summary]\n{history.strip()}")]
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
def _last_ai_text(msgs: List[BaseMessage]) -> str:
|
| 63 |
+
for m in reversed(msgs):
|
| 64 |
+
if isinstance(m, AIMessage):
|
| 65 |
+
return m.content
|
| 66 |
+
return ""
|
| 67 |
+
|
| 68 |
+
def _safe_json_loads(raw: str) -> dict:
|
| 69 |
+
try:
|
| 70 |
+
return json.loads(raw)
|
| 71 |
+
except Exception:
|
| 72 |
+
start, end = raw.find("{"), raw.rfind("}")
|
| 73 |
+
if start != -1 and end != -1 and end > start:
|
| 74 |
+
return json.loads(raw[start:end+1])
|
| 75 |
+
raise
|
| 76 |
+
def _make_non_streaming_resolver(llm_):
|
| 77 |
+
"""
|
| 78 |
+
Create a non-streaming, callback-free copy of the same LLM class for the resolver step.
|
| 79 |
+
Works for ChatOpenAI-style classes that accept 'model' or 'model_name'.
|
| 80 |
+
"""
|
| 81 |
+
model_name = getattr(llm_, "model_name", getattr(llm_, "model", None))
|
| 82 |
+
kwargs = {}
|
| 83 |
+
if hasattr(llm_, "temperature"):
|
| 84 |
+
kwargs["temperature"] = getattr(llm_, "temperature")
|
| 85 |
+
try:
|
| 86 |
+
return llm_.__class__(model=model_name, streaming=False, callbacks=[], **kwargs)
|
| 87 |
+
except TypeError:
|
| 88 |
+
return llm_.__class__(model_name=model_name, streaming=False, callbacks=[], **kwargs)
|
| 89 |
+
|
| 90 |
+
def _resolver(user_text: str, history_msgs: List[BaseMessage]) -> dict:
|
| 91 |
+
resolver_llm = _make_non_streaming_resolver(llm)
|
| 92 |
+
|
| 93 |
+
RESOLVER_SYS = (
|
| 94 |
+
"You are a controller that decides if the next answer should rely ONLY on Chat History "
|
| 95 |
+
"(ignore Retrieved Context completely) or may use Retrieved Context.\n"
|
| 96 |
+
"Return STRICT JSON with keys:\n"
|
| 97 |
+
'{ "use_history_only": true|false, "resolved_task": "<resolved user request>" }\n\n'
|
| 98 |
+
"Rules:\n"
|
| 99 |
+
"- Always set set use_history_only=false (especially if the query has meaningful concepts for retrieval, e.g., specific entities, topics, product names, technical terms, factual questions).\n"
|
| 100 |
+
"- Except in rare cases, do NOT set use_history_only=true. Only do so if the query contains undefined pronouns (e.g., this, that, it, they, those, these, above, continue, previous, earlier, same...).\n"
|
| 101 |
+
"Examples:\n"
|
| 102 |
+
'User: "Where in the onboarding guide do we define the trial limits?"\n'
|
| 103 |
+
'-> { "use_history_only": false, "resolved_task": "Find where the onboarding guide defines the trial limits and report the exact limits." }\n'
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
resolver_msgs: List[BaseMessage] = [SystemMessage(RESOLVER_SYS)]
|
| 107 |
+
last_ai = _last_ai_text(history_msgs)
|
| 108 |
+
if last_ai:
|
| 109 |
+
resolver_msgs.append(AIMessage(content=f"[Last assistant answer]\n{last_ai}"))
|
| 110 |
+
resolver_msgs.extend(history_msgs)
|
| 111 |
+
resolver_msgs.append(HumanMessage(content=f"User message: {user_text}"))
|
| 112 |
+
|
| 113 |
+
raw = resolver_llm.invoke(resolver_msgs).content
|
| 114 |
+
try:
|
| 115 |
+
data = _safe_json_loads(raw)
|
| 116 |
+
except Exception:
|
| 117 |
+
data = {"use_history_only": False, "resolved_task": user_text}
|
| 118 |
+
|
| 119 |
+
data.setdefault("use_history_only", False)
|
| 120 |
+
data.setdefault("resolved_task", user_text)
|
| 121 |
+
return data
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ---------- Prepare history ----------
|
| 125 |
+
history_messages: List[BaseMessage] = []
|
| 126 |
+
if memory:
|
| 127 |
+
# Get the last messages from LlamaIndex memory
|
| 128 |
+
last_msgs = memory.get_all()[-8:]
|
| 129 |
+
|
| 130 |
+
# Convert LlamaIndex messages to LangChain message types
|
| 131 |
+
for msg in last_msgs:
|
| 132 |
+
if msg.role == "user":
|
| 133 |
+
history_messages.append(HumanMessage(content=msg.content))
|
| 134 |
+
elif msg.role in ("ai", "assistant"):
|
| 135 |
+
history_messages.append(AIMessage(content=msg.content))
|
| 136 |
+
# Add more roles if needed
|
| 137 |
+
|
| 138 |
+
# ---------- Stage 1: Resolve (non-streaming) ----------
|
| 139 |
+
plan = _resolver(prompt_message, history_messages)
|
| 140 |
+
|
| 141 |
+
use_history_only = bool(plan.get("use_history_only", False))
|
| 142 |
+
resolved_task = plan.get("resolved_task", prompt_message)
|
| 143 |
+
|
| 144 |
+
if info:
|
| 145 |
+
print("[Resolver]", plan)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ---------- Build retrieval context block ----------
|
| 149 |
+
context_lines = []
|
| 150 |
+
if not use_history_only:
|
| 151 |
+
for i, chunk in enumerate(retrieved_chunks or []):
|
| 152 |
+
source_filename = os.path.basename((chunk.get("source") or "unknown"))
|
| 153 |
+
text = chunk.get("text") or ""
|
| 154 |
+
context_lines.append(f"Source {i+1} ({source_filename}):\n{text}")
|
| 155 |
+
|
| 156 |
+
if graphRAG and graph_context:
|
| 157 |
+
context_lines.append("[Graph context]\n" + graph_context)
|
| 158 |
+
|
| 159 |
+
context_for_llm = "\n\n".join(context_lines)
|
| 160 |
+
|
| 161 |
+
# ---------- System prompt (first) ----------
|
| 162 |
+
base_rules = (
|
| 163 |
+
"You are an expert assistant. Answer in English. Use:\n"
|
| 164 |
+
"- Chat History\n"
|
| 165 |
+
"- Retrieved Context (reference-only facts; not user intent).\n\n"
|
| 166 |
+
"Decision rubric before answering:\n"
|
| 167 |
+
"- Important: you MUST ALWAYS cite a source, i.e., always use exactly the filename from the 'source' metadata (e.g., 'Source: sample.pdf.' in the same paragraph as the claim).\n"
|
| 168 |
+
"- If the answer is not supported by Retrieved Context and not implied by history, say you cannot answer.\n\n"
|
| 169 |
+
"Important: output should be very well-structured Markdown (always different headings, hierarchical structure, bullets, tables and code blocks when needed), with a few emojis for scannability."
|
| 170 |
+
)
|
| 171 |
+
turn_rule = (
|
| 172 |
+
"\n\nTURN-SPECIFIC RULE: For THIS turn, you MUST NOT use any Retrieved Context. "
|
| 173 |
+
"Base your answer ONLY on Chat History and the user's current request."
|
| 174 |
+
if use_history_only else ""
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
prompt_parts: List[BaseMessage] = [SystemMessage(content=base_rules + turn_rule)]
|
| 178 |
+
|
| 179 |
+
# ---------- Retrieved context as assistant message (only if allowed) ----------
|
| 180 |
+
if (not use_history_only) and context_for_llm.strip():
|
| 181 |
+
prompt_parts.append(
|
| 182 |
+
SystemMessage(
|
| 183 |
+
content="📚 Retrieved Context (reference-only; not user intent, Use info only from here and nothing else, if info not present, say you do not know. You are only allowed to base your answer on this info and not use your own):\n\n" + context_for_llm
|
| 184 |
+
)
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# ---------- History next (more recent than retrieval context) ----------
|
| 188 |
+
if history_messages:
|
| 189 |
+
prompt_parts.append(SystemMessage(content="🕘 Chat History (most recent last):"))
|
| 190 |
+
prompt_parts.extend(history_messages)
|
| 191 |
+
|
| 192 |
+
# ---------- Current user last (include BOTH original and resolved) ----------
|
| 193 |
+
final_human = (
|
| 194 |
+
"User request (original):\n"
|
| 195 |
+
f"{prompt_message}\n\n"
|
| 196 |
+
"Resolved task (use this when pronouns/references appear):\n"
|
| 197 |
+
f"{resolved_task}"
|
| 198 |
+
)
|
| 199 |
+
prompt_parts.append(HumanMessage(content=final_human))
|
| 200 |
+
|
| 201 |
+
# ---------- Stage 2: Answer (streaming via passed llm) ----------
|
| 202 |
+
print(f"[Info] The final prompt is the following: {prompt_parts}")
|
| 203 |
+
response = llm.invoke(prompt_parts)
|
| 204 |
+
print(f"[Info] The final response is the following: {response}")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# ---------- Pydantic validation: ensure some "Source:" structure is present ----------
|
| 212 |
+
class _AnswerWithCitationStructure(BaseModel):
|
| 213 |
+
content: str
|
| 214 |
+
|
| 215 |
+
@classmethod
|
| 216 |
+
def ensure_source_structure(cls, content: str):
|
| 217 |
+
"""
|
| 218 |
+
Check that there is at least one 'Source:' or 'Sources:' pattern in the text.
|
| 219 |
+
"""
|
| 220 |
+
import re
|
| 221 |
+
|
| 222 |
+
if not re.search(r"\bSources?:\s*.+", content, flags=re.IGNORECASE):
|
| 223 |
+
raise ValueError("Missing any 'Source:' structure in the answer.")
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
# Run validation only when we expected citations (retrieval was allowed)
|
| 227 |
+
try:
|
| 228 |
+
if not use_history_only:
|
| 229 |
+
_AnswerWithCitationStructure.ensure_source_structure(
|
| 230 |
+
getattr(response, "content", str(response))
|
| 231 |
+
)
|
| 232 |
+
except (ValidationError, ValueError) as ve:
|
| 233 |
+
print(f"[Validation] Source structure check failed: {ve}")
|
| 234 |
+
|
| 235 |
+
# Retry answer generation with stronger emphasis on sources
|
| 236 |
+
retry_prompt_parts = prompt_parts.copy()
|
| 237 |
+
retry_prompt_parts.append(SystemMessage(
|
| 238 |
+
content="⚠️ IMPORTANT: Your previous answer did not include any 'Source:' citation. "
|
| 239 |
+
"Regenerate your answer and make sure to include at least one 'Source: ...' or 'Sources: ...' line "
|
| 240 |
+
"that cites the relevant documents or context."
|
| 241 |
+
))
|
| 242 |
+
response = llm.invoke(retry_prompt_parts)
|
| 243 |
+
print("[Retry] Regenerated answer with source emphasis.")
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
# ---------- Persist to memory ----------
|
| 252 |
+
|
| 253 |
+
from llama_index.core.llms import ChatMessage
|
| 254 |
+
|
| 255 |
+
# ---------- Persist to memory ----------
|
| 256 |
+
if memory:
|
| 257 |
+
# Add user message
|
| 258 |
+
memory.put(ChatMessage(role="user", content=prompt_message))
|
| 259 |
+
|
| 260 |
+
if not use_history_only:
|
| 261 |
+
# Add context as AI message
|
| 262 |
+
memory.put(ChatMessage(role="assistant", content=f"The context was: [start context] {context_for_llm} [end context]"))
|
| 263 |
+
|
| 264 |
+
# Add final AI response
|
| 265 |
+
memory.put(ChatMessage(role="assistant", content=getattr(response, "content", str(response))))
|
| 266 |
+
|
| 267 |
+
# To print the current memory, retrieve all messages
|
| 268 |
+
print("[Info] The following is the current memory:", memory.get_all())
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
return response
|
logo_mono.png
ADDED
|
prepare.py
ADDED
|
@@ -0,0 +1,896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""prepare.py
|
| 2 |
+
|
| 3 |
+
Utilities to prepare documents and knowledge-graph artifacts for a RAG (Retrieval-Augmented
|
| 4 |
+
Generation) pipeline.
|
| 5 |
+
|
| 6 |
+
This module implements:
|
| 7 |
+
- safe file loading for text-like files (UTF-8 tolerant)
|
| 8 |
+
- dataset creation from a `context` directory using various loaders
|
| 9 |
+
- chunking, embedding and upserting to Pinecone via `prepare_RAG`
|
| 10 |
+
- building/updating a Knowledge Graph and generating hierarchical community summaries
|
| 11 |
+
|
| 12 |
+
Main public functions:
|
| 13 |
+
- create_dataset(directory_path: str) -> List[Document]
|
| 14 |
+
- prepare_RAG(pinecone_API, index_name, ...) -> (index, pc, llm, documents)
|
| 15 |
+
- build_knowledge_graph(documents, llm, pc, index, info=True) -> KnowledgeGraphIndex
|
| 16 |
+
|
| 17 |
+
Note: many helper functions are nested; this docstring highlights the high-level
|
| 18 |
+
purpose and responsibilities only.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import pathlib
|
| 23 |
+
import time
|
| 24 |
+
import re
|
| 25 |
+
from pinecone import Pinecone
|
| 26 |
+
|
| 27 |
+
from langchain_mistralai import ChatMistralAI
|
| 28 |
+
from langchain_openai import ChatOpenAI
|
| 29 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 30 |
+
from langchain.schema import Document
|
| 31 |
+
from langchain_community.document_loaders import (
|
| 32 |
+
CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
|
| 33 |
+
UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
|
| 34 |
+
UnstructuredHTMLLoader, NotebookLoader
|
| 35 |
+
)
|
| 36 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 37 |
+
|
| 38 |
+
from llama_index.core.memory import Memory
|
| 39 |
+
|
| 40 |
+
import pickle
|
| 41 |
+
|
| 42 |
+
import json
|
| 43 |
+
from typing import List, Any
|
| 44 |
+
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
|
| 45 |
+
|
| 46 |
+
from typing import List, Any
|
| 47 |
+
from pydantic import BaseModel, ValidationError
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
memory = Memory(token_limit=2048)
|
| 51 |
+
|
| 52 |
+
# -------------------------
|
| 53 |
+
# UTF-8 safe Text Loader
|
| 54 |
+
# -------------------------
|
| 55 |
+
class SafeTextLoader:
|
| 56 |
+
"""Loads a text file as a single Document, safely handling UTF-8 decoding errors."""
|
| 57 |
+
def __init__(self, file_path):
|
| 58 |
+
self.file_path = file_path
|
| 59 |
+
|
| 60 |
+
def load(self):
|
| 61 |
+
"""Load the file and return a list containing a single LangChain `Document`.
|
| 62 |
+
|
| 63 |
+
The loader is UTF-8 tolerant: it reads raw bytes and decodes using UTF-8 with
|
| 64 |
+
'ignore' on errors to avoid failing on files containing invalid sequences.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
List[Document]: a list with one Document (page_content and metadata['source'])
|
| 68 |
+
or an empty list on error.
|
| 69 |
+
"""
|
| 70 |
+
try:
|
| 71 |
+
with open(self.file_path, "rb") as f:
|
| 72 |
+
raw_bytes = f.read()
|
| 73 |
+
text = raw_bytes.decode("utf-8", errors="ignore")
|
| 74 |
+
return [Document(page_content=text, metadata={"source": str(self.file_path)})]
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"[Error] Failed to read {self.file_path}: {e}")
|
| 77 |
+
return []
|
| 78 |
+
|
| 79 |
+
# -------------------------
|
| 80 |
+
# Loader mapping
|
| 81 |
+
# -------------------------
|
| 82 |
+
LOADER_MAPPING = {
|
| 83 |
+
".txt": SafeTextLoader,
|
| 84 |
+
".json": SafeTextLoader,
|
| 85 |
+
".md": UnstructuredMarkdownLoader,
|
| 86 |
+
".csv": CSVLoader,
|
| 87 |
+
".yaml": SafeTextLoader,
|
| 88 |
+
".yml": SafeTextLoader,
|
| 89 |
+
".pdf": PyPDFLoader,
|
| 90 |
+
".docx": UnstructuredWordDocumentLoader,
|
| 91 |
+
".pptx": UnstructuredPowerPointLoader,
|
| 92 |
+
".html": UnstructuredHTMLLoader,
|
| 93 |
+
".htm": UnstructuredHTMLLoader,
|
| 94 |
+
".ipynb": NotebookLoader,
|
| 95 |
+
".py": SafeTextLoader,
|
| 96 |
+
".js": SafeTextLoader,
|
| 97 |
+
".sql": SafeTextLoader,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
CONTEXT_ROOT = pathlib.Path(__file__).parent / "context"
|
| 101 |
+
|
| 102 |
+
def create_dataset(directory_path: str = "context"):
|
| 103 |
+
"""Recursively load files under `directory_path` using extension-specific loaders."""
|
| 104 |
+
|
| 105 |
+
target_dir = pathlib.Path(directory_path).resolve()
|
| 106 |
+
if not target_dir.exists() or not target_dir.is_dir():
|
| 107 |
+
print(f"[Error] Target directory does not exist: {target_dir}")
|
| 108 |
+
return []
|
| 109 |
+
|
| 110 |
+
documents = []
|
| 111 |
+
for file_path in target_dir.rglob("*"):
|
| 112 |
+
if not file_path.is_file():
|
| 113 |
+
continue
|
| 114 |
+
ext = file_path.suffix.lower()
|
| 115 |
+
loader_cls = LOADER_MAPPING.get(ext)
|
| 116 |
+
if loader_cls is None:
|
| 117 |
+
print(f"[Skip] Unsupported file type: {file_path}")
|
| 118 |
+
continue
|
| 119 |
+
try:
|
| 120 |
+
loader = loader_cls(str(file_path))
|
| 121 |
+
docs = loader.load()
|
| 122 |
+
documents.extend(docs)
|
| 123 |
+
print(f"[Loaded] {file_path} ({len(docs)} docs)")
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print(f"[Error] Failed to load {file_path}: {e}")
|
| 126 |
+
|
| 127 |
+
print(f"[Done] Finished scanning {target_dir}")
|
| 128 |
+
print(f"Total documents loaded: {len(documents)}")
|
| 129 |
+
return documents
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
from llama_index.core import KnowledgeGraphIndex
|
| 133 |
+
from llama_index.core import Document as LlamaDocument
|
| 134 |
+
|
| 135 |
+
import hashlib
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def fetch_existing_ids(index, namespace, ids, batch_size=100):
|
| 139 |
+
"""Fetch IDs from Pinecone in safe batches to avoid URI too large errors"""
|
| 140 |
+
existing_ids = set()
|
| 141 |
+
for start in range(0, len(ids), batch_size):
|
| 142 |
+
batch_ids = ids[start:start + batch_size]
|
| 143 |
+
result = index.fetch(ids=batch_ids, namespace=namespace)
|
| 144 |
+
if hasattr(result, "vectors"):
|
| 145 |
+
existing_ids.update(result.vectors.keys())
|
| 146 |
+
return existing_ids
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# -------------------------
|
| 151 |
+
# Prepare RAG
|
| 152 |
+
# -------------------------
|
| 153 |
+
import hashlib
|
| 154 |
+
import time
|
| 155 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 156 |
+
from llama_index.core import Document as LlamaDocument
|
| 157 |
+
from pinecone import Pinecone
|
| 158 |
+
|
| 159 |
+
import os
|
| 160 |
+
import re
|
| 161 |
+
import time
|
| 162 |
+
import hashlib
|
| 163 |
+
|
| 164 |
+
from langchain_openai import ChatOpenAI
|
| 165 |
+
from langchain_mistralai import ChatMistralAI
|
| 166 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 167 |
+
from pinecone import Pinecone
|
| 168 |
+
|
| 169 |
+
from llama_index.core import Document as LlamaDocument
|
| 170 |
+
|
| 171 |
+
# You are assumed to already have:
|
| 172 |
+
# - create_dataset(dir_name)
|
| 173 |
+
# - fetch_existing_ids(index, namespace, all_ids, batch_size)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# -------------------------
|
| 177 |
+
# Internal helper: build & upsert community summaries (incremental inside; same signature)
|
| 178 |
+
# -------------------------
|
| 179 |
+
def _build_and_index_community_summaries(
|
| 180 |
+
kg_index,
|
| 181 |
+
pc,
|
| 182 |
+
index,
|
| 183 |
+
llm,
|
| 184 |
+
impacted_nodes=None,
|
| 185 |
+
info=True,
|
| 186 |
+
):
|
| 187 |
+
"""
|
| 188 |
+
This function implements a hierarchical community detection and summarization pipeline:
|
| 189 |
+
|
| 190 |
+
1. COMMUNITY DETECTION:
|
| 191 |
+
- Uses NetworkX's greedy_modularity_communities to find natural clusters in the KG
|
| 192 |
+
- Filters communities by minimum size (COMMUNITY_MIN_SIZE) to avoid noise
|
| 193 |
+
|
| 194 |
+
2. HIERARCHY CONSTRUCTION:
|
| 195 |
+
- Builds a multi-level tree structure (max depth = MAX_HIERARCHY_DEPTH)
|
| 196 |
+
- Recursively splits large communities using the same modularity algorithm
|
| 197 |
+
- Creates parent-child relationships between community levels
|
| 198 |
+
|
| 199 |
+
3. AFFECTED NODE TRACKING:
|
| 200 |
+
- Marks communities as "_affected" if they contain new/updated nodes
|
| 201 |
+
- Propagates affected status upward to parent communities
|
| 202 |
+
- Enables incremental updates by only processing changed regions
|
| 203 |
+
|
| 204 |
+
4. BOTTOM-UP SUMMARIZATION:
|
| 205 |
+
- Leaf communities: Generate detailed reports from entity relationships
|
| 206 |
+
- Parent communities: Synthesize child summaries into higher-level overviews
|
| 207 |
+
- Uses sampling (LIMIT_NODES_PER_SUMMARY) to handle large communities
|
| 208 |
+
|
| 209 |
+
5. VECTOR STORAGE:
|
| 210 |
+
- Creates stable IDs using SHA-256 hashes of community composition
|
| 211 |
+
- Embeds summaries using Pinecone's llama-text-embed-v2 model
|
| 212 |
+
- Stores in dedicated "community-summaries" namespace
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
import hashlib
|
| 216 |
+
import networkx as nx
|
| 217 |
+
from networkx.algorithms.community import greedy_modularity_communities
|
| 218 |
+
|
| 219 |
+
COMMUNITY_NAMESPACE = "community-summaries"
|
| 220 |
+
COMMUNITY_MIN_SIZE = 3
|
| 221 |
+
MAX_HIERARCHY_DEPTH = 2
|
| 222 |
+
LIMIT_NODES_PER_SUMMARY = 60
|
| 223 |
+
LIMIT_TRIPLES_PER_SUMMARY = 120
|
| 224 |
+
|
| 225 |
+
try:
|
| 226 |
+
nxg = kg_index.get_networkx_graph()
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"[Error] Unable to extract NetworkX graph from KG: {e}")
|
| 229 |
+
return
|
| 230 |
+
|
| 231 |
+
if nxg.number_of_nodes() == 0 or nxg.number_of_edges() == 0:
|
| 232 |
+
if info:
|
| 233 |
+
print("[Community] KG empty or trivial; skipping community summarization.")
|
| 234 |
+
return
|
| 235 |
+
|
| 236 |
+
first_run = impacted_nodes is None
|
| 237 |
+
impacted_nodes = set(impacted_nodes or [])
|
| 238 |
+
|
| 239 |
+
if info:
|
| 240 |
+
print(f"[Community] Starting summarization. First run: {first_run}")
|
| 241 |
+
print(f"[Community] Impacted nodes: {len(impacted_nodes)}")
|
| 242 |
+
|
| 243 |
+
# ---- community detection ----
|
| 244 |
+
if info:
|
| 245 |
+
print("[Community] Detecting top-level communities (greedy modularity)...")
|
| 246 |
+
try:
|
| 247 |
+
communities = list(greedy_modularity_communities(nxg))
|
| 248 |
+
except Exception as e:
|
| 249 |
+
print(f"[Error] Community detection failed: {e}")
|
| 250 |
+
return
|
| 251 |
+
|
| 252 |
+
large_communities = [c for c in communities if len(c) >= max(2, COMMUNITY_MIN_SIZE)]
|
| 253 |
+
small_communities = [c for c in communities if len(c) < max(2, COMMUNITY_MIN_SIZE)]
|
| 254 |
+
|
| 255 |
+
if info:
|
| 256 |
+
print(f"[Community] Found {len(communities)} communities; "
|
| 257 |
+
f"{len(large_communities)} large, {len(small_communities)} small.")
|
| 258 |
+
|
| 259 |
+
# ---- build hierarchy and mark affected ----
|
| 260 |
+
hierarchy = []
|
| 261 |
+
for idx, comm in enumerate(large_communities):
|
| 262 |
+
subgraph = nxg.subgraph(comm).copy()
|
| 263 |
+
node_set = set(subgraph.nodes())
|
| 264 |
+
node = {
|
| 265 |
+
"id": f"C{idx}",
|
| 266 |
+
"level": 0,
|
| 267 |
+
"nodes": node_set,
|
| 268 |
+
"children": [],
|
| 269 |
+
"_affected": first_run or bool(impacted_nodes & node_set),
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
# simple frontier-based recursive splitting
|
| 273 |
+
frontier = [(node, subgraph, 1)]
|
| 274 |
+
while frontier:
|
| 275 |
+
parent, g, depth = frontier.pop()
|
| 276 |
+
if depth > MAX_HIERARCHY_DEPTH or g.number_of_nodes() < max(2, COMMUNITY_MIN_SIZE * 2):
|
| 277 |
+
continue
|
| 278 |
+
try:
|
| 279 |
+
subs = list(greedy_modularity_communities(g))
|
| 280 |
+
except Exception:
|
| 281 |
+
subs = []
|
| 282 |
+
|
| 283 |
+
subs = [s for s in subs if 1 <= len(s) <= len(g) - 1]
|
| 284 |
+
subs = [s for s in subs if len(s) >= max(2, COMMUNITY_MIN_SIZE)]
|
| 285 |
+
|
| 286 |
+
for j, s in enumerate(subs):
|
| 287 |
+
sg = g.subgraph(s).copy()
|
| 288 |
+
child = {
|
| 289 |
+
"id": f"{parent['id']}.{j}",
|
| 290 |
+
"level": depth,
|
| 291 |
+
"nodes": set(s),
|
| 292 |
+
"children": [],
|
| 293 |
+
"_affected": first_run or bool(impacted_nodes & set(s)),
|
| 294 |
+
}
|
| 295 |
+
parent["children"].append(child)
|
| 296 |
+
if depth + 1 <= MAX_HIERARCHY_DEPTH and sg.number_of_nodes() >= max(2, COMMUNITY_MIN_SIZE * 2):
|
| 297 |
+
frontier.append((child, sg, depth + 1))
|
| 298 |
+
|
| 299 |
+
hierarchy.append(node)
|
| 300 |
+
|
| 301 |
+
# propagate affected upward
|
| 302 |
+
def mark_ancestors(n):
|
| 303 |
+
any_child = False
|
| 304 |
+
for c in n["children"]:
|
| 305 |
+
if mark_ancestors(c):
|
| 306 |
+
any_child = True
|
| 307 |
+
if any_child:
|
| 308 |
+
n["_affected"] = True
|
| 309 |
+
return n["_affected"]
|
| 310 |
+
|
| 311 |
+
for root in hierarchy:
|
| 312 |
+
mark_ancestors(root)
|
| 313 |
+
|
| 314 |
+
# ---- summarization helpers ----
|
| 315 |
+
def triples_within(node_ids, graph):
|
| 316 |
+
res = []
|
| 317 |
+
for (u, v, data) in graph.edges(data=True):
|
| 318 |
+
if u in node_ids and v in node_ids:
|
| 319 |
+
rel = data.get("label") or data.get("relationship") or "related_to"
|
| 320 |
+
res.append((u, rel, v))
|
| 321 |
+
return res
|
| 322 |
+
|
| 323 |
+
def sample_for_prompt(nodes_set, triples_list, max_nodes=LIMIT_NODES_PER_SUMMARY, max_triples=LIMIT_TRIPLES_PER_SUMMARY):
|
| 324 |
+
nodes_list = list(nodes_set)[:max_nodes]
|
| 325 |
+
triples_list = triples_list[:max_triples]
|
| 326 |
+
return nodes_list, triples_list
|
| 327 |
+
|
| 328 |
+
def summarize_leaf(nodes_set, graph):
|
| 329 |
+
nodes_list, tri_list = sample_for_prompt(
|
| 330 |
+
nodes_set,
|
| 331 |
+
triples_within(nodes_set, graph)
|
| 332 |
+
)
|
| 333 |
+
prompt = (
|
| 334 |
+
"You are creating a concise community report from a knowledge graph.\n"
|
| 335 |
+
"Given the following entity list and intra-community relationships, produce:\n"
|
| 336 |
+
" - Title\n"
|
| 337 |
+
" - Key Themes (bullet points)\n"
|
| 338 |
+
" - Notable Entities\n"
|
| 339 |
+
" - Important Relationships (summarize patterns rather than listing all)\n"
|
| 340 |
+
" - Outliers or Cross-links (if any)\n"
|
| 341 |
+
" - 3-5 Answerable Questions this community can address\n"
|
| 342 |
+
"Keep it under ~250-300 words.\n\n"
|
| 343 |
+
f"Entities (sample): {nodes_list}\n"
|
| 344 |
+
f"Relationships (sample triples): {[f'{u} --[{r}]--> {v}' for (u,r,v) in tri_list]}\n"
|
| 345 |
+
)
|
| 346 |
+
resp = llm.invoke(prompt)
|
| 347 |
+
return resp.content.strip()
|
| 348 |
+
|
| 349 |
+
def summarize_parent(child_summaries):
|
| 350 |
+
join_text = "\n\n".join([f"[Child {i+1}]\n{txt}" for i, txt in enumerate(child_summaries)])
|
| 351 |
+
prompt = (
|
| 352 |
+
"You are creating a higher-level summary that unifies several community reports.\n"
|
| 353 |
+
"Synthesize the following child community reports into a coherent parent-level summary:\n"
|
| 354 |
+
" - Overarching Title\n"
|
| 355 |
+
" - Cross-community Key Themes\n"
|
| 356 |
+
" - How the sub-communities relate and differ\n"
|
| 357 |
+
" - Cross-cutting entities/relationships\n"
|
| 358 |
+
" - 3-5 high-level questions the parent community can answer\n"
|
| 359 |
+
"Target length: 250-350 words.\n\n"
|
| 360 |
+
f"{join_text}\n"
|
| 361 |
+
)
|
| 362 |
+
resp = llm.invoke(prompt)
|
| 363 |
+
return resp.content.strip()
|
| 364 |
+
|
| 365 |
+
# bottom-up, only affected subtrees
|
| 366 |
+
def build_summaries(node, graph):
|
| 367 |
+
if not node["_affected"]:
|
| 368 |
+
return None
|
| 369 |
+
if not node["children"]:
|
| 370 |
+
node["summary"] = summarize_leaf(node["nodes"], graph)
|
| 371 |
+
return node["summary"]
|
| 372 |
+
child_summaries = []
|
| 373 |
+
for ch in node["children"]:
|
| 374 |
+
s = build_summaries(ch, graph)
|
| 375 |
+
if s is not None:
|
| 376 |
+
child_summaries.append(s)
|
| 377 |
+
if child_summaries:
|
| 378 |
+
node["summary"] = summarize_parent(child_summaries)
|
| 379 |
+
return node["summary"]
|
| 380 |
+
node["summary"] = summarize_leaf(node["nodes"], graph)
|
| 381 |
+
return node["summary"]
|
| 382 |
+
|
| 383 |
+
for node in hierarchy:
|
| 384 |
+
build_summaries(node, nxg)
|
| 385 |
+
|
| 386 |
+
# flatten affected nodes w/ new summaries
|
| 387 |
+
flat_nodes = []
|
| 388 |
+
def flatten(n):
|
| 389 |
+
if n.get("_affected") and "summary" in n:
|
| 390 |
+
flat_nodes.append({
|
| 391 |
+
"id": n["id"],
|
| 392 |
+
"level": n["level"],
|
| 393 |
+
"size": len(n["nodes"]),
|
| 394 |
+
"nodes": list(n["nodes"]),
|
| 395 |
+
"summary": n["summary"]
|
| 396 |
+
})
|
| 397 |
+
for c in n["children"]:
|
| 398 |
+
flatten(c)
|
| 399 |
+
for n in hierarchy:
|
| 400 |
+
flatten(n)
|
| 401 |
+
|
| 402 |
+
if not flat_nodes:
|
| 403 |
+
if info:
|
| 404 |
+
print("[Community] No affected community summaries to upsert.")
|
| 405 |
+
return
|
| 406 |
+
|
| 407 |
+
if info:
|
| 408 |
+
print(f"[Community] Upserting {len(flat_nodes)} community summaries to namespace: {COMMUNITY_NAMESPACE}")
|
| 409 |
+
|
| 410 |
+
def summary_vec_id(node_rec):
|
| 411 |
+
key = f"{node_rec['id']}|{node_rec['level']}|{','.join(sorted(node_rec['nodes'])[:20])}"
|
| 412 |
+
return "comm_" + hashlib.sha256(key.encode("utf-8")).hexdigest()[:24]
|
| 413 |
+
|
| 414 |
+
# batch embed + upsert
|
| 415 |
+
B = 96
|
| 416 |
+
texts = [rec["summary"] for rec in flat_nodes]
|
| 417 |
+
ids = [summary_vec_id(rec) for rec in flat_nodes]
|
| 418 |
+
metas = [{
|
| 419 |
+
"type": "community_summary",
|
| 420 |
+
"community_id": rec["id"],
|
| 421 |
+
"level": rec["level"],
|
| 422 |
+
"size": rec["size"],
|
| 423 |
+
"node_sample": rec["nodes"][:20],
|
| 424 |
+
"text": rec["summary"]
|
| 425 |
+
} for rec in flat_nodes]
|
| 426 |
+
|
| 427 |
+
for start in range(0, len(texts), B):
|
| 428 |
+
batch_texts = texts[start:start+B]
|
| 429 |
+
batch_ids = ids[start:start+B]
|
| 430 |
+
batch_metas = metas[start:start+B]
|
| 431 |
+
emb = pc.inference.embed(
|
| 432 |
+
model="llama-text-embed-v2",
|
| 433 |
+
inputs=batch_texts,
|
| 434 |
+
parameters={"input_type": "passage", "truncate": "END"}
|
| 435 |
+
)
|
| 436 |
+
vectors = [
|
| 437 |
+
{"id": vid, "values": e["values"], "metadata": meta}
|
| 438 |
+
for vid, e, meta in zip(batch_ids, emb, batch_metas)
|
| 439 |
+
]
|
| 440 |
+
index.upsert(vectors=vectors, namespace=COMMUNITY_NAMESPACE)
|
| 441 |
+
|
| 442 |
+
if info:
|
| 443 |
+
print("[Community] Community summaries upsert completed.")
|
| 444 |
+
|
| 445 |
+
def prepare_RAG(
|
| 446 |
+
pinecone_API,
|
| 447 |
+
index_name,
|
| 448 |
+
chunk_size=400,
|
| 449 |
+
chunk_overlap=30,
|
| 450 |
+
llm_model="gpt-4.1-nano",
|
| 451 |
+
dir_name="context",
|
| 452 |
+
info=True
|
| 453 |
+
):
|
| 454 |
+
"""
|
| 455 |
+
Steps:
|
| 456 |
+
1) Select LLM wrapper (OpenAI vs. Mistral) by `llm_model` string.
|
| 457 |
+
2) Create dataset with `create_dataset(dir_name)`.
|
| 458 |
+
3) Connect to Pinecone and obtain `index`.
|
| 459 |
+
4) Split documents into chunks; normalize `metadata['source']` to be path-relative
|
| 460 |
+
to the `context` anchor (stable across machines).
|
| 461 |
+
5) Compute stable vector IDs per chunk from source+content hashes.
|
| 462 |
+
6) Use `fetch_existing_ids` to identify and skip already-indexed chunks.
|
| 463 |
+
7) Embed only new chunks via `pc.inference.embed` (retry with backoff).
|
| 464 |
+
8) Upsert embeddings and metadata into a fixed namespace (`example-namespace`).
|
| 465 |
+
"""
|
| 466 |
+
|
| 467 |
+
import os, re, hashlib, time
|
| 468 |
+
from pinecone import Pinecone
|
| 469 |
+
from langchain_mistralai import ChatMistralAI
|
| 470 |
+
from langchain_openai import ChatOpenAI
|
| 471 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 472 |
+
|
| 473 |
+
if info:
|
| 474 |
+
print(f"Preparing RAG with LLM: {llm_model}, Index: {index_name}, Dir: {dir_name}")
|
| 475 |
+
llm = ChatOpenAI(model=llm_model, streaming=True) if "gpt" in llm_model else ChatMistralAI(model=llm_model, streaming=True)
|
| 476 |
+
|
| 477 |
+
documents = create_dataset(dir_name)
|
| 478 |
+
pc = Pinecone(api_key=pinecone_API)
|
| 479 |
+
index = pc.Index(index_name)
|
| 480 |
+
|
| 481 |
+
if not documents:
|
| 482 |
+
print(f"[Warning] No documents found. Using existing Pinecone index.")
|
| 483 |
+
return index, pc, llm, None
|
| 484 |
+
|
| 485 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 486 |
+
all_splits = splitter.split_documents(documents)
|
| 487 |
+
|
| 488 |
+
def path_after_context(full_path: str, anchor: str = "context") -> str:
|
| 489 |
+
if not full_path:
|
| 490 |
+
return ""
|
| 491 |
+
parts = re.split(r"[\\/]+", str(full_path))
|
| 492 |
+
idx = None
|
| 493 |
+
for i, part in enumerate(parts):
|
| 494 |
+
if part.lower() == anchor.lower():
|
| 495 |
+
idx = i
|
| 496 |
+
if idx is not None and idx < len(parts) - 1:
|
| 497 |
+
return "/".join(parts[idx + 1 :])
|
| 498 |
+
return os.path.basename(str(full_path))
|
| 499 |
+
|
| 500 |
+
for chunk in all_splits:
|
| 501 |
+
if "source" in chunk.metadata and chunk.metadata["source"]:
|
| 502 |
+
chunk.metadata["source"] = path_after_context(chunk.metadata["source"], anchor="context")
|
| 503 |
+
|
| 504 |
+
if info:
|
| 505 |
+
print(f"Total chunks: {len(all_splits)}")
|
| 506 |
+
|
| 507 |
+
def chunk_id(chunk, prefix="vec"):
|
| 508 |
+
text_hash = hashlib.sha256(chunk.page_content.encode("utf-8")).hexdigest()[:16]
|
| 509 |
+
source = chunk.metadata.get("source", "unknown")
|
| 510 |
+
file_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:8]
|
| 511 |
+
return f"{prefix}_{file_hash}_{text_hash}"
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
# -------------------------
|
| 515 |
+
# Vector ID & Namespace Architecture
|
| 516 |
+
# -------------------------
|
| 517 |
+
"""
|
| 518 |
+
VECTOR ID GENERATION STRATEGY:
|
| 519 |
+
|
| 520 |
+
For Document Chunks:
|
| 521 |
+
Pattern: "vec_{file_hash}_{content_hash}"
|
| 522 |
+
- file_hash: SHA-256 of normalized source path (8 chars)
|
| 523 |
+
- content_hash: SHA-256 of chunk content (16 chars)
|
| 524 |
+
- Enables exact duplicate detection across runs
|
| 525 |
+
- Stable across different machine paths due to source normalization
|
| 526 |
+
|
| 527 |
+
For Community Summaries:
|
| 528 |
+
Pattern: "comm_{community_hash}"
|
| 529 |
+
- community_hash: SHA-256 of "community_id|level|sorted_node_sample"
|
| 530 |
+
- Ensures stable IDs for the same community composition
|
| 531 |
+
- Allows updates when community structure changes
|
| 532 |
+
|
| 533 |
+
NAMESPACE STRATEGY:
|
| 534 |
+
- "example-namespace": Stores document chunk embeddings
|
| 535 |
+
- "community-summaries": Stores hierarchical community summaries
|
| 536 |
+
- Separation enables independent update/query strategies
|
| 537 |
+
- Prevents interference between document and summary vectors
|
| 538 |
+
|
| 539 |
+
IDEMPOTENCY GUARANTEE:
|
| 540 |
+
- fetch_existing_ids() checks Pinecone before embedding
|
| 541 |
+
- Prevents duplicate embeddings for identical content
|
| 542 |
+
- Enables safe re-runs without data duplication
|
| 543 |
+
- Reduces embedding costs and storage usage
|
| 544 |
+
"""
|
| 545 |
+
|
| 546 |
+
namespace = "example-namespace"
|
| 547 |
+
all_ids = [chunk_id(c) for c in all_splits]
|
| 548 |
+
existing = fetch_existing_ids(index, namespace, all_ids, batch_size=100)
|
| 549 |
+
new_chunks = [(c, i) for c, i in zip(all_splits, all_ids) if i not in existing]
|
| 550 |
+
|
| 551 |
+
if info:
|
| 552 |
+
print(f"Chunks already indexed: {len(all_splits) - len(new_chunks)}")
|
| 553 |
+
print(f"New chunks to embed: {len(new_chunks)}")
|
| 554 |
+
|
| 555 |
+
if not new_chunks:
|
| 556 |
+
print("[Info] Nothing new to index. Skipping embedding/upsert.")
|
| 557 |
+
else:
|
| 558 |
+
batch_size = 94
|
| 559 |
+
|
| 560 |
+
def retry_forever(func, *args, **kwargs):
|
| 561 |
+
attempt = 1
|
| 562 |
+
while True:
|
| 563 |
+
try:
|
| 564 |
+
return func(*args, **kwargs)
|
| 565 |
+
except Exception as e:
|
| 566 |
+
wait = min(60, 2 ** min(attempt, 6))
|
| 567 |
+
print(f"[Retry] {func.__name__} failed (attempt {attempt}): {e}. Sleeping {wait}s")
|
| 568 |
+
time.sleep(wait)
|
| 569 |
+
attempt += 1
|
| 570 |
+
|
| 571 |
+
for start_idx in range(0, len(new_chunks), batch_size):
|
| 572 |
+
print(f"[Info] Embedding and upserting batch {start_idx // batch_size + 1}...")
|
| 573 |
+
batch, ids = zip(*new_chunks[start_idx:start_idx + batch_size])
|
| 574 |
+
texts = [chunk.page_content for chunk in batch]
|
| 575 |
+
metas = [chunk.metadata or {} for chunk in batch]
|
| 576 |
+
|
| 577 |
+
embeddings = retry_forever(
|
| 578 |
+
pc.inference.embed,
|
| 579 |
+
model="llama-text-embed-v2",
|
| 580 |
+
inputs=texts,
|
| 581 |
+
parameters={"input_type": "passage", "truncate": "END"}
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
batch_records = [
|
| 585 |
+
{"id": i, "values": e['values'], "metadata": {"text": t, **m}}
|
| 586 |
+
for i, e, t, m in zip(ids, embeddings, texts, metas)
|
| 587 |
+
]
|
| 588 |
+
retry_forever(index.upsert, vectors=batch_records, namespace=namespace)
|
| 589 |
+
|
| 590 |
+
if info:
|
| 591 |
+
print(f"Completed upsert of {len(new_chunks)} new vectors.")
|
| 592 |
+
|
| 593 |
+
return index, pc, llm, documents # Return documents for KG construction
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
def build_knowledge_graph(documents, llm, pc, index, info=True):
|
| 597 |
+
"""
|
| 598 |
+
Build/update the Knowledge Graph (KG) from documents, persist it, merge deltas, and
|
| 599 |
+
(re)generate community summaries for changed regions.
|
| 600 |
+
|
| 601 |
+
Args:
|
| 602 |
+
documents: List of LangChain Documents from prepare_RAG (may be empty).
|
| 603 |
+
llm: LangChain-compatible LLM used via LlamaIndex.
|
| 604 |
+
pc: Pinecone client (for embeddings).
|
| 605 |
+
index: Pinecone index to store community summary vectors.
|
| 606 |
+
info: Enable verbose logging.
|
| 607 |
+
|
| 608 |
+
Returns:
|
| 609 |
+
KnowledgeGraphIndex | None
|
| 610 |
+
|
| 611 |
+
Flow:
|
| 612 |
+
1) Identify new/changed docs via source+content hashing (seen file cache).
|
| 613 |
+
2) Load existing KG from pickle or build a fresh one.
|
| 614 |
+
3) If there is a delta, build a delta KG and merge nodes/edges.
|
| 615 |
+
4) Summarize impacted communities and upsert summaries to Pinecone.
|
| 616 |
+
5) Export `knowledge_graph.json` and update the seen-file signatures.
|
| 617 |
+
"""
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
import os, pickle, json, hashlib, re
|
| 621 |
+
from llama_index.core import Document, KnowledgeGraphIndex
|
| 622 |
+
from llama_index.llms.langchain import LangChainLLM
|
| 623 |
+
|
| 624 |
+
# ---- duplicate detection "like in prepare_RAG" (signature unchanged) ----
|
| 625 |
+
def path_after_context(full_path: str, anchor: str = "context") -> str:
|
| 626 |
+
if not full_path:
|
| 627 |
+
return ""
|
| 628 |
+
parts = re.split(r"[\\/]+", str(full_path))
|
| 629 |
+
idx = None
|
| 630 |
+
for i, part in enumerate(parts):
|
| 631 |
+
if part.lower() == anchor.lower():
|
| 632 |
+
idx = i
|
| 633 |
+
if idx is not None and idx < len(parts) - 1:
|
| 634 |
+
return "/".join(parts[idx + 1 :])
|
| 635 |
+
return os.path.basename(str(full_path))
|
| 636 |
+
|
| 637 |
+
def file_sig(doc_like):
|
| 638 |
+
"""Return (sig_id, normalized_source) using source+content hashing similar to prepare_RAG."""
|
| 639 |
+
meta = getattr(doc_like, "metadata", {}) or {}
|
| 640 |
+
text = getattr(doc_like, "page_content", "") or getattr(doc_like, "text", "") or ""
|
| 641 |
+
src = meta.get("source", "unknown")
|
| 642 |
+
if src:
|
| 643 |
+
src = path_after_context(src, anchor="context")
|
| 644 |
+
src_hash = hashlib.sha256(src.encode("utf-8")).hexdigest()[:8]
|
| 645 |
+
text_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
|
| 646 |
+
return f"kg_{src_hash}_{text_hash}", src
|
| 647 |
+
|
| 648 |
+
def load_seen_sigs(path="kg_seen_files.json"):
|
| 649 |
+
try:
|
| 650 |
+
if os.path.exists(path):
|
| 651 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 652 |
+
data = json.load(f)
|
| 653 |
+
return set(data if isinstance(data, list) else [])
|
| 654 |
+
except Exception as e:
|
| 655 |
+
print(f"[Warn] Failed to load seen file sigs: {e}")
|
| 656 |
+
return set()
|
| 657 |
+
|
| 658 |
+
def save_seen_sigs(sigs, path="kg_seen_files.json"):
|
| 659 |
+
try:
|
| 660 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 661 |
+
json.dump(sorted(list(sigs)), f, indent=2)
|
| 662 |
+
except Exception as e:
|
| 663 |
+
print(f"[Warn] Failed to save seen file sigs: {e}")
|
| 664 |
+
|
| 665 |
+
# -------------------------
|
| 666 |
+
# Incremental KG Update Strategy
|
| 667 |
+
# -------------------------
|
| 668 |
+
"""
|
| 669 |
+
CRITICAL: This section handles the complex merge of new documents into existing knowledge graphs.
|
| 670 |
+
|
| 671 |
+
KEY CHALLENGES ADDRESSED:
|
| 672 |
+
- Duplicate Detection: Uses content+source hashing to identify truly new/changed documents
|
| 673 |
+
- Delta Processing: Builds partial KG from only new documents, then merges
|
| 674 |
+
- Conflict Resolution: Handles nodes/edges that may already exist in the base graph
|
| 675 |
+
- Change Propagation: Tracks exactly which nodes/edges are new for community summarization
|
| 676 |
+
|
| 677 |
+
MERGE STRATEGY:
|
| 678 |
+
1. Signature-based filtering identifies only new/changed documents
|
| 679 |
+
2. Builds a "delta KG" from new documents only
|
| 680 |
+
3. Performs set operations to find truly new nodes/edges:
|
| 681 |
+
- new_nodes = delta_nodes - base_nodes
|
| 682 |
+
- new_edges = delta_edges - base_edges
|
| 683 |
+
4. Merges using NetworkX's native add_nodes_from/add_edges_from
|
| 684 |
+
5. Preserves all node/edge attributes during merge
|
| 685 |
+
|
| 686 |
+
WHY THIS MATTERS:
|
| 687 |
+
- Without proper incremental updates, the system would rebuild the entire KG every time
|
| 688 |
+
- Enables efficient updates when only a few documents change
|
| 689 |
+
- Maintains community summaries for unchanged parts of the graph
|
| 690 |
+
"""
|
| 691 |
+
|
| 692 |
+
seen_sigs = load_seen_sigs()
|
| 693 |
+
|
| 694 |
+
# Identify new/changed docs by signature
|
| 695 |
+
all_docs = documents or []
|
| 696 |
+
new_docs = []
|
| 697 |
+
new_sigs = []
|
| 698 |
+
for d in all_docs:
|
| 699 |
+
sig, _ = file_sig(d)
|
| 700 |
+
if sig not in seen_sigs:
|
| 701 |
+
new_docs.append(d)
|
| 702 |
+
new_sigs.append(sig)
|
| 703 |
+
|
| 704 |
+
if info:
|
| 705 |
+
print(f"[KG] Total input docs: {len(all_docs)} | New/changed docs detected: {len(new_docs)}")
|
| 706 |
+
|
| 707 |
+
# ---- prepare LlamaIndex objects ----
|
| 708 |
+
llama_docs_all = [Document(text=doc.page_content, metadata=doc.metadata) for doc in all_docs]
|
| 709 |
+
llama_docs_delta = [Document(text=doc.page_content, metadata=doc.metadata) for doc in new_docs]
|
| 710 |
+
llm_for_kg = LangChainLLM(llm)
|
| 711 |
+
persist_file = os.path.abspath("./kg_index.pkl")
|
| 712 |
+
|
| 713 |
+
def _build_and_persist(docs):
|
| 714 |
+
kg = KnowledgeGraphIndex.from_documents(
|
| 715 |
+
documents=docs,
|
| 716 |
+
max_triplets_per_chunk=20,
|
| 717 |
+
extract_relations=True,
|
| 718 |
+
include_embeddings=True,
|
| 719 |
+
llm=llm_for_kg
|
| 720 |
+
)
|
| 721 |
+
with open(persist_file, "wb") as f:
|
| 722 |
+
pickle.dump(kg, f)
|
| 723 |
+
return kg
|
| 724 |
+
|
| 725 |
+
def _load_existing():
|
| 726 |
+
with open(persist_file, "rb") as f:
|
| 727 |
+
return pickle.load(f)
|
| 728 |
+
|
| 729 |
+
kg_index = None
|
| 730 |
+
graph_exists = False
|
| 731 |
+
|
| 732 |
+
try:
|
| 733 |
+
if os.path.exists(persist_file):
|
| 734 |
+
if info:
|
| 735 |
+
print("[Info] Found persisted KG pickle file.")
|
| 736 |
+
graph_exists = True
|
| 737 |
+
kg_index = _load_existing()
|
| 738 |
+
if info:
|
| 739 |
+
print("[Info] Loaded Knowledge Graph from pickle.")
|
| 740 |
+
elif llama_docs_all:
|
| 741 |
+
if info:
|
| 742 |
+
print("[Info] No persisted KG found. Building new KG from all documents...")
|
| 743 |
+
kg_index = _build_and_persist(llama_docs_all)
|
| 744 |
+
if info:
|
| 745 |
+
print("[Info] Built and persisted Knowledge Graph via pickle.")
|
| 746 |
+
else:
|
| 747 |
+
if info:
|
| 748 |
+
print("[Info] No persisted KG found and no documents to build from.")
|
| 749 |
+
except Exception as e:
|
| 750 |
+
print(f"[Error] GraphRAG init/load failed: {e}")
|
| 751 |
+
kg_index = None
|
| 752 |
+
|
| 753 |
+
# ---- incremental insertion (signature unchanged) ----
|
| 754 |
+
inserted_any = False
|
| 755 |
+
graph_override = None # if we need merge fallback for community detection
|
| 756 |
+
|
| 757 |
+
new_nodes = set()
|
| 758 |
+
new_edges = set()
|
| 759 |
+
|
| 760 |
+
if kg_index and graph_exists and llama_docs_delta:
|
| 761 |
+
if info:
|
| 762 |
+
print(f"[Info] Incrementally inserting {len(llama_docs_delta)} new/changed docs into KG...")
|
| 763 |
+
|
| 764 |
+
######################################################################
|
| 765 |
+
try:
|
| 766 |
+
# Build delta KG from new/changed docs
|
| 767 |
+
kg_delta = KnowledgeGraphIndex.from_documents(
|
| 768 |
+
documents=llama_docs_delta,
|
| 769 |
+
max_triplets_per_chunk=20,
|
| 770 |
+
extract_relations=True,
|
| 771 |
+
include_embeddings=False,
|
| 772 |
+
llm=llm_for_kg
|
| 773 |
+
)
|
| 774 |
+
nxg_base = kg_index.get_networkx_graph()
|
| 775 |
+
nxg_delta = kg_delta.get_networkx_graph()
|
| 776 |
+
|
| 777 |
+
# Diagnostic: Print node/edge sets before merge
|
| 778 |
+
base_nodes_before = set(nxg_base.nodes())
|
| 779 |
+
base_edges_before = set(nxg_base.edges())
|
| 780 |
+
delta_nodes = set(nxg_delta.nodes())
|
| 781 |
+
delta_edges = set(nxg_delta.edges())
|
| 782 |
+
|
| 783 |
+
print(f"\n[Diagnostic] Base graph nodes before merge: {len(base_nodes_before)}")
|
| 784 |
+
print(f"[Diagnostic] Base graph edges before merge: {len(base_edges_before)}")
|
| 785 |
+
print(f"[Diagnostic] Delta graph nodes: {len(delta_nodes)}")
|
| 786 |
+
print(f"[Diagnostic] Delta graph edges: {len(delta_edges)}")
|
| 787 |
+
|
| 788 |
+
# Show intersection and difference
|
| 789 |
+
new_nodes = delta_nodes - base_nodes_before
|
| 790 |
+
new_edges = delta_edges - base_edges_before
|
| 791 |
+
already_existing_nodes = delta_nodes & base_nodes_before
|
| 792 |
+
already_existing_edges = delta_edges & base_edges_before
|
| 793 |
+
|
| 794 |
+
print(f"[Diagnostic] Delta nodes already in base: {len(already_existing_nodes)}")
|
| 795 |
+
print(f"[Diagnostic] Delta edges already in base: {len(already_existing_edges)}")
|
| 796 |
+
print(f"[Diagnostic] Truly new nodes to add: {len(new_nodes)}")
|
| 797 |
+
print(f"[Diagnostic] Truly new edges to add: {len(new_edges)}")
|
| 798 |
+
|
| 799 |
+
# Merge delta into base
|
| 800 |
+
nxg_base.add_nodes_from(nxg_delta.nodes(data=True))
|
| 801 |
+
nxg_base.add_edges_from(nxg_delta.edges(data=True))
|
| 802 |
+
graph_override = nxg_base
|
| 803 |
+
inserted_any = True
|
| 804 |
+
|
| 805 |
+
# Diagnostic: Print node/edge sets after merge
|
| 806 |
+
base_nodes_after = set(nxg_base.nodes())
|
| 807 |
+
base_edges_after = set(nxg_base.edges())
|
| 808 |
+
print(f"\n[Diagnostic] Base graph nodes after merge: {len(base_nodes_after)}")
|
| 809 |
+
print(f"[Diagnostic] Base graph edges after merge: {len(base_edges_after)}")
|
| 810 |
+
print(f"[Diagnostic] Nodes added: {len(base_nodes_after - base_nodes_before)}")
|
| 811 |
+
print(f"[Diagnostic] Edges added: {len(base_edges_after - base_edges_before)}")
|
| 812 |
+
|
| 813 |
+
# Print delta graph summary
|
| 814 |
+
num_nodes = nxg_delta.number_of_nodes()
|
| 815 |
+
num_edges = nxg_delta.number_of_edges()
|
| 816 |
+
|
| 817 |
+
print(f"\n[Delta Graph Summary]")
|
| 818 |
+
print(f" - Total Nodes: {num_nodes}")
|
| 819 |
+
print(f" - Total Edges: {num_edges}")
|
| 820 |
+
|
| 821 |
+
# Print first 10 nodes
|
| 822 |
+
print("\n[Delta Graph Nodes] (showing up to 10):")
|
| 823 |
+
for i, (node, data) in enumerate(nxg_delta.nodes(data=True)):
|
| 824 |
+
if i >= 10:
|
| 825 |
+
print(" ...")
|
| 826 |
+
break
|
| 827 |
+
print(f" {i+1}. {node}: {data}")
|
| 828 |
+
|
| 829 |
+
# Print first 10 edges
|
| 830 |
+
print("\n[Delta Graph Edges] (showing up to 10):")
|
| 831 |
+
for i, (source, target, data) in enumerate(nxg_delta.edges(data=True)):
|
| 832 |
+
if i >= 10:
|
| 833 |
+
print(" ...")
|
| 834 |
+
break
|
| 835 |
+
print(f" {i+1}. {source} -> {target}: {data}")
|
| 836 |
+
|
| 837 |
+
# Warn if nothing new was actually added
|
| 838 |
+
if len(new_nodes) == 0 and len(new_edges) == 0:
|
| 839 |
+
print("[Warning] All delta nodes/edges already existed in the base graph. No actual change.")
|
| 840 |
+
|
| 841 |
+
if info:
|
| 842 |
+
print("[Info] Merged delta KG into existing graph (override used for summaries).")
|
| 843 |
+
except Exception as e:
|
| 844 |
+
print(f"[Error] Fallback merge failed: {e}")
|
| 845 |
+
######################################################################
|
| 846 |
+
|
| 847 |
+
|
| 848 |
+
# Persist KG if mutated via API
|
| 849 |
+
if inserted_any and graph_override is None:
|
| 850 |
+
try:
|
| 851 |
+
with open(persist_file, "wb") as f:
|
| 852 |
+
pickle.dump(kg_index, f)
|
| 853 |
+
except Exception as e:
|
| 854 |
+
print(f"[Warn] Failed to persist updated KG: {e}")
|
| 855 |
+
|
| 856 |
+
# First-time build already happened above (graph_exists==False and llama_docs_all not empty)
|
| 857 |
+
|
| 858 |
+
# ---- Community summaries (incremental occurs inside the helper; same signature) ----
|
| 859 |
+
if kg_index:
|
| 860 |
+
# Only trigger summaries when: first build or we actually inserted/merged something
|
| 861 |
+
if not graph_exists or inserted_any:
|
| 862 |
+
_build_and_index_community_summaries(
|
| 863 |
+
kg_index=kg_index,
|
| 864 |
+
pc=pc,
|
| 865 |
+
index=index,
|
| 866 |
+
llm=llm,
|
| 867 |
+
impacted_nodes=new_nodes.union(u for u, v in new_edges).union(v for u, v in new_edges),
|
| 868 |
+
info=True
|
| 869 |
+
)
|
| 870 |
+
|
| 871 |
+
# Optional: save graph for visualization (post-update)
|
| 872 |
+
try:
|
| 873 |
+
nxg = graph_override if graph_override is not None else kg_index.get_networkx_graph()
|
| 874 |
+
graph_dict = {}
|
| 875 |
+
for u, v, attrs in nxg.edges(data=True):
|
| 876 |
+
rel = attrs.get("label") or attrs.get("relationship") or "related_to"
|
| 877 |
+
if u not in graph_dict:
|
| 878 |
+
graph_dict[u] = []
|
| 879 |
+
graph_dict[u].append([rel, v])
|
| 880 |
+
output_file = "knowledge_graph.json"
|
| 881 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 882 |
+
json.dump(graph_dict, f, indent=4, ensure_ascii=False)
|
| 883 |
+
if info:
|
| 884 |
+
print(f"[Info] Knowledge graph saved to {output_file}")
|
| 885 |
+
except Exception as e:
|
| 886 |
+
print(f"[Error] Failed to save knowledge graph: {e}")
|
| 887 |
+
|
| 888 |
+
# ---- mark seen signatures only after successful insertion or first build ----
|
| 889 |
+
if (not graph_exists and llama_docs_all) or inserted_any:
|
| 890 |
+
# Add only the new ones we processed this run
|
| 891 |
+
seen_sigs.update(new_sigs)
|
| 892 |
+
save_seen_sigs(seen_sigs)
|
| 893 |
+
|
| 894 |
+
return kg_index
|
| 895 |
+
|
| 896 |
+
|
rag_execute.py
ADDED
|
@@ -0,0 +1,647 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main entry point for the RAG Gradio application.
|
| 3 |
+
|
| 4 |
+
Loads environment variables, sets up context directory and model parameters,
|
| 5 |
+
initializes retrieval and generation functions, and launches the interactive chat UI.
|
| 6 |
+
Handles file uploads, user queries, and streaming LLM responses.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import queue
|
| 12 |
+
from threading import Thread
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
load_dotenv()
|
| 15 |
+
# Construct the path to the .env file relative to this script's location
|
| 16 |
+
dotenv_path = os.path.join(os.path.dirname(__file__), '..', 'RAG-LangChain', '.env')
|
| 17 |
+
print(f"Start loading .env from {dotenv_path}")
|
| 18 |
+
load_dotenv(dotenv_path=dotenv_path)
|
| 19 |
+
print(f"Finish loading .env")
|
| 20 |
+
from langchain.callbacks.base import BaseCallbackHandler
|
| 21 |
+
print(f"Start importing from rag_func")
|
| 22 |
+
from prepare import prepare_RAG
|
| 23 |
+
from retrieve import retrieve_RAG
|
| 24 |
+
from generate import generate_RAG
|
| 25 |
+
from prepare import build_knowledge_graph
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
######
|
| 29 |
+
# --- Graph viz imports (Plotly + NetworkX) ---
|
| 30 |
+
import json, math, random
|
| 31 |
+
import networkx as nx
|
| 32 |
+
import numpy as np
|
| 33 |
+
import plotly.graph_objects as go
|
| 34 |
+
import plotly.express as px
|
| 35 |
+
try:
|
| 36 |
+
from scipy.spatial import ConvexHull
|
| 37 |
+
SCIPY_AVAILABLE = True
|
| 38 |
+
except Exception:
|
| 39 |
+
SCIPY_AVAILABLE = False
|
| 40 |
+
|
| 41 |
+
######
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
print(f"Finish importing from rag_func")
|
| 45 |
+
import gradio as gr
|
| 46 |
+
|
| 47 |
+
# -------------------- Context Setup --------------------
|
| 48 |
+
|
| 49 |
+
user_dir = "context"
|
| 50 |
+
#print default
|
| 51 |
+
print(f"[Info] Using context directory: {user_dir}")
|
| 52 |
+
|
| 53 |
+
pinecone_API = os.getenv("PINECONE_API")
|
| 54 |
+
index_name = os.getenv("INDEX_NAME")
|
| 55 |
+
llm_model = os.getenv("MODELNAME")
|
| 56 |
+
|
| 57 |
+
#index, pc, llm, kg_index = prepare_RAG(pinecone_API, index_name, llm_model=llm_model, dir_name=user_dir, graph_rag=(graph_rag=="True"))
|
| 58 |
+
index, pc, llm, kg_index = None, None, None, None
|
| 59 |
+
|
| 60 |
+
# -------------------- Chat Functions --------------------
|
| 61 |
+
def add_user_message(message, history):
|
| 62 |
+
"""
|
| 63 |
+
Adds a new user message to the chat history.
|
| 64 |
+
|
| 65 |
+
Ensures the message is appended in the correct format for downstream processing.
|
| 66 |
+
Returns updated history for use in the chat UI.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
history = history or []
|
| 70 |
+
history.append({"role": "user", "content": message})
|
| 71 |
+
return "", history, history
|
| 72 |
+
|
| 73 |
+
import time
|
| 74 |
+
|
| 75 |
+
# -------------------- Streaming Handler --------------------
|
| 76 |
+
class StreamHandler(BaseCallbackHandler):
|
| 77 |
+
"""
|
| 78 |
+
Callback handler for streaming LLM tokens to the UI.
|
| 79 |
+
|
| 80 |
+
Tracks timing for first token and total response, buffers tokens,
|
| 81 |
+
and manages the flow of streamed content for real-time display.
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
def __init__(self, q: queue.Queue):
|
| 85 |
+
self.q = q
|
| 86 |
+
self.first_token_received = False
|
| 87 |
+
self.ttft = None # time to first token
|
| 88 |
+
self.total_time = None
|
| 89 |
+
self.start_time = None
|
| 90 |
+
self.buffer = [] # optional: accumulate tokens
|
| 91 |
+
|
| 92 |
+
def on_llm_new_token(self, token: str, **kwargs):
|
| 93 |
+
if not self.first_token_received:
|
| 94 |
+
self.ttft = time.time() - self.start_time
|
| 95 |
+
self.first_token_received = True
|
| 96 |
+
self.buffer.append(token)
|
| 97 |
+
self.q.put(token)
|
| 98 |
+
|
| 99 |
+
def on_llm_end(self, *args, **kwargs):
|
| 100 |
+
# IMPORTANT: do NOT end the consumer here.
|
| 101 |
+
# Let the worker thread send [[FINAL]] (if any) and then [[END]].
|
| 102 |
+
self.total_time = time.time() - self.start_time
|
| 103 |
+
# self.q.put("[[END]]") # <-- REMOVED (this was breaking before we could send [[FINAL]])
|
| 104 |
+
|
| 105 |
+
# -------------------- Chat Functions with timing --------------------
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def generate_bot_response(history):
|
| 110 |
+
"""
|
| 111 |
+
Streams the first pass from the LLM to the UI and updates a styled progress box above the chat.
|
| 112 |
+
"""
|
| 113 |
+
global index, pc, llm, kg_index
|
| 114 |
+
|
| 115 |
+
if not history or history[-1]["role"] != "user":
|
| 116 |
+
yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Ready</div>"
|
| 117 |
+
return
|
| 118 |
+
|
| 119 |
+
user_msg = history[-1]["content"]
|
| 120 |
+
documents = None
|
| 121 |
+
|
| 122 |
+
# --- Stage 1: Initialize LLM / vector infra ---
|
| 123 |
+
yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Initializing LLM and infrastructure...</div>"
|
| 124 |
+
if not index or not pc or not llm:
|
| 125 |
+
from langchain_mistralai import ChatMistralAI
|
| 126 |
+
from langchain_openai import ChatOpenAI
|
| 127 |
+
|
| 128 |
+
llm = ChatOpenAI(model=llm_model) if "gpt" in llm_model else ChatMistralAI(model=llm_model)
|
| 129 |
+
index, pc, llm, documents = prepare_RAG(
|
| 130 |
+
pinecone_API,
|
| 131 |
+
index_name,
|
| 132 |
+
llm_model=llm_model,
|
| 133 |
+
dir_name=user_dir,
|
| 134 |
+
info=True
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# --- Stage 2: Decide Graph RAG usage ---
|
| 138 |
+
yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Deciding Graph RAG usage...</div>"
|
| 139 |
+
def decide_graph_rag_usage(llm_, user_text: str) -> bool:
|
| 140 |
+
prompt = (
|
| 141 |
+
"Given the following user prompt, determine whether graph RAG should be used (True or False):\n"
|
| 142 |
+
f"{user_text}\n"
|
| 143 |
+
"Use 'False' only if the prompt is focused on retrieving a single fact.\n"
|
| 144 |
+
"Use 'True' if the prompt suggests reasoning over a large portion or the entirety of a dataset or corpus."
|
| 145 |
+
)
|
| 146 |
+
resp = llm_.invoke(prompt)
|
| 147 |
+
decision = (getattr(resp, "content", str(resp)) or "").strip()
|
| 148 |
+
print("[Debug] Graph RAG decision response:", decision)
|
| 149 |
+
return decision == "True"
|
| 150 |
+
|
| 151 |
+
graph_rag_flag = decide_graph_rag_usage(llm, user_msg)
|
| 152 |
+
print(f"[Info] Graph RAG usage decision: {graph_rag_flag}")
|
| 153 |
+
|
| 154 |
+
if graph_rag_flag and not documents:
|
| 155 |
+
_, _, _, documents = prepare_RAG(
|
| 156 |
+
pinecone_API,
|
| 157 |
+
index_name,
|
| 158 |
+
llm_model=llm_model,
|
| 159 |
+
dir_name=user_dir,
|
| 160 |
+
info=True
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
if graph_rag_flag:
|
| 164 |
+
kg_index = build_knowledge_graph(documents, llm, pc, index, info=True)
|
| 165 |
+
|
| 166 |
+
# --- Stage 3: Retrieve context ---
|
| 167 |
+
yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Retrieving context...</div>"
|
| 168 |
+
retrieved_chunks, graph_context = retrieve_RAG(
|
| 169 |
+
user_msg,
|
| 170 |
+
pc,
|
| 171 |
+
index,
|
| 172 |
+
kg_index,
|
| 173 |
+
top_k=5,
|
| 174 |
+
use_query_reformulation=True,
|
| 175 |
+
llm=llm,
|
| 176 |
+
graphRAG=graph_rag_flag
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# --- Stage 4: Generating response ---
|
| 180 |
+
yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Generating response...</div>"
|
| 181 |
+
FINAL_PREFIX = "[[FINAL]]"
|
| 182 |
+
q = queue.Queue()
|
| 183 |
+
handler = StreamHandler(q)
|
| 184 |
+
handler.start_time = time.time()
|
| 185 |
+
|
| 186 |
+
model_name = getattr(llm, "model_name", getattr(llm, "model", None))
|
| 187 |
+
streaming_llm = llm.__class__(model=model_name, streaming=True, callbacks=[handler])
|
| 188 |
+
|
| 189 |
+
def run_llm():
|
| 190 |
+
try:
|
| 191 |
+
resp = generate_RAG(
|
| 192 |
+
user_msg,
|
| 193 |
+
streaming_llm,
|
| 194 |
+
retrieved_chunks,
|
| 195 |
+
graph_context,
|
| 196 |
+
graphRAG=graph_rag_flag
|
| 197 |
+
)
|
| 198 |
+
final_text = (getattr(resp, "content", str(resp)) or "").strip()
|
| 199 |
+
if final_text:
|
| 200 |
+
q.put(FINAL_PREFIX + final_text)
|
| 201 |
+
finally:
|
| 202 |
+
q.put("[[END]]")
|
| 203 |
+
|
| 204 |
+
Thread(target=run_llm, daemon=True).start()
|
| 205 |
+
|
| 206 |
+
partial = ""
|
| 207 |
+
history.append({"role": "assistant", "content": ""})
|
| 208 |
+
|
| 209 |
+
while True:
|
| 210 |
+
token = q.get()
|
| 211 |
+
if token == "[[END]]":
|
| 212 |
+
yield history, history, "<div style='background:#d4edda;padding:10px;border-radius:8px;'>Completed!</div>"
|
| 213 |
+
print(f"[Timing] TTFT: {handler.ttft:.3f} s, Total: {handler.total_time:.3f} s")
|
| 214 |
+
break
|
| 215 |
+
|
| 216 |
+
if token.startswith(FINAL_PREFIX):
|
| 217 |
+
final = token[len(FINAL_PREFIX):]
|
| 218 |
+
history[-1]["content"] = final
|
| 219 |
+
yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Generating response...</div>"
|
| 220 |
+
partial = final
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
partial += token
|
| 224 |
+
history[-1]["content"] = partial
|
| 225 |
+
yield history, history, "<div style='background:#f5f5f5;padding:10px;border-radius:8px;'>Generating response...</div>"
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
# -------------------- Simplified CSS for Default Gradio Font --------------------
|
| 230 |
+
from pathlib import Path
|
| 231 |
+
import gradio as gr
|
| 232 |
+
|
| 233 |
+
# Load external assets
|
| 234 |
+
custom_css = Path("app.css").read_text(encoding="utf-8")
|
| 235 |
+
js_force_light = """ function refresh() {
|
| 236 |
+
const url = new URL(window.location);
|
| 237 |
+
if (url.searchParams.get('__theme') !== 'light') {
|
| 238 |
+
url.searchParams.set('__theme', 'light');
|
| 239 |
+
window.location.replace(url);
|
| 240 |
+
}
|
| 241 |
+
} """
|
| 242 |
+
|
| 243 |
+
# -------------------- Gradio App --------------------
|
| 244 |
+
import os
|
| 245 |
+
import shutil
|
| 246 |
+
MAX_TOTAL_SIZE_MB = 5
|
| 247 |
+
CONTEXT_DIR = "context"
|
| 248 |
+
|
| 249 |
+
def handle_file_upload(uploaded_files):
|
| 250 |
+
"""
|
| 251 |
+
Validates and saves uploaded files to the context directory for RAG processing.
|
| 252 |
+
|
| 253 |
+
Checks file extensions and total upload size against allowed limits.
|
| 254 |
+
Returns a status message indicating success or failure for each upload attempt.
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
context_dir = "context"
|
| 258 |
+
os.makedirs(context_dir, exist_ok=True)
|
| 259 |
+
saved_files = []
|
| 260 |
+
total_size_mb = 0
|
| 261 |
+
|
| 262 |
+
# Allowed extensions
|
| 263 |
+
allowed_extensions = {".txt", ".json", ".md", ".csv", ".pdf", ".docx", ".pptx", ".py"}
|
| 264 |
+
|
| 265 |
+
for file_obj in uploaded_files:
|
| 266 |
+
# Check file extension
|
| 267 |
+
ext = os.path.splitext(file_obj.name)[1].lower()
|
| 268 |
+
if ext not in allowed_extensions:
|
| 269 |
+
return f"❌ Unsupported file type: {ext}. Allowed types are: {', '.join(sorted(allowed_extensions))}"
|
| 270 |
+
# Check size
|
| 271 |
+
file_size_mb = os.path.getsize(file_obj.name) / (1024 * 1024)
|
| 272 |
+
total_size_mb += file_size_mb
|
| 273 |
+
if total_size_mb > MAX_TOTAL_SIZE_MB:
|
| 274 |
+
return f"❌ Total upload size exceeds the limit of {MAX_TOTAL_SIZE_MB}MB."
|
| 275 |
+
# Save file
|
| 276 |
+
filename = os.path.basename(file_obj.name)
|
| 277 |
+
dest_path = os.path.join(context_dir, filename)
|
| 278 |
+
with open(file_obj.name, "rb") as src, open(dest_path, "wb") as dst:
|
| 279 |
+
dst.write(src.read())
|
| 280 |
+
saved_files.append(dest_path)
|
| 281 |
+
|
| 282 |
+
return f"✅ Uploaded {len(saved_files)} file(s) to '{context_dir}': {', '.join(os.path.basename(f) for f in saved_files)}"
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
#########
|
| 287 |
+
# ---------- Graph viz core ----------
|
| 288 |
+
GRAPH_JSON_PATH = "knowledge_graph.json"
|
| 289 |
+
COMMUNITY_MIN_SIZE = 3
|
| 290 |
+
MERGE_SMALLS_POLICY = "bucket" # or 'attach'
|
| 291 |
+
LAYOUT_SEED = 42
|
| 292 |
+
LAYOUT_ITERS = 30
|
| 293 |
+
|
| 294 |
+
# Cached state (simple globals for now)
|
| 295 |
+
_g_G = None
|
| 296 |
+
_g_pos3d = None
|
| 297 |
+
_g_node2comm = None
|
| 298 |
+
_g_comm2nodes = None
|
| 299 |
+
_g_edges = None
|
| 300 |
+
_g_node_names = None
|
| 301 |
+
|
| 302 |
+
def load_graph_from_json(path=GRAPH_JSON_PATH):
|
| 303 |
+
"""Read {source: [[rel, target], ...], ...} and return a DiGraph."""
|
| 304 |
+
try:
|
| 305 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 306 |
+
graph_dict = json.load(f)
|
| 307 |
+
except Exception:
|
| 308 |
+
graph_dict = {}
|
| 309 |
+
G = nx.DiGraph()
|
| 310 |
+
for source, edges_list in graph_dict.items():
|
| 311 |
+
for relation, target in edges_list:
|
| 312 |
+
G.add_edge(source, target, label=relation)
|
| 313 |
+
if G.number_of_nodes() == 0:
|
| 314 |
+
G.add_node("(empty)")
|
| 315 |
+
return G
|
| 316 |
+
|
| 317 |
+
def precompute_layout_and_communities(G: nx.DiGraph):
|
| 318 |
+
"""Compute 3D spring layout and top-level modularity communities."""
|
| 319 |
+
pos3d = nx.spring_layout(G, dim=3, seed=LAYOUT_SEED, iterations=LAYOUT_ITERS)
|
| 320 |
+
node_names = list(G.nodes())
|
| 321 |
+
edges = list(G.edges())
|
| 322 |
+
|
| 323 |
+
# Greedy modularity communities (on undirected projection)
|
| 324 |
+
from networkx.algorithms.community import greedy_modularity_communities
|
| 325 |
+
UG = nx.Graph()
|
| 326 |
+
UG.add_edges_from(G.to_undirected().edges())
|
| 327 |
+
communities = list(greedy_modularity_communities(UG))
|
| 328 |
+
large = [set(c) for c in communities if len(c) >= COMMUNITY_MIN_SIZE]
|
| 329 |
+
small = [set(c) for c in communities if len(c) < COMMUNITY_MIN_SIZE]
|
| 330 |
+
|
| 331 |
+
if MERGE_SMALLS_POLICY == "bucket" and small:
|
| 332 |
+
other = set().union(*small) if small else set()
|
| 333 |
+
if other:
|
| 334 |
+
large.append(other)
|
| 335 |
+
comm_ids = [f"C{i}" for i in range(len(large) - (1 if other else 0))]
|
| 336 |
+
if other:
|
| 337 |
+
comm_ids.append("C_other")
|
| 338 |
+
elif MERGE_SMALLS_POLICY == "attach" and small and large:
|
| 339 |
+
for s in small:
|
| 340 |
+
# attach to the large community with the most cross-edges
|
| 341 |
+
best_i, best_links = None, -1
|
| 342 |
+
for i, L in enumerate(large):
|
| 343 |
+
links = sum(1 for u in s for v in L if UG.has_edge(u, v))
|
| 344 |
+
if links > best_links:
|
| 345 |
+
best_i, best_links = i, links
|
| 346 |
+
if best_i is None:
|
| 347 |
+
best_i = max(range(len(large)), key=lambda i: len(large[i]))
|
| 348 |
+
large[best_i].update(s)
|
| 349 |
+
comm_ids = [f"C{i}" for i in range(len(large))]
|
| 350 |
+
else:
|
| 351 |
+
comm_ids = [f"C{i}" for i in range(len(large))]
|
| 352 |
+
|
| 353 |
+
node2comm, comm2nodes = {}, {}
|
| 354 |
+
for cid, nodeset in zip(comm_ids, large):
|
| 355 |
+
comm2nodes[cid] = set(nodeset)
|
| 356 |
+
for n in nodeset:
|
| 357 |
+
node2comm[n] = cid
|
| 358 |
+
for n in G.nodes():
|
| 359 |
+
if n not in node2comm:
|
| 360 |
+
node2comm[n] = "C_isolated"
|
| 361 |
+
comm2nodes.setdefault("C_isolated", set()).add(n)
|
| 362 |
+
|
| 363 |
+
return pos3d, node2comm, comm2nodes, edges, node_names
|
| 364 |
+
|
| 365 |
+
def _make_comm_colors(comm2nodes_dict):
|
| 366 |
+
palette = (px.colors.qualitative.Alphabet +
|
| 367 |
+
px.colors.qualitative.Set3 +
|
| 368 |
+
px.colors.qualitative.Bold +
|
| 369 |
+
px.colors.qualitative.Dark24 +
|
| 370 |
+
px.colors.qualitative.Light24)
|
| 371 |
+
cids = sorted(comm2nodes_dict.keys())
|
| 372 |
+
return {cid: palette[i % len(palette)] for i, cid in enumerate(cids)}
|
| 373 |
+
|
| 374 |
+
def _community_hulls_traces(pos3d, comm2nodes, comm_colors, opacity=0.12):
|
| 375 |
+
if not SCIPY_AVAILABLE:
|
| 376 |
+
return []
|
| 377 |
+
hull_traces = []
|
| 378 |
+
for cid, nodeset in comm2nodes.items():
|
| 379 |
+
pts = np.array([pos3d[n] for n in nodeset if n in pos3d])
|
| 380 |
+
if pts.shape[0] < 4:
|
| 381 |
+
continue
|
| 382 |
+
try:
|
| 383 |
+
hull = ConvexHull(pts)
|
| 384 |
+
simplices = hull.simplices
|
| 385 |
+
hull_traces.append(go.Mesh3d(
|
| 386 |
+
x=pts[:, 0], y=pts[:, 1], z=pts[:, 2],
|
| 387 |
+
i=simplices[:, 0], j=simplices[:, 1], k=simplices[:, 2],
|
| 388 |
+
color=_make_comm_colors(comm2nodes).get(cid, "#cccccc"),
|
| 389 |
+
opacity=opacity, name=f"{cid} region",
|
| 390 |
+
hoverinfo="skip", showlegend=False
|
| 391 |
+
))
|
| 392 |
+
except Exception:
|
| 393 |
+
pass
|
| 394 |
+
return hull_traces
|
| 395 |
+
#########
|
| 396 |
+
def build_plotly_figure(mode="community", highlight_node=None,
|
| 397 |
+
highlight_comm_id=None, dim_inter_edges=True,
|
| 398 |
+
show_hulls=False):
|
| 399 |
+
global _g_G, _g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names
|
| 400 |
+
|
| 401 |
+
# Load & cache if not present
|
| 402 |
+
if _g_G is None:
|
| 403 |
+
_g_G = load_graph_from_json()
|
| 404 |
+
_g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names = \
|
| 405 |
+
precompute_layout_and_communities(_g_G)
|
| 406 |
+
|
| 407 |
+
G = _g_G
|
| 408 |
+
pos3d = _g_pos3d
|
| 409 |
+
node2comm = _g_node2comm
|
| 410 |
+
comm2nodes = _g_comm2nodes
|
| 411 |
+
edges = _g_edges
|
| 412 |
+
|
| 413 |
+
# split intra/inter edges
|
| 414 |
+
edge_x_intra, edge_y_intra, edge_z_intra = [], [], []
|
| 415 |
+
edge_x_inter, edge_y_inter, edge_z_inter = [], [], []
|
| 416 |
+
for (u, v) in edges:
|
| 417 |
+
x0, y0, z0 = pos3d[u]
|
| 418 |
+
x1, y1, z1 = pos3d[v]
|
| 419 |
+
if node2comm.get(u) == node2comm.get(v):
|
| 420 |
+
edge_x_intra += [x0, x1, None]; edge_y_intra += [y0, y1, None]; edge_z_intra += [z0, z1, None]
|
| 421 |
+
else:
|
| 422 |
+
edge_x_inter += [x0, x1, None]; edge_y_inter += [y0, y1, None]; edge_z_inter += [z0, z1, None]
|
| 423 |
+
|
| 424 |
+
edge_traces = []
|
| 425 |
+
if edge_x_inter:
|
| 426 |
+
edge_traces.append(go.Scatter3d(
|
| 427 |
+
x=edge_x_inter, y=edge_y_inter, z=edge_z_inter,
|
| 428 |
+
mode="lines",
|
| 429 |
+
line=dict(width=1, color="rgba(180,180,180,0.30)" if dim_inter_edges else "#BBBBBB"),
|
| 430 |
+
hoverinfo="none", showlegend=False, name="Inter-community"
|
| 431 |
+
))
|
| 432 |
+
if edge_x_intra:
|
| 433 |
+
edge_traces.append(go.Scatter3d(
|
| 434 |
+
x=edge_x_intra, y=edge_y_intra, z=edge_z_intra,
|
| 435 |
+
mode="lines",
|
| 436 |
+
line=dict(width=2, color="rgba(120,120,120,0.55)"),
|
| 437 |
+
hoverinfo="none", showlegend=False, name="Intra-community"
|
| 438 |
+
))
|
| 439 |
+
|
| 440 |
+
comm_colors = _make_comm_colors(comm2nodes)
|
| 441 |
+
hull_traces = _community_hulls_traces(pos3d, comm2nodes, comm_colors) if show_hulls else []
|
| 442 |
+
|
| 443 |
+
# neighbor sets (if needed)
|
| 444 |
+
nbr_succ, nbr_pred = set(), set()
|
| 445 |
+
if mode == "neighbors" and highlight_node and highlight_node in G:
|
| 446 |
+
nbr_succ = set(G.neighbors(highlight_node))
|
| 447 |
+
nbr_pred = set(G.predecessors(highlight_node))
|
| 448 |
+
|
| 449 |
+
node_traces = []
|
| 450 |
+
for cid, nodeset in sorted(comm2nodes.items(), key=lambda kv: kv[0]):
|
| 451 |
+
xs, ys, zs, texts, colors, sizes = [], [], [], [], [], []
|
| 452 |
+
base_color = comm_colors.get(cid, "#66c2a5")
|
| 453 |
+
for n in nodeset:
|
| 454 |
+
x, y, z = pos3d[n]
|
| 455 |
+
xs.append(x); ys.append(y); zs.append(z); texts.append(n)
|
| 456 |
+
if mode == "neighbors":
|
| 457 |
+
if highlight_node == n:
|
| 458 |
+
colors.append("red"); sizes.append(8.0)
|
| 459 |
+
elif n in nbr_succ or n in nbr_pred:
|
| 460 |
+
colors.append("orange"); sizes.append(6.5)
|
| 461 |
+
elif highlight_node and node2comm.get(n) == node2comm.get(highlight_node):
|
| 462 |
+
colors.append(base_color); sizes.append(5.5)
|
| 463 |
+
else:
|
| 464 |
+
colors.append("lightblue"); sizes.append(5.0)
|
| 465 |
+
else:
|
| 466 |
+
if highlight_comm_id and node2comm.get(n) == highlight_comm_id:
|
| 467 |
+
colors.append(base_color); sizes.append(6.5)
|
| 468 |
+
else:
|
| 469 |
+
colors.append(base_color); sizes.append(5.0)
|
| 470 |
+
if xs:
|
| 471 |
+
node_traces.append(go.Scatter3d(
|
| 472 |
+
x=xs, y=ys, z=zs, mode="markers",
|
| 473 |
+
hovertext=texts, hoverinfo="text",
|
| 474 |
+
marker=dict(size=sizes, color=colors, opacity=0.95),
|
| 475 |
+
name=cid, showlegend=True
|
| 476 |
+
))
|
| 477 |
+
|
| 478 |
+
fig = go.Figure(data=hull_traces + edge_traces + node_traces)
|
| 479 |
+
fig.update_layout(
|
| 480 |
+
title="3D Knowledge Graph — Communities & Neighbors",
|
| 481 |
+
showlegend=True if mode == "community" else False,
|
| 482 |
+
height=800,
|
| 483 |
+
margin=dict(l=0, r=0, t=40, b=0),
|
| 484 |
+
scene=dict(
|
| 485 |
+
xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False),
|
| 486 |
+
aspectmode="data"
|
| 487 |
+
),
|
| 488 |
+
scene_camera=dict(eye=dict(x=1.5, y=1.5, z=1.5)),
|
| 489 |
+
uirevision=True,
|
| 490 |
+
)
|
| 491 |
+
return fig
|
| 492 |
+
|
| 493 |
+
def reload_graph_cache():
|
| 494 |
+
"""Force re-read knowledge_graph.json and recompute layout/communities."""
|
| 495 |
+
global _g_G, _g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names
|
| 496 |
+
_g_G = load_graph_from_json()
|
| 497 |
+
_g_pos3d, _g_node2comm, _g_comm2nodes, _g_edges, _g_node_names = \
|
| 498 |
+
precompute_layout_and_communities(_g_G)
|
| 499 |
+
# Return a default figure
|
| 500 |
+
return build_plotly_figure(mode="community", highlight_comm_id=None, dim_inter_edges=True, show_hulls=False)
|
| 501 |
+
|
| 502 |
+
#########
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
with gr.Blocks(css=custom_css, fill_height=True, js=js_force_light) as demo:
|
| 507 |
+
with gr.Row():
|
| 508 |
+
# LEFT SIDE: Branding + Upload
|
| 509 |
+
with gr.Column(scale=1, elem_id="left-column"):
|
| 510 |
+
# Branding row: logo and title side by side
|
| 511 |
+
with gr.Row(elem_id="branding-row"):
|
| 512 |
+
import base64
|
| 513 |
+
from pathlib import Path
|
| 514 |
+
|
| 515 |
+
HERE = Path(__file__).resolve().parent
|
| 516 |
+
logo_path = HERE / "logo_mono.png"
|
| 517 |
+
|
| 518 |
+
with open(logo_path, "rb") as f:
|
| 519 |
+
encoded = base64.b64encode(f.read()).decode()
|
| 520 |
+
|
| 521 |
+
gr.HTML(f"""
|
| 522 |
+
<div id="branding">
|
| 523 |
+
<img id="company-logo" src="data:image/png;base64,{encoded}" alt="Logo" />
|
| 524 |
+
<span id="brand-text">mosaiicRAG</span>
|
| 525 |
+
</div>
|
| 526 |
+
""")
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
gr.Markdown(
|
| 531 |
+
"<p>Daten verstehen. Wissen vernetzen. Entscheidungen stärken.</p>",
|
| 532 |
+
elem_id="header2"
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
# Below branding: upload info
|
| 536 |
+
gr.Markdown(
|
| 537 |
+
"""
|
| 538 |
+
**Supported file formats:** .txt, .json, .md, .csv, .pdf, .docx, .pptx, .py
|
| 539 |
+
|
| 540 |
+
**Maximum files' size:** 5 MB
|
| 541 |
+
""",
|
| 542 |
+
elem_id="upload-note"
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
file_upload = gr.File(
|
| 546 |
+
label="Upload files for RAG context",
|
| 547 |
+
file_count="multiple",
|
| 548 |
+
elem_id="file-upload"
|
| 549 |
+
)
|
| 550 |
+
upload_btn = gr.Button("Upload")
|
| 551 |
+
upload_output = gr.Textbox(label="Upload status", interactive=False)
|
| 552 |
+
upload_btn.click(handle_file_upload, inputs=[file_upload], outputs=[upload_output])
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
with gr.Column(scale=4, elem_id="right-column"):
|
| 556 |
+
with gr.Tabs():
|
| 557 |
+
# ------------------------- Chat tab (unchanged) -------------------------
|
| 558 |
+
with gr.Tab("Chat"):
|
| 559 |
+
with gr.Column(elem_id="chat-area"):
|
| 560 |
+
progress_box = gr.HTML("<div style='background:#f5f5f5;padding:10px;border-radius:8px;margin-bottom:10px;'>Ready</div>")
|
| 561 |
+
chatbot = gr.Chatbot(type="messages", label="Conversation", elem_id="chatbot")
|
| 562 |
+
with gr.Row(elem_id="input-row"):
|
| 563 |
+
msg = gr.Textbox(placeholder="Type your question here...", lines=1)
|
| 564 |
+
send_btn = gr.Button("➤", elem_id="send-button", size="sm")
|
| 565 |
+
state = gr.State([])
|
| 566 |
+
msg.submit(add_user_message, inputs=[msg, state], outputs=[msg, chatbot, state])\
|
| 567 |
+
.then(generate_bot_response, inputs=[state], outputs=[chatbot, state, progress_box])
|
| 568 |
+
send_btn.click(add_user_message, inputs=[msg, state], outputs=[msg, chatbot, state])\
|
| 569 |
+
.then(generate_bot_response, inputs=[state], outputs=[chatbot, state, progress_box])
|
| 570 |
+
|
| 571 |
+
# --------------------- Knowledge Graph tab (updated) ---------------------
|
| 572 |
+
with gr.Tab("Knowledge Graph"):
|
| 573 |
+
with gr.Row():
|
| 574 |
+
color_mode = gr.Radio(
|
| 575 |
+
["community"],
|
| 576 |
+
value="community",
|
| 577 |
+
label="Color mode"
|
| 578 |
+
)
|
| 579 |
+
community_select = gr.Dropdown(
|
| 580 |
+
label="Highlight community (optional)",
|
| 581 |
+
choices=[],
|
| 582 |
+
value=None
|
| 583 |
+
)
|
| 584 |
+
view_opts = gr.CheckboxGroup(
|
| 585 |
+
choices=[
|
| 586 |
+
"Dim inter-community edges",
|
| 587 |
+
f"Show 3D community hulls{' (requires scipy)' if not SCIPY_AVAILABLE else ''}"
|
| 588 |
+
],
|
| 589 |
+
value=["Dim inter-community edges"],
|
| 590 |
+
label="View options"
|
| 591 |
+
)
|
| 592 |
+
reload_btn = gr.Button("Reload graph")
|
| 593 |
+
|
| 594 |
+
graph_plot = gr.Plot(label="3D Knowledge Graph")
|
| 595 |
+
node_info = gr.Markdown("")
|
| 596 |
+
|
| 597 |
+
# ---- functions bound to UI (defined above or inline) ----
|
| 598 |
+
def _init_graph():
|
| 599 |
+
# Rebuild cache from knowledge_graph.json and return default figure
|
| 600 |
+
fig = reload_graph_cache()
|
| 601 |
+
cids = sorted(list(_g_comm2nodes.keys())) if _g_comm2nodes else []
|
| 602 |
+
info = "Select a community or click a node to highlight its community."
|
| 603 |
+
# Use gr.update to set dropdown choices
|
| 604 |
+
return fig, gr.update(choices=cids, value=None), info
|
| 605 |
+
|
| 606 |
+
def _refresh(mode, selected_cid, opts):
|
| 607 |
+
dim_edges = isinstance(opts, list) and ("Dim inter-community edges" in opts)
|
| 608 |
+
show_hulls = isinstance(opts, list) and any("Show 3D community hulls" in s for s in opts)
|
| 609 |
+
|
| 610 |
+
fig = build_plotly_figure(
|
| 611 |
+
mode="community" if mode == "community" else "neighbors",
|
| 612 |
+
highlight_comm_id=(selected_cid if mode == "community" else None),
|
| 613 |
+
dim_inter_edges=dim_edges,
|
| 614 |
+
show_hulls=(show_hulls if mode == "community" else False)
|
| 615 |
+
)
|
| 616 |
+
info = (
|
| 617 |
+
"Select a community or click a node to highlight its community."
|
| 618 |
+
if mode == "community"
|
| 619 |
+
else "Click a node to see its neighbors (community tint applied)."
|
| 620 |
+
)
|
| 621 |
+
return fig, info
|
| 622 |
+
|
| 623 |
+
def _reload(mode, selected_cid, opts):
|
| 624 |
+
# Reload data and recompute communities/layout
|
| 625 |
+
_ = reload_graph_cache()
|
| 626 |
+
cids = sorted(list(_g_comm2nodes.keys())) if _g_comm2nodes else []
|
| 627 |
+
# Immediately apply current UI options on the new graph state
|
| 628 |
+
fig, info = _refresh(mode, selected_cid, opts)
|
| 629 |
+
return fig, gr.update(choices=cids, value=selected_cid), info
|
| 630 |
+
|
| 631 |
+
# wire controls
|
| 632 |
+
color_mode.change(_refresh, inputs=[color_mode, community_select, view_opts],
|
| 633 |
+
outputs=[graph_plot, node_info])
|
| 634 |
+
community_select.change(_refresh, inputs=[color_mode, community_select, view_opts],
|
| 635 |
+
outputs=[graph_plot, node_info])
|
| 636 |
+
view_opts.change(_refresh, inputs=[color_mode, community_select, view_opts],
|
| 637 |
+
outputs=[graph_plot, node_info])
|
| 638 |
+
|
| 639 |
+
reload_btn.click(_reload, inputs=[color_mode, community_select, view_opts],
|
| 640 |
+
outputs=[graph_plot, community_select, node_info])
|
| 641 |
+
|
| 642 |
+
# ------------------------ IMPORTANT: INSIDE THE BLOCKS ------------------------
|
| 643 |
+
# Initialize the graph once when the app loads (now inside the Blocks context)
|
| 644 |
+
demo.load(_init_graph, inputs=[], outputs=[graph_plot, community_select, node_info])
|
| 645 |
+
# -------------------- Launch App --------------------
|
| 646 |
+
if __name__ == "__main__":
|
| 647 |
+
demo.launch(inbrowser=True)
|
rag_on_prem.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pathlib
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
# LangChain imports
|
| 6 |
+
from langchain_community.document_loaders import (
|
| 7 |
+
CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
|
| 8 |
+
UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
|
| 9 |
+
UnstructuredHTMLLoader, NotebookLoader
|
| 10 |
+
)
|
| 11 |
+
from langchain_core.documents import Document
|
| 12 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 13 |
+
from langchain_ollama import ChatOllama, OllamaEmbeddings
|
| 14 |
+
from langchain_community.vectorstores import FAISS
|
| 15 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 16 |
+
|
| 17 |
+
# -------------------------
|
| 18 |
+
# UTF-8 safe Text Loader
|
| 19 |
+
# -------------------------
|
| 20 |
+
class SafeTextLoader:
|
| 21 |
+
"""Loads a text file as a single Document, safely handling UTF-8 decoding errors."""
|
| 22 |
+
def __init__(self, file_path):
|
| 23 |
+
self.file_path = file_path
|
| 24 |
+
print(f"[Debug] Initialized SafeTextLoader with file_path: {file_path}")
|
| 25 |
+
|
| 26 |
+
def load(self):
|
| 27 |
+
try:
|
| 28 |
+
print(f"[Debug] Attempting to load file: {self.file_path}")
|
| 29 |
+
with open(self.file_path, "rb") as f: # open in binary mode
|
| 30 |
+
raw_bytes = f.read()
|
| 31 |
+
text = raw_bytes.decode("utf-8", errors="ignore") # decode safely
|
| 32 |
+
print(f"[Debug] Successfully loaded file: {self.file_path}")
|
| 33 |
+
return [Document(page_content=text, metadata={"source": str(self.file_path)})]
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"[Error] Failed to read {self.file_path}: {e}")
|
| 36 |
+
return []
|
| 37 |
+
|
| 38 |
+
# -------------------------
|
| 39 |
+
# Loader mapping
|
| 40 |
+
# -------------------------
|
| 41 |
+
LOADER_MAPPING = {
|
| 42 |
+
# Text
|
| 43 |
+
".txt": SafeTextLoader,
|
| 44 |
+
".json": SafeTextLoader,
|
| 45 |
+
".md": UnstructuredMarkdownLoader,
|
| 46 |
+
".csv": CSVLoader,
|
| 47 |
+
".yaml": SafeTextLoader,
|
| 48 |
+
".yml": SafeTextLoader,
|
| 49 |
+
|
| 50 |
+
# Documents
|
| 51 |
+
".pdf": PyPDFLoader,
|
| 52 |
+
".docx": UnstructuredWordDocumentLoader,
|
| 53 |
+
".pptx": UnstructuredPowerPointLoader,
|
| 54 |
+
".html": UnstructuredHTMLLoader,
|
| 55 |
+
".htm": UnstructuredHTMLLoader,
|
| 56 |
+
|
| 57 |
+
# Code / Notebook
|
| 58 |
+
".ipynb": NotebookLoader,
|
| 59 |
+
".py": SafeTextLoader,
|
| 60 |
+
".js": SafeTextLoader,
|
| 61 |
+
".sql": SafeTextLoader,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# -------------------------
|
| 65 |
+
# Dataset creation
|
| 66 |
+
# -------------------------
|
| 67 |
+
def create_dataset(directory_path: str = "context"):
|
| 68 |
+
"""Loads all supported files from the given directory (recursively)."""
|
| 69 |
+
print(f"[Debug] Starting dataset creation for directory: {directory_path}")
|
| 70 |
+
target_dir = pathlib.Path(directory_path).resolve()
|
| 71 |
+
|
| 72 |
+
if not target_dir.exists() or not target_dir.is_dir():
|
| 73 |
+
print(f"[Error] Target directory does not exist: {target_dir}")
|
| 74 |
+
return []
|
| 75 |
+
|
| 76 |
+
documents = []
|
| 77 |
+
|
| 78 |
+
for file_path in target_dir.rglob("*"): # recursive
|
| 79 |
+
if not file_path.is_file():
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
ext = file_path.suffix.lower()
|
| 83 |
+
loader_cls = LOADER_MAPPING.get(ext)
|
| 84 |
+
|
| 85 |
+
if loader_cls is None:
|
| 86 |
+
print(f"[Skip] Unsupported file type: {file_path}")
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
print(f"[Debug] Loading file: {file_path}")
|
| 91 |
+
loader = loader_cls(str(file_path))
|
| 92 |
+
docs = loader.load()
|
| 93 |
+
documents.extend(docs)
|
| 94 |
+
print(f"[Loaded] {file_path} ({len(docs)} docs)")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(f"[Error] Failed to load {file_path}: {e}")
|
| 97 |
+
|
| 98 |
+
print(f"[Done] Finished scanning {target_dir}")
|
| 99 |
+
print(f"Total documents loaded: {len(documents)}")
|
| 100 |
+
return documents
|
| 101 |
+
|
| 102 |
+
# -------------------------
|
| 103 |
+
# Prepare RAG (Ollama + FAISS)
|
| 104 |
+
# -------------------------
|
| 105 |
+
def prepare_RAG(dir_name="context", chunk_size=600, chunk_overlap=50):
|
| 106 |
+
print(f"[Debug] Preparing RAG with Ollama + FAISS. Context dir={dir_name}")
|
| 107 |
+
|
| 108 |
+
documents = create_dataset(dir_name)
|
| 109 |
+
if not documents:
|
| 110 |
+
raise ValueError("No documents loaded. Please add files to the context directory.")
|
| 111 |
+
|
| 112 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 113 |
+
print(f"[Debug] Splitting documents into chunks with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}")
|
| 114 |
+
all_splits = text_splitter.split_documents(documents)
|
| 115 |
+
|
| 116 |
+
print(f"[Debug] Number of chunks created: {len(all_splits)}")
|
| 117 |
+
|
| 118 |
+
# Ollama embeddings (local)
|
| 119 |
+
print(f"[Debug] Initializing Ollama embeddings")
|
| 120 |
+
embeddings = OllamaEmbeddings(model="nomic-embed-text")
|
| 121 |
+
|
| 122 |
+
# FAISS vector store
|
| 123 |
+
print(f"[Debug] Creating FAISS vector store")
|
| 124 |
+
vectorstore = FAISS.from_documents(all_splits, embeddings)
|
| 125 |
+
|
| 126 |
+
# Ollama LLM
|
| 127 |
+
print(f"[Debug] Initializing Ollama LLM")
|
| 128 |
+
llm = ChatOllama(model="llama3") # change model if needed
|
| 129 |
+
|
| 130 |
+
return vectorstore, llm
|
| 131 |
+
|
| 132 |
+
# -------------------------
|
| 133 |
+
# Retrieval
|
| 134 |
+
# -------------------------
|
| 135 |
+
def retrieve_RAG(query, vectorstore, top_k=5):
|
| 136 |
+
print(f"[Debug] Retrieving top {top_k} documents for query: {query}")
|
| 137 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
|
| 138 |
+
results = retriever.get_relevant_documents(query)
|
| 139 |
+
print(f"[Debug] Retrieved {len(results)} documents")
|
| 140 |
+
return results
|
| 141 |
+
|
| 142 |
+
# -------------------------
|
| 143 |
+
# Generation
|
| 144 |
+
# -------------------------
|
| 145 |
+
def generate_RAG(prompt_message, llm, retrieved_docs):
|
| 146 |
+
print(f"[Debug] Generating response for prompt: {prompt_message}")
|
| 147 |
+
context_message = (
|
| 148 |
+
"You are an expert assistant. Use ONLY the provided context documents "
|
| 149 |
+
"to answer the question. If the context does not contain the answer, reply with 'I don’t know'."
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
context_text = "\n\n".join([d.page_content for d in retrieved_docs])
|
| 153 |
+
print(f"[Debug] Context for generation: {context_text[:500]}... (truncated)")
|
| 154 |
+
|
| 155 |
+
prompt = [
|
| 156 |
+
SystemMessage(content=context_message),
|
| 157 |
+
HumanMessage(content=f"Context:\n{context_text}\n\nQuestion: {prompt_message}")
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
response = llm.invoke(prompt)
|
| 161 |
+
print(f"[Debug] Generated response: {response.content}")
|
| 162 |
+
return response
|
| 163 |
+
|
| 164 |
+
# -------------------------
|
| 165 |
+
# Gradio Chatbot
|
| 166 |
+
# -------------------------
|
| 167 |
+
def run_chatbot(user_dir="context"):
|
| 168 |
+
print(f"[Debug] Starting chatbot with user_dir: {user_dir}")
|
| 169 |
+
vectorstore, llm = prepare_RAG(dir_name=user_dir)
|
| 170 |
+
|
| 171 |
+
# Step 1: Add user message
|
| 172 |
+
def add_user_message(message, history):
|
| 173 |
+
print(f"[Debug] Adding user message: {message}")
|
| 174 |
+
history = history or []
|
| 175 |
+
history.append({"role": "user", "content": message})
|
| 176 |
+
return "", history, history
|
| 177 |
+
|
| 178 |
+
# Step 2: Generate bot response
|
| 179 |
+
def generate_bot_response(history):
|
| 180 |
+
if not history or history[-1]["role"] != "user":
|
| 181 |
+
print(f"[Debug] No user message to respond to.")
|
| 182 |
+
return history, history
|
| 183 |
+
|
| 184 |
+
user_msg = history[-1]["content"]
|
| 185 |
+
print(f"[Debug] Generating response for user message: {user_msg}")
|
| 186 |
+
retrieved_docs = retrieve_RAG(user_msg, vectorstore)
|
| 187 |
+
response = generate_RAG(user_msg, llm, retrieved_docs)
|
| 188 |
+
|
| 189 |
+
history.append({"role": "assistant", "content": response.content})
|
| 190 |
+
return history, history
|
| 191 |
+
|
| 192 |
+
with gr.Blocks() as demo:
|
| 193 |
+
gr.Markdown("# 📚 On-Prem RAG Chatbot (Ollama + FAISS)")
|
| 194 |
+
gr.Markdown("Ask questions about your local documents.")
|
| 195 |
+
|
| 196 |
+
chatbot = gr.Chatbot(type="messages")
|
| 197 |
+
msg = gr.Textbox(label="Your message")
|
| 198 |
+
state = gr.State([])
|
| 199 |
+
|
| 200 |
+
msg.submit(add_user_message, inputs=[msg, state], outputs=[msg, chatbot, state]) \
|
| 201 |
+
.then(generate_bot_response, inputs=[state], outputs=[chatbot, state])
|
| 202 |
+
|
| 203 |
+
demo.launch()
|
| 204 |
+
|
| 205 |
+
# -------------------------
|
| 206 |
+
# Main
|
| 207 |
+
# -------------------------
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
user_input = input("Enter a subfolder inside 'context' (press Enter for full 'context'): ").strip()
|
| 210 |
+
if not user_input:
|
| 211 |
+
user_dir = "context"
|
| 212 |
+
else:
|
| 213 |
+
user_dir = os.path.join("context", user_input)
|
| 214 |
+
|
| 215 |
+
print(f"[Info] Using context directory: {user_dir}")
|
| 216 |
+
run_chatbot(user_dir)
|
requirements.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain
|
| 2 |
+
langchain-core
|
| 3 |
+
langchain-community
|
| 4 |
+
langchain-openai
|
| 5 |
+
langchain-mistralai
|
| 6 |
+
langchain-text-splitters
|
| 7 |
+
llama-index==0.14.5
|
| 8 |
+
llama-index-llms-langchain==0.7.1
|
| 9 |
+
datasets==4.1.0
|
| 10 |
+
ragas==0.3.5
|
| 11 |
+
langgraph==0.6.7
|
| 12 |
+
gradio==5.46.0
|
| 13 |
+
python-dotenv==1.1.1
|
| 14 |
+
nbformat==5.10.4
|
| 15 |
+
nbconvert==7.16.6
|
| 16 |
+
pypdf==6.0.0
|
| 17 |
+
unstructured[docx,pptx,html,md]==0.18.15
|
| 18 |
+
pinecone==7.3.0
|
| 19 |
+
plotly
|
retrieve.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pathlib
|
| 3 |
+
import time
|
| 4 |
+
import re
|
| 5 |
+
from pinecone import Pinecone
|
| 6 |
+
|
| 7 |
+
from langchain_mistralai import ChatMistralAI
|
| 8 |
+
from langchain_openai import ChatOpenAI
|
| 9 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 10 |
+
from langchain.schema import Document
|
| 11 |
+
from langchain_community.document_loaders import (
|
| 12 |
+
CSVLoader, PyPDFLoader, UnstructuredWordDocumentLoader,
|
| 13 |
+
UnstructuredPowerPointLoader, UnstructuredMarkdownLoader,
|
| 14 |
+
UnstructuredHTMLLoader, NotebookLoader
|
| 15 |
+
)
|
| 16 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 17 |
+
|
| 18 |
+
from llama_index.core.memory import Memory
|
| 19 |
+
|
| 20 |
+
import pickle
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
from typing import List, Any
|
| 24 |
+
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
|
| 25 |
+
|
| 26 |
+
from typing import List, Any
|
| 27 |
+
from pydantic import BaseModel, ValidationError
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def retrieve_RAG(
|
| 34 |
+
prompt_message, pc, index, kg_index, top_k=5, info=True,
|
| 35 |
+
use_query_reformulation=False, llm=None, graphRAG=False,
|
| 36 |
+
):
|
| 37 |
+
"""
|
| 38 |
+
Retrieve relevant document chunks and community summaries from Pinecone for a given prompt.
|
| 39 |
+
- Optionally splits and reformulates the prompt for improved search.
|
| 40 |
+
- Searches both standard document chunks and, if enabled, community summaries from the knowledge graph.
|
| 41 |
+
- Returns all retrieved results for further use.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
import os
|
| 45 |
+
import re
|
| 46 |
+
|
| 47 |
+
if info:
|
| 48 |
+
print("[Debug] Starting retrieval with prompt:", prompt_message)
|
| 49 |
+
print("[Debug] Top K:", top_k)
|
| 50 |
+
print("[Debug] Query Reformulation Enabled:", use_query_reformulation)
|
| 51 |
+
|
| 52 |
+
# --- Step 0: Decide context usage (standard, graph, both) ---
|
| 53 |
+
def _graph_available():
|
| 54 |
+
try:
|
| 55 |
+
stats = index.describe_index_stats()
|
| 56 |
+
namespaces = stats.get("namespaces", {}) or {}
|
| 57 |
+
return "community-summaries" in namespaces
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"[Error] Failed to inspect index namespaces: {e}")
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
graph_ok = bool(kg_index) or _graph_available()
|
| 63 |
+
|
| 64 |
+
# --- Step 1: Use LLM to split the prompt into sub-queries ---
|
| 65 |
+
sub_queries = [prompt_message] # fallback: single query
|
| 66 |
+
if llm is not None:
|
| 67 |
+
try:
|
| 68 |
+
split_prompt = (
|
| 69 |
+
"Given the following user query, identify and list all distinct sub-queries or tasks it contains. "
|
| 70 |
+
"Return ONLY a numbered list of sub-queries, each as a concise phrase.\n\n"
|
| 71 |
+
f"User Query: {prompt_message}"
|
| 72 |
+
)
|
| 73 |
+
split_response = llm.invoke(split_prompt)
|
| 74 |
+
sub_queries = re.findall(r"\d+\.\s*(.+)", split_response.content)
|
| 75 |
+
if not sub_queries:
|
| 76 |
+
sub_queries = [prompt_message]
|
| 77 |
+
if info:
|
| 78 |
+
print(f"[Debug] Identified sub-queries: {sub_queries}")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"[Error] Sub-query splitting failed: {e}")
|
| 81 |
+
|
| 82 |
+
all_retrieved_chunks = []
|
| 83 |
+
all_graph_context_blocks = []
|
| 84 |
+
|
| 85 |
+
# --- Step 2: For each sub-query, retrieve context as decided ---
|
| 86 |
+
for idx, sub_query in enumerate(sub_queries):
|
| 87 |
+
task_prompt = sub_query.strip()
|
| 88 |
+
|
| 89 |
+
# Optional Query Reformulation
|
| 90 |
+
if use_query_reformulation and llm is not None:
|
| 91 |
+
try:
|
| 92 |
+
reformulation_prompt = (
|
| 93 |
+
"Reformulate the following query to focus only on the key concepts and remove any unnecessary details. "
|
| 94 |
+
"It should be suitable for vector search in RAG retrieval:\n\n"
|
| 95 |
+
f"Original Query: {task_prompt}"
|
| 96 |
+
)
|
| 97 |
+
reformulated_response = llm.invoke(reformulation_prompt)
|
| 98 |
+
task_prompt = reformulated_response.content.strip()
|
| 99 |
+
if info:
|
| 100 |
+
print(f"[Debug] Reformulated Query for sub-query {idx+1}: {task_prompt}")
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"[Error] Query reformulation failed for sub-query {idx+1}: {e}")
|
| 103 |
+
|
| 104 |
+
# Embed the sub-query
|
| 105 |
+
query_embedding = pc.inference.embed(
|
| 106 |
+
model="llama-text-embed-v2",
|
| 107 |
+
inputs=[task_prompt],
|
| 108 |
+
parameters={"input_type": "query"}
|
| 109 |
+
)
|
| 110 |
+
if info:
|
| 111 |
+
print(f"[Debug] Query embedding generated for sub-query {idx+1}.")
|
| 112 |
+
qvec = query_embedding[0].values
|
| 113 |
+
|
| 114 |
+
# --- Retrieve chunks if context_choice is standard or both ---
|
| 115 |
+
try:
|
| 116 |
+
retrieved_chunks_raw = index.query(
|
| 117 |
+
namespace="example-namespace",
|
| 118 |
+
vector=qvec,
|
| 119 |
+
top_k=top_k,
|
| 120 |
+
include_values=False,
|
| 121 |
+
include_metadata=True
|
| 122 |
+
)
|
| 123 |
+
retrieved_chunks = []
|
| 124 |
+
for match in retrieved_chunks_raw.matches:
|
| 125 |
+
text = match.metadata.get("text", "")
|
| 126 |
+
source = match.metadata.get("source", "Unknown source")
|
| 127 |
+
retrieved_chunks.append({
|
| 128 |
+
"text": text,
|
| 129 |
+
"source": source,
|
| 130 |
+
"sub_query": sub_query
|
| 131 |
+
})
|
| 132 |
+
all_retrieved_chunks.extend(retrieved_chunks)
|
| 133 |
+
if info:
|
| 134 |
+
print(f"[Debug] Match processed for sub-query {idx+1}: text='{text[:50]}...', source='{source}'")
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f"[Error] Standard retrieval failed for sub-query {idx+1}: {e}")
|
| 137 |
+
|
| 138 |
+
# --- Retrieve community summaries if context_choice is graph or both ---
|
| 139 |
+
if graphRAG:
|
| 140 |
+
COMMUNITY_NAMESPACE = "community-summaries"
|
| 141 |
+
TOP_K_SUMMARIES = 5
|
| 142 |
+
try:
|
| 143 |
+
comm_matches = index.query(
|
| 144 |
+
namespace=COMMUNITY_NAMESPACE,
|
| 145 |
+
vector=qvec,
|
| 146 |
+
top_k=TOP_K_SUMMARIES,
|
| 147 |
+
include_values=False,
|
| 148 |
+
include_metadata=True
|
| 149 |
+
)
|
| 150 |
+
blocks = []
|
| 151 |
+
for m in comm_matches.matches:
|
| 152 |
+
meta = m.metadata or {}
|
| 153 |
+
txt = meta.get("text", "")
|
| 154 |
+
cid = meta.get("community_id", "NA")
|
| 155 |
+
level = meta.get("level", -1)
|
| 156 |
+
size = meta.get("size", 0)
|
| 157 |
+
block = f"[Community {cid} \n level={level} \n size={size}]\n{txt}"
|
| 158 |
+
blocks.append(block)
|
| 159 |
+
graph_context_str = ("\n\n---\n\n").join(blocks)
|
| 160 |
+
all_graph_context_blocks.append((sub_query, graph_context_str))
|
| 161 |
+
if info:
|
| 162 |
+
print(f"[Community] Retrieved {len(blocks)} community summaries for sub-query {idx+1}.")
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f"[Error] Community summaries retrieval failed for sub-query {idx+1}: {e}")
|
| 165 |
+
|
| 166 |
+
# --- Step 3: Aggregate results ---
|
| 167 |
+
combined_graph_context = "\n\n====\n\n".join(
|
| 168 |
+
f"Sub-query: {sub_query}\n{context}"
|
| 169 |
+
for (sub_query, context) in all_graph_context_blocks if context
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
if info:
|
| 173 |
+
sources = {os.path.basename(chunk['source']) for chunk in all_retrieved_chunks}
|
| 174 |
+
print(f"[Debug] Final retrieval: {len(all_retrieved_chunks)} chunks from {len(sources)} sources, "
|
| 175 |
+
f"graph context length {len(combined_graph_context)}.")
|
| 176 |
+
|
| 177 |
+
# --- Return as before ---
|
| 178 |
+
return all_retrieved_chunks, combined_graph_context
|
| 179 |
+
|
| 180 |
+
|