Spaces:

ArthurSrz
/

borges-graph

Sleeping

ArthurSrz Claude commited on Oct 19, 2025

Commit

70ab3b6

1 Parent(s): d341c77

feat: Add complete nano-graphrag source code

- Add all nano-graphrag source files to Space
- Remove submodule reference and add as regular files
- This ensures nano-graphrag can be installed locally with -e ./nano-graphrag

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

nano-graphrag +0 -1
nano-graphrag/.coveragerc +11 -0
nano-graphrag/.env.example.azure +7 -0
nano-graphrag/.github/workflows/test.yml +58 -0
nano-graphrag/.gitignore +183 -0
nano-graphrag/LICENSE +21 -0
nano-graphrag/MANIFEST.in +1 -0
nano-graphrag/docs/CONTRIBUTING.md +19 -0
nano-graphrag/docs/FAQ.md +41 -0
nano-graphrag/docs/ROADMAP.md +25 -0
nano-graphrag/docs/benchmark-dspy-entity-extraction.md +276 -0
nano-graphrag/docs/benchmark-en.md +150 -0
nano-graphrag/docs/benchmark-zh.md +91 -0
nano-graphrag/docs/use_neo4j_for_graphrag.md +27 -0
nano-graphrag/examples/benchmarks/dspy_entity.py +152 -0
nano-graphrag/examples/benchmarks/eval_naive_graphrag_on_multi_hop.ipynb +432 -0
nano-graphrag/examples/benchmarks/hnsw_vs_nano_vector_storage.py +78 -0
nano-graphrag/examples/benchmarks/md5_vs_xxhash.py +54 -0
nano-graphrag/examples/finetune_entity_relationship_dspy.ipynb +0 -0
nano-graphrag/examples/generate_entity_relationship_dspy.ipynb +0 -0
nano-graphrag/examples/graphml_visualize.py +282 -0
nano-graphrag/examples/no_openai_key_at_all.py +111 -0
nano-graphrag/examples/using_amazon_bedrock.py +19 -0
nano-graphrag/examples/using_custom_chunking_method.py +43 -0
nano-graphrag/examples/using_deepseek_api_as_llm+glm_api_as_embedding.py +136 -0
nano-graphrag/examples/using_deepseek_as_llm.py +98 -0
nano-graphrag/examples/using_dspy_entity_extraction.py +144 -0
nano-graphrag/examples/using_faiss_as_vextorDB.py +97 -0
nano-graphrag/examples/using_hnsw_as_vectorDB.py +129 -0
nano-graphrag/examples/using_llm_api_as_llm+ollama_embedding.py +122 -0
nano-graphrag/examples/using_local_embedding_model.py +38 -0
nano-graphrag/examples/using_milvus_as_vectorDB.py +94 -0
nano-graphrag/examples/using_ollama_as_llm.py +96 -0
nano-graphrag/examples/using_ollama_as_llm_and_embedding.py +120 -0
nano-graphrag/examples/using_qdrant_as_vectorDB.py +113 -0
nano-graphrag/nano_graphrag/__init__.py +7 -0
nano-graphrag/nano_graphrag/_llm.py +294 -0
nano-graphrag/nano_graphrag/_op.py +1140 -0
nano-graphrag/nano_graphrag/_splitter.py +94 -0
nano-graphrag/nano_graphrag/_storage/__init__.py +5 -0
nano-graphrag/nano_graphrag/_storage/gdb_neo4j.py +529 -0
nano-graphrag/nano_graphrag/_storage/gdb_networkx.py +268 -0
nano-graphrag/nano_graphrag/_storage/kv_json.py +46 -0
nano-graphrag/nano_graphrag/_storage/vdb_hnswlib.py +141 -0
nano-graphrag/nano_graphrag/_storage/vdb_nanovectordb.py +68 -0
nano-graphrag/nano_graphrag/_utils.py +305 -0
nano-graphrag/nano_graphrag/base.py +186 -0
nano-graphrag/nano_graphrag/entity_extraction/__init__.py +0 -0
nano-graphrag/nano_graphrag/entity_extraction/extract.py +171 -0
nano-graphrag/nano_graphrag/entity_extraction/metric.py +62 -0

nano-graphrag DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 01f429e8c562e8f19b2449f90cec9a4a67d4f6ee

nano-graphrag/.coveragerc ADDED Viewed

	@@ -0,0 +1,11 @@

+[report]
+exclude_lines =
+    # Have to re-enable the standard pragma
+    pragma: no cover
+    # Don't complain if tests don't hit defensive assertion code:
+    raise NotImplementedError
+    logger.
+omit =
+    # Don't have a nice github action for neo4j now, so skip this file:
+    nano_graphrag/_storage/gdb_neo4j.py

nano-graphrag/.env.example.azure ADDED Viewed

	@@ -0,0 +1,7 @@

+API_KEY_EMB="<your azure openai key for embedding>"
+AZURE_ENDPOINT_EMB="<your azure openai endpoint for embedding>"
+API_VERSION_EMB="<api version>"
+AZURE_OPENAI_API_KEY="<your azure openai key for embedding>"
+AZURE_OPENAI_ENDPOINT="<AZURE_OPENAI_ENDPOINT>"
+OPENAI_API_VERSION="<OPENAI_API_VERSION>"

nano-graphrag/.github/workflows/test.yml ADDED Viewed

	@@ -0,0 +1,58 @@

+name: test
+on:
+  push:
+    branches:
+      - main
+      - dev
+    paths-ignore:
+      - '**/*.md'
+      - '**/*.ipynb'
+      - 'examples/**'
+  pull_request:
+    branches:
+      - main
+      - dev
+    paths-ignore:
+      - '**/*.md'
+      - '**/*.ipynb'
+      - 'examples/**'
+jobs:
+  test:
+    name: Tests on ${{ matrix.os }} for ${{ matrix.python-version }}
+    strategy:
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -r requirements-dev.txt
+      - name: Lint with flake8
+        run: |
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+      - name: Build and Test
+        env:
+          NANO_GRAPHRAG_TEST_IGNORE_NEO4J: true
+        run: |
+          python -m pytest -o log_cli=true -o log_cli_level="INFO" --cov=nano_graphrag --cov-report=xml -v ./
+      - name: Check codecov file
+        id: check_files
+        uses: andstor/file-existence-action@v1
+        with:
+          files: './coverage.xml'
+      - name: Upload coverage from test to Codecov
+        uses: codecov/codecov-action@v2
+        with:
+          file: ./coverage.xml
+          token: ${{ secrets.CODECOV_TOKEN }}

nano-graphrag/.gitignore ADDED Viewed

	@@ -0,0 +1,183 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+test_cache.json
+run_test*.py
+nano_graphrag_cache*/
+*.txt
+examples/benchmarks/fixtures/
+tests/original_workflow.txt
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+.vscode
+.DS_Store
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+# End of https://www.toptal.com/developers/gitignore/api/python

nano-graphrag/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Gustavo Ye
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

nano-graphrag/MANIFEST.in ADDED Viewed

	@@ -0,0 +1 @@


1	+ include readme.md

nano-graphrag/docs/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Contributing to nano-graphrag
+### Submit your Contribution through PR
+To make a contribution, follow these steps:
+1. Fork and clone this repository
+3. If you modified the core code (`./nano_graphrag`), please add tests for it
+4. **Include proper documentation / docstring or examples**
+5. Ensure that all tests pass by running `pytest`
+6. Submit a pull request
+For more details about pull requests, please read [GitHub's guides](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
+### Only add a dependency when we have to
+`nano-graphrag` needs to be `nano` and `light`. If we want to add more features, we add them smartly. Don't introduce a huge dependency just for a simple function.

nano-graphrag/docs/FAQ.md ADDED Viewed

	@@ -0,0 +1,41 @@

+### `Leiden.EmptyNetworkError:EmptyNetworkError`
+This error is caused by `nano-graphrag` tries to compute communities on an empty network. In most cases, this is caused by the LLM model you're using, it fails to extract any entities or relations, so the graph is empty.
+Try to use another bigger LLM, or here are some ideas to fix it:
+ - Check the response from the LLM, make sure the result fits the desired response format of the extracting entities prompt.
+    The desired response format is something like that:
+    ```text
+    ("entity"<|>"Cruz"<|>"person"<|>"Cruz is associated with a vision of control and order, influencing the dynamics among other characters.")
+    ```
+ - Some LLMs may not return the format like above, so one possible solution is to add a system instruction to the input prompt, such like:
+    ```json
+    {
+        "role": "system",
+        "content": "You are an intelligent assistant and will follow the instructions given to you to fulfill the goal. The answer should be in the format as in the given example."
+    }
+    ```
+    You can use this system_prompt as default for your LLM calling funcation
+### One possible reason of 'Processed 42 chunks,0 entities(duplicated),0 relations(duplicated)WARNING:nano-graphrag:Didn't extract any entities, maybe your LLM is not working WARNING:nano-graphrag:No new entities found'
+The default num_ctx of ollama is 2048 which is too small for the input prompt of entity extraction. This causes the model to fail to respond correctly.
+Solution:
+  Each model in Ollama has a configuration file. Here, you need to generate a new configuration file based on the original one, and then use this configuration file to generate a new model.
+  For example the qwen2, run the following command:
+  `ollama show --modelfile qwen2 > Modelfile`
+  Add a new line into this file below the 'FROM':
+  `PARAMETER num_ctx 32000`
+  `ollama create -f Modelfile qwen2:ctx32k`
+  Afterwards, you can use qwen2:ctx32k to replace qwen2.

nano-graphrag/docs/ROADMAP.md ADDED Viewed

	@@ -0,0 +1,25 @@

+## Next Version
+- [ ] Add DSpy for prompt-tuning to make small models(Qwen2 7B, Llama 3.1 8B...) can extract entities. @NumberChiffre @gusye1234
+- [ ] Optimize Algorithm: add `global_local` query method, globally rewrite query then perform local search.
+## In next few versions
+- [ ] Add rate limiter: support token limit (tokens per second, per minute)
+- [ ] Add other advanced RAG algorithms, candidates:
+  - [ ] [HybridRAG](https://arxiv.org/abs/2408.04948)
+  - [ ] [HippoRAG](https://arxiv.org/abs/2405.14831)
+## Interesting directions
+- [ ] Add [Sciphi Triplex](https://huggingface.co/SciPhi/Triplex) as the entity extraction model.
+- [ ] Add new components, see [issue](https://github.com/gusye1234/nano-graphrag/issues/2)

nano-graphrag/docs/benchmark-dspy-entity-extraction.md ADDED Viewed

	@@ -0,0 +1,276 @@

+# Chain Of Thought Prompting with DSPy-AI (v2.4.16)
+## Main Takeaways
+- Time difference: 156.99 seconds
+- Execution time with DSPy-AI: 304.38 seconds
+- Execution time without DSPy-AI: 147.39 seconds
+- Entities extracted: 22 (without DSPy-AI) vs 37 (with DSPy-AI)
+- Relationships extracted: 21 (without DSPy-AI) vs 36 (with DSPy-AI)
+## Results
+```markdown
+> python examples/benchmarks/dspy_entity.py
+Running benchmark with DSPy-AI:
+INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
+INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
+INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
+DEBUG:nano-graphrag:Entities: 14 | Missed Entities: 23 | Total Entities: 37
+DEBUG:nano-graphrag:Relationships: 13 | Missed Relationships: 23 | Total Relationships: 36
+DEBUG:nano-graphrag:Direct Relationships: 31 | Second-order: 5 | Third-order: 0 | Total Relationships: 36
+⠙ Processed 1 chunks, 37 entities(duplicated), 36 relations(duplicated)
+Execution time with DSPy-AI: 304.38 seconds
+Entities:
+- 朱元璋 (PERSON):
+  明朝开国皇帝，原名朱重八，后改名朱元璋。他出身贫农，经历了从放牛娃到皇帝的传奇人生。在元朝末年，他参加了红巾军起义，最终推翻元朝，建立了明朝。
+- 朱五四 (PERSON):
+  朱元璋的父亲，农民出身，家境贫寒。他在朱元璋幼年时去世，对朱元璋的成长和人生选择产生了深远影响。
+- 陈氏 (PERSON):
+  朱元璋的母亲，农民出身，家境贫寒。她在朱元璋幼年时去世，对朱元璋的成长和人生选择产生了深远影响。
+- 汤和 (PERSON):
+  朱元璋的幼年朋友，后来成为朱元璋起义军中的重要将领。他在朱元璋早期的发展中起到了关键作用。
+- 郭子兴 (PERSON):
+  红巾军起义的领导人之一，朱元璋的岳父。他在朱元璋早期的发展中起到了重要作用，但后来与朱元璋产生了矛盾。
+- 马姑娘 (PERSON):
+  郭子兴的义女，朱元璋的妻子。她在朱元璋最困难的时候给予了极大的支持，是朱元璋成功的重要因素之一。
+- 元朝 (ORGANIZATION):
+  中国历史上的一个朝代，由蒙古族建立。元朝末年，社会矛盾激化，最终导致了红巾军起义和明朝的建立。
+- 红巾军 (ORGANIZATION):
+  元朝末年起义军的一支，主要由农民组成。朱元璋最初加入的就是红巾军，并在其中逐渐崭露头角。
+- 皇觉寺 (LOCATION):
+  朱元璋早年出家的地方，位于安徽凤阳。他在寺庙中度过了几年的时光，这段经历对他的人生观和价值观产生了深远影响。
+- 濠州 (LOCATION):
+  朱元璋早期活动的重要地点，也是红巾军的重要据点之一。朱元璋在这里经历了许多重要事件，包括与郭子兴的矛盾和最终的离开。
+- 1328年 (DATE):
+  朱元璋出生的年份。这一年标志着明朝开国皇帝传奇人生的开始。
+- 1344年 (DATE):
+  朱元璋家庭遭遇重大变故的年份，他的父母在这一年相继去世。这一事件对朱元璋的人生选择产生了深远影响。
+- 1352年 (DATE):
+  朱元璋正式加入红巾军起义的年份。这一年标志着朱元璋从农民到起义军领袖的转变。
+- 1368年 (DATE):
+  朱元璋推翻元朝，建立明朝的年份。这一年标志着朱元璋从起义军领袖到皇帝的转变。
+- 朱百六 (PERSON):
+  朱元璋的高祖，名字具有元朝时期老百姓命名的特点，即以数字命名。
+- 朱四九 (PERSON):
+  朱元璋的曾祖，名字同样具有元朝时期老百姓命名的特点，即以数字命名。
+- 朱初一 (PERSON):
+  朱元璋的祖父，名字具有元朝时期老百姓命名的特点，即以数字命名。
+- 刘德 (PERSON):
+  朱元璋早年为其放牛的地主，对朱元璋的童年生活有重要影响。
+- 韩山童 (PERSON):
+  红巾军起义的早期领导人之一，与刘福通共同起义，对朱元璋的起义选择有间接影响。
+- 刘福通 (PERSON):
+  红巾军起义的早期领导人之一，与韩山童共同起义，对朱元璋的起义选择有间接影响。
+- 脱脱 (PERSON):
+  元朝末年的著名宰相，主张治理黄河，但他的政策间接导致了红巾军起义的爆发。
+- 元顺帝 (PERSON):
+  元朝末代皇帝，他在位期间元朝社会矛盾激化，最终导致了红巾军起义和明朝的建立。
+- 孙德崖 (PERSON):
+  红巾军起义的领导人之一，与郭子兴有矛盾，曾绑架郭子兴，对朱元璋的早期发展有重要影响。
+- 周德兴 (PERSON):
+  朱元璋的早期朋友，曾为朱元璋算卦，对朱元璋的人生选择有一定影响。
+- 徐达 (PERSON):
+  朱元璋早期的重要将领，后来成为明朝的开国功臣之一。
+- 明教 (RELIGION):
+  朱元璋在起义过程中接触到的宗教信仰，对他的思想和行动有一定影响。
+- 弥勒佛 (RELIGION):
+  明教中的重要神祇，朱元璋相信弥勒佛会降世，对他的信仰和行动有一定影响。
+- 颖州 (LOCATION):
+  朱元璋早年讨饭的地方，也是红巾军起义的重要地点之一。
+- 定远 (LOCATION):
+  朱元璋早期攻打的地点之一，是他军事生涯的起点。
+- 怀远 (LOCATION):
+  朱元璋早期攻打的地点之一，是他军事生涯的起点。
+- 安奉 (LOCATION):
+  朱元璋早期攻打的地点之一，是他军事生涯的起点。
+- 含山 (LOCATION):
+  朱元璋早期攻打的地点之一，是他军事生涯的起点。
+- 虹县 (LOCATION):
+  朱元璋早期攻打的地点之一，是他军事生涯的起点。
+- 钟离 (LOCATION):
+  朱元璋的家乡，他在此地召集了二十四位重要将领。
+- 黄河 (LOCATION):
+  元朝末年黄河泛滥，导致了严重的社会问题，间接引发了红巾军起义。
+- 淮河 (LOCATION):
+  元朝末年淮河沿岸遭遇严重瘟疫和旱灾，加剧了社会矛盾。
+- 1351年 (DATE):
+  红巾军起义爆发的年份，对朱元璋的人生选择产生了重要影响。
+Relationships:
+- 朱元璋 -> 朱五四:
+  朱元璋是朱五四的儿子，朱五四的去世对朱元璋的成长和人生选择产生了深远影响。
+- 朱元璋 -> 陈氏:
+  朱元璋是陈氏的儿子，陈氏的去世对朱元璋的成长和人生选择产生了深远影响。
+- 朱元璋 -> 汤和:
+  汤和是朱元璋的幼年朋友，后来成为朱元璋起义军中的重要将领，对朱元璋早期的发展起到了关键作用。
+- 朱元璋 -> 郭子兴:
+  郭子兴是朱元璋的岳父，也是红巾军起义的领导人之一。他在朱元璋早期的发展中起到了重要作用，但后来与朱元璋产生了矛盾。
+- 朱元璋 -> 马姑娘:
+  马姑娘是朱元璋的妻子，她在朱元璋最困难的时候给予了极大的支持，是朱元璋成功的重要因素之一。
+- 朱元璋 -> 元朝:
+  朱元璋在元朝末年参加了红巾军起义，最终推翻了元朝，建立了明朝。
+- 朱元璋 -> 红巾军:
+  朱元璋最初加入的是红巾军，并在其中逐渐崭露头角，最终成为起义军的重要领导人。
+- 朱元璋 -> 皇觉寺:
+  朱元璋早年出家的地方是皇觉寺，这段经历对他的人生观和价值观产生了深远影响。
+- 朱元璋 -> 濠州:
+  濠州是朱元璋早期活动的重要地点，也是红巾军的重要据点之一。朱元璋在这里经历了许多重要事件，包括与郭子兴的矛盾和最终的离开。
+- 朱元璋 -> 1328年:
+  1328年是朱元璋出生的年份，这一年标志着明朝开国皇帝传奇人生的开始。
+- 朱元璋 -> 1344年:
+  1344年是朱元璋家庭遭遇重大变故的年份，他的父母在这一年相继去世，这一事件对朱元璋的人生选择产生了深远影响。
+- 朱元璋 -> 1352年:
+  1352年是朱元璋正式加入红巾军起义的年份，这一年标志着朱元璋从农民到起义军领袖的转变。
+- 朱元璋 -> 1368年:
+  1368年是朱元璋推翻元朝，建立明朝的年份，这一年标志着朱元璋从起义军领袖到皇帝的转变。
+- 朱元璋 -> 朱百六:
+  朱百六是朱元璋的高祖，对朱元璋的家族背景有重要影响。
+- 朱元璋 -> 朱四九:
+  朱四九是朱元璋的曾祖，对朱元璋的家族背景有重要影响。
+- 朱元璋 -> 朱初一:
+  朱初一是朱元璋的祖父，对朱元璋的家族背景有重要影响。
+- 朱元璋 -> 刘德:
+  刘德是朱元璋早年为其放牛的地主，对朱元璋的童年生活有重要影响。
+- 朱元璋 -> 韩山童:
+  韩山童是红巾军起义的早期领导人之一，对朱元璋的起义选择有间接影响。
+- 朱元璋 -> 刘福通:
+  刘福通是红巾军起义的早期领导人之一，对朱元璋的起义选择有间接影响。
+- 朱元璋 -> 脱脱:
+  脱脱是元朝末年的著名宰相，他的政策间接导致了红巾军起义的爆发，对朱元璋的起义选择有间接影响。
+- 朱元璋 -> 元顺帝:
+  元顺帝是元朝末代皇帝，他在位期间社会矛盾激化，最终导致了红巾军起义和明朝的建立，对朱元璋的起义选择有重要影响。
+- 朱元璋 -> 孙德崖:
+  孙德崖是红巾军起义的领导人之一，与郭子兴有矛盾，曾绑架郭子兴，对朱元璋的早期发展有重要影响。
+- 朱元璋 -> 周德兴:
+  周德兴是朱元璋的早期朋友，曾为朱元璋算卦，对朱元璋的人生选择有一定影响。
+- 朱元璋 -> 徐达:
+  徐达是朱元璋早期的重要将领，后来成为明朝的开国功臣之一，对朱元璋的军事生涯有重要影响。
+- 朱元璋 -> 明教:
+  朱元璋在起义过程中接触到的宗教信仰，对他的思想和行动有一定影响。
+- 朱元璋 -> 弥勒佛:
+  朱元璋相信弥勒佛会降世，对他的信仰和行动有一定影响。
+- 朱元璋 -> 颖州:
+  颖州是朱元璋早年讨饭的地方，也是红巾军起义的重要地点之一，对朱元璋的早期生活有重要影响。
+- 朱元璋 -> 定远:
+  定远是朱元璋早期攻打的地点之一，是他军事生涯的起点，对朱元璋的军事发展有重要影响。
+- 朱元璋 -> 怀远:
+  怀远是朱元璋早期攻打的地点之一，是他军事生涯的起点，对朱元璋的军事发展有重要影响。
+- 朱元璋 -> 安奉:
+  安奉是朱元璋早期攻打的地点之一，是他军事生涯的起点，对朱元璋的军事发展有重要影响。
+- 朱元璋 -> 含山:
+  含山是朱元璋早期攻打的地点之一，是他军事生涯的起点，对朱元璋的军事发展有重要影响。
+- 朱元璋 -> 虹县:
+  虹县是朱元璋早期攻打的地点之一，是他军事生涯的起点，对朱元璋的军事发展有重要影响。
+- 朱元璋 -> 钟离:
+  钟离是朱元璋的家乡，他在此地召集了二十四位重要将领，对朱元璋的军事发展有重要影响。
+- 朱元璋 -> 黄河:
+  元朝末年黄河泛滥，导致了严重的社会问题，间接引发了红巾军起义，对朱元璋的起义选择有重要影响。
+- 朱元璋 -> 淮河:
+  元朝末年淮河沿岸遭遇严重瘟疫和旱灾，加剧了社会矛盾，对朱元璋的起义选择有重要影响。
+- 朱元璋 -> 1351年:
+  1351年是红巾军起义爆发的年份，对朱元璋的人生选择产生了重要影响。
+Running benchmark without DSPy-AI:
+INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
+INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
+⠙ Processed 1 chunks, 22 entities(duplicated), 21 relations(duplicated)
+Execution time without DSPy-AI: 147.39 seconds
+Entities:
+- "朱元璋" ("PERSON"):
+  "朱元璋，原名朱重八，后改名朱元璋，是明朝的开国皇帝。他出身贫农，经历了从放牛娃到和尚，再到起义军领袖，最终成为皇帝的传奇人生。"
+- "朱五四" ("PERSON"):
+  "朱五四，朱元璋的父亲，是一个农民，为地主种地，家境贫寒。"
+- "陈氏" ("PERSON"):
+  "陈氏，朱元璋的母亲，是一个农民，与丈夫朱五四一起辛勤劳作，家境贫寒。"
+- "汤和" ("PERSON"):
+  "汤和，朱元璋的幼年朋友，后来成为朱元璋的战友，在朱元璋的崛起过程中起到了重要作用。"
+- "郭子兴" ("PERSON"):
+  "郭子兴，濠州城的守卫者，是朱元璋的岳父，也是朱元璋早期的重要支持者。"
+- "韩山童" ("PERSON"):
+  "韩山童，与刘福通一起起义反抗元朝统治，是元末农民起义的重要领袖之一。"<SEP>"韩山童，元末农民起义的领袖之一，自称宋朝皇室后裔，与刘福通一起起义。"
+- "刘福通" ("PERSON"):
+  "刘福通，与韩山童一起起义反抗元朝统治，是元末农民起义的重要领袖之一。"<SEP>"刘福通，元末农民起义的领袖之一，自称刘光世大将的后人，与韩山童一起起义。"
+- "元朝" ("ORGANIZATION"):
+  "元朝，由蒙古族建立的王朝，统治中国时期实行了严格的等级制度，导致社会矛盾激化，最终被朱元璋领导的起义军推翻。"
+- "皇觉寺" ("ORGANIZATION"):
+  "皇觉寺，朱元璋曾经在此当和尚，从事杂役工作，后来因饥荒严重，和尚们都被派出去化缘。"
+- "白莲教" ("ORGANIZATION"):
+  "白莲教，元末农民起义中的一种宗教组织，韩山童和刘福通起义时利用了这一宗教信仰。"
+- "濠州城" ("GEO"):
+  "濠州城，位于今安徽省，是朱元璋早期活动的重要地点，也是郭子兴的驻地。"
+- "定远" ("GEO"):
+  "定远，朱元璋奉命攻击的地方，成功攻克后在元军回援前撤出，显示了其军事才能。"
+- "钟离" ("GEO"):
+  "钟离，朱元璋的家乡，他在此招收了二十四名壮丁，这些人后来成为明朝的高级干部。"
+- "元末农民起义" ("EVENT"):
+  "元末农民起义，是元朝末年由韩山童、刘福通等人领导的反抗元朝统治的大规模起义，最终导致了元朝的灭亡。"
+- "马姑娘" ("PERSON"):
+  "马姑娘，郭子兴的义女，后来成为朱元璋的妻子，在朱元璋被关押时，她冒着危险送饭给朱元璋，表现出深厚的感情。"
+- "孙德崖" ("PERSON"):
+  "孙德崖，与郭子兴有矛盾的起义军领袖之一，曾参与绑架郭子兴。"
+- "徐达" ("PERSON"):
+  "徐达，朱元璋的二十四名亲信之一，后来成为明朝的重要将领。"
+- "周德兴" ("PERSON"):
+  "周德兴，朱元璋的二十四名亲信之一，曾为朱元璋算过命。"
+- "脱脱" ("PERSON"):
+  "脱脱，元朝的著名宰相，主张治理黄河，但他的政策间接导致了元朝的灭亡。"
+- "元顺帝" ("PERSON"):
+  "元顺帝，元朝的最后一位皇帝，统治时期元朝社会矛盾激化，最终导致了元朝的灭亡。"
+- "刘德" ("PERSON"):
+  "刘德，地主，朱元璋早年为其放牛。"
+- "吴老太" ("PERSON"):
+  "吴老太，村口的媒人，朱元璋曾希望托她找一个媳妇。"
+Relationships:
+- "朱元璋" -> "朱五四":
+  "朱元璋的父亲，对他的成长和早期生活有重要影响。"
+- "朱元璋" -> "陈氏":
+  "朱元璋的母亲，对他的成长和早期生活有重要影响。"
+- "朱元璋" -> "汤和":
+  "朱元璋的幼年朋友，后来成为他的战友，在朱元璋的崛起过程中起到了重要作用。"
+- "朱元璋" -> "郭子兴":
+  "朱元璋的岳父，是他在起义军中的重要支持者。"
+- "朱元璋" -> "韩山童":
+  "朱元璋在起义过程中与韩山童有间接联系，韩山童的起义对朱元璋的崛起有重要影响。"
+- "朱元璋" -> "刘福通":
+  "朱元璋在起义过程中与刘福通有间接联系，刘福通的起义对朱元璋的崛起有重要影响。"
+- "朱元璋" -> "元朝":
+  "朱元璋最终推翻了元朝的统治，建立了明朝。"
+- "朱元璋" -> "皇觉寺":
+  "朱元璋曾经在此当和尚，这段经历对他的成长有重要影响。"
+- "朱元璋" -> "白莲教":
+  "朱元璋在起义过程中接触到了白莲教，虽然他本人可能并不信仰，但白莲教的起义对他有重要影响。"
+- "朱元璋" -> "濠州城":
+  "朱元璋在濠州城的活动对其早期军事和政治生涯有重要影响。"
+- "朱元璋" -> "定远":
+  "朱元璋成功攻克定远，显示了其军事才能。"
+- "朱元璋" -> "钟离":
+  "朱元璋的家乡，他在此招收了二十四名壮丁，这些人后来成为明朝的高级干部。"
+- "朱元璋" -> "元末农民起义":
+  "朱元璋参与并最终领导了元末农民起义，推翻了元朝的统治。"
+- "朱元璋" -> "马姑娘":
+  "朱元璋的妻子，在朱元璋被关押时，她冒着危险送饭给朱元璋，表现出深厚的感情。"
+- "朱元璋" -> "孙德崖":
+  "朱元璋在孙德崖与郭子兴的矛盾中起到了调解作用，显示了其政治智慧。"
+- "朱元璋" -> "徐达":
+  "朱元璋的二十四名亲信之一，后来成为明朝的重要将领。"
+- "朱元璋" -> "周德兴":
+  "朱元璋的二十四名亲信之一，曾为朱元璋算过命。"
+- "朱元璋" -> "脱脱":
+  "朱元璋在起义过程中间接受到脱脱政策的影响，脱脱的政策间接导致了元朝的灭亡。"
+- "朱元璋" -> "元顺帝":
+  "朱元璋最终推翻了元顺帝的统治，建立了明朝。"
+- "朱元璋" -> "刘德":
+  "朱元璋早年为刘德放牛，这段经历对他的成长有重要影响。"
+- "朱元璋" -> "吴老太":
+  "朱元璋曾希望托吴老太找一个媳妇，显示了他对家庭的渴望。"
+```
+# Self-Refine with DSPy-AI (v2.5.6)
+## Main Takeaways
+- Time difference: 66.24 seconds
+- Execution time with DSPy-AI: 211.04 seconds
+- Execution time without DSPy-AI: 144.80 seconds
+- Entities extracted: 38 (without DSPy-AI) vs 16 (with DSPy-AI)
+- Relationships extracted: 38 (without DSPy-AI) vs 16 (with DSPy-AI)

nano-graphrag/docs/benchmark-en.md ADDED Viewed

	@@ -0,0 +1,150 @@

+- We use [A Christmas Carol](https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt) by Dickens as the benchmark corpus.
+- We use `61b5eea34783c58074b3c53f1689ad8a5ba6b6ee` commit of [Official GraphRAG implementation](https://github.com/microsoft/graphrag/tree/main)
+- Both GraphRAG and `nano-graphrag` use OpenAI Embedding and `gpt-4o`.
+- Not Cache for both. On the same device and network connection.
+- GrapgRAG Max Async API request: 25
+- `nano-graphrag` Max Async API request: 16
+## Index Benchmark
+**GraphRAG index time**: more than 5 minutes
+**`nano-graphrag` index time**: less than 4 minutes
+## Local Search Results
+#### GraphRAG
+"What are the top themes in this story?"
+```markdown
+# Top Themes in the Story
+The story revolves around several central themes that are intricately woven into the narrative, each contributing to the overall message and character development. Here are the top themes:
+## Family and Relationships
+Family and relationships play a crucial role in the story, particularly through the interactions between Scrooge and his nephew and niece. Scrooge's nephew is depicted as a warm-hearted and persistent character who continually tries to engage Scrooge, especially during the Christmas season. Despite Scrooge's initial disdain for Christmas, his nephew's efforts highlight themes of compassion and familial bonds [Data: Scrooge's Family and Christmas (20); Relationships (149, 152, 154)]. Scrooge's niece also contributes to this theme with her strong opinions and musical talent, which help soften Scrooge's attitude and remind him of his past [Data: Scrooge's Family and Christmas (20); Relationships (154)].
+## Transformation and Redemption
+The theme of transformation and redemption is central to the story, primarily illustrated through Scrooge's character arc. Initially, Scrooge is cynical and dismissive of Christmas and its associated values. However, the recurring event of Christmas serves as a catalyst for his transformation. Through various emotional experiences and reflections prompted by the holiday, Scrooge undergoes a significant change, ultimately embracing generosity and kindness [Data: Scrooge's Family and Christmas (20); Relationships (96, 156, 153, 154)].
+## The Spirit of Christmas
+Christmas itself is a recurring and significant event in the story, symbolizing themes of goodwill, generosity, and reflection. The holiday serves as the backdrop for many of the story's pivotal moments and emotional responses. It is during Christmas that Scrooge's transformation occurs, highlighting the holiday's themes of kindness and community. The discussions and celebrations around Christmas emphasize its importance in fostering a sense of togetherness and compassion [Data: Scrooge's Family and Christmas (20); Relationships (96, 149, 153, 154, 156)].
+## Social Commentary
+The story also provides a social commentary on the issues of poverty and social inequality. Characters like Ignorance and Want symbolize the social issues of the time, revealed by the Ghost of Christmas Present. Scrooge's initial attitude towards the poor and his interactions with the gentlemen seeking charity highlight the harsh realities of social neglect and the need for compassion and support for the less fortunate [Data: Entities (40); Sources (4)].
+## The Role of Memory and Reflection
+Memory and reflection are significant themes that contribute to Scrooge's transformation. The story often delves into Scrooge's past, evoking memories that help him reflect on his life choices and their impact on his current state. These reflections are crucial in prompting Scrooge to change his ways and embrace a more generous and compassionate outlook [Data: Scrooge's Family and Christmas (20); Relationships (96, 156)].
+## Conclusion
+In summary, the top themes in the story include family and relationships, transformation and redemption, the spirit of Christmas, social commentary, and the role of memory and reflection. Each of these themes is intricately connected to the characters and events, contributing to the story's rich narrative and moral lessons.
+```
+### `nano-graphrag`
+"What are the top themes in this story? "
+```markdown
+Sure, let’s explore the top themes in "A Christmas Carol" based on the data provided.
+### Transformation and Redemption
+The central theme of "A Christmas Carol" revolves around the profound transformation and redemption of Ebenezer Scrooge. Initially depicted as a miserly and solitary man who disdains Christmas and human connection, Scrooge undergoes a radical change after a series of supernatural visits. These visitations force him to reflect on his past, recognize the suffering of others in the present, and confront the grim future that awaits him if he remains unchanged [Data: Reports (0); Entities (1); Relationships (0, 3, 5, 14, 45, +more)].
+### Influence of Supernatural Entities
+Supernatural elements are crucial in facilitating Scrooge's transformation. The Ghosts of Christmas Past, Present, and Yet to Come each play a significant role by exposing Scrooge to various scenes, prompting deep introspection and emotional responses. Jacob Marley's ghost, in particular, acts as the initial catalyst, warning Scrooge of the consequences of his miserly ways and setting the stage for the more impactful supernatural encounters that follow [Data: Reports (0); Entities (17, 18, 46); Relationships (29, 30, 46, 55, 114, +more)].
+### Social Critique
+Dickens uses the story to critique the social conditions of his time, emphasizing themes of poverty and social neglect. Characters like Bob Cratchit and his family highlight the struggles of the working class, while the conceptual children, Ignorance and Want, presented by the Ghost of Christmas Present, illustrate the dire consequences of societal indifference. This critique encourages readers to reflect on their own social responsibilities and the broader societal need for compassion and generosity [Data: Reports (0); Entities (57, 107, 141); Relationships (45, 102, 159, +more)].
+### Family and Community
+Another prominent theme is the importance of family and community. Scrooge's interactions with Fred, his cheerful and persistent nephew, highlight the warmth and joy that family connections can bring. Fred's continued efforts to include Scrooge in family celebrations play a pivotal role in rekindling their relationship. Likewise, the Cratchit family, despite their poverty, exemplifies love, togetherness, and the true spirit of Christmas, all of which contribute to Scrooge's awakening and ultimate redemption [Data: Reports (0); Entities (14, 44); Relationships (28, 39, 40, 86, 153, +more)].
+### Generosity and Charity
+The theme of generosity and charity is woven throughout the narrative. Scrooge's eventual acts of kindness, such as his improved treatment of Bob Cratchit and his charitable gestures towards the end of the story, symbolize the transformative power of giving. This newly embraced generosity contrasts sharply with his former frugality and serves as a testament to the positive changes that have taken place within him [Data: Reports (0); Entities (6, 43); Relationships (7, 8, 84, 127, +more)].
+### Conclusion
+In summary, "A Christmas Carol" is rich with themes that explore personal transformation, social responsibility, family, and the spirit of generosity. These themes are intricately linked through the narrative structure and character interactions, making the story a timeless reflection on the human condition and the potential for redemption and change [Data: Reports (0); Entities (1, 17, 18); Relationships (0, 14, 31, 45, 48, +more)].
+Feel free to ask if you need more detailed information on any specific aspect of these themes!
+```
+## Global Search Results
+### GraphRAG
+"What are the top themes in this story? "
+```markdown
+#### Transformation
+The central theme of the story is the transformation of Ebenezer Scrooge. Initially depicted as a miserly and unsympathetic individual, Scrooge undergoes a profound change to become a generous and caring person. This transformation is driven by his interactions with various supernatural entities, including Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come [Data: Reports (18, 17, 27, 21, 6, 26)].
+#### Familial Love and Support
+The theme of familial love and support is prominently depicted through the Cratchit Family. Despite their financial struggles, the Cratchit family remains loving and supportive of each other. Key relationships include Bob Cratchit's deep affection for his son Tiny Tim and the family's collective grief over Tiny Tim's potential death [Data: Reports (20, 19)].
+#### Generosity and Compassion
+Generosity and compassion are illustrated through characters like Fezziwig and the transformed Scrooge. Fezziwig's joyful Christmas parties and Scrooge's eventual acts of kindness towards the Cratchit family and others highlight the positive impact of generosity on the community [Data: Reports (23, 19, 26)].
+#### Social Responsibility
+The theme of social responsibility is underscored by the allegorical figures of Ignorance and Want, as well as the broader societal issues highlighted by the Ghost of Christmas Present. These elements emphasize the importance of addressing poverty and the consequences of neglecting the less fortunate [Data: Reports (17, 21)].
+#### The Christmas Spirit
+The Christmas spirit, encompassing joy, celebration, and togetherness, is a recurring motif throughout the story. Various scenes, such as the Cratchit family's Christmas dinner, Fred's Christmas party, and the celebrations shown by the Ghost of Christmas Present, reinforce the importance of embracing the festive spirit and its associated values [Data: Reports (20, 18, 23)].
+#### Impact of Past Relationships
+Scrooge's past relationships, particularly with 'The Girl' (later 'The Matron') and Belle, highlight themes of lost opportunities and the contrast between his lonely existence and the warmth of family life [Data: Reports (2)].
+#### Role of Supernatural Elements
+Supernatural elements, including Marley's Ghost and the three spirits, play a crucial role in Scrooge's transformation. These entities guide Scrooge through various significant scenes, influencing his reflections and realizations [Data: Reports (26)].
+#### Family and Love
+The relationships within the community, such as those between Mr. and Mrs. Fezziwig and Belle's family, underscore the importance of family and love. These relationships serve as a reflection on what Scrooge has missed out on due to his choices [Data: Reports (2, 25)].
+These themes collectively highlight the story's emphasis on personal growth, the importance of community and family, and the enduring values associated with the Christmas season.
+```
+### `nano-graghrag`
+"What are the top themes in this story? "
+```markdown
+### Key Themes in the Story
+#### 1. Transformation and Redemption
+The central theme of the story revolves around Ebenezer Scrooge's profound transformation from a miserly, cold-hearted man to a generous and kind individual. This journey of change emphasizes the possibility of redemption and the capacity for personal change. The transformation is predominantly driven by Scrooge's interactions with various supernatural entities such as the Ghosts of Christmas Past, Present, and Yet to Come [Data: Reports (0, 1, 2, 3, 4, 5, 6, 12, 23, 14, +more)].
+#### 2. The Role of Supernatural Entities
+Supernatural spirits play critical roles in guiding Scrooge toward his transformation. The Ghosts of Christmas Past, Present, and Yet to Come evoke reflection, regret, and fear in Scrooge, highlighting the impact of spiritual guidance on personal redemption. These encounters force him to confront and reflect on his past actions and their consequences [Data: Reports (1, 2, 3, 12, 0)].
+#### 3. Generosity and Community Support
+Another significant theme is the spirit of generosity and community support. Initially depicted as a miser, Scrooge's eventual acts of generosity and the festive spirit among various characters underscore the importance of kindness and community. The Cratchit family's modest yet joyous Christmas celebration poignantly highlights this theme [Data: Reports (26, 6, 2, 14)].
+#### 4. Family and Emotional Bonds
+The story places significant emphasis on family and emotional connections. This theme is illustrated through the relationships between Scrooge and his family, such as the close sibling bond with Fan and the dynamics within the Cratchit family. These relationships and their development underscore the importance of familial love and support [Data: Reports (0, 2, 4, 17, 5, 15)].
+#### 5. Social Justice and Economic Disparity
+The narrative also addresses themes of social justice and economic disparity. Scrooge's initial indifference to others' struggles and the contrasting lifestyles of characters like Bob Cratchit highlight the societal inequalities of the time. This critique is further exemplified through the characters’ interactions and the broader social critique in the story [Data: Reports (0, 1, 24, 9, 25, +more)].
+#### 6. Role of Memory and the Past
+Closely tied to Scrooge’s transformation is the theme of memory and the past. The Ghosts of Christmas Past and Marley’s Ghost prompt Scrooge to reflect on his own life, evoking memories that significantly impact his journey toward becoming a better man. These reflections play a crucial role in his redemption [Data: Reports (3, 8, 13, 17, 6, +more)].
+### Conclusion
+These themes collectively create a rich and multifaceted narrative that conveys powerful messages about personal growth, the impact of community, the value of generosity, and the critique of social inequities. The journey of Ebenezer Scrooge serves as a timeless reminder of the potential for change and the importance of compassion and empathy in society.
+```

nano-graphrag/docs/benchmark-zh.md ADDED Viewed

	@@ -0,0 +1,91 @@

+- We use [三国演义](https://github.com/tennessine/corpus/blob/master/%E4%B8%89%E5%9B%BD%E6%BC%94%E4%B9%89.txt) by 罗贯中 as the benchmark corpus.
+- We use `61b5eea34783c58074b3c53f1689ad8a5ba6b6ee` commit of [Official GraphRAG implementation](https://github.com/microsoft/graphrag/tree/main)
+- Both GraphRAG and `nano-graphrag` use OpenAI Embedding and `gpt-4o`.
+- Not Cache for both.
+- On the same device and network connection.
+- GrapgRAG Max Async API request: 25
+- `nano-graphrag` Max Async API request: 16
+## Index Benchmark
+Both are suck, more than 10 minutes.
+## Local Search Results
+### GraphRAG
+"大乔和曹操的关系"
+```markdown
+大乔与曹操之间的关系在历史和文学中被描绘为复杂且多维的，尽管直接的互动可能并不频繁。大乔，作为东吴孙策的妻子，以及其美貌和与三国时期重要人物的关联而闻名。曹操，作为三国时期的关键政治和军事领导者，其与大乔的关系主要通过他与东吴的关系以及对大乔及其家族的兴趣来体现。
+首先，大乔的美貌在三国时期广为人知，甚至引起了曹操的注意。据记载，曹操曾有意俘获大乔以供己用，这一点体现了曹操对大乔的个人兴趣，尽管这种兴趣可能更多地是基于她的美貌和象征意义，而非两人之间有实质性的个人关系 [Data: Entities (266)]。
+其次，大乔作为孙策的妻子，自然成为了孙家与东吴势力中的一员。孙策和孙权是曹操在政治和军事上的对手，这使得大乔与曹操之间存在着间接的敌对关系。曹操对东吴的军事行动，包括对孙权的攻击，间接地影响到了大乔及其家族的安全和利益 [Data: Entities (266)]。
+最后，尽管直接的文献记录可能不多，大乔的地位和她所处的时代背景使她与曹操之间的关系充满了历史和文化上的意义。她的故事和曹操的野心相交织，展现了三国时期女性角色与政治军事活动之间的复杂联系。
+综上所述，大乔与曹操之间的关系主要是通过她的家庭背景、曹操的政治军事活动，以及文化记忆中对她美貌的追捧来定义的。这种关系更多地体现在历史和文学的叙述中，而不是基于两人之间有直接的个人互动。
+```
+### `nano-graphrag`
+"大乔和曹操的关系"
+```markdown
+# 大乔和曹操的关系
+## 大乔的背景
+大乔（Da Qiao）是江东乔公的长女，以其极美的容貌闻名于世。她最为著名的身份是孙策（Sun Ce）的妻子，这一身份在三国时期的政治和军事格局中也占有重要地位。周瑜特别提到她，显示她在周瑜与曹操局势中的重要性 [Data: Entities (1)]。
+## 曹操的欲望
+曹操（Cao Cao）对大乔的渴望主要源自其好色本性。他希望得到大乔和她的妹妹小乔，以满足个人欲望，并且认为获得她们可以帮助他提高政治地位和享乐。曹操曾发誓希望能得到江东二乔（大乔和小乔），并将她们置于铜雀台上，以乐晚年 [Data: Relationships (6, 7); Entities (1, 2)]。
+## 谋略与政治
+在三国时期的权谋斗争中，诸葛亮曾提议使用美人计，将大乔和小乔送给曹操，以此来使曹操班师回朝，减少对东吴的威胁。这一提议显示出大乔在权谋中的重要角色以及她可能对大局产生的影响 [Data: Sources (0); Relationships (12)]。
+## 曹操的策略和行动
+尽管曹操对大乔有很大的兴趣，但实际行动中的证据有限。周瑜得知曹操对大乔的意图后，表达了强烈的反对，并与诸葛亮商议对策，以确保她和小乔的安全 [Data: Sources (2)]。周瑜明确表示大乔是孙策的正妻，而小乔是他的妻子，拒绝将她们送给曹操。
+## 总结
+总体而言，曹操和大乔没有直接的互动记录，但曹操对大乔的欲望及其可能带来的政治影响却是不容忽视的。大乔作为江东乔公的长女，其美貌和身份使她成为多方势力争夺的对象，但她与曹操的关系主要体现在曹操的欲望和诸葛亮、周瑜等人的权谋计划中。
+曹操对大乔的欲望和权谋体现了三国时期复杂的政治动态，同时也揭示了美人计在当时战略中的重要性 [Data: Relationships (6, 12, 13); Entities (1); Sources (0, 2)].
+```
+## Global Search Results
+### GraphRAG
+"大乔与曹操的关系"
+```markdown
+### 大乔与曹操的关系
+大乔（Da Qiao）是乔公的女儿，以其非凡的美貌而闻名。这一特质使她在社区中成为一个重要人物，尤其是在曹操（Cao Cao）对她的渴望背景下 [Data: Reports (41)]。
+#### 曹操的计划
+曹操计划在统一江南后，将大乔安置在铜雀台（Tongque Tai）。铜雀台是曹操在叶县张河上建造的一座宏伟平台，象征着好运和庆祝。曹操打算将包括大乔在内的美丽女子安置在铜雀台中 [Data: Reports (269)]。这一计划不仅反映了曹操的野心和愿景，也��示了他对大乔的特别关注。
+#### 乔公与曹操的关系
+乔公是大乔的父亲，同时也是曹操的朋友。这层关系不仅突显了乔公的影响力，还暗示了由于曹操对大乔的渴望，可能会产生潜在的杠杆作用或冲突 [Data: Reports (41)]。
+### 结论
+综上所述，大乔与曹操的关系主要体现在曹操对她的渴望以及他计划将她安置在铜雀台的意图上。此外，乔公与曹操的友谊也为这一关系增添了复杂性，可能会在未来引发更多的动态变化 [Data: Reports (41, 269)]。
+```
+### `nano-graph`
+"大乔与曹操的关系"
+```markdown
+## 曹操与大乔的关系
+曹操对大乔展现了显著的政治和战略兴趣，这部分是由于大乔的美貌以及她在战略上的重要性。在那个时期，美丽并非单纯的外貌特征，还往往与政治资源和战略权力相关联。曹操认为拥有大乔会对他的政治野心有所帮助，并为其权势扩大增添筹码 [Data: Reports (6)]。
+大乔作为孙策的妻子，再加上她的地位和美貌，使得她在权力动态中成为一个关键的目标。曹操对大乔的重视，不仅因为她个人的吸引力，更因为她所代表的权力象征。如果能将大乔纳入自己的势力范围，曹操将可能通过这种方式削弱敌对势力，同时增强自己的影响力 [Data: Reports (6)]。
+总之，曹操和大乔的关系不仅仅停留在个人层面，更深层次的是政治策略和权力博弈的体现。曹操对她的兴趣，反映了他在权力斗争中的精明与野心。
+```

nano-graphrag/docs/use_neo4j_for_graphrag.md ADDED Viewed

	@@ -0,0 +1,27 @@

+1. Install [Neo4j](https://neo4j.com/docs/operations-manual/current/installation/) (please use 5.x version)
+2. Install Neo4j GDS (graph data science) [plugin](https://neo4j.com/docs/graph-data-science/current/installation/neo4j-server/)
+3. Start neo4j server
+4. Get the `NEO4J_URL`,  `NEO4J_USER` and `NEO4J_PASSWORD`
+   - By default, `NEO4J_URL` is `neo4j://localhost:7687` ,  `NEO4J_USER` is `neo4j` and `NEO4J_PASSWORD` is `neo4j`
+Pass your neo4j instance to `GraphRAG`:
+```python
+from nano_graphrag import GraphRAG
+from nano_graphrag._storage import Neo4jStorage
+neo4j_config = {
+  "neo4j_url": os.environ.get("NEO4J_URL", "neo4j://localhost:7687"),
+  "neo4j_auth": (
+      os.environ.get("NEO4J_USER", "neo4j"),
+      os.environ.get("NEO4J_PASSWORD", "neo4j"),
+  )
+}
+GraphRAG(
+  graph_storage_cls=Neo4jStorage,
+  addon_params=neo4j_config,
+)
+```

nano-graphrag/examples/benchmarks/dspy_entity.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import dspy
+import os
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+import logging
+import asyncio
+import time
+import shutil
+from nano_graphrag.entity_extraction.extract import extract_entities_dspy
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._storage import NetworkXStorage
+from nano_graphrag._utils import compute_mdhash_id, compute_args_hash
+from nano_graphrag._op import extract_entities
+WORKING_DIR = "./nano_graphrag_cache_dspy_entity"
+load_dotenv()
+logger = logging.getLogger("nano-graphrag")
+logger.setLevel(logging.DEBUG)
+async def deepseepk_model_if_cache(
+    prompt: str, model: str = "deepseek-chat", system_prompt : str = None, history_messages: list = [], **kwargs
+) -> str:
+    openai_async_client = AsyncOpenAI(
+        api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com"
+    )
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(model, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await openai_async_client.chat.completions.create(
+        model=model, messages=messages, **kwargs
+    )
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": model}}
+        )
+    # -----------------------------------------------------
+    return response.choices[0].message.content
+async def benchmark_entity_extraction(text: str, system_prompt: str, use_dspy: bool = False):
+    working_dir = os.path.join(WORKING_DIR, f"use_dspy={use_dspy}")
+    if os.path.exists(working_dir):
+        shutil.rmtree(working_dir)
+    start_time = time.time()
+    graph_storage = NetworkXStorage(namespace="test", global_config={
+        "working_dir": working_dir,
+        "entity_summary_to_max_tokens": 500,
+        "cheap_model_func": lambda *args, **kwargs: deepseepk_model_if_cache(*args, system_prompt=system_prompt, **kwargs),
+        "best_model_func": lambda *args, **kwargs: deepseepk_model_if_cache(*args, system_prompt=system_prompt, **kwargs),
+        "cheap_model_max_token_size": 4096,
+        "best_model_max_token_size": 4096,
+        "tiktoken_model_name": "gpt-4o",
+        "hashing_kv": BaseKVStorage(namespace="test", global_config={"working_dir": working_dir}),
+        "entity_extract_max_gleaning": 1,
+        "entity_extract_max_tokens": 4096,
+        "entity_extract_max_entities": 100,
+        "entity_extract_max_relationships": 100,
+    })
+    chunks = {compute_mdhash_id(text, prefix="chunk-"): {"content": text}}
+    if use_dspy:
+        graph_storage = await extract_entities_dspy(chunks, graph_storage, None, graph_storage.global_config)
+    else:
+        graph_storage = await extract_entities(chunks, graph_storage, None, graph_storage.global_config)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    return graph_storage, execution_time
+def print_extraction_results(graph_storage: NetworkXStorage):
+    print("\nEntities:")
+    entities = []
+    for node, data in graph_storage._graph.nodes(data=True):
+        entity_type = data.get('entity_type', 'Unknown')
+        description = data.get('description', 'No description')
+        entities.append(f"- {node} ({entity_type}):\n  {description}")
+    print("\n".join(entities))
+    print("\nRelationships:")
+    relationships = []
+    for source, target, data in graph_storage._graph.edges(data=True):
+        description = data.get('description', 'No description')
+        relationships.append(f"- {source} -> {target}:\n  {description}")
+    print("\n".join(relationships))
+async def run_benchmark(text: str):
+    print("\nRunning benchmark with DSPy-AI:")
+    system_prompt = """
+    You are an expert system specialized in entity and relationship extraction from complex texts.
+    Your task is to thoroughly analyze the given text and extract all relevant entities and their relationships with utmost precision and completeness.
+    """
+    system_prompt_dspy = f"{system_prompt} Time: {time.time()}."
+    lm = dspy.LM(
+        model="deepseek/deepseek-chat",
+        model_type="chat",
+        api_provider="openai",
+        api_key=os.environ["DEEPSEEK_API_KEY"],
+        base_url=os.environ["DEEPSEEK_BASE_URL"],
+        system_prompt=system_prompt,
+        temperature=1.0,
+        max_tokens=8192
+    )
+    dspy.settings.configure(lm=lm, experimental=True)
+    graph_storage_with_dspy, time_with_dspy = await benchmark_entity_extraction(text, system_prompt_dspy, use_dspy=True)
+    print(f"Execution time with DSPy-AI: {time_with_dspy:.2f} seconds")
+    print_extraction_results(graph_storage_with_dspy)
+    print("Running benchmark without DSPy-AI:")
+    system_prompt_no_dspy = f"{system_prompt} Time: {time.time()}."
+    graph_storage_without_dspy, time_without_dspy = await benchmark_entity_extraction(text, system_prompt_no_dspy, use_dspy=False)
+    print(f"Execution time without DSPy-AI: {time_without_dspy:.2f} seconds")
+    print_extraction_results(graph_storage_without_dspy)
+    print("\nComparison:")
+    print(f"Time difference: {abs(time_with_dspy - time_without_dspy):.2f} seconds")
+    print(f"DSPy-AI is {'faster' if time_with_dspy < time_without_dspy else 'slower'}")
+    entities_without_dspy = len(graph_storage_without_dspy._graph.nodes())
+    entities_with_dspy = len(graph_storage_with_dspy._graph.nodes())
+    relationships_without_dspy = len(graph_storage_without_dspy._graph.edges())
+    relationships_with_dspy = len(graph_storage_with_dspy._graph.edges())
+    print(f"Entities extracted: {entities_without_dspy} (without DSPy-AI) vs {entities_with_dspy} (with DSPy-AI)")
+    print(f"Relationships extracted: {relationships_without_dspy} (without DSPy-AI) vs {relationships_with_dspy} (with DSPy-AI)")
+if __name__ == "__main__":
+    with open("./tests/zhuyuanzhang.txt", encoding="utf-8-sig") as f:
+        text = f.read()
+    asyncio.run(run_benchmark(text=text))

nano-graphrag/examples/benchmarks/eval_naive_graphrag_on_multi_hop.ipynb ADDED Viewed

	@@ -0,0 +1,432 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial, we are going to evaluate the performance of the naive RAG and the GraphRAG algorithm on a [multi-hop RAG task](https://github.com/yixuantt/MultiHop-RAG)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "Make sure you install the necessary dependencies by running the following commands:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install ragas nest_asyncio datasets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import the necessary libraries, and set up your openai api key if needed:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "# os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"\n",
+    "import json\n",
+    "import sys\n",
+    "sys.path.append(\"../..\")\n",
+    "\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "import logging\n",
+    "\n",
+    "logging.basicConfig(level=logging.WARNING)\n",
+    "logging.getLogger(\"nano-graphrag\").setLevel(logging.INFO)\n",
+    "from nano_graphrag import GraphRAG, QueryParam\n",
+    "from datasets import Dataset \n",
+    "from ragas import evaluate\n",
+    "from ragas.metrics import (\n",
+    "    answer_correctness,\n",
+    "    answer_similarity,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download the dataset from [Github Repo](https://github.com/yixuantt/MultiHop-RAG/tree/main/dataset). \n",
+    "If should contain two files:\n",
+    "- `MultiHopRAG.json`\n",
+    "- `corpus.json`\n",
+    "\n",
+    "After downloading the dataset, replace the below paths to the paths on your machine."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "multi_hop_rag_file = \"./fixtures/MultiHopRAG.json\"\n",
+    "multi_hop_corpus_file = \"./fixtures/corpus.json\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocess"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "with open(multi_hop_rag_file) as f:\n",
+    "    multi_hop_rag_dataset = json.load(f)\n",
+    "with open(multi_hop_corpus_file) as f:\n",
+    "    multi_hop_corpus = json.load(f)\n",
+    "\n",
+    "corups_url_refernces = {}\n",
+    "for cor in multi_hop_corpus:\n",
+    "    corups_url_refernces[cor['url']] = cor"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We only use the top-100 queries for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Queries have types: {'inference_query', 'comparison_query', 'null_query', 'temporal_query'}\n",
+      "We will need 139 articles:\n",
+      "## ASX set to drop as Wall Street’s September slump deepens\n",
+      "Author: Stan Choe, The Sydney Morning Herald\n",
+      "Category: business\n",
+      "Publised: 2023-09-26T19:11:30+00:00\n",
+      "ETF provider Betashares, which manages $ ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "multi_hop_rag_dataset = multi_hop_rag_dataset[:100]\n",
+    "print(\"Queries have types:\", set([q['question_type'] for q in multi_hop_rag_dataset]))\n",
+    "total_urls = set()\n",
+    "for q in multi_hop_rag_dataset:\n",
+    "    total_urls.update([up['url'] for up in q['evidence_list']])\n",
+    "corups_url_refernces = {k:v for k, v in corups_url_refernces.items() if k in total_urls}\n",
+    "\n",
+    "total_corups = [f\"## {cor['title']}\\nAuthor: {cor['author']}, {cor['source']}\\nCategory: {cor['category']}\\nPublised: {cor['published_at']}\\n{cor['body']}\" for cor in corups_url_refernces.values()]\n",
+    "\n",
+    "print(f\"We will need {len(total_corups)} articles:\")\n",
+    "print(total_corups[0][:200], \"...\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Add index for the `total_corups` using naive RAG and GraphRAG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:nano-graphrag:Load KV full_docs with 139 data\n",
+      "INFO:nano-graphrag:Load KV text_chunks with 408 data\n",
+      "INFO:nano-graphrag:Load KV llm_response_cache with 1634 data\n",
+      "INFO:nano-graphrag:Load KV community_reports with 794 data\n",
+      "INFO:nano-graphrag:Loaded graph from nano_graphrag_cache_multi_hop_rag_test/graph_chunk_entity_relation.graphml with 6181 nodes, 5423 edges\n",
+      "WARNING:nano-graphrag:All docs are already in the storage\n",
+      "INFO:nano-graphrag:Writing graph with 6181 nodes, 5423 edges\n"
+     ]
+    }
+   ],
+   "source": [
+    "# First time indexing will cost many time, roughly 15~20 minutes\n",
+    "graphrag_func = GraphRAG(working_dir=\"nano_graphrag_cache_multi_hop_rag_test\", enable_naive_rag=True,\n",
+    "                         embedding_func_max_async=4)\n",
+    "graphrag_func.insert(total_corups)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Look at the response of different RAG methods on the first query:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_formate = \"Single phrase or sentence, concise and no redundant explanation needed. If you don't have the answer in context, Just response 'Insufficient information'\"\n",
+    "naive_rag_query_param = QueryParam(mode='naive', response_type=response_formate)\n",
+    "naive_rag_query_only_context_param = QueryParam(mode='naive', only_need_context=True)\n",
+    "local_graphrag_query_param = QueryParam(mode='local', response_type=response_formate)\n",
+    "local_graphrag_only_context__param = QueryParam(mode='local', only_need_context=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question: Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?\n",
+      "GroundTruth Answer: Sam Bankman-Fried\n"
+     ]
+    }
+   ],
+   "source": [
+    "query = multi_hop_rag_dataset[0]\n",
+    "print(\"Question:\", query['query'])\n",
+    "print(\"GroundTruth Answer:\", query['answer'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:nano-graphrag:Truncate 20 to 12 chunks\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NaiveRAG Answer: Sam Bankman-Fried\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"NaiveRAG Answer:\", graphrag_func.query(query['query'], param=naive_rag_query_param))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:nano-graphrag:Using 20 entites, 3 communities, 124 relations, 3 text units\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Local GraphRAG Answer: Sam Bankman-Fried\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Local GraphRAG Answer:\", graphrag_func.query(query['query'], param=local_graphrag_query_param))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great! Now we're ready to evaluate more detailed metrics. We will use [ragas](https://docs.ragas.io/en/stable/) to evalue the answers' quality."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "questions = [q['query'] for q in multi_hop_rag_dataset]\n",
+    "labels = [q['answer'] for q in multi_hop_rag_dataset]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/100 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 100/100 [03:53<00:00,  2.33s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "logging.getLogger(\"nano-graphrag\").setLevel(logging.WARNING)\n",
+    "\n",
+    "naive_rag_answers = [\n",
+    "    graphrag_func.query(q, param=naive_rag_query_param) for q in tqdm(questions)\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 100/100 [09:10<00:00,  5.50s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "local_graphrag_answers = [\n",
+    "    graphrag_func.query(q, param=local_graphrag_query_param) for q in tqdm(questions)\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 70%|███████   | 70/100 [04:25<01:53,  3.79s/it]8,  6.38it/s]\n",
+      "Evaluating: 100%|██████████| 200/200 [00:32<00:00,  6.19it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "naive_results = evaluate(\n",
+    "    Dataset.from_dict({\n",
+    "        \"question\": questions,\n",
+    "        \"ground_truth\": labels,\n",
+    "        \"answer\": naive_rag_answers,\n",
+    "    }),\n",
+    "    metrics=[\n",
+    "        # answer_relevancy,\n",
+    "        answer_correctness,\n",
+    "        answer_similarity,\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating: 100%|██████████| 200/200 [00:23<00:00,  8.59it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "local_graphrag_results = evaluate(\n",
+    "    Dataset.from_dict({\n",
+    "        \"question\": questions,\n",
+    "        \"ground_truth\": labels,\n",
+    "        \"answer\": local_graphrag_answers,\n",
+    "    }),\n",
+    "    metrics=[\n",
+    "        # answer_relevancy,\n",
+    "        answer_correctness,\n",
+    "        answer_similarity,\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Naive RAG results {'answer_correctness': 0.5896, 'answer_similarity': 0.8935}\n",
+      "Local GraphRAG results {'answer_correctness': 0.7380, 'answer_similarity': 0.8619}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Naive RAG results\", naive_results)\n",
+    "print(\"Local GraphRAG results\", local_graphrag_results)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "baai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

nano-graphrag/examples/benchmarks/hnsw_vs_nano_vector_storage.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import asyncio
+import time
+import numpy as np
+from tqdm import tqdm
+from nano_graphrag import GraphRAG
+from nano_graphrag._storage import NanoVectorDBStorage, HNSWVectorStorage
+from nano_graphrag._utils import wrap_embedding_func_with_attrs
+WORKING_DIR = "./nano_graphrag_cache_benchmark_hnsw_vs_nano_vector_storage"
+DATA_LEN = 100_000
+FAKE_DIM = 1024
+BATCH_SIZE = 100000
+@wrap_embedding_func_with_attrs(embedding_dim=FAKE_DIM, max_token_size=8192)
+async def sample_embedding(texts: list[str]) -> np.ndarray:
+    return np.float32(np.random.rand(len(texts), FAKE_DIM))
+def generate_test_data():
+    return {str(i): {"content": f"Test content {i}"} for i in range(DATA_LEN)}
+async def benchmark_storage(storage_class, name):
+    rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=sample_embedding)
+    storage = storage_class(
+        namespace=f"benchmark_{name}",
+        global_config=rag.__dict__,
+        embedding_func=sample_embedding,
+        meta_fields={"content"},
+    )
+    test_data = generate_test_data()
+    print(f"Benchmarking {name}...")
+    with tqdm(total=DATA_LEN, desc=f"{name} Benchmark") as pbar:
+        start_time = time.time()
+        for i in range(0, len(test_data), BATCH_SIZE):
+            batch = {k: test_data[k] for k in list(test_data.keys())[i:i+BATCH_SIZE]}
+            await storage.upsert(batch)
+            pbar.update(min(BATCH_SIZE, DATA_LEN - i))
+        insert_time = time.time() - start_time
+        save_start_time = time.time()
+        await storage.index_done_callback()
+        save_time = time.time() - save_start_time
+        pbar.update(1)
+        query_vector = np.random.rand(FAKE_DIM)
+        query_times = []
+        for _ in range(100):
+            query_start = time.time()
+            await storage.query(query_vector, top_k=10)
+            query_times.append(time.time() - query_start)
+            pbar.update(1)
+    avg_query_time = sum(query_times) / len(query_times)
+    print(f"{name} - Insert: {insert_time:.2f}s, Save: {save_time:.2f}s, Avg Query: {avg_query_time:.4f}s")
+    return insert_time, save_time, avg_query_time
+async def run_benchmarks():
+    print("Running NanoVectorDB benchmark...")
+    nano_insert_time, nano_save_time, nano_query_time = await benchmark_storage(NanoVectorDBStorage, "nano")
+    print("\nRunning HNSWVectorStorage benchmark...")
+    hnsw_insert_time, hnsw_save_time, hnsw_query_time = await benchmark_storage(HNSWVectorStorage, "hnsw")
+    print("\nBenchmark Results:")
+    print(f"NanoVectorDB - Insert: {nano_insert_time:.2f}s, Save: {nano_save_time:.2f}s, Avg Query: {nano_query_time:.4f}s")
+    print(f"HNSWVectorStorage - Insert: {hnsw_insert_time:.2f}s, Save: {hnsw_save_time:.2f}s, Avg Query: {hnsw_query_time:.4f}s")
+if __name__ == "__main__":
+    asyncio.run(run_benchmarks())

nano-graphrag/examples/benchmarks/md5_vs_xxhash.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import time
+import xxhash
+from hashlib import md5
+from tqdm import tqdm
+import numpy as np
+def xxhash_ids(data: list[str]) -> np.ndarray:
+    return np.fromiter(
+        (xxhash.xxh32_intdigest(d.encode()) for d in data),
+        dtype=np.uint32,
+        count=len(data)
+    )
+def md5_ids(data: list[str]) -> np.ndarray:
+    return np.fromiter(
+        (int(md5(d.encode()).hexdigest(), 16) & 0xFFFFFFFF for d in data),
+        dtype=np.uint32,
+        count=len(data)
+    )
+if __name__ == "__main__":
+    num_ids = 1000000
+    num_iterations = 100
+    xxhash_times = []
+    md5_times = []
+    for i in tqdm(range(num_iterations)):
+        test_data = [f"{i}_{j}" for j in range(num_ids)]
+        start_time = time.time()
+        xxhash_result = xxhash_ids(test_data)
+        xxhash_times.append(time.time() - start_time)
+        start_time = time.time()
+        md5_result = md5_ids(test_data)
+        md5_times.append(time.time() - start_time)
+        assert len(xxhash_result) == len(md5_result) == num_ids
+        assert not np.array_equal(xxhash_result, md5_result)
+    avg_xxhash_time = np.mean(xxhash_times)
+    avg_md5_time = np.mean(md5_times)
+    std_xxhash_time = np.std(xxhash_times)
+    std_md5_time = np.std(md5_times)
+    print(f"num_ids: {num_ids} | num_iterations: {num_iterations}")
+    print(f"\nAverage xxhash time: {avg_xxhash_time:.4f} seconds")
+    print(f"Average MD5 time: {avg_md5_time:.4f} seconds")
+    print(f"xxhash is {avg_md5_time / avg_xxhash_time:.2f}x faster than MD5 on average")
+    print(f"\nxxhash time standard deviation: {std_xxhash_time:.4f} seconds")
+    print(f"MD5 time standard deviation: {std_md5_time:.4f} seconds")

nano-graphrag/examples/finetune_entity_relationship_dspy.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

nano-graphrag/examples/generate_entity_relationship_dspy.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

nano-graphrag/examples/graphml_visualize.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import networkx as nx
+import json
+import os
+import webbrowser
+import http.server
+import socketserver
+import threading
+# load GraphML file and transfer to JSON
+def graphml_to_json(graphml_file):
+    G = nx.read_graphml(graphml_file)
+    data = nx.node_link_data(G)
+    return json.dumps(data)
+# create HTML file
+def create_html(html_path):
+    html_content = '''
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Graph Visualization</title>
+    <script src="https://d3js.org/d3.v7.min.js"></script>
+    <style>
+        body, html {
+            margin: 0;
+            padding: 0;
+            width: 100%;
+            height: 100%;
+            overflow: hidden;
+        }
+        svg {
+            width: 100%;
+            height: 100%;
+        }
+        .links line {
+            stroke: #999;
+            stroke-opacity: 0.6;
+        }
+        .nodes circle {
+            stroke: #fff;
+            stroke-width: 1.5px;
+        }
+        .node-label {
+            font-size: 12px;
+            pointer-events: none;
+        }
+        .link-label {
+            font-size: 10px;
+            fill: #666;
+            pointer-events: none;
+            opacity: 0;
+            transition: opacity 0.3s;
+        }
+        .link:hover .link-label {
+            opacity: 1;
+        }
+        .tooltip {
+            position: absolute;
+            text-align: left;
+            padding: 10px;
+            font: 12px sans-serif;
+            background: lightsteelblue;
+            border: 0px;
+            border-radius: 8px;
+            pointer-events: none;
+            opacity: 0;
+            transition: opacity 0.3s;
+            max-width: 300px;
+        }
+        .legend {
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background-color: rgba(255, 255, 255, 0.8);
+            padding: 10px;
+            border-radius: 5px;
+        }
+        .legend-item {
+            margin: 5px 0;
+        }
+        .legend-color {
+            display: inline-block;
+            width: 20px;
+            height: 20px;
+            margin-right: 5px;
+            vertical-align: middle;
+        }
+    </style>
+</head>
+<body>
+    <svg></svg>
+    <div class="tooltip"></div>
+    <div class="legend"></div>
+    <script type="text/javascript" src="./graph_json.js"></script>
+    <script>
+        const graphData = graphJson;
+        const svg = d3.select("svg"),
+            width = window.innerWidth,
+            height = window.innerHeight;
+        svg.attr("viewBox", [0, 0, width, height]);
+        const g = svg.append("g");
+        const entityTypes = [...new Set(graphData.nodes.map(d => d.entity_type))];
+        const color = d3.scaleOrdinal(d3.schemeCategory10).domain(entityTypes);
+        const simulation = d3.forceSimulation(graphData.nodes)
+            .force("link", d3.forceLink(graphData.links).id(d => d.id).distance(150))
+            .force("charge", d3.forceManyBody().strength(-300))
+            .force("center", d3.forceCenter(width / 2, height / 2))
+            .force("collide", d3.forceCollide().radius(30));
+        const linkGroup = g.append("g")
+            .attr("class", "links")
+            .selectAll("g")
+            .data(graphData.links)
+            .enter().append("g")
+            .attr("class", "link");
+        const link = linkGroup.append("line")
+            .attr("stroke-width", d => Math.sqrt(d.value));
+        const linkLabel = linkGroup.append("text")
+            .attr("class", "link-label")
+            .text(d => d.description || "");
+        const node = g.append("g")
+            .attr("class", "nodes")
+            .selectAll("circle")
+            .data(graphData.nodes)
+            .enter().append("circle")
+            .attr("r", 5)
+            .attr("fill", d => color(d.entity_type))
+            .call(d3.drag()
+                .on("start", dragstarted)
+                .on("drag", dragged)
+                .on("end", dragended));
+        const nodeLabel = g.append("g")
+            .attr("class", "node-labels")
+            .selectAll("text")
+            .data(graphData.nodes)
+            .enter().append("text")
+            .attr("class", "node-label")
+            .text(d => d.id);
+        const tooltip = d3.select(".tooltip");
+        node.on("mouseover", function(event, d) {
+            tooltip.transition()
+                .duration(200)
+                .style("opacity", .9);
+            tooltip.html(`<strong>${d.id}</strong><br>Entity Type: ${d.entity_type}<br>Description: ${d.description || "N/A"}`)
+                .style("left", (event.pageX + 10) + "px")
+                .style("top", (event.pageY - 28) + "px");
+        })
+        .on("mouseout", function(d) {
+            tooltip.transition()
+                .duration(500)
+                .style("opacity", 0);
+        });
+        const legend = d3.select(".legend");
+        entityTypes.forEach(type => {
+            legend.append("div")
+                .attr("class", "legend-item")
+                .html(`<span class="legend-color" style="background-color: ${color(type)}"></span>${type}`);
+        });
+        simulation
+            .nodes(graphData.nodes)
+            .on("tick", ticked);
+        simulation.force("link")
+            .links(graphData.links);
+        function ticked() {
+            link
+                .attr("x1", d => d.source.x)
+                .attr("y1", d => d.source.y)
+                .attr("x2", d => d.target.x)
+                .attr("y2", d => d.target.y);
+            linkLabel
+                .attr("x", d => (d.source.x + d.target.x) / 2)
+                .attr("y", d => (d.source.y + d.target.y) / 2)
+                .attr("text-anchor", "middle")
+                .attr("dominant-baseline", "middle");
+            node
+                .attr("cx", d => d.x)
+                .attr("cy", d => d.y);
+            nodeLabel
+                .attr("x", d => d.x + 8)
+                .attr("y", d => d.y + 3);
+        }
+        function dragstarted(event) {
+            if (!event.active) simulation.alphaTarget(0.3).restart();
+            event.subject.fx = event.subject.x;
+            event.subject.fy = event.subject.y;
+        }
+        function dragged(event) {
+            event.subject.fx = event.x;
+            event.subject.fy = event.y;
+        }
+        function dragended(event) {
+            if (!event.active) simulation.alphaTarget(0);
+            event.subject.fx = null;
+            event.subject.fy = null;
+        }
+        const zoom = d3.zoom()
+            .scaleExtent([0.1, 10])
+            .on("zoom", zoomed);
+        svg.call(zoom);
+        function zoomed(event) {
+            g.attr("transform", event.transform);
+        }
+    </script>
+</body>
+</html>
+    '''
+    with open(html_path, 'w', encoding='utf-8') as f:
+        f.write(html_content)
+def create_json(json_data, json_path):
+    json_data = "var graphJson = " + json_data.replace('\\"', '').replace("'", "\\'").replace("\n", "")
+    with open(json_path, 'w', encoding='utf-8') as f:
+        f.write(json_data)
+# start simple HTTP server
+def start_server(port):
+    handler = http.server.SimpleHTTPRequestHandler
+    with socketserver.TCPServer(("", port), handler) as httpd:
+        print(f"Server started at http://localhost:{port}")
+        httpd.serve_forever()
+# main function
+def visualize_graphml(graphml_file, html_path, port=8000):
+    json_data = graphml_to_json(graphml_file)
+    html_dir = os.path.dirname(html_path)
+    if not os.path.exists(html_dir):
+        os.makedirs(html_dir)
+    json_path = os.path.join(html_dir, 'graph_json.js')
+    create_json(json_data, json_path)
+    create_html(html_path)
+    # start server in background
+    server_thread = threading.Thread(target=start_server(port))
+    server_thread.daemon = True
+    server_thread.start()
+    # open default browser
+    webbrowser.open(f'http://localhost:{port}/{html_path}')
+    print("Visualization is ready. Press Ctrl+C to exit.")
+    try:
+        # keep main thread running
+        while True:
+            pass
+    except KeyboardInterrupt:
+        print("Shutting down...")
+# usage
+if __name__ == "__main__":
+    graphml_file = r"nano_graphrag_cache_azure_openai_TEST\graph_chunk_entity_relation.graphml"  # replace with your GraphML file path
+    html_path = "graph_visualization.html"
+    visualize_graphml(graphml_file, html_path, 11236)

nano-graphrag/examples/no_openai_key_at_all.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import logging
+import ollama
+import numpy as np
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs
+from sentence_transformers import SentenceTransformer
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+# !!! qwen2-7B maybe produce unparsable results and cause the extraction of graph to fail.
+WORKING_DIR = "./nano_graphrag_cache_ollama_TEST"
+MODEL = "qwen2"
+EMBED_MODEL = SentenceTransformer(
+    "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu"
+)
+# We're using Sentence Transformers to generate embeddings for the BGE model
+@wrap_embedding_func_with_attrs(
+    embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(),
+    max_token_size=EMBED_MODEL.max_seq_length,
+)
+async def local_embedding(texts: list[str]) -> np.ndarray:
+    return EMBED_MODEL.encode(texts, normalize_embeddings=True)
+async def ollama_model_if_cache(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    # remove kwargs that are not supported by ollama
+    kwargs.pop("max_tokens", None)
+    kwargs.pop("response_format", None)
+    ollama_client = ollama.AsyncClient()
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(MODEL, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await ollama_client.chat(model=MODEL, messages=messages, **kwargs)
+    result = response["message"]["content"]
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert({args_hash: {"return": result, "model": MODEL}})
+    # -----------------------------------------------------
+    return result
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        best_model_func=ollama_model_if_cache,
+        cheap_model_func=ollama_model_if_cache,
+        embedding_func=local_embedding,
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        best_model_func=ollama_model_if_cache,
+        cheap_model_func=ollama_model_if_cache,
+        embedding_func=local_embedding,
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+    # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True)
+    # rag.insert(FAKE_TEXT[half_len:])
+if __name__ == "__main__":
+    insert()
+    query()

nano-graphrag/examples/using_amazon_bedrock.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from nano_graphrag import GraphRAG, QueryParam
+graph_func = GraphRAG(
+    working_dir="../bedrock_example",
+    using_amazon_bedrock=True,
+    best_model_id="us.anthropic.claude-3-sonnet-20240229-v1:0",
+    cheap_model_id="us.anthropic.claude-3-haiku-20240307-v1:0",
+)
+with open("../tests/mock_data.txt") as f:
+    graph_func.insert(f.read())
+prompt = "What are the top themes in this story?"
+# Perform global graphrag search
+print(graph_func.query(prompt, param=QueryParam(mode="global")))
+# Perform local graphrag search (I think is better and more scalable one)
+print(graph_func.query(prompt, param=QueryParam(mode="local")))

nano-graphrag/examples/using_custom_chunking_method.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from nano_graphrag._utils import encode_string_by_tiktoken
+from nano_graphrag.base import QueryParam
+from nano_graphrag.graphrag import GraphRAG
+from nano_graphrag._op import chunking_by_seperators
+def chunking_by_token_size(
+    tokens_list: list[list[int]],  # nano-graphrag may pass a batch of docs' tokens
+    doc_keys: list[str],  # nano-graphrag may pass a batch of docs' key ids
+    tiktoken_model,  # a titoken model
+    overlap_token_size=128,
+    max_token_size=1024,
+):
+    results = []
+    for index, tokens in enumerate(tokens_list):
+        chunk_token = []
+        lengths = []
+        for start in range(0, len(tokens), max_token_size - overlap_token_size):
+            chunk_token.append(tokens[start : start + max_token_size])
+            lengths.append(min(max_token_size, len(tokens) - start))
+        chunk_token = tiktoken_model.decode_batch(chunk_token)
+        for i, chunk in enumerate(chunk_token):
+            results.append(
+                {
+                    "tokens": lengths[i],
+                    "content": chunk.strip(),
+                    "chunk_order_index": i,
+                    "full_doc_id": doc_keys[index],
+                }
+            )
+    return results
+WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST"
+rag = GraphRAG(
+    working_dir=WORKING_DIR,
+    chunk_func=chunking_by_seperators,
+)

nano-graphrag/examples/using_deepseek_api_as_llm+glm_api_as_embedding.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import logging
+import numpy as np
+from openai import AsyncOpenAI, OpenAI
+from dataclasses import dataclass
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+GLM_API_KEY = "XXXX"
+DEEPSEEK_API_KEY = "sk-XXXX"
+MODEL = "deepseek-chat"
+async def deepseepk_model_if_cache(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    openai_async_client = AsyncOpenAI(
+        api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com"
+    )
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(MODEL, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await openai_async_client.chat.completions.create(
+        model=MODEL, messages=messages, **kwargs
+    )
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": MODEL}}
+        )
+    # -----------------------------------------------------
+    return response.choices[0].message.content
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+@dataclass
+class EmbeddingFunc:
+    embedding_dim: int
+    max_token_size: int
+    func: callable
+    async def __call__(self, *args, **kwargs) -> np.ndarray:
+        return await self.func(*args, **kwargs)
+def wrap_embedding_func_with_attrs(**kwargs):
+    """Wrap a function with attributes"""
+    def final_decro(func) -> EmbeddingFunc:
+        new_func = EmbeddingFunc(**kwargs, func=func)
+        return new_func
+    return final_decro
+@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
+async def GLM_embedding(texts: list[str]) -> np.ndarray:
+    model_name = "embedding-2"
+    client = OpenAI(
+        api_key=GLM_API_KEY,
+        base_url="https://open.bigmodel.cn/api/paas/v4/"
+    )
+    embedding = client.embeddings.create(
+        input=texts,
+        model=model_name,
+    )
+    final_embedding = [d.embedding for d in embedding.data]
+    return np.array(final_embedding)
+WORKING_DIR = "./nano_graphrag_cache_deepseek_TEST"
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        best_model_func=deepseepk_model_if_cache,
+        cheap_model_func=deepseepk_model_if_cache,
+        embedding_func=GLM_embedding,
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        best_model_func=deepseepk_model_if_cache,
+        cheap_model_func=deepseepk_model_if_cache,
+        embedding_func=GLM_embedding,
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+    # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True)
+    # rag.insert(FAKE_TEXT[half_len:])
+if __name__ == "__main__":
+    insert()
+    # query()

nano-graphrag/examples/using_deepseek_as_llm.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import logging
+from openai import AsyncOpenAI
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+DEEPSEEK_API_KEY = "sk-XXXX"
+MODEL = "deepseek-chat"
+async def deepseepk_model_if_cache(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    openai_async_client = AsyncOpenAI(
+        api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com"
+    )
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(MODEL, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await openai_async_client.chat.completions.create(
+        model=MODEL, messages=messages, **kwargs
+    )
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": MODEL}}
+        )
+    # -----------------------------------------------------
+    return response.choices[0].message.content
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+WORKING_DIR = "./nano_graphrag_cache_deepseek_TEST"
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        best_model_func=deepseepk_model_if_cache,
+        cheap_model_func=deepseepk_model_if_cache,
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        best_model_func=deepseepk_model_if_cache,
+        cheap_model_func=deepseepk_model_if_cache,
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+    # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True)
+    # rag.insert(FAKE_TEXT[half_len:])
+if __name__ == "__main__":
+    insert()
+    # query()

nano-graphrag/examples/using_dspy_entity_extraction.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+from openai import AsyncOpenAI
+from dotenv import load_dotenv
+import logging
+import numpy as np
+import dspy
+from sentence_transformers import SentenceTransformer
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag._llm import gpt_4o_mini_complete
+from nano_graphrag._storage import HNSWVectorStorage
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs
+from nano_graphrag.entity_extraction.extract import extract_entities_dspy
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.DEBUG)
+WORKING_DIR = "./nano_graphrag_cache_using_dspy_entity_extraction"
+load_dotenv()
+EMBED_MODEL = SentenceTransformer(
+    "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu"
+)
+@wrap_embedding_func_with_attrs(
+    embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(),
+    max_token_size=EMBED_MODEL.max_seq_length,
+)
+async def local_embedding(texts: list[str]) -> np.ndarray:
+    return EMBED_MODEL.encode(texts, normalize_embeddings=True)
+async def deepseepk_model_if_cache(
+    prompt, model: str = "deepseek-chat", system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    openai_async_client = AsyncOpenAI(
+        api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com"
+    )
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(model, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await openai_async_client.chat.completions.create(
+        model=model, messages=messages, **kwargs
+    )
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": model}}
+        )
+    # -----------------------------------------------------
+    return response.choices[0].message.content
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        vector_db_storage_cls=HNSWVectorStorage,
+        vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50},
+        best_model_max_async=10,
+        cheap_model_max_async=10,
+        best_model_func=deepseepk_model_if_cache,
+        cheap_model_func=deepseepk_model_if_cache,
+        embedding_func=local_embedding,
+        entity_extraction_func=extract_entities_dspy
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        vector_db_storage_cls=HNSWVectorStorage,
+        vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50},
+        best_model_max_token_size=8196,
+        cheap_model_max_token_size=8196,
+        best_model_max_async=4,
+        cheap_model_max_async=4,
+        best_model_func=gpt_4o_mini_complete,
+        cheap_model_func=gpt_4o_mini_complete,
+        embedding_func=local_embedding,
+        entity_extraction_func=extract_entities_dspy
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="local")
+        )
+    )
+if __name__ == "__main__":
+    lm = dspy.LM(
+        model="deepseek/deepseek-chat",
+        model_type="chat",
+        api_provider="openai",
+        api_key=os.environ["DEEPSEEK_API_KEY"],
+        base_url=os.environ["DEEPSEEK_BASE_URL"],
+        temperature=1.0,
+        max_tokens=8192
+    )
+    dspy.settings.configure(lm=lm, experimental=True)
+    insert()
+    query()

nano-graphrag/examples/using_faiss_as_vextorDB.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import asyncio
+import numpy as np
+from nano_graphrag.graphrag import GraphRAG, QueryParam
+from nano_graphrag._utils import logger
+from nano_graphrag.base import BaseVectorStorage
+from dataclasses import dataclass
+import faiss
+import pickle
+import logging
+import xxhash
+logging.getLogger('msal').setLevel(logging.WARNING)
+logging.getLogger('azure').setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+WORKING_DIR = "./nano_graphrag_cache_faiss_TEST"
+@dataclass
+class FAISSStorage(BaseVectorStorage):
+    def __post_init__(self):
+        self._index_file_name = os.path.join(
+            self.global_config["working_dir"], f"{self.namespace}_faiss.index"
+        )
+        self._metadata_file_name = os.path.join(
+            self.global_config["working_dir"], f"{self.namespace}_metadata.pkl"
+        )
+        self._max_batch_size = self.global_config["embedding_batch_num"]
+        if os.path.exists(self._index_file_name) and os.path.exists(self._metadata_file_name):
+            self._index = faiss.read_index(self._index_file_name)
+            with open(self._metadata_file_name, 'rb') as f:
+                self._metadata = pickle.load(f)
+        else:
+            self._index = faiss.IndexIDMap(faiss.IndexFlatIP(self.embedding_func.embedding_dim))
+            self._metadata = {}
+    async def upsert(self, data: dict[str, dict]):
+        logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
+        contents = [v["content"] for v in data.values()]
+        batches = [
+            contents[i : i + self._max_batch_size]
+            for i in range(0, len(contents), self._max_batch_size)
+        ]
+        embeddings_list = await asyncio.gather(
+            *[self.embedding_func(batch) for batch in batches]
+        )
+        embeddings = np.concatenate(embeddings_list)
+        ids = []
+        for k, v in data.items():
+            id = xxhash.xxh32_intdigest(k.encode())
+            metadata = {k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}
+            metadata['id'] = k
+            self._metadata[id] = metadata
+            ids.append(id)
+        ids = np.array(ids, dtype=np.int64)
+        self._index.add_with_ids(embeddings, ids)
+        return len(data)
+    async def query(self, query, top_k=5):
+        embedding = await self.embedding_func([query])
+        distances, indices = self._index.search(embedding, top_k)
+        results = []
+        for _, (distance, id) in enumerate(zip(distances[0], indices[0])):
+            if id != -1:  # FAISS returns -1 for empty slots
+                if id in self._metadata:
+                    metadata = self._metadata[id]
+                    results.append({**metadata, "distance": 1 - distance})  # Convert to cosine distance
+        return results
+    async def index_done_callback(self):
+        faiss.write_index(self._index, self._index_file_name)
+        with open(self._metadata_file_name, 'wb') as f:
+            pickle.dump(self._metadata, f)
+if __name__ == "__main__":
+    graph_func = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        vector_db_storage_cls=FAISSStorage,
+    )
+    with open(r"tests/mock_data.txt", encoding='utf-8') as f:
+        graph_func.insert(f.read()[:30000])
+    # Perform global graphrag search
+    print(graph_func.query("What are the top themes in this story?"))

nano-graphrag/examples/using_hnsw_as_vectorDB.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+from openai import AsyncOpenAI
+from dotenv import load_dotenv
+import logging
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag._llm import gpt_4o_mini_complete
+from nano_graphrag._storage import HNSWVectorStorage
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.DEBUG)
+WORKING_DIR = "./nano_graphrag_cache_using_hnsw_as_vectorDB"
+load_dotenv()
+EMBED_MODEL = SentenceTransformer(
+    "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu"
+)
+@wrap_embedding_func_with_attrs(
+    embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(),
+    max_token_size=EMBED_MODEL.max_seq_length,
+)
+async def local_embedding(texts: list[str]) -> np.ndarray:
+    return EMBED_MODEL.encode(texts, normalize_embeddings=True)
+async def deepseepk_model_if_cache(
+    prompt, model: str = "deepseek-chat", system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    openai_async_client = AsyncOpenAI(
+        api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com"
+    )
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(model, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await openai_async_client.chat.completions.create(
+        model=model, messages=messages, **kwargs
+    )
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": model}}
+        )
+    # -----------------------------------------------------
+    return response.choices[0].message.content
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        vector_db_storage_cls=HNSWVectorStorage,
+        vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50},
+        best_model_max_async=10,
+        cheap_model_max_async=10,
+        best_model_func=deepseepk_model_if_cache,
+        cheap_model_func=deepseepk_model_if_cache,
+        embedding_func=local_embedding
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        vector_db_storage_cls=HNSWVectorStorage,
+        vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50},
+        best_model_max_token_size=8196,
+        cheap_model_max_token_size=8196,
+        best_model_max_async=4,
+        cheap_model_max_async=4,
+        best_model_func=gpt_4o_mini_complete,
+        cheap_model_func=gpt_4o_mini_complete,
+        embedding_func=local_embedding
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="local")
+        )
+    )
+if __name__ == "__main__":
+    insert()
+    query()

nano-graphrag/examples/using_llm_api_as_llm+ollama_embedding.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+import logging
+import ollama
+import numpy as np
+from openai import AsyncOpenAI
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+# Assumed llm model settings
+LLM_BASE_URL = "https://your.api.url"
+LLM_API_KEY = "your_api_key"
+MODEL = "your_model_name"
+# Assumed embedding model settings
+EMBEDDING_MODEL = "nomic-embed-text"
+EMBEDDING_MODEL_DIM = 768
+EMBEDDING_MODEL_MAX_TOKENS = 8192
+async def llm_model_if_cache(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    openai_async_client = AsyncOpenAI(
+        api_key=LLM_API_KEY, base_url=LLM_BASE_URL
+    )
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(MODEL, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await openai_async_client.chat.completions.create(
+        model=MODEL, messages=messages, **kwargs
+    )
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": MODEL}}
+        )
+    # -----------------------------------------------------
+    return response.choices[0].message.content
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+WORKING_DIR = "./nano_graphrag_cache_llm_TEST"
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        best_model_func=llm_model_if_cache,
+        cheap_model_func=llm_model_if_cache,
+        embedding_func=ollama_embedding,
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        best_model_func=llm_model_if_cache,
+        cheap_model_func=llm_model_if_cache,
+        embedding_func=ollama_embedding,
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+    # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True)
+    # rag.insert(FAKE_TEXT[half_len:])
+# We're using Ollama to generate embeddings for the BGE model
+@wrap_embedding_func_with_attrs(
+    embedding_dim= EMBEDDING_MODEL_DIM,
+    max_token_size= EMBEDDING_MODEL_MAX_TOKENS,
+)
+async def ollama_embedding(texts :list[str]) -> np.ndarray:
+    embed_text = []
+    for text in texts:
+      data = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
+      embed_text.append(data["embedding"])
+    return embed_text
+if __name__ == "__main__":
+    insert()
+    query()

nano-graphrag/examples/using_local_embedding_model.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import sys
+sys.path.append("..")
+import logging
+import numpy as np
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag._utils import wrap_embedding_func_with_attrs
+from sentence_transformers import SentenceTransformer
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST"
+EMBED_MODEL = SentenceTransformer(
+    "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu"
+)
+# We're using Sentence Transformers to generate embeddings for the BGE model
+@wrap_embedding_func_with_attrs(
+    embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(),
+    max_token_size=EMBED_MODEL.max_seq_length,
+)
+async def local_embedding(texts: list[str]) -> np.ndarray:
+    return EMBED_MODEL.encode(texts, normalize_embeddings=True)
+rag = GraphRAG(
+    working_dir=WORKING_DIR,
+    embedding_func=local_embedding,
+)
+with open("../tests/mock_data.txt", encoding="utf-8-sig") as f:
+    FAKE_TEXT = f.read()
+# rag.insert(FAKE_TEXT)
+print(rag.query("What the main theme of this story?", param=QueryParam(mode="local")))

nano-graphrag/examples/using_milvus_as_vectorDB.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import asyncio
+import numpy as np
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag._utils import logger
+from nano_graphrag.base import BaseVectorStorage
+from dataclasses import dataclass
+@dataclass
+class MilvusLiteStorge(BaseVectorStorage):
+    @staticmethod
+    def create_collection_if_not_exist(client, collection_name: str, **kwargs):
+        if client.has_collection(collection_name):
+            return
+        # TODO add constants for ID max length to 32
+        client.create_collection(
+            collection_name, max_length=32, id_type="string", **kwargs
+        )
+    def __post_init__(self):
+        from pymilvus import MilvusClient
+        self._client_file_name = os.path.join(
+            self.global_config["working_dir"], "milvus_lite.db"
+        )
+        self._client = MilvusClient(self._client_file_name)
+        self._max_batch_size = self.global_config["embedding_batch_num"]
+        MilvusLiteStorge.create_collection_if_not_exist(
+            self._client,
+            self.namespace,
+            dimension=self.embedding_func.embedding_dim,
+        )
+    async def upsert(self, data: dict[str, dict]):
+        logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
+        list_data = [
+            {
+                "id": k,
+                **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
+            }
+            for k, v in data.items()
+        ]
+        contents = [v["content"] for v in data.values()]
+        batches = [
+            contents[i : i + self._max_batch_size]
+            for i in range(0, len(contents), self._max_batch_size)
+        ]
+        embeddings_list = await asyncio.gather(
+            *[self.embedding_func(batch) for batch in batches]
+        )
+        embeddings = np.concatenate(embeddings_list)
+        for i, d in enumerate(list_data):
+            d["vector"] = embeddings[i]
+        results = self._client.upsert(collection_name=self.namespace, data=list_data)
+        return results
+    async def query(self, query, top_k=5):
+        embedding = await self.embedding_func([query])
+        results = self._client.search(
+            collection_name=self.namespace,
+            data=embedding,
+            limit=top_k,
+            output_fields=list(self.meta_fields),
+            search_params={"metric_type": "COSINE", "params": {"radius": 0.2}},
+        )
+        return [
+            {**dp["entity"], "id": dp["id"], "distance": dp["distance"]}
+            for dp in results[0]
+        ]
+def insert():
+    data = ["YOUR TEXT DATA HERE", "YOUR TEXT DATA HERE"]
+    rag = GraphRAG(
+        working_dir="./nano_graphrag_cache_milvus_TEST",
+        enable_llm_cache=True,
+        vector_db_storage_cls=MilvusLiteStorge,
+    )
+    rag.insert(data)
+def query():
+    rag = GraphRAG(
+        working_dir="./nano_graphrag_cache_milvus_TEST",
+        enable_llm_cache=True,
+        vector_db_storage_cls=MilvusLiteStorge,
+    )
+    print(rag.query("YOUR QUERY HERE", param=QueryParam(mode="local")))
+insert()
+query()

nano-graphrag/examples/using_ollama_as_llm.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import logging
+import ollama
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+# !!! qwen2-7B maybe produce unparsable results and cause the extraction of graph to fail.
+MODEL = "qwen2"
+async def ollama_model_if_cache(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    # remove kwargs that are not supported by ollama
+    kwargs.pop("max_tokens", None)
+    kwargs.pop("response_format", None)
+    ollama_client = ollama.AsyncClient()
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(MODEL, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await ollama_client.chat(model=MODEL, messages=messages, **kwargs)
+    result = response["message"]["content"]
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert({args_hash: {"return": result, "model": MODEL}})
+    # -----------------------------------------------------
+    return result
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+WORKING_DIR = "./nano_graphrag_cache_ollama_TEST"
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        best_model_func=ollama_model_if_cache,
+        cheap_model_func=ollama_model_if_cache,
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        best_model_func=ollama_model_if_cache,
+        cheap_model_func=ollama_model_if_cache,
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+    # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True)
+    # rag.insert(FAKE_TEXT[half_len:])
+if __name__ == "__main__":
+    insert()
+    query()

nano-graphrag/examples/using_ollama_as_llm_and_embedding.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import sys
+sys.path.append("..")
+import logging
+import ollama
+import numpy as np
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag.base import BaseKVStorage
+from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("nano-graphrag").setLevel(logging.INFO)
+# Assumed llm model settings
+MODEL = "your_model_name"
+# Assumed embedding model settings
+EMBEDDING_MODEL = "nomic-embed-text"
+EMBEDDING_MODEL_DIM = 768
+EMBEDDING_MODEL_MAX_TOKENS = 8192
+async def ollama_model_if_cache(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    # remove kwargs that are not supported by ollama
+    kwargs.pop("max_tokens", None)
+    kwargs.pop("response_format", None)
+    ollama_client = ollama.AsyncClient()
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    # Get the cached response if having-------------------
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(MODEL, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    # -----------------------------------------------------
+    response = await ollama_client.chat(model=MODEL, messages=messages, **kwargs)
+    result = response["message"]["content"]
+    # Cache the response if having-------------------
+    if hashing_kv is not None:
+        await hashing_kv.upsert({args_hash: {"return": result, "model": MODEL}})
+    # -----------------------------------------------------
+    return result
+def remove_if_exist(file):
+    if os.path.exists(file):
+        os.remove(file)
+WORKING_DIR = "./nano_graphrag_cache_ollama_TEST"
+def query():
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        best_model_func=ollama_model_if_cache,
+        cheap_model_func=ollama_model_if_cache,
+        embedding_func=ollama_embedding,
+    )
+    print(
+        rag.query(
+            "What are the top themes in this story?", param=QueryParam(mode="global")
+        )
+    )
+def insert():
+    from time import time
+    with open("./tests/mock_data.txt", encoding="utf-8-sig") as f:
+        FAKE_TEXT = f.read()
+    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
+    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
+    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")
+    rag = GraphRAG(
+        working_dir=WORKING_DIR,
+        enable_llm_cache=True,
+        best_model_func=ollama_model_if_cache,
+        cheap_model_func=ollama_model_if_cache,
+        embedding_func=ollama_embedding,
+    )
+    start = time()
+    rag.insert(FAKE_TEXT)
+    print("indexing time:", time() - start)
+    # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True)
+    # rag.insert(FAKE_TEXT[half_len:])
+# We're using Ollama to generate embeddings for the BGE model
+@wrap_embedding_func_with_attrs(
+    embedding_dim=EMBEDDING_MODEL_DIM,
+    max_token_size=EMBEDDING_MODEL_MAX_TOKENS,
+)
+async def ollama_embedding(texts: list[str]) -> np.ndarray:
+    embed_text = []
+    for text in texts:
+        data = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
+        embed_text.append(data["embedding"])
+    return embed_text
+if __name__ == "__main__":
+    insert()
+    query()

nano-graphrag/examples/using_qdrant_as_vectorDB.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+import asyncio
+import uuid
+import numpy as np
+from nano_graphrag import GraphRAG, QueryParam
+from nano_graphrag._utils import logger
+from nano_graphrag.base import BaseVectorStorage
+from dataclasses import dataclass
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import VectorParams, Distance, PointStruct, SearchParams
+except ImportError as original_error:
+    raise ImportError(
+        "Qdrant client is not installed. Install it using: pip install qdrant-client\n"
+    ) from original_error
+@dataclass
+class QdrantStorage(BaseVectorStorage):
+    def __post_init__(self):
+        # Use a local file-based Qdrant storage
+        # Useful for prototyping and CI.
+        # For production, refer to:
+        # https://qdrant.tech/documentation/guides/installation/
+        self._client_file_path = os.path.join(
+            self.global_config["working_dir"], "qdrant_storage"
+        )
+        self._client = QdrantClient(path=self._client_file_path)
+        self._max_batch_size = self.global_config["embedding_batch_num"]
+        if not self._client.collection_exists(collection_name=self.namespace):
+            self._client.create_collection(
+                collection_name=self.namespace,
+                vectors_config=VectorParams(
+                    size=self.embedding_func.embedding_dim, distance=Distance.COSINE
+                ),
+            )
+    async def upsert(self, data: dict[str, dict]):
+        logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
+        list_data = [
+            {
+                "id": k,
+                **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
+            }
+            for k, v in data.items()
+        ]
+        contents = [v["content"] for v in data.values()]
+        batches = [
+            contents[i : i + self._max_batch_size]
+            for i in range(0, len(contents), self._max_batch_size)
+        ]
+        embeddings_list = await asyncio.gather(
+            *[self.embedding_func(batch) for batch in batches]
+        )
+        embeddings = np.concatenate(embeddings_list)
+        points = [
+            PointStruct(
+                id=uuid.uuid4().hex,
+                vector=embeddings[i].tolist(),
+                payload=data,
+            )
+            for i, data in enumerate(list_data)
+        ]
+        results = self._client.upsert(collection_name=self.namespace, points=points)
+        return results
+    async def query(self, query, top_k=5):
+        embedding = await self.embedding_func([query])
+        results = self._client.query_points(
+            collection_name=self.namespace,
+            query=embedding[0].tolist(),
+            limit=top_k,
+        ).points
+        return [
+            {**result.payload, "score": result.score}
+            for result in results
+        ]
+def insert():
+    data = ["YOUR TEXT DATA HERE", "YOUR TEXT DATA HERE"]
+    rag = GraphRAG(
+        working_dir="./nano_graphrag_cache_qdrant_TEST",
+        enable_llm_cache=True,
+        vector_db_storage_cls=QdrantStorage,
+    )
+    rag.insert(data)
+def query():
+    rag = GraphRAG(
+        working_dir="./nano_graphrag_cache_qdrant_TEST",
+        enable_llm_cache=True,
+        vector_db_storage_cls=QdrantStorage,
+    )
+    print(rag.query("YOUR QUERY HERE", param=QueryParam(mode="local")))
+if __name__ == "__main__":
+    insert()
+    query()

nano-graphrag/nano_graphrag/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .graphrag import GraphRAG, QueryParam
+__version__ = "0.0.8.2"
+__author__ = "Jianbai Ye"
+__url__ = "https://github.com/gusye1234/nano-graphrag"
+# dp stands for data pack

nano-graphrag/nano_graphrag/_llm.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import json
+import numpy as np
+from typing import Optional, List, Any, Callable
+import aioboto3
+from openai import AsyncOpenAI, AsyncAzureOpenAI, APIConnectionError, RateLimitError
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+import os
+from ._utils import compute_args_hash, wrap_embedding_func_with_attrs
+from .base import BaseKVStorage
+global_openai_async_client = None
+global_azure_openai_async_client = None
+global_amazon_bedrock_async_client = None
+def get_openai_async_client_instance():
+    global global_openai_async_client
+    if global_openai_async_client is None:
+        global_openai_async_client = AsyncOpenAI()
+    return global_openai_async_client
+def get_azure_openai_async_client_instance():
+    global global_azure_openai_async_client
+    if global_azure_openai_async_client is None:
+        global_azure_openai_async_client = AsyncAzureOpenAI()
+    return global_azure_openai_async_client
+def get_amazon_bedrock_async_client_instance():
+    global global_amazon_bedrock_async_client
+    if global_amazon_bedrock_async_client is None:
+        global_amazon_bedrock_async_client = aioboto3.Session()
+    return global_amazon_bedrock_async_client
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type((RateLimitError, APIConnectionError)),
+)
+async def openai_complete_if_cache(
+    model, prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    openai_async_client = get_openai_async_client_instance()
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(model, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    response = await openai_async_client.chat.completions.create(
+        model=model, messages=messages, **kwargs
+    )
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response.choices[0].message.content, "model": model}}
+        )
+        await hashing_kv.index_done_callback()
+    return response.choices[0].message.content
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type((RateLimitError, APIConnectionError)),
+)
+async def amazon_bedrock_complete_if_cache(
+    model, prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    amazon_bedrock_async_client = get_amazon_bedrock_async_client_instance()
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages = []
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": [{"text": prompt}]})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(model, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    inference_config = {
+        "temperature": 0,
+        "maxTokens": 4096 if "max_tokens" not in kwargs else kwargs["max_tokens"],
+    }
+    async with amazon_bedrock_async_client.client(
+        "bedrock-runtime",
+        region_name=os.getenv("AWS_REGION", "us-east-1")
+    ) as bedrock_runtime:
+        if system_prompt:
+            response = await bedrock_runtime.converse(
+                modelId=model, messages=messages, inferenceConfig=inference_config,
+                system=[{"text": system_prompt}]
+            )
+        else:
+            response = await bedrock_runtime.converse(
+                modelId=model, messages=messages, inferenceConfig=inference_config,
+            )
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {args_hash: {"return": response["output"]["message"]["content"][0]["text"], "model": model}}
+        )
+        await hashing_kv.index_done_callback()
+    return response["output"]["message"]["content"][0]["text"]
+def create_amazon_bedrock_complete_function(model_id: str) -> Callable:
+    """
+    Factory function to dynamically create completion functions for Amazon Bedrock
+    Args:
+        model_id (str): Amazon Bedrock model identifier (e.g., "us.anthropic.claude-3-sonnet-20240229-v1:0")
+    Returns:
+        Callable: Generated completion function
+    """
+    async def bedrock_complete(
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        history_messages: List[Any] = [],
+        **kwargs
+    ) -> str:
+        return await amazon_bedrock_complete_if_cache(
+            model_id,
+            prompt,
+            system_prompt=system_prompt,
+            history_messages=history_messages,
+            **kwargs
+        )
+    # Set function name for easier debugging
+    bedrock_complete.__name__ = f"{model_id}_complete"
+    return bedrock_complete
+async def gpt_4o_complete(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    return await openai_complete_if_cache(
+        "gpt-4o",
+        prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        **kwargs,
+    )
+async def gpt_4o_mini_complete(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    return await openai_complete_if_cache(
+        "gpt-4o-mini",
+        prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        **kwargs,
+    )
+@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type((RateLimitError, APIConnectionError)),
+)
+async def amazon_bedrock_embedding(texts: list[str]) -> np.ndarray:
+    amazon_bedrock_async_client = get_amazon_bedrock_async_client_instance()
+    async with amazon_bedrock_async_client.client(
+        "bedrock-runtime",
+        region_name=os.getenv("AWS_REGION", "us-east-1")
+    ) as bedrock_runtime:
+        embeddings = []
+        for text in texts:
+            body = json.dumps(
+                {
+                    "inputText": text,
+                    "dimensions": 1024,
+                }
+            )
+            response = await bedrock_runtime.invoke_model(
+                modelId="amazon.titan-embed-text-v2:0", body=body,
+            )
+            response_body = await response.get("body").read()
+            embeddings.append(json.loads(response_body))
+    return np.array([dp["embedding"] for dp in embeddings])
+@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type((RateLimitError, APIConnectionError)),
+)
+async def openai_embedding(texts: list[str]) -> np.ndarray:
+    openai_async_client = get_openai_async_client_instance()
+    response = await openai_async_client.embeddings.create(
+        model="text-embedding-3-small", input=texts, encoding_format="float"
+    )
+    return np.array([dp.embedding for dp in response.data])
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type((RateLimitError, APIConnectionError)),
+)
+async def azure_openai_complete_if_cache(
+    deployment_name, prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    azure_openai_client = get_azure_openai_async_client_instance()
+    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.extend(history_messages)
+    messages.append({"role": "user", "content": prompt})
+    if hashing_kv is not None:
+        args_hash = compute_args_hash(deployment_name, messages)
+        if_cache_return = await hashing_kv.get_by_id(args_hash)
+        if if_cache_return is not None:
+            return if_cache_return["return"]
+    response = await azure_openai_client.chat.completions.create(
+        model=deployment_name, messages=messages, **kwargs
+    )
+    if hashing_kv is not None:
+        await hashing_kv.upsert(
+            {
+                args_hash: {
+                    "return": response.choices[0].message.content,
+                    "model": deployment_name,
+                }
+            }
+        )
+        await hashing_kv.index_done_callback()
+    return response.choices[0].message.content
+async def azure_gpt_4o_complete(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    return await azure_openai_complete_if_cache(
+        "gpt-4o",
+        prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        **kwargs,
+    )
+async def azure_gpt_4o_mini_complete(
+    prompt, system_prompt=None, history_messages=[], **kwargs
+) -> str:
+    return await azure_openai_complete_if_cache(
+        "gpt-4o-mini",
+        prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        **kwargs,
+    )
+@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type((RateLimitError, APIConnectionError)),
+)
+async def azure_openai_embedding(texts: list[str]) -> np.ndarray:
+    azure_openai_client = get_azure_openai_async_client_instance()
+    response = await azure_openai_client.embeddings.create(
+        model="text-embedding-3-small", input=texts, encoding_format="float"
+    )
+    return np.array([dp.embedding for dp in response.data])

nano-graphrag/nano_graphrag/_op.py ADDED Viewed

	@@ -0,0 +1,1140 @@

+import re
+import json
+import asyncio
+from typing import Union
+from collections import Counter, defaultdict
+from ._splitter import SeparatorSplitter
+from ._utils import (
+    logger,
+    clean_str,
+    compute_mdhash_id,
+    is_float_regex,
+    list_of_list_to_csv,
+    pack_user_ass_to_openai_messages,
+    split_string_by_multi_markers,
+    truncate_list_by_token_size,
+    TokenizerWrapper
+)
+from .base import (
+    BaseGraphStorage,
+    BaseKVStorage,
+    BaseVectorStorage,
+    SingleCommunitySchema,
+    CommunitySchema,
+    TextChunkSchema,
+    QueryParam,
+)
+from .prompt import GRAPH_FIELD_SEP, PROMPTS
+def chunking_by_token_size(
+    tokens_list: list[list[int]],
+    doc_keys,
+    tokenizer_wrapper: TokenizerWrapper,
+    overlap_token_size=128,
+    max_token_size=1024,
+):
+    results = []
+    for index, tokens in enumerate(tokens_list):
+        chunk_token = []
+        lengths = []
+        for start in range(0, len(tokens), max_token_size - overlap_token_size):
+            chunk_token.append(tokens[start : start + max_token_size])
+            lengths.append(min(max_token_size, len(tokens) - start))
+        chunk_texts = tokenizer_wrapper.decode_batch(chunk_token)
+        for i, chunk in enumerate(chunk_texts):
+            results.append(
+                {
+                    "tokens": lengths[i],
+                    "content": chunk.strip(),
+                    "chunk_order_index": i,
+                    "full_doc_id": doc_keys[index],
+                }
+            )
+    return results
+def chunking_by_seperators(
+    tokens_list: list[list[int]],
+    doc_keys,
+    tokenizer_wrapper: TokenizerWrapper,
+    overlap_token_size=128,
+    max_token_size=1024,
+):
+    from .prompt import PROMPTS
+    # *** 修改 ***: 直接使用 wrapper 编码，而不是获取底层 tokenizer
+    separators = [tokenizer_wrapper.encode(s) for s in PROMPTS["default_text_separator"]]
+    splitter = SeparatorSplitter(
+        separators=separators,
+        chunk_size=max_token_size,
+        chunk_overlap=overlap_token_size,
+    )
+    results = []
+    for index, tokens in enumerate(tokens_list):
+        chunk_tokens = splitter.split_tokens(tokens)
+        lengths = [len(c) for c in chunk_tokens]
+        decoded_chunks = tokenizer_wrapper.decode_batch(chunk_tokens)
+        for i, chunk in enumerate(decoded_chunks):
+            results.append(
+                {
+                    "tokens": lengths[i],
+                    "content": chunk.strip(),
+                    "chunk_order_index": i,
+                    "full_doc_id": doc_keys[index],
+                }
+            )
+    return results
+def get_chunks(new_docs, chunk_func=chunking_by_token_size, tokenizer_wrapper: TokenizerWrapper = None, **chunk_func_params):
+    inserting_chunks = {}
+    new_docs_list = list(new_docs.items())
+    docs = [new_doc[1]["content"] for new_doc in new_docs_list]
+    doc_keys = [new_doc[0] for new_doc in new_docs_list]
+    tokens = [tokenizer_wrapper.encode(doc) for doc in docs]
+    chunks = chunk_func(
+        tokens, doc_keys=doc_keys, tokenizer_wrapper=tokenizer_wrapper, overlap_token_size=chunk_func_params.get("overlap_token_size", 128), max_token_size=chunk_func_params.get("max_token_size", 1024)
+    )
+    for chunk in chunks:
+        inserting_chunks.update(
+            {compute_mdhash_id(chunk["content"], prefix="chunk-"): chunk}
+        )
+    return inserting_chunks
+async def _handle_entity_relation_summary(
+    entity_or_relation_name: str,
+    description: str,
+    global_config: dict,
+    tokenizer_wrapper: TokenizerWrapper,
+) -> str:
+    use_llm_func: callable = global_config["cheap_model_func"]
+    llm_max_tokens = global_config["cheap_model_max_token_size"]
+    summary_max_tokens = global_config["entity_summary_to_max_tokens"]
+    tokens = tokenizer_wrapper.encode(description)
+    if len(tokens) < summary_max_tokens:
+        return description
+    prompt_template = PROMPTS["summarize_entity_descriptions"]
+    use_description = tokenizer_wrapper.decode(tokens[:llm_max_tokens])
+    context_base = dict(
+        entity_name=entity_or_relation_name,
+        description_list=use_description.split(GRAPH_FIELD_SEP),
+    )
+    use_prompt = prompt_template.format(**context_base)
+    logger.debug(f"Trigger summary: {entity_or_relation_name}")
+    summary = await use_llm_func(use_prompt, max_tokens=summary_max_tokens)
+    return summary
+async def _handle_single_entity_extraction(
+    record_attributes: list[str],
+    chunk_key: str,
+):
+    if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
+        return None
+    # add this record as a node in the G
+    entity_name = clean_str(record_attributes[1].upper())
+    if not entity_name.strip():
+        return None
+    entity_type = clean_str(record_attributes[2].upper())
+    entity_description = clean_str(record_attributes[3])
+    entity_source_id = chunk_key
+    return dict(
+        entity_name=entity_name,
+        entity_type=entity_type,
+        description=entity_description,
+        source_id=entity_source_id,
+    )
+async def _handle_single_relationship_extraction(
+    record_attributes: list[str],
+    chunk_key: str,
+):
+    if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
+        return None
+    # add this record as edge
+    source = clean_str(record_attributes[1].upper())
+    target = clean_str(record_attributes[2].upper())
+    edge_description = clean_str(record_attributes[3])
+    edge_source_id = chunk_key
+    weight = (
+        float(record_attributes[-1]) if is_float_regex(record_attributes[-1]) else 1.0
+    )
+    return dict(
+        src_id=source,
+        tgt_id=target,
+        weight=weight,
+        description=edge_description,
+        source_id=edge_source_id,
+    )
+async def _merge_nodes_then_upsert(
+    entity_name: str,
+    nodes_data: list[dict],
+    knwoledge_graph_inst: BaseGraphStorage,
+    global_config: dict,
+    tokenizer_wrapper,
+):
+    already_entitiy_types = []
+    already_source_ids = []
+    already_description = []
+    already_node = await knwoledge_graph_inst.get_node(entity_name)
+    if already_node is not None:
+        already_entitiy_types.append(already_node["entity_type"])
+        already_source_ids.extend(
+            split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
+        )
+        already_description.append(already_node["description"])
+    entity_type = sorted(
+        Counter(
+            [dp["entity_type"] for dp in nodes_data] + already_entitiy_types
+        ).items(),
+        key=lambda x: x[1],
+        reverse=True,
+    )[0][0]
+    description = GRAPH_FIELD_SEP.join(
+        sorted(set([dp["description"] for dp in nodes_data] + already_description))
+    )
+    source_id = GRAPH_FIELD_SEP.join(
+        set([dp["source_id"] for dp in nodes_data] + already_source_ids)
+    )
+    description = await _handle_entity_relation_summary(
+        entity_name, description, global_config, tokenizer_wrapper
+    )
+    node_data = dict(
+        entity_type=entity_type,
+        description=description,
+        source_id=source_id,
+    )
+    await knwoledge_graph_inst.upsert_node(
+        entity_name,
+        node_data=node_data,
+    )
+    node_data["entity_name"] = entity_name
+    return node_data
+async def _merge_edges_then_upsert(
+    src_id: str,
+    tgt_id: str,
+    edges_data: list[dict],
+    knwoledge_graph_inst: BaseGraphStorage,
+    global_config: dict,
+    tokenizer_wrapper,
+):
+    already_weights = []
+    already_source_ids = []
+    already_description = []
+    already_order = []
+    if await knwoledge_graph_inst.has_edge(src_id, tgt_id):
+        already_edge = await knwoledge_graph_inst.get_edge(src_id, tgt_id)
+        already_weights.append(already_edge["weight"])
+        already_source_ids.extend(
+            split_string_by_multi_markers(already_edge["source_id"], [GRAPH_FIELD_SEP])
+        )
+        already_description.append(already_edge["description"])
+        already_order.append(already_edge.get("order", 1))
+    # [numberchiffre]: `Relationship.order` is only returned from DSPy's predictions
+    order = min([dp.get("order", 1) for dp in edges_data] + already_order)
+    weight = sum([dp["weight"] for dp in edges_data] + already_weights)
+    description = GRAPH_FIELD_SEP.join(
+        sorted(set([dp["description"] for dp in edges_data] + already_description))
+    )
+    source_id = GRAPH_FIELD_SEP.join(
+        set([dp["source_id"] for dp in edges_data] + already_source_ids)
+    )
+    for need_insert_id in [src_id, tgt_id]:
+        if not (await knwoledge_graph_inst.has_node(need_insert_id)):
+            await knwoledge_graph_inst.upsert_node(
+                need_insert_id,
+                node_data={
+                    "source_id": source_id,
+                    "description": description,
+                    "entity_type": '"UNKNOWN"',
+                },
+            )
+    description = await _handle_entity_relation_summary(
+        (src_id, tgt_id), description, global_config, tokenizer_wrapper
+    )
+    await knwoledge_graph_inst.upsert_edge(
+        src_id,
+        tgt_id,
+        edge_data=dict(
+            weight=weight, description=description, source_id=source_id, order=order
+        ),
+    )
+async def extract_entities(
+    chunks: dict[str, TextChunkSchema],
+    knwoledge_graph_inst: BaseGraphStorage,
+    entity_vdb: BaseVectorStorage,
+    tokenizer_wrapper,
+    global_config: dict,
+    using_amazon_bedrock: bool=False,
+) -> Union[BaseGraphStorage, None]:
+    use_llm_func: callable = global_config["best_model_func"]
+    entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
+    ordered_chunks = list(chunks.items())
+    entity_extract_prompt = PROMPTS["entity_extraction"]
+    context_base = dict(
+        tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
+        record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
+        completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
+        entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]),
+    )
+    continue_prompt = PROMPTS["entiti_continue_extraction"]
+    if_loop_prompt = PROMPTS["entiti_if_loop_extraction"]
+    already_processed = 0
+    already_entities = 0
+    already_relations = 0
+    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
+        nonlocal already_processed, already_entities, already_relations
+        chunk_key = chunk_key_dp[0]
+        chunk_dp = chunk_key_dp[1]
+        content = chunk_dp["content"]
+        hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
+        final_result = await use_llm_func(hint_prompt)
+        if isinstance(final_result, list):
+            final_result = final_result[0]["text"]
+        history = pack_user_ass_to_openai_messages(hint_prompt, final_result, using_amazon_bedrock)
+        for now_glean_index in range(entity_extract_max_gleaning):
+            glean_result = await use_llm_func(continue_prompt, history_messages=history)
+            history += pack_user_ass_to_openai_messages(continue_prompt, glean_result, using_amazon_bedrock)
+            final_result += glean_result
+            if now_glean_index == entity_extract_max_gleaning - 1:
+                break
+            if_loop_result: str = await use_llm_func(
+                if_loop_prompt, history_messages=history
+            )
+            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+            if if_loop_result != "yes":
+                break
+        records = split_string_by_multi_markers(
+            final_result,
+            [context_base["record_delimiter"], context_base["completion_delimiter"]],
+        )
+        maybe_nodes = defaultdict(list)
+        maybe_edges = defaultdict(list)
+        for record in records:
+            record = re.search(r"\((.*)\)", record)
+            if record is None:
+                continue
+            record = record.group(1)
+            record_attributes = split_string_by_multi_markers(
+                record, [context_base["tuple_delimiter"]]
+            )
+            if_entities = await _handle_single_entity_extraction(
+                record_attributes, chunk_key
+            )
+            if if_entities is not None:
+                maybe_nodes[if_entities["entity_name"]].append(if_entities)
+                continue
+            if_relation = await _handle_single_relationship_extraction(
+                record_attributes, chunk_key
+            )
+            if if_relation is not None:
+                maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
+                    if_relation
+                )
+        already_processed += 1
+        already_entities += len(maybe_nodes)
+        already_relations += len(maybe_edges)
+        now_ticks = PROMPTS["process_tickers"][
+            already_processed % len(PROMPTS["process_tickers"])
+        ]
+        print(
+            f"{now_ticks} Processed {already_processed}({already_processed*100//len(ordered_chunks)}%) chunks,  {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
+            end="",
+            flush=True,
+        )
+        return dict(maybe_nodes), dict(maybe_edges)
+    # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings
+    results = await asyncio.gather(
+        *[_process_single_content(c) for c in ordered_chunks]
+    )
+    print()  # clear the progress bar
+    maybe_nodes = defaultdict(list)
+    maybe_edges = defaultdict(list)
+    for m_nodes, m_edges in results:
+        for k, v in m_nodes.items():
+            maybe_nodes[k].extend(v)
+        for k, v in m_edges.items():
+            # it's undirected graph
+            maybe_edges[tuple(sorted(k))].extend(v)
+    all_entities_data = await asyncio.gather(
+        *[
+            _merge_nodes_then_upsert(k, v, knwoledge_graph_inst, global_config, tokenizer_wrapper)
+            for k, v in maybe_nodes.items()
+        ]
+    )
+    await asyncio.gather(
+        *[
+            _merge_edges_then_upsert(k[0], k[1], v, knwoledge_graph_inst, global_config, tokenizer_wrapper)
+            for k, v in maybe_edges.items()
+        ]
+    )
+    if not len(all_entities_data):
+        logger.warning("Didn't extract any entities, maybe your LLM is not working")
+        return None
+    if entity_vdb is not None:
+        data_for_vdb = {
+            compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
+                "content": dp["entity_name"] + dp["description"],
+                "entity_name": dp["entity_name"],
+            }
+            for dp in all_entities_data
+        }
+        await entity_vdb.upsert(data_for_vdb)
+    return knwoledge_graph_inst
+def _pack_single_community_by_sub_communities(
+    community: SingleCommunitySchema,
+    max_token_size: int,
+    already_reports: dict[str, CommunitySchema],
+    tokenizer_wrapper: TokenizerWrapper,
+) -> tuple[str, int, set, set]:
+    all_sub_communities = [
+        already_reports[k] for k in community["sub_communities"] if k in already_reports
+    ]
+    all_sub_communities = sorted(
+        all_sub_communities, key=lambda x: x["occurrence"], reverse=True
+    )
+    may_trun_all_sub_communities = truncate_list_by_token_size(
+        all_sub_communities,
+        key=lambda x: x["report_string"],
+        max_token_size=max_token_size,
+        tokenizer_wrapper=tokenizer_wrapper,
+    )
+    sub_fields = ["id", "report", "rating", "importance"]
+    sub_communities_describe = list_of_list_to_csv(
+        [sub_fields]
+        + [
+            [
+                i,
+                c["report_string"],
+                c["report_json"].get("rating", -1),
+                c["occurrence"],
+            ]
+            for i, c in enumerate(may_trun_all_sub_communities)
+        ]
+    )
+    already_nodes = []
+    already_edges = []
+    for c in may_trun_all_sub_communities:
+        already_nodes.extend(c["nodes"])
+        already_edges.extend([tuple(e) for e in c["edges"]])
+    return (
+        sub_communities_describe,
+        len(tokenizer_wrapper.encode(sub_communities_describe)),
+        set(already_nodes),
+        set(already_edges),
+    )
+async def _pack_single_community_describe(
+    knwoledge_graph_inst: BaseGraphStorage,
+    community: SingleCommunitySchema,
+    tokenizer_wrapper: "TokenizerWrapper",
+    max_token_size: int = 12000,
+    already_reports: dict[str, CommunitySchema] = {},
+    global_config: dict = {},
+) -> str:
+    # 1. 准备原始数据
+    nodes_in_order = sorted(community["nodes"])
+    edges_in_order = sorted(community["edges"], key=lambda x: x[0] + x[1])
+    nodes_data = await asyncio.gather(
+        *[knwoledge_graph_inst.get_node(n) for n in nodes_in_order]
+    )
+    edges_data = await asyncio.gather(
+        *[knwoledge_graph_inst.get_edge(src, tgt) for src, tgt in edges_in_order]
+    )
+    # 2. 定义模板和固定开销
+    final_template = """-----Reports-----
+```csv
+{reports}
+```
+-----Entities-----
+```csv
+{entities}
+```
+-----Relationships-----
+```csv
+{relationships}
+```"""
+    base_template_tokens = len(tokenizer_wrapper.encode(
+        final_template.format(reports="", entities="", relationships="")
+    ))
+    remaining_budget = max_token_size - base_template_tokens
+    # 3. 处理子社区报告
+    report_describe = ""
+    contain_nodes = set()
+    contain_edges = set()
+    # 启发式截断检测
+    truncated = len(nodes_in_order) > 100 or len(edges_in_order) > 100
+    need_to_use_sub_communities = (
+        truncated and
+        community["sub_communities"] and
+        already_reports
+    )
+    force_to_use_sub_communities = global_config["addon_params"].get(
+        "force_to_use_sub_communities", False
+    )
+    if need_to_use_sub_communities or force_to_use_sub_communities:
+        logger.debug(f"Community {community['title']} using sub-communities")
+        # 获取子社区报告及包含的节点/边
+        result = _pack_single_community_by_sub_communities(
+            community, remaining_budget, already_reports, tokenizer_wrapper
+        )
+        report_describe, report_size, contain_nodes, contain_edges = result
+        remaining_budget = max(0, remaining_budget - report_size)
+    # 4. 准备节点和边数据（过滤子社区已包含的）
+    def format_row(row: list) -> str:
+        return ','.join('"{}"'.format(str(item).replace('"', '""')) for item in row)
+    node_fields = ["id", "entity", "type", "description", "degree"]
+    edge_fields = ["id", "source", "target", "description", "rank"]
+    # 获取度数并创建数据结构
+    node_degrees = await knwoledge_graph_inst.node_degrees_batch(nodes_in_order)
+    edge_degrees = await knwoledge_graph_inst.edge_degrees_batch(edges_in_order)
+    # 过滤已存在于子社区的节点/边
+    nodes_list_data = [
+        [i, name, data.get("entity_type", "UNKNOWN"),
+         data.get("description", "UNKNOWN"), node_degrees[i]]
+        for i, (name, data) in enumerate(zip(nodes_in_order, nodes_data))
+        if name not in contain_nodes  # 关键过滤
+    ]
+    edges_list_data = [
+        [i, edge[0], edge[1], data.get("description", "UNKNOWN"), edge_degrees[i]]
+        for i, (edge, data) in enumerate(zip(edges_in_order, edges_data))
+        if (edge[0], edge[1]) not in contain_edges  # 关键过滤
+    ]
+    # 按重要性排序
+    nodes_list_data.sort(key=lambda x: x[-1], reverse=True)
+    edges_list_data.sort(key=lambda x: x[-1], reverse=True)
+    # 5. 动态分配预算
+    # 计算表头开销
+    header_tokens = len(tokenizer_wrapper.encode(
+        list_of_list_to_csv([node_fields]) + "\n" + list_of_list_to_csv([edge_fields])
+    ))
+    data_budget = max(0, remaining_budget - header_tokens)
+    total_items = len(nodes_list_data) + len(edges_list_data)
+    node_ratio = len(nodes_list_data) / max(1, total_items)
+    edge_ratio = 1 - node_ratio
+    # 执行截断
+    nodes_final = truncate_list_by_token_size(
+        nodes_list_data, key=format_row,
+        max_token_size=int(data_budget * node_ratio),
+        tokenizer_wrapper=tokenizer_wrapper
+    )
+    edges_final = truncate_list_by_token_size(
+        edges_list_data, key=format_row,
+        max_token_size= int(data_budget * edge_ratio),
+        tokenizer_wrapper=tokenizer_wrapper
+    )
+    # 6. 组装最终输出
+    nodes_describe = list_of_list_to_csv([node_fields] + nodes_final)
+    edges_describe = list_of_list_to_csv([edge_fields] + edges_final)
+    final_output = final_template.format(
+        reports=report_describe,
+        entities=nodes_describe,
+        relationships=edges_describe
+    )
+    return final_output
+def _community_report_json_to_str(parsed_output: dict) -> str:
+    """refer official graphrag: index/graph/extractors/community_reports"""
+    title = parsed_output.get("title", "Report")
+    summary = parsed_output.get("summary", "")
+    findings = parsed_output.get("findings", [])
+    def finding_summary(finding: dict):
+        if isinstance(finding, str):
+            return finding
+        return finding.get("summary")
+    def finding_explanation(finding: dict):
+        if isinstance(finding, str):
+            return ""
+        return finding.get("explanation")
+    report_sections = "\n\n".join(
+        f"## {finding_summary(f)}\n\n{finding_explanation(f)}" for f in findings
+    )
+    return f"# {title}\n\n{summary}\n\n{report_sections}"
+async def generate_community_report(
+    community_report_kv: BaseKVStorage[CommunitySchema],
+    knwoledge_graph_inst: BaseGraphStorage,
+    tokenizer_wrapper: TokenizerWrapper,
+    global_config: dict,
+):
+    llm_extra_kwargs = global_config["special_community_report_llm_kwargs"]
+    use_llm_func: callable = global_config["best_model_func"]
+    use_string_json_convert_func: callable = global_config["convert_response_to_json_func"]
+    communities_schema = await knwoledge_graph_inst.community_schema()
+    community_keys, community_values = list(communities_schema.keys()), list(communities_schema.values())
+    already_processed = 0
+    prompt_template = PROMPTS["community_report"]
+    prompt_overhead = len(tokenizer_wrapper.encode(prompt_template.format(input_text="")))
+    async def _form_single_community_report(
+        community: SingleCommunitySchema, already_reports: dict[str, CommunitySchema]
+    ):
+        nonlocal already_processed
+        describe = await _pack_single_community_describe(
+            knwoledge_graph_inst,
+            community,
+            tokenizer_wrapper=tokenizer_wrapper,
+            max_token_size=global_config["best_model_max_token_size"] - prompt_overhead -200, # extra token for chat template and prompt template
+            already_reports=already_reports,
+            global_config=global_config,
+        )
+        prompt = prompt_template.format(input_text=describe)
+        response = await use_llm_func(prompt, **llm_extra_kwargs)
+        data = use_string_json_convert_func(response)
+        already_processed += 1
+        now_ticks = PROMPTS["process_tickers"][already_processed % len(PROMPTS["process_tickers"])]
+        print(f"{now_ticks} Processed {already_processed} communities\r", end="", flush=True)
+        return data
+    levels = sorted(set([c["level"] for c in community_values]), reverse=True)
+    logger.info(f"Generating by levels: {levels}")
+    community_datas = {}
+    for level in levels:
+        this_level_community_keys, this_level_community_values = zip(
+            *[
+                (k, v)
+                for k, v in zip(community_keys, community_values)
+                if v["level"] == level
+            ]
+        )
+        this_level_communities_reports = await asyncio.gather(
+            *[
+                _form_single_community_report(c, community_datas)
+                for c in this_level_community_values
+            ]
+        )
+        community_datas.update(
+            {
+                k: {
+                    "report_string": _community_report_json_to_str(r),
+                    "report_json": r,
+                    **v,
+                }
+                for k, r, v in zip(
+                    this_level_community_keys,
+                    this_level_communities_reports,
+                    this_level_community_values,
+                )
+            }
+        )
+    print()  # clear the progress bar
+    await community_report_kv.upsert(community_datas)
+async def _find_most_related_community_from_entities(
+    node_datas: list[dict],
+    query_param: QueryParam,
+    community_reports: BaseKVStorage[CommunitySchema],
+    tokenizer_wrapper,
+):
+    related_communities = []
+    for node_d in node_datas:
+        if "clusters" not in node_d:
+            continue
+        related_communities.extend(json.loads(node_d["clusters"]))
+    related_community_dup_keys = [
+        str(dp["cluster"])
+        for dp in related_communities
+        if dp["level"] <= query_param.level
+    ]
+    related_community_keys_counts = dict(Counter(related_community_dup_keys))
+    _related_community_datas = await asyncio.gather(
+        *[community_reports.get_by_id(k) for k in related_community_keys_counts.keys()]
+    )
+    related_community_datas = {
+        k: v
+        for k, v in zip(related_community_keys_counts.keys(), _related_community_datas)
+        if v is not None
+    }
+    related_community_keys = sorted(
+        related_community_keys_counts.keys(),
+        key=lambda k: (
+            related_community_keys_counts[k],
+            related_community_datas[k]["report_json"].get("rating", -1),
+        ),
+        reverse=True,
+    )
+    sorted_community_datas = [
+        related_community_datas[k] for k in related_community_keys
+    ]
+    use_community_reports = truncate_list_by_token_size(
+        sorted_community_datas,
+        key=lambda x: x["report_string"],
+        max_token_size=query_param.local_max_token_for_community_report,
+        tokenizer_wrapper=tokenizer_wrapper,
+    )
+    if query_param.local_community_single_one:
+        use_community_reports = use_community_reports[:1]
+    return use_community_reports
+async def _find_most_related_text_unit_from_entities(
+    node_datas: list[dict],
+    query_param: QueryParam,
+    text_chunks_db: BaseKVStorage[TextChunkSchema],
+    knowledge_graph_inst: BaseGraphStorage,
+    tokenizer_wrapper,
+):
+    text_units = [
+        split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
+        for dp in node_datas
+    ]
+    edges = await knowledge_graph_inst.get_nodes_edges_batch([dp["entity_name"] for dp in node_datas])
+    all_one_hop_nodes = set()
+    for this_edges in edges:
+        if not this_edges:
+            continue
+        all_one_hop_nodes.update([e[1] for e in this_edges])
+    all_one_hop_nodes = list(all_one_hop_nodes)
+    all_one_hop_nodes_data = await knowledge_graph_inst.get_nodes_batch(all_one_hop_nodes)
+    all_one_hop_text_units_lookup = {
+        k: set(split_string_by_multi_markers(v["source_id"], [GRAPH_FIELD_SEP]))
+        for k, v in zip(all_one_hop_nodes, all_one_hop_nodes_data)
+        if v is not None
+    }
+    all_text_units_lookup = {}
+    for index, (this_text_units, this_edges) in enumerate(zip(text_units, edges)):
+        for c_id in this_text_units:
+            if c_id in all_text_units_lookup:
+                continue
+            relation_counts = 0
+            for e in this_edges:
+                if (
+                    e[1] in all_one_hop_text_units_lookup
+                    and c_id in all_one_hop_text_units_lookup[e[1]]
+                ):
+                    relation_counts += 1
+            all_text_units_lookup[c_id] = {
+                "data": await text_chunks_db.get_by_id(c_id),
+                "order": index,
+                "relation_counts": relation_counts,
+            }
+    if any([v is None for v in all_text_units_lookup.values()]):
+        logger.warning("Text chunks are missing, maybe the storage is damaged")
+    all_text_units = [
+        {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None
+    ]
+    all_text_units = sorted(
+        all_text_units, key=lambda x: (x["order"], -x["relation_counts"])
+    )
+    all_text_units = truncate_list_by_token_size(
+        all_text_units,
+        key=lambda x: x["data"]["content"],
+        max_token_size=query_param.local_max_token_for_text_unit,
+        tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper
+    )
+    all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units]
+    return all_text_units
+async def _find_most_related_edges_from_entities(
+    node_datas: list[dict],
+    query_param: QueryParam,
+    knowledge_graph_inst: BaseGraphStorage,
+    tokenizer_wrapper,
+):
+    all_related_edges = await knowledge_graph_inst.get_nodes_edges_batch([dp["entity_name"] for dp in node_datas])
+    all_edges = []
+    seen = set()
+    for this_edges in all_related_edges:
+        for e in this_edges:
+            sorted_edge = tuple(sorted(e))
+            if sorted_edge not in seen:
+                seen.add(sorted_edge)
+                all_edges.append(sorted_edge)
+    all_edges_pack = await knowledge_graph_inst.get_edges_batch(all_edges)
+    all_edges_degree = await knowledge_graph_inst.edge_degrees_batch(all_edges)
+    all_edges_data = [
+        {"src_tgt": k, "rank": d, **v}
+        for k, v, d in zip(all_edges, all_edges_pack, all_edges_degree)
+        if v is not None
+    ]
+    all_edges_data = sorted(
+        all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True
+    )
+    all_edges_data = truncate_list_by_token_size(
+        all_edges_data,
+        key=lambda x: x["description"],
+        max_token_size=query_param.local_max_token_for_local_context,
+        tokenizer_wrapper=tokenizer_wrapper,
+    )
+    return all_edges_data
+async def _build_local_query_context(
+    query,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    community_reports: BaseKVStorage[CommunitySchema],
+    text_chunks_db: BaseKVStorage[TextChunkSchema],
+    query_param: QueryParam,
+    tokenizer_wrapper,
+):
+    results = await entities_vdb.query(query, top_k=query_param.top_k)
+    if not len(results):
+        return None
+    node_datas = await knowledge_graph_inst.get_nodes_batch([r["entity_name"] for r in results])
+    if not all([n is not None for n in node_datas]):
+        logger.warning("Some nodes are missing, maybe the storage is damaged")
+    node_degrees = await knowledge_graph_inst.node_degrees_batch([r["entity_name"] for r in results])
+    node_datas = [
+        {**n, "entity_name": k["entity_name"], "rank": d}
+        for k, n, d in zip(results, node_datas, node_degrees)
+        if n is not None
+    ]
+    use_communities = await _find_most_related_community_from_entities(
+        node_datas, query_param, community_reports, tokenizer_wrapper
+    )
+    use_text_units = await _find_most_related_text_unit_from_entities(
+        node_datas, query_param, text_chunks_db, knowledge_graph_inst, tokenizer_wrapper
+    )
+    use_relations = await _find_most_related_edges_from_entities(
+        node_datas, query_param, knowledge_graph_inst, tokenizer_wrapper
+    )
+    logger.info(
+        f"Using {len(node_datas)} entites, {len(use_communities)} communities, {len(use_relations)} relations, {len(use_text_units)} text units"
+    )
+    entites_section_list = [["id", "entity", "type", "description", "rank"]]
+    for i, n in enumerate(node_datas):
+        entites_section_list.append(
+            [
+                i,
+                n["entity_name"],
+                n.get("entity_type", "UNKNOWN"),
+                n.get("description", "UNKNOWN"),
+                n["rank"],
+            ]
+        )
+    entities_context = list_of_list_to_csv(entites_section_list)
+    relations_section_list = [
+        ["id", "source", "target", "description", "weight", "rank"]
+    ]
+    for i, e in enumerate(use_relations):
+        relations_section_list.append(
+            [
+                i,
+                e["src_tgt"][0],
+                e["src_tgt"][1],
+                e["description"],
+                e["weight"],
+                e["rank"],
+            ]
+        )
+    relations_context = list_of_list_to_csv(relations_section_list)
+    communities_section_list = [["id", "content"]]
+    for i, c in enumerate(use_communities):
+        communities_section_list.append([i, c["report_string"]])
+    communities_context = list_of_list_to_csv(communities_section_list)
+    text_units_section_list = [["id", "content"]]
+    for i, t in enumerate(use_text_units):
+        text_units_section_list.append([i, t["content"]])
+    text_units_context = list_of_list_to_csv(text_units_section_list)
+    return f"""
+-----Reports-----
+```csv
+{communities_context}
+```
+-----Entities-----
+```csv
+{entities_context}
+```
+-----Relationships-----
+```csv
+{relations_context}
+```
+-----Sources-----
+```csv
+{text_units_context}
+```
+"""
+async def local_query(
+    query,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    community_reports: BaseKVStorage[CommunitySchema],
+    text_chunks_db: BaseKVStorage[TextChunkSchema],
+    query_param: QueryParam,
+    tokenizer_wrapper,
+    global_config: dict,
+) -> str:
+    use_model_func = global_config["best_model_func"]
+    context = await _build_local_query_context(
+        query,
+        knowledge_graph_inst,
+        entities_vdb,
+        community_reports,
+        text_chunks_db,
+        query_param,
+        tokenizer_wrapper,
+    )
+    if query_param.only_need_context:
+        return context
+    if context is None:
+        return PROMPTS["fail_response"]
+    sys_prompt_temp = PROMPTS["local_rag_response"]
+    sys_prompt = sys_prompt_temp.format(
+        context_data=context, response_type=query_param.response_type
+    )
+    response = await use_model_func(
+        query,
+        system_prompt=sys_prompt,
+    )
+    return response
+async def _map_global_communities(
+    query: str,
+    communities_data: list[CommunitySchema],
+    query_param: QueryParam,
+    global_config: dict,
+    tokenizer_wrapper,
+):
+    use_string_json_convert_func = global_config["convert_response_to_json_func"]
+    use_model_func = global_config["best_model_func"]
+    community_groups = []
+    while len(communities_data):
+        this_group = truncate_list_by_token_size(
+            communities_data,
+            key=lambda x: x["report_string"],
+            max_token_size=query_param.global_max_token_for_community_report,
+            tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper
+        )
+        community_groups.append(this_group)
+        communities_data = communities_data[len(this_group) :]
+    async def _process(community_truncated_datas: list[CommunitySchema]) -> dict:
+        communities_section_list = [["id", "content", "rating", "importance"]]
+        for i, c in enumerate(community_truncated_datas):
+            communities_section_list.append(
+                [
+                    i,
+                    c["report_string"],
+                    c["report_json"].get("rating", 0),
+                    c["occurrence"],
+                ]
+            )
+        community_context = list_of_list_to_csv(communities_section_list)
+        sys_prompt_temp = PROMPTS["global_map_rag_points"]
+        sys_prompt = sys_prompt_temp.format(context_data=community_context)
+        response = await use_model_func(
+            query,
+            system_prompt=sys_prompt,
+            **query_param.global_special_community_map_llm_kwargs,
+        )
+        data = use_string_json_convert_func(response)
+        return data.get("points", [])
+    logger.info(f"Grouping to {len(community_groups)} groups for global search")
+    responses = await asyncio.gather(*[_process(c) for c in community_groups])
+    return responses
+async def global_query(
+    query,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    community_reports: BaseKVStorage[CommunitySchema],
+    text_chunks_db: BaseKVStorage[TextChunkSchema],
+    query_param: QueryParam,
+    tokenizer_wrapper,
+    global_config: dict,
+) -> str:
+    community_schema = await knowledge_graph_inst.community_schema()
+    community_schema = {
+        k: v for k, v in community_schema.items() if v["level"] <= query_param.level
+    }
+    if not len(community_schema):
+        return PROMPTS["fail_response"]
+    use_model_func = global_config["best_model_func"]
+    sorted_community_schemas = sorted(
+        community_schema.items(),
+        key=lambda x: x[1]["occurrence"],
+        reverse=True,
+    )
+    sorted_community_schemas = sorted_community_schemas[
+        : query_param.global_max_consider_community
+    ]
+    community_datas = await community_reports.get_by_ids(
+        [k[0] for k in sorted_community_schemas]
+    )
+    community_datas = [c for c in community_datas if c is not None]
+    community_datas = [
+        c
+        for c in community_datas
+        if c["report_json"].get("rating", 0) >= query_param.global_min_community_rating
+    ]
+    community_datas = sorted(
+        community_datas,
+        key=lambda x: (x["occurrence"], x["report_json"].get("rating", 0)),
+        reverse=True,
+    )
+    logger.info(f"Revtrieved {len(community_datas)} communities")
+    map_communities_points = await _map_global_communities(
+        query, community_datas, query_param, global_config, tokenizer_wrapper
+    )
+    final_support_points = []
+    for i, mc in enumerate(map_communities_points):
+        for point in mc:
+            if "description" not in point:
+                continue
+            final_support_points.append(
+                {
+                    "analyst": i,
+                    "answer": point["description"],
+                    "score": point.get("score", 1),
+                }
+            )
+    final_support_points = [p for p in final_support_points if p["score"] > 0]
+    if not len(final_support_points):
+        return PROMPTS["fail_response"]
+    final_support_points = sorted(
+        final_support_points, key=lambda x: x["score"], reverse=True
+    )
+    final_support_points = truncate_list_by_token_size(
+        final_support_points,
+        key=lambda x: x["answer"],
+        max_token_size=query_param.global_max_token_for_community_report,
+        tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper
+    )
+    points_context = []
+    for dp in final_support_points:
+        points_context.append(
+            f"""----Analyst {dp['analyst']}----
+Importance Score: {dp['score']}
+{dp['answer']}
+"""
+        )
+    points_context = "\n".join(points_context)
+    if query_param.only_need_context:
+        return points_context
+    sys_prompt_temp = PROMPTS["global_reduce_rag_response"]
+    response = await use_model_func(
+        query,
+        sys_prompt_temp.format(
+            report_data=points_context, response_type=query_param.response_type
+        ),
+    )
+    return response
+async def naive_query(
+    query,
+    chunks_vdb: BaseVectorStorage,
+    text_chunks_db: BaseKVStorage[TextChunkSchema],
+    query_param: QueryParam,
+    tokenizer_wrapper,
+    global_config: dict,
+):
+    use_model_func = global_config["best_model_func"]
+    results = await chunks_vdb.query(query, top_k=query_param.top_k)
+    if not len(results):
+        return PROMPTS["fail_response"]
+    chunks_ids = [r["id"] for r in results]
+    chunks = await text_chunks_db.get_by_ids(chunks_ids)
+    maybe_trun_chunks = truncate_list_by_token_size(
+        chunks,
+        key=lambda x: x["content"],
+        max_token_size=query_param.naive_max_token_for_text_unit,
+        tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper
+    )
+    logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks")
+    section = "--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
+    if query_param.only_need_context:
+        return section
+    sys_prompt_temp = PROMPTS["naive_rag_response"]
+    sys_prompt = sys_prompt_temp.format(
+        content_data=section, response_type=query_param.response_type
+    )
+    response = await use_model_func(
+        query,
+        system_prompt=sys_prompt,
+    )
+    return response

nano-graphrag/nano_graphrag/_splitter.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from typing import List, Optional, Union, Literal
+class SeparatorSplitter:
+    def __init__(
+        self,
+        separators: Optional[List[List[int]]] = None,
+        keep_separator: Union[bool, Literal["start", "end"]] = "end",
+        chunk_size: int = 4000,
+        chunk_overlap: int = 200,
+        length_function: callable = len,
+    ):
+        self._separators = separators or []
+        self._keep_separator = keep_separator
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        self._length_function = length_function
+    def split_tokens(self, tokens: List[int]) -> List[List[int]]:
+        splits = self._split_tokens_with_separators(tokens)
+        return self._merge_splits(splits)
+    def _split_tokens_with_separators(self, tokens: List[int]) -> List[List[int]]:
+        splits = []
+        current_split = []
+        i = 0
+        while i < len(tokens):
+            separator_found = False
+            for separator in self._separators:
+                if tokens[i:i+len(separator)] == separator:
+                    if self._keep_separator in [True, "end"]:
+                        current_split.extend(separator)
+                    if current_split:
+                        splits.append(current_split)
+                        current_split = []
+                    if self._keep_separator == "start":
+                        current_split.extend(separator)
+                    i += len(separator)
+                    separator_found = True
+                    break
+            if not separator_found:
+                current_split.append(tokens[i])
+                i += 1
+        if current_split:
+            splits.append(current_split)
+        return [s for s in splits if s]
+    def _merge_splits(self, splits: List[List[int]]) -> List[List[int]]:
+        if not splits:
+            return []
+        merged_splits = []
+        current_chunk = []
+        for split in splits:
+            if not current_chunk:
+                current_chunk = split
+            elif self._length_function(current_chunk) + self._length_function(split) <= self._chunk_size:
+                current_chunk.extend(split)
+            else:
+                merged_splits.append(current_chunk)
+                current_chunk = split
+        if current_chunk:
+            merged_splits.append(current_chunk)
+        if len(merged_splits) == 1 and self._length_function(merged_splits[0]) > self._chunk_size:
+            return self._split_chunk(merged_splits[0])
+        if self._chunk_overlap > 0:
+            return self._enforce_overlap(merged_splits)
+        return merged_splits
+    def _split_chunk(self, chunk: List[int]) -> List[List[int]]:
+        result = []
+        for i in range(0, len(chunk), self._chunk_size - self._chunk_overlap):
+            new_chunk = chunk[i:i + self._chunk_size]
+            if len(new_chunk) > self._chunk_overlap:  # 只有当 chunk 长度大于 overlap 时才添加
+                result.append(new_chunk)
+        return result
+    def _enforce_overlap(self, chunks: List[List[int]]) -> List[List[int]]:
+        result = []
+        for i, chunk in enumerate(chunks):
+            if i == 0:
+                result.append(chunk)
+            else:
+                overlap = chunks[i-1][-self._chunk_overlap:]
+                new_chunk = overlap + chunk
+                if self._length_function(new_chunk) > self._chunk_size:
+                    new_chunk = new_chunk[:self._chunk_size]
+                result.append(new_chunk)
+        return result

nano-graphrag/nano_graphrag/_storage/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .gdb_networkx import NetworkXStorage
+from .gdb_neo4j import Neo4jStorage
+from .vdb_hnswlib import HNSWVectorStorage
+from .vdb_nanovectordb import NanoVectorDBStorage
+from .kv_json import JsonKVStorage

nano-graphrag/nano_graphrag/_storage/gdb_neo4j.py ADDED Viewed

	@@ -0,0 +1,529 @@

+import json
+import asyncio
+from collections import defaultdict
+from typing import List
+from neo4j import AsyncGraphDatabase
+from dataclasses import dataclass
+from typing import Union
+from ..base import BaseGraphStorage, SingleCommunitySchema
+from .._utils import logger
+from ..prompt import GRAPH_FIELD_SEP
+neo4j_lock = asyncio.Lock()
+def make_path_idable(path):
+    return path.replace(".", "_").replace("/", "__").replace("-", "_").replace(":", "_").replace("\\", "__")
+@dataclass
+class Neo4jStorage(BaseGraphStorage):
+    def __post_init__(self):
+        self.neo4j_url = self.global_config["addon_params"].get("neo4j_url", None)
+        self.neo4j_auth = self.global_config["addon_params"].get("neo4j_auth", None)
+        self.namespace = (
+            f"{make_path_idable(self.global_config['working_dir'])}__{self.namespace}"
+        )
+        logger.info(f"Using the label {self.namespace} for Neo4j as identifier")
+        if self.neo4j_url is None or self.neo4j_auth is None:
+            raise ValueError("Missing neo4j_url or neo4j_auth in addon_params")
+        self.async_driver = AsyncGraphDatabase.driver(
+            self.neo4j_url, auth=self.neo4j_auth, max_connection_pool_size=50,
+        )
+    # async def create_database(self):
+    #     async with self.async_driver.session() as session:
+    #         try:
+    #             constraints = await session.run("SHOW CONSTRAINTS")
+    #             # TODO I don't know why CREATE CONSTRAINT IF NOT EXISTS still trigger error
+    #             # so have to check if the constrain exists
+    #             constrain_exists = False
+    #             async for record in constraints:
+    #                 if (
+    #                     self.namespace in record["labelsOrTypes"]
+    #                     and "id" in record["properties"]
+    #                     and record["type"] == "UNIQUENESS"
+    #                 ):
+    #                     constrain_exists = True
+    #                     break
+    #             if not constrain_exists:
+    #                 await session.run(
+    #                     f"CREATE CONSTRAINT FOR (n:{self.namespace}) REQUIRE n.id IS UNIQUE"
+    #                 )
+    #                 logger.info(f"Add constraint for namespace: {self.namespace}")
+    #         except Exception as e:
+    #             logger.error(f"Error accessing or setting up the database: {str(e)}")
+    #             raise
+    async def _init_workspace(self):
+        await self.async_driver.verify_authentication()
+        await self.async_driver.verify_connectivity()
+        # TODOLater: create database if not exists always cause an error when async
+        # await self.create_database()
+    async def index_start_callback(self):
+        logger.info("Init Neo4j workspace")
+        await self._init_workspace()
+        # create index for faster searching
+        try:
+            async with self.async_driver.session() as session:
+                await session.run(
+                    f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.id)"
+                )
+                await session.run(
+                    f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.entity_type)"
+                )
+                await session.run(
+                    f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.communityIds)"
+                )
+                await session.run(
+                    f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.source_id)"
+                )
+                logger.info("Neo4j indexes created successfully")
+        except Exception as e:
+            logger.error(f"Failed to create indexes: {e}")
+            raise e
+    async def has_node(self, node_id: str) -> bool:
+        async with self.async_driver.session() as session:
+            result = await session.run(
+                f"MATCH (n:`{self.namespace}`) WHERE n.id = $node_id RETURN COUNT(n) > 0 AS exists",
+                node_id=node_id,
+            )
+            record = await result.single()
+            return record["exists"] if record else False
+    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        async with self.async_driver.session() as session:
+            result = await session.run(
+                f"""
+                MATCH (s:`{self.namespace}`)
+                WHERE s.id = $source_id
+                MATCH (t:`{self.namespace}`)
+                WHERE t.id = $target_id
+                RETURN EXISTS((s)-[]->(t)) AS exists
+                """,
+                source_id=source_node_id,
+                target_id=target_node_id,
+            )
+            record = await result.single()
+            return record["exists"] if record else False
+    async def node_degree(self, node_id: str) -> int:
+        results = await self.node_degrees_batch([node_id])
+        return results[0] if results else 0
+    async def node_degrees_batch(self, node_ids: List[str]) -> List[str]:
+        if not node_ids:
+            return {}
+        result_dict = {node_id: 0 for node_id in node_ids}
+        async with self.async_driver.session() as session:
+            result = await session.run(
+                f"""
+                UNWIND $node_ids AS node_id
+                MATCH (n:`{self.namespace}`)
+                WHERE n.id = node_id
+                OPTIONAL MATCH (n)-[]-(m:`{self.namespace}`)
+                RETURN node_id, COUNT(m) AS degree
+                """,
+                node_ids=node_ids
+            )
+            async for record in result:
+                result_dict[record["node_id"]] = record["degree"]
+        return [result_dict[node_id] for node_id in node_ids]
+    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        results = await self.edge_degrees_batch([(src_id, tgt_id)])
+        return results[0] if results else 0
+    async def edge_degrees_batch(self, edge_pairs: list[tuple[str, str]]) -> list[int]:
+        if not edge_pairs:
+            return []
+        result_dict = {tuple(edge_pair): 0 for edge_pair in edge_pairs}
+        edges_params = [{"src_id": src, "tgt_id": tgt} for src, tgt in edge_pairs]
+        try:
+            async with self.async_driver.session() as session:
+                result = await session.run(
+                    f"""
+                    UNWIND $edges AS edge
+                    MATCH (s:`{self.namespace}`)
+                    WHERE s.id = edge.src_id
+                    WITH edge, s
+                    OPTIONAL MATCH (s)-[]-(n1:`{self.namespace}`)
+                    WITH edge, COUNT(n1) AS src_degree
+                    MATCH (t:`{self.namespace}`)
+                    WHERE t.id = edge.tgt_id
+                    WITH edge, src_degree, t
+                    OPTIONAL MATCH (t)-[]-(n2:`{self.namespace}`)
+                    WITH edge.src_id AS src_id, edge.tgt_id AS tgt_id, src_degree, COUNT(n2) AS tgt_degree
+                    RETURN src_id, tgt_id, src_degree + tgt_degree AS degree
+                    """,
+                    edges=edges_params
+                )
+                async for record in result:
+                    src_id = record["src_id"]
+                    tgt_id = record["tgt_id"]
+                    degree = record["degree"]
+                    # 更新结果字典
+                    edge_pair = (src_id, tgt_id)
+                    result_dict[edge_pair] = degree
+            return [result_dict[tuple(edge_pair)] for edge_pair in edge_pairs]
+        except Exception as e:
+            logger.error(f"Error in batch edge degree calculation: {e}")
+            return [0] * len(edge_pairs)
+    async def get_node(self, node_id: str) -> Union[dict, None]:
+        result = await self.get_nodes_batch([node_id])
+        return result[0] if result else None
+    async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, Union[dict, None]]:
+        if not node_ids:
+            return {}
+        result_dict = {node_id: None for node_id in node_ids}
+        try:
+            async with self.async_driver.session() as session:
+                result = await session.run(
+                    f"""
+                    UNWIND $node_ids AS node_id
+                    MATCH (n:`{self.namespace}`)
+                    WHERE n.id = node_id
+                    RETURN node_id, properties(n) AS node_data
+                    """,
+                    node_ids=node_ids
+                )
+                async for record in result:
+                    node_id = record["node_id"]
+                    raw_node_data = record["node_data"]
+                    if raw_node_data:
+                        raw_node_data["clusters"] = json.dumps(
+                            [
+                                {
+                                    "level": index,
+                                    "cluster": cluster_id,
+                                }
+                                for index, cluster_id in enumerate(
+                                    raw_node_data.get("communityIds", [])
+                                )
+                            ]
+                        )
+                        result_dict[node_id] = raw_node_data
+            return [result_dict[node_id] for node_id in node_ids]
+        except Exception as e:
+            logger.error(f"Error in batch node retrieval: {e}")
+            raise e
+    async def get_edge(
+        self, source_node_id: str, target_node_id: str
+    ) -> Union[dict, None]:
+        results = await self.get_edges_batch([(source_node_id, target_node_id)])
+        return results[0] if results else None
+    async def get_edges_batch(
+        self, edge_pairs: list[tuple[str, str]]
+    ) -> list[Union[dict, None]]:
+        if not edge_pairs:
+            return []
+        result_dict = {tuple(edge_pair): None for edge_pair in edge_pairs}
+        edges_params = [{"source_id": src, "target_id": tgt} for src, tgt in edge_pairs]
+        try:
+            async with self.async_driver.session() as session:
+                result = await session.run(
+                    f"""
+                    UNWIND $edges AS edge
+                    MATCH (s:`{self.namespace}`)-[r]->(t:`{self.namespace}`)
+                    WHERE s.id = edge.source_id AND t.id = edge.target_id
+                    RETURN edge.source_id AS source_id, edge.target_id AS target_id, properties(r) AS edge_data
+                    """,
+                    edges=edges_params
+                )
+                async for record in result:
+                    source_id = record["source_id"]
+                    target_id = record["target_id"]
+                    edge_data = record["edge_data"]
+                    edge_pair = (source_id, target_id)
+                    result_dict[edge_pair] = edge_data
+            return [result_dict[tuple(edge_pair)] for edge_pair in edge_pairs]
+        except Exception as e:
+            logger.error(f"Error in batch edge retrieval: {e}")
+            return [None] * len(edge_pairs)
+    async def get_node_edges(
+        self, source_node_id: str
+    ) -> list[tuple[str, str]]:
+        results = await self.get_nodes_edges_batch([source_node_id])
+        return results[0] if results else []
+    async def get_nodes_edges_batch(
+        self, node_ids: list[str]
+    ) -> list[list[tuple[str, str]]]:
+        if not node_ids:
+            return []
+        result_dict = {node_id: [] for node_id in node_ids}
+        try:
+            async with self.async_driver.session() as session:
+                result = await session.run(
+                    f"""
+                    UNWIND $node_ids AS node_id
+                    MATCH (s:`{self.namespace}`)-[r]->(t:`{self.namespace}`)
+                    WHERE s.id = node_id
+                    RETURN s.id AS source_id, t.id AS target_id
+                    """,
+                    node_ids=node_ids
+                )
+                async for record in result:
+                    source_id = record["source_id"]
+                    target_id = record["target_id"]
+                    if source_id in result_dict:
+                        result_dict[source_id].append((source_id, target_id))
+            return [result_dict[node_id] for node_id in node_ids]
+        except Exception as e:
+            logger.error(f"Error in batch node edges retrieval: {e}")
+            return [[] for _ in node_ids]
+    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        await self.upsert_nodes_batch([(node_id, node_data)])
+    async def upsert_nodes_batch(self, nodes_data: list[tuple[str, dict[str, str]]]):
+        if not nodes_data:
+            return []
+        nodes_by_type = {}
+        for node_id, node_data in nodes_data:
+            node_type = node_data.get("entity_type", "UNKNOWN").strip('"')
+            if node_type not in nodes_by_type:
+                nodes_by_type[node_type] = []
+            nodes_by_type[node_type].append((node_id, node_data))
+        async with self.async_driver.session() as session:
+            for node_type, type_nodes in nodes_by_type.items():
+                params = [{"id": node_id, "data": node_data} for node_id, node_data in type_nodes]
+                await session.run(
+                    f"""
+                    UNWIND $nodes AS node
+                    MERGE (n:`{self.namespace}`:`{node_type}` {{id: node.id}})
+                    SET n += node.data
+                    """,
+                    nodes=params
+                )
+    async def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        await self.upsert_edges_batch([(source_node_id, target_node_id, edge_data)])
+    async def upsert_edges_batch(
+        self, edges_data: list[tuple[str, str, dict[str, str]]]
+    ):
+        if not edges_data:
+            return
+        edges_params = []
+        for source_id, target_id, edge_data in edges_data:
+            edge_data_copy = edge_data.copy()
+            edge_data_copy.setdefault("weight", 0.0)
+            edges_params.append({
+                "source_id": source_id,
+                "target_id": target_id,
+                "edge_data": edge_data_copy
+            })
+        async with self.async_driver.session() as session:
+            await session.run(
+                f"""
+                UNWIND $edges AS edge
+                MATCH (s:`{self.namespace}`)
+                WHERE s.id = edge.source_id
+                WITH edge, s
+                MATCH (t:`{self.namespace}`)
+                WHERE t.id = edge.target_id
+                MERGE (s)-[r:RELATED]->(t)
+                SET r += edge.edge_data
+                """,
+                edges=edges_params
+            )
+    async def clustering(self, algorithm: str):
+        if algorithm != "leiden":
+            raise ValueError(
+                f"Clustering algorithm {algorithm} not supported in Neo4j implementation"
+            )
+        random_seed = self.global_config["graph_cluster_seed"]
+        max_level = self.global_config["max_graph_cluster_size"]
+        async with self.async_driver.session() as session:
+            try:
+                # Project the graph with undirected relationships
+                await session.run(
+                    f"""
+                    CALL gds.graph.project(
+                        'graph_{self.namespace}',
+                        ['{self.namespace}'],
+                        {{
+                            RELATED: {{
+                                orientation: 'UNDIRECTED',
+                                properties: ['weight']
+                            }}
+                        }}
+                    )
+                    """
+                )
+                # Run Leiden algorithm
+                result = await session.run(
+                    f"""
+                    CALL gds.leiden.write(
+                        'graph_{self.namespace}',
+                        {{
+                            writeProperty: 'communityIds',
+                            includeIntermediateCommunities: True,
+                            relationshipWeightProperty: "weight",
+                            maxLevels: {max_level},
+                            tolerance: 0.0001,
+                            gamma: 1.0,
+                            theta: 0.01,
+                            randomSeed: {random_seed}
+                        }}
+                    )
+                    YIELD communityCount, modularities;
+                    """
+                )
+                result = await result.single()
+                community_count: int = result["communityCount"]
+                modularities = result["modularities"]
+                logger.info(
+                    f"Performed graph clustering with {community_count} communities and modularities {modularities}"
+                )
+            finally:
+                # Drop the projected graph
+                await session.run(f"CALL gds.graph.drop('graph_{self.namespace}')")
+    async def community_schema(self) -> dict[str, SingleCommunitySchema]:
+        results = defaultdict(
+            lambda: dict(
+                level=None,
+                title=None,
+                edges=set(),
+                nodes=set(),
+                chunk_ids=set(),
+                occurrence=0.0,
+                sub_communities=[],
+            )
+        )
+        async with self.async_driver.session() as session:
+            # Fetch community data
+            result = await session.run(
+                f"""
+                MATCH (n:`{self.namespace}`)
+                WITH n, n.communityIds AS communityIds, [(n)-[]-(m:`{self.namespace}`) | m.id] AS connected_nodes
+                RETURN n.id AS node_id, n.source_id AS source_id,
+                       communityIds AS cluster_key,
+                       connected_nodes
+                """
+            )
+            # records = await result.fetch()
+            max_num_ids = 0
+            async for record in result:
+                for index, c_id in enumerate(record["cluster_key"]):
+                    node_id = str(record["node_id"])
+                    source_id = record["source_id"]
+                    level = index
+                    cluster_key = str(c_id)
+                    connected_nodes = record["connected_nodes"]
+                    results[cluster_key]["level"] = level
+                    results[cluster_key]["title"] = f"Cluster {cluster_key}"
+                    results[cluster_key]["nodes"].add(node_id)
+                    results[cluster_key]["edges"].update(
+                        [
+                            tuple(sorted([node_id, str(connected)]))
+                            for connected in connected_nodes
+                            if connected != node_id
+                        ]
+                    )
+                    chunk_ids = source_id.split(GRAPH_FIELD_SEP)
+                    results[cluster_key]["chunk_ids"].update(chunk_ids)
+                    max_num_ids = max(
+                        max_num_ids, len(results[cluster_key]["chunk_ids"])
+                    )
+            # Process results
+            for k, v in results.items():
+                v["edges"] = [list(e) for e in v["edges"]]
+                v["nodes"] = list(v["nodes"])
+                v["chunk_ids"] = list(v["chunk_ids"])
+                v["occurrence"] = len(v["chunk_ids"]) / max_num_ids
+            # Compute sub-communities (this is a simplified approach)
+            for cluster in results.values():
+                cluster["sub_communities"] = [
+                    sub_key
+                    for sub_key, sub_cluster in results.items()
+                    if sub_cluster["level"] > cluster["level"]
+                    and set(sub_cluster["nodes"]).issubset(set(cluster["nodes"]))
+                ]
+        return dict(results)
+    async def index_done_callback(self):
+        await self.async_driver.close()
+    async def _debug_delete_all_node_edges(self):
+        async with self.async_driver.session() as session:
+            try:
+                # Delete all relationships in the namespace
+                await session.run(f"MATCH (n:`{self.namespace}`)-[r]-() DELETE r")
+                # Delete all nodes in the namespace
+                await session.run(f"MATCH (n:`{self.namespace}`) DELETE n")
+                logger.info(
+                    f"All nodes and edges in namespace '{self.namespace}' have been deleted."
+                )
+            except Exception as e:
+                logger.error(f"Error deleting nodes and edges: {str(e)}")
+                raise

nano-graphrag/nano_graphrag/_storage/gdb_networkx.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import html
+import json
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Union, cast, List
+import networkx as nx
+import numpy as np
+import asyncio
+from .._utils import logger
+from ..base import (
+    BaseGraphStorage,
+    SingleCommunitySchema,
+)
+from ..prompt import GRAPH_FIELD_SEP
+@dataclass
+class NetworkXStorage(BaseGraphStorage):
+    @staticmethod
+    def load_nx_graph(file_name) -> nx.Graph:
+        if os.path.exists(file_name):
+            return nx.read_graphml(file_name)
+        return None
+    @staticmethod
+    def write_nx_graph(graph: nx.Graph, file_name):
+        logger.info(
+            f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges"
+        )
+        nx.write_graphml(graph, file_name)
+    @staticmethod
+    def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph:
+        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
+        Return the largest connected component of the graph, with nodes and edges sorted in a stable way.
+        """
+        from graspologic.utils import largest_connected_component
+        graph = graph.copy()
+        graph = cast(nx.Graph, largest_connected_component(graph))
+        node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()}  # type: ignore
+        graph = nx.relabel_nodes(graph, node_mapping)
+        return NetworkXStorage._stabilize_graph(graph)
+    @staticmethod
+    def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
+        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
+        Ensure an undirected graph with the same relationships will always be read the same way.
+        """
+        fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()
+        sorted_nodes = graph.nodes(data=True)
+        sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])
+        fixed_graph.add_nodes_from(sorted_nodes)
+        edges = list(graph.edges(data=True))
+        if not graph.is_directed():
+            def _sort_source_target(edge):
+                source, target, edge_data = edge
+                if source > target:
+                    temp = source
+                    source = target
+                    target = temp
+                return source, target, edge_data
+            edges = [_sort_source_target(edge) for edge in edges]
+        def _get_edge_key(source: Any, target: Any) -> str:
+            return f"{source} -> {target}"
+        edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))
+        fixed_graph.add_edges_from(edges)
+        return fixed_graph
+    def __post_init__(self):
+        self._graphml_xml_file = os.path.join(
+            self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
+        )
+        preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
+        if preloaded_graph is not None:
+            logger.info(
+                f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges"
+            )
+        self._graph = preloaded_graph or nx.Graph()
+        self._clustering_algorithms = {
+            "leiden": self._leiden_clustering,
+        }
+        self._node_embed_algorithms = {
+            "node2vec": self._node2vec_embed,
+        }
+    async def index_done_callback(self):
+        NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file)
+    async def has_node(self, node_id: str) -> bool:
+        return self._graph.has_node(node_id)
+    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        return self._graph.has_edge(source_node_id, target_node_id)
+    async def get_node(self, node_id: str) -> Union[dict, None]:
+        return self._graph.nodes.get(node_id)
+    async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, Union[dict, None]]:
+        return await asyncio.gather(*[self.get_node(node_id) for node_id in node_ids])
+    async def node_degree(self, node_id: str) -> int:
+        # [numberchiffre]: node_id not part of graph returns `DegreeView({})` instead of 0
+        return self._graph.degree(node_id) if self._graph.has_node(node_id) else 0
+    async def node_degrees_batch(self, node_ids: List[str]) -> List[str]:
+        return await asyncio.gather(*[self.node_degree(node_id) for node_id in node_ids])
+    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return (self._graph.degree(src_id) if self._graph.has_node(src_id) else 0) + (
+            self._graph.degree(tgt_id) if self._graph.has_node(tgt_id) else 0
+        )
+    async def edge_degrees_batch(self, edge_pairs: list[tuple[str, str]]) -> list[int]:
+        return await asyncio.gather(*[self.edge_degree(src_id, tgt_id) for src_id, tgt_id in edge_pairs])
+    async def get_edge(
+        self, source_node_id: str, target_node_id: str
+    ) -> Union[dict, None]:
+        return self._graph.edges.get((source_node_id, target_node_id))
+    async def get_edges_batch(
+        self, edge_pairs: list[tuple[str, str]]
+    ) -> list[Union[dict, None]]:
+        return await asyncio.gather(*[self.get_edge(source_node_id, target_node_id) for source_node_id, target_node_id in edge_pairs])
+    async def get_node_edges(self, source_node_id: str):
+        if self._graph.has_node(source_node_id):
+            return list(self._graph.edges(source_node_id))
+        return None
+    async def get_nodes_edges_batch(
+        self, node_ids: list[str]
+    ) -> list[list[tuple[str, str]]]:
+        return await asyncio.gather(*[self.get_node_edges(node_id) for node_id
+        in node_ids])
+    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        self._graph.add_node(node_id, **node_data)
+    async def upsert_nodes_batch(self, nodes_data: list[tuple[str, dict[str, str]]]):
+        await asyncio.gather(*[self.upsert_node(node_id, node_data) for node_id, node_data in nodes_data])
+    async def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        self._graph.add_edge(source_node_id, target_node_id, **edge_data)
+    async def upsert_edges_batch(
+        self, edges_data: list[tuple[str, str, dict[str, str]]]
+    ):
+        await asyncio.gather(*[self.upsert_edge(source_node_id, target_node_id, edge_data)
+                for source_node_id, target_node_id, edge_data in edges_data])
+    async def clustering(self, algorithm: str):
+        if algorithm not in self._clustering_algorithms:
+            raise ValueError(f"Clustering algorithm {algorithm} not supported")
+        await self._clustering_algorithms[algorithm]()
+    async def community_schema(self) -> dict[str, SingleCommunitySchema]:
+        results = defaultdict(
+            lambda: dict(
+                level=None,
+                title=None,
+                edges=set(),
+                nodes=set(),
+                chunk_ids=set(),
+                occurrence=0.0,
+                sub_communities=[],
+            )
+        )
+        max_num_ids = 0
+        levels = defaultdict(set)
+        for node_id, node_data in self._graph.nodes(data=True):
+            if "clusters" not in node_data:
+                continue
+            clusters = json.loads(node_data["clusters"])
+            this_node_edges = self._graph.edges(node_id)
+            for cluster in clusters:
+                level = cluster["level"]
+                cluster_key = str(cluster["cluster"])
+                levels[level].add(cluster_key)
+                results[cluster_key]["level"] = level
+                results[cluster_key]["title"] = f"Cluster {cluster_key}"
+                results[cluster_key]["nodes"].add(node_id)
+                results[cluster_key]["edges"].update(
+                    [tuple(sorted(e)) for e in this_node_edges]
+                )
+                results[cluster_key]["chunk_ids"].update(
+                    node_data["source_id"].split(GRAPH_FIELD_SEP)
+                )
+                max_num_ids = max(max_num_ids, len(results[cluster_key]["chunk_ids"]))
+        ordered_levels = sorted(levels.keys())
+        for i, curr_level in enumerate(ordered_levels[:-1]):
+            next_level = ordered_levels[i + 1]
+            this_level_comms = levels[curr_level]
+            next_level_comms = levels[next_level]
+            # compute the sub-communities by nodes intersection
+            for comm in this_level_comms:
+                results[comm]["sub_communities"] = [
+                    c
+                    for c in next_level_comms
+                    if results[c]["nodes"].issubset(results[comm]["nodes"])
+                ]
+        for k, v in results.items():
+            v["edges"] = list(v["edges"])
+            v["edges"] = [list(e) for e in v["edges"]]
+            v["nodes"] = list(v["nodes"])
+            v["chunk_ids"] = list(v["chunk_ids"])
+            v["occurrence"] = len(v["chunk_ids"]) / max_num_ids
+        return dict(results)
+    def _cluster_data_to_subgraphs(self, cluster_data: dict[str, list[dict[str, str]]]):
+        for node_id, clusters in cluster_data.items():
+            self._graph.nodes[node_id]["clusters"] = json.dumps(clusters)
+    async def _leiden_clustering(self):
+        from graspologic.partition import hierarchical_leiden
+        graph = NetworkXStorage.stable_largest_connected_component(self._graph)
+        community_mapping = hierarchical_leiden(
+            graph,
+            max_cluster_size=self.global_config["max_graph_cluster_size"],
+            random_seed=self.global_config["graph_cluster_seed"],
+        )
+        node_communities: dict[str, list[dict[str, str]]] = defaultdict(list)
+        __levels = defaultdict(set)
+        for partition in community_mapping:
+            level_key = partition.level
+            cluster_id = partition.cluster
+            node_communities[partition.node].append(
+                {"level": level_key, "cluster": cluster_id}
+            )
+            __levels[level_key].add(cluster_id)
+        node_communities = dict(node_communities)
+        __levels = {k: len(v) for k, v in __levels.items()}
+        logger.info(f"Each level has communities: {dict(__levels)}")
+        self._cluster_data_to_subgraphs(node_communities)
+    async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]:
+        if algorithm not in self._node_embed_algorithms:
+            raise ValueError(f"Node embedding algorithm {algorithm} not supported")
+        return await self._node_embed_algorithms[algorithm]()
+    async def _node2vec_embed(self):
+        from graspologic import embed
+        embeddings, nodes = embed.node2vec_embed(
+            self._graph,
+            **self.global_config["node2vec_params"],
+        )
+        nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes]
+        return embeddings, nodes_ids

nano-graphrag/nano_graphrag/_storage/kv_json.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+from dataclasses import dataclass
+from .._utils import load_json, logger, write_json
+from ..base import (
+    BaseKVStorage,
+)
+@dataclass
+class JsonKVStorage(BaseKVStorage):
+    def __post_init__(self):
+        working_dir = self.global_config["working_dir"]
+        self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
+        self._data = load_json(self._file_name) or {}
+        logger.info(f"Load KV {self.namespace} with {len(self._data)} data")
+    async def all_keys(self) -> list[str]:
+        return list(self._data.keys())
+    async def index_done_callback(self):
+        write_json(self._data, self._file_name)
+    async def get_by_id(self, id):
+        return self._data.get(id, None)
+    async def get_by_ids(self, ids, fields=None):
+        if fields is None:
+            return [self._data.get(id, None) for id in ids]
+        return [
+            (
+                {k: v for k, v in self._data[id].items() if k in fields}
+                if self._data.get(id, None)
+                else None
+            )
+            for id in ids
+        ]
+    async def filter_keys(self, data: list[str]) -> set[str]:
+        return set([s for s in data if s not in self._data])
+    async def upsert(self, data: dict[str, dict]):
+        self._data.update(data)
+    async def drop(self):
+        self._data = {}

nano-graphrag/nano_graphrag/_storage/vdb_hnswlib.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import asyncio
+import os
+from dataclasses import dataclass, field
+from typing import Any
+import pickle
+import hnswlib
+import numpy as np
+import xxhash
+from .._utils import logger
+from ..base import BaseVectorStorage
+@dataclass
+class HNSWVectorStorage(BaseVectorStorage):
+    ef_construction: int = 100
+    M: int = 16
+    max_elements: int = 1000000
+    ef_search: int = 50
+    num_threads: int = -1
+    _index: Any = field(init=False)
+    _metadata: dict[str, dict] = field(default_factory=dict)
+    _current_elements: int = 0
+    def __post_init__(self):
+        self._index_file_name = os.path.join(
+            self.global_config["working_dir"], f"{self.namespace}_hnsw.index"
+        )
+        self._metadata_file_name = os.path.join(
+            self.global_config["working_dir"], f"{self.namespace}_hnsw_metadata.pkl"
+        )
+        self._embedding_batch_num = self.global_config.get("embedding_batch_num", 100)
+        hnsw_params = self.global_config.get("vector_db_storage_cls_kwargs", {})
+        self.ef_construction = hnsw_params.get("ef_construction", self.ef_construction)
+        self.M = hnsw_params.get("M", self.M)
+        self.max_elements = hnsw_params.get("max_elements", self.max_elements)
+        self.ef_search = hnsw_params.get("ef_search", self.ef_search)
+        self.num_threads = hnsw_params.get("num_threads", self.num_threads)
+        self._index = hnswlib.Index(
+            space="cosine", dim=self.embedding_func.embedding_dim
+        )
+        if os.path.exists(self._index_file_name) and os.path.exists(
+            self._metadata_file_name
+        ):
+            self._index.load_index(
+                self._index_file_name, max_elements=self.max_elements
+            )
+            with open(self._metadata_file_name, "rb") as f:
+                self._metadata, self._current_elements = pickle.load(f)
+            logger.info(
+                f"Loaded existing index for {self.namespace} with {self._current_elements} elements"
+            )
+        else:
+            self._index.init_index(
+                max_elements=self.max_elements,
+                ef_construction=self.ef_construction,
+                M=self.M,
+            )
+            self._index.set_ef(self.ef_search)
+            self._metadata = {}
+            self._current_elements = 0
+            logger.info(f"Created new index for {self.namespace}")
+    async def upsert(self, data: dict[str, dict]) -> np.ndarray:
+        logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
+        if not data:
+            logger.warning("You insert an empty data to vector DB")
+            return []
+        if self._current_elements + len(data) > self.max_elements:
+            raise ValueError(
+                f"Cannot insert {len(data)} elements. Current: {self._current_elements}, Max: {self.max_elements}"
+            )
+        list_data = [
+            {
+                "id": k,
+                **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
+            }
+            for k, v in data.items()
+        ]
+        contents = [v["content"] for v in data.values()]
+        batch_size = min(self._embedding_batch_num, len(contents))
+        embeddings = np.concatenate(
+            await asyncio.gather(
+                *[
+                    self.embedding_func(contents[i : i + batch_size])
+                    for i in range(0, len(contents), batch_size)
+                ]
+            )
+        )
+        ids = np.fromiter(
+            (xxhash.xxh32_intdigest(d["id"].encode()) for d in list_data),
+            dtype=np.uint32,
+            count=len(list_data),
+        )
+        self._metadata.update(
+            {
+                id_int: {
+                    k: v for k, v in d.items() if k in self.meta_fields or k == "id"
+                }
+                for id_int, d in zip(ids, list_data)
+            }
+        )
+        self._index.add_items(data=embeddings, ids=ids, num_threads=self.num_threads)
+        self._current_elements = self._index.get_current_count()
+        return ids
+    async def query(self, query: str, top_k: int = 5) -> list[dict]:
+        if self._current_elements == 0:
+            return []
+        top_k = min(top_k, self._current_elements)
+        if top_k > self.ef_search:
+            logger.warning(
+                f"Setting ef_search to {top_k} because top_k is larger than ef_search"
+            )
+            self._index.set_ef(top_k)
+        embedding = await self.embedding_func([query])
+        labels, distances = self._index.knn_query(
+            data=embedding[0], k=top_k, num_threads=self.num_threads
+        )
+        return [
+            {
+                **self._metadata.get(label, {}),
+                "distance": distance,
+                "similarity": 1 - distance,
+            }
+            for label, distance in zip(labels[0], distances[0])
+        ]
+    async def index_done_callback(self):
+        self._index.save_index(self._index_file_name)
+        with open(self._metadata_file_name, "wb") as f:
+            pickle.dump((self._metadata, self._current_elements), f)

nano-graphrag/nano_graphrag/_storage/vdb_nanovectordb.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import asyncio
+import os
+from dataclasses import dataclass
+import numpy as np
+from nano_vectordb import NanoVectorDB
+from .._utils import logger
+from ..base import BaseVectorStorage
+@dataclass
+class NanoVectorDBStorage(BaseVectorStorage):
+    cosine_better_than_threshold: float = 0.2
+    def __post_init__(self):
+        self._client_file_name = os.path.join(
+            self.global_config["working_dir"], f"vdb_{self.namespace}.json"
+        )
+        self._max_batch_size = self.global_config["embedding_batch_num"]
+        self._client = NanoVectorDB(
+            self.embedding_func.embedding_dim, storage_file=self._client_file_name
+        )
+        self.cosine_better_than_threshold = self.global_config.get(
+            "query_better_than_threshold", self.cosine_better_than_threshold
+        )
+    async def upsert(self, data: dict[str, dict]):
+        logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
+        if not len(data):
+            logger.warning("You insert an empty data to vector DB")
+            return []
+        list_data = [
+            {
+                "__id__": k,
+                **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
+            }
+            for k, v in data.items()
+        ]
+        contents = [v["content"] for v in data.values()]
+        batches = [
+            contents[i : i + self._max_batch_size]
+            for i in range(0, len(contents), self._max_batch_size)
+        ]
+        embeddings_list = await asyncio.gather(
+            *[self.embedding_func(batch) for batch in batches]
+        )
+        embeddings = np.concatenate(embeddings_list)
+        for i, d in enumerate(list_data):
+            d["__vector__"] = embeddings[i]
+        results = self._client.upsert(datas=list_data)
+        return results
+    async def query(self, query: str, top_k=5):
+        embedding = await self.embedding_func([query])
+        embedding = embedding[0]
+        results = self._client.query(
+            query=embedding,
+            top_k=top_k,
+            better_than_threshold=self.cosine_better_than_threshold,
+        )
+        results = [
+            {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
+        ]
+        return results
+    async def index_done_callback(self):
+        self._client.save()

nano-graphrag/nano_graphrag/_utils.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import asyncio
+import html
+import json
+import logging
+import os
+import re
+import numbers
+from dataclasses import dataclass
+from functools import wraps
+from hashlib import md5
+from typing import Any, Union, Literal
+import numpy as np
+import tiktoken
+from transformers import AutoTokenizer
+logger = logging.getLogger("nano-graphrag")
+logging.getLogger("neo4j").setLevel(logging.ERROR)
+def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
+    try:
+        # If there is already an event loop, use it.
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        # If in a sub-thread, create a new event loop.
+        logger.info("Creating a new event loop in a sub-thread.")
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    return loop
+def extract_first_complete_json(s: str):
+    """Extract the first complete JSON object from the string using a stack to track braces."""
+    stack = []
+    first_json_start = None
+    for i, char in enumerate(s):
+        if char == '{':
+            stack.append(i)
+            if first_json_start is None:
+                first_json_start = i
+        elif char == '}':
+            if stack:
+                start = stack.pop()
+                if not stack:
+                    first_json_str = s[first_json_start:i+1]
+                    try:
+                        # Attempt to parse the JSON string
+                        return json.loads(first_json_str.replace("\n", ""))
+                    except json.JSONDecodeError as e:
+                        logger.error(f"JSON decoding failed: {e}. Attempted string: {first_json_str[:50]}...")
+                        return None
+                    finally:
+                        first_json_start = None
+    logger.warning("No complete JSON object found in the input string.")
+    return None
+def parse_value(value: str):
+    """Convert a string value to its appropriate type (int, float, bool, None, or keep as string). Work as a more broad 'eval()'"""
+    value = value.strip()
+    if value == "null":
+        return None
+    elif value == "true":
+        return True
+    elif value == "false":
+        return False
+    else:
+        # Try to convert to int or float
+        try:
+            if '.' in value:  # If there's a dot, it might be a float
+                return float(value)
+            else:
+                return int(value)
+        except ValueError:
+            # If conversion fails, return the value as-is (likely a string)
+            return value.strip('"')  # Remove surrounding quotes if they exist
+def extract_values_from_json(json_string, keys=["reasoning", "answer", "data"], allow_no_quotes=False):
+    """Extract key values from a non-standard or malformed JSON string, handling nested objects."""
+    extracted_values = {}
+    # Enhanced pattern to match both quoted and unquoted values, as well as nested objects
+    regex_pattern = r'(?P<key>"?\w+"?)\s*:\s*(?P<value>{[^}]*}|".*?"|[^,}]+)'
+    for match in re.finditer(regex_pattern, json_string, re.DOTALL):
+        key = match.group('key').strip('"')  # Strip quotes from key
+        value = match.group('value').strip()
+        # If the value is another nested JSON (starts with '{' and ends with '}'), recursively parse it
+        if value.startswith('{') and value.endswith('}'):
+            extracted_values[key] = extract_values_from_json(value)
+        else:
+            # Parse the value into the appropriate type (int, float, bool, etc.)
+            extracted_values[key] = parse_value(value)
+    if not extracted_values:
+        logger.warning("No values could be extracted from the string.")
+    return extracted_values
+def convert_response_to_json(response: str) -> dict:
+    """Convert response string to JSON, with error handling and fallback to non-standard JSON extraction."""
+    prediction_json = extract_first_complete_json(response)
+    if prediction_json is None:
+        logger.info("Attempting to extract values from a non-standard JSON string...")
+        prediction_json = extract_values_from_json(response, allow_no_quotes=True)
+    if not prediction_json:
+        logger.error("Unable to extract meaningful data from the response.")
+    else:
+        logger.info("JSON data successfully extracted.")
+    return prediction_json
+class TokenizerWrapper:
+    def __init__(self, tokenizer_type: Literal["tiktoken", "huggingface"] = "tiktoken", model_name: str = "gpt-4o"):
+        self.tokenizer_type = tokenizer_type
+        self.model_name = model_name
+        self._tokenizer = None
+        self._lazy_load_tokenizer()
+    def _lazy_load_tokenizer(self):
+        if self._tokenizer is not None:
+            return
+        logger.info(f"Loading tokenizer: type='{self.tokenizer_type}', name='{self.model_name}'")
+        if self.tokenizer_type == "tiktoken":
+            self._tokenizer = tiktoken.encoding_for_model(self.model_name)
+        elif self.tokenizer_type == "huggingface":
+            if AutoTokenizer is None:
+                raise ImportError("`transformers` is not installed. Please install it via `pip install transformers` to use HuggingFace tokenizers.")
+            self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
+        else:
+            raise ValueError(f"Unknown tokenizer_type: {self.tokenizer_type}")
+    def get_tokenizer(self):
+        """提供对底层 tokenizer 对象的访问，用于特殊情况（如 decode_batch）。"""
+        self._lazy_load_tokenizer()
+        return self._tokenizer
+    def encode(self, text: str) -> list[int]:
+        self._lazy_load_tokenizer()
+        return self._tokenizer.encode(text)
+    def decode(self, tokens: list[int]) -> str:
+        self._lazy_load_tokenizer()
+        return self._tokenizer.decode(tokens)
+    # +++ 新增 +++: 增加一个批量解码的方法以提高效率，并保持接口一致性
+    def decode_batch(self, tokens_list: list[list[int]]) -> list[str]:
+        self._lazy_load_tokenizer()
+        # HuggingFace tokenizer 有 decode_batch，但 tiktoken 没有，我们用列表推导来模拟
+        if self.tokenizer_type == "tiktoken":
+            return [self._tokenizer.decode(tokens) for tokens in tokens_list]
+        elif self.tokenizer_type == "huggingface":
+            return self._tokenizer.batch_decode(tokens_list, skip_special_tokens=True)
+        else:
+             raise ValueError(f"Unknown tokenizer_type: {self.tokenizer_type}")
+def truncate_list_by_token_size(
+    list_data: list,
+    key: callable,
+    max_token_size: int,
+    tokenizer_wrapper: TokenizerWrapper
+):
+    """Truncate a list of data by token size using a provided tokenizer wrapper."""
+    if max_token_size <= 0:
+        return []
+    tokens = 0
+    for i, data in enumerate(list_data):
+        tokens += len(tokenizer_wrapper.encode(key(data))) + 1 # 防御性，模拟通过\n拼接列表的情况
+        if tokens > max_token_size:
+            return list_data[:i]
+    return list_data
+def compute_mdhash_id(content, prefix: str = ""):
+    return prefix + md5(content.encode()).hexdigest()
+def write_json(json_obj, file_name):
+    with open(file_name, "w", encoding="utf-8") as f:
+        json.dump(json_obj, f, indent=2, ensure_ascii=False)
+def load_json(file_name):
+    if not os.path.exists(file_name):
+        return None
+    with open(file_name, encoding="utf-8") as f:
+        return json.load(f)
+# it's dirty to type, so it's a good way to have fun
+def pack_user_ass_to_openai_messages(prompt: str, generated_content: str, using_amazon_bedrock: bool):
+    if using_amazon_bedrock:
+        return [
+            {"role": "user", "content": [{"text": prompt}]},
+            {"role": "assistant", "content": [{"text": generated_content}]},
+        ]
+    else:
+        return [
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": generated_content},
+        ]
+def is_float_regex(value):
+    return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value))
+def compute_args_hash(*args):
+    return md5(str(args).encode()).hexdigest()
+def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]:
+    """Split a string by multiple markers"""
+    if not markers:
+        return [content]
+    results = re.split("|".join(re.escape(marker) for marker in markers), content)
+    return [r.strip() for r in results if r.strip()]
+def enclose_string_with_quotes(content: Any) -> str:
+    """Enclose a string with quotes"""
+    if isinstance(content, numbers.Number):
+        return str(content)
+    content = str(content)
+    content = content.strip().strip("'").strip('"')
+    return f'"{content}"'
+def list_of_list_to_csv(data: list[list]):
+    return "\n".join(
+        [
+            ",\t".join([f"{enclose_string_with_quotes(data_dd)}" for data_dd in data_d])
+            for data_d in data
+        ]
+    )
+# -----------------------------------------------------------------------------------
+# Refer the utils functions of the official GraphRAG implementation:
+# https://github.com/microsoft/graphrag
+def clean_str(input: Any) -> str:
+    """Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
+    # If we get non-string input, just give it back
+    if not isinstance(input, str):
+        return input
+    result = html.unescape(input.strip())
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
+    return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
+# Utils types -----------------------------------------------------------------------
+@dataclass
+class EmbeddingFunc:
+    embedding_dim: int
+    max_token_size: int
+    func: callable
+    async def __call__(self, *args, **kwargs) -> np.ndarray:
+        return await self.func(*args, **kwargs)
+# Decorators ------------------------------------------------------------------------
+def limit_async_func_call(max_size: int, waitting_time: float = 0.0001):
+    """Add restriction of maximum async calling times for a async func"""
+    def final_decro(func):
+        """Not using async.Semaphore to aovid use nest-asyncio"""
+        __current_size = 0
+        @wraps(func)
+        async def wait_func(*args, **kwargs):
+            nonlocal __current_size
+            while __current_size >= max_size:
+                await asyncio.sleep(waitting_time)
+            __current_size += 1
+            result = await func(*args, **kwargs)
+            __current_size -= 1
+            return result
+        return wait_func
+    return final_decro
+def wrap_embedding_func_with_attrs(**kwargs):
+    """Wrap a function with attributes"""
+    def final_decro(func) -> EmbeddingFunc:
+        new_func = EmbeddingFunc(**kwargs, func=func)
+        return new_func
+    return final_decro

nano-graphrag/nano_graphrag/base.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from dataclasses import dataclass, field
+from typing import TypedDict, Union, Literal, Generic, TypeVar, List
+import numpy as np
+from ._utils import EmbeddingFunc
+@dataclass
+class QueryParam:
+    mode: Literal["local", "global", "naive"] = "global"
+    only_need_context: bool = False
+    response_type: str = "Multiple Paragraphs"
+    level: int = 2
+    top_k: int = 20
+    # naive search
+    naive_max_token_for_text_unit = 12000
+    # local search
+    local_max_token_for_text_unit: int = 4000  # 12000 * 0.33
+    local_max_token_for_local_context: int = 4800  # 12000 * 0.4
+    local_max_token_for_community_report: int = 3200  # 12000 * 0.27
+    local_community_single_one: bool = False
+    # global search
+    global_min_community_rating: float = 0
+    global_max_consider_community: float = 512
+    global_max_token_for_community_report: int = 16384
+    global_special_community_map_llm_kwargs: dict = field(
+        default_factory=lambda: {"response_format": {"type": "json_object"}}
+    )
+TextChunkSchema = TypedDict(
+    "TextChunkSchema",
+    {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int},
+)
+SingleCommunitySchema = TypedDict(
+    "SingleCommunitySchema",
+    {
+        "level": int,
+        "title": str,
+        "edges": list[list[str, str]],
+        "nodes": list[str],
+        "chunk_ids": list[str],
+        "occurrence": float,
+        "sub_communities": list[str],
+    },
+)
+class CommunitySchema(SingleCommunitySchema):
+    report_string: str
+    report_json: dict
+T = TypeVar("T")
+@dataclass
+class StorageNameSpace:
+    namespace: str
+    global_config: dict
+    async def index_start_callback(self):
+        """commit the storage operations after indexing"""
+        pass
+    async def index_done_callback(self):
+        """commit the storage operations after indexing"""
+        pass
+    async def query_done_callback(self):
+        """commit the storage operations after querying"""
+        pass
+@dataclass
+class BaseVectorStorage(StorageNameSpace):
+    embedding_func: EmbeddingFunc
+    meta_fields: set = field(default_factory=set)
+    async def query(self, query: str, top_k: int) -> list[dict]:
+        raise NotImplementedError
+    async def upsert(self, data: dict[str, dict]):
+        """Use 'content' field from value for embedding, use key as id.
+        If embedding_func is None, use 'embedding' field from value
+        """
+        raise NotImplementedError
+@dataclass
+class BaseKVStorage(Generic[T], StorageNameSpace):
+    async def all_keys(self) -> list[str]:
+        raise NotImplementedError
+    async def get_by_id(self, id: str) -> Union[T, None]:
+        raise NotImplementedError
+    async def get_by_ids(
+        self, ids: list[str], fields: Union[set[str], None] = None
+    ) -> list[Union[T, None]]:
+        raise NotImplementedError
+    async def filter_keys(self, data: list[str]) -> set[str]:
+        """return un-exist keys"""
+        raise NotImplementedError
+    async def upsert(self, data: dict[str, T]):
+        raise NotImplementedError
+    async def drop(self):
+        raise NotImplementedError
+@dataclass
+class BaseGraphStorage(StorageNameSpace):
+    async def has_node(self, node_id: str) -> bool:
+        raise NotImplementedError
+    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        raise NotImplementedError
+    async def node_degree(self, node_id: str) -> int:
+        raise NotImplementedError
+    async def node_degrees_batch(self, node_ids: List[str]) -> List[str]:
+        raise NotImplementedError
+    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        raise NotImplementedError
+    async def edge_degrees_batch(self, edge_pairs: list[tuple[str, str]]) -> list[int]:
+        raise NotImplementedError
+    async def get_node(self, node_id: str) -> Union[dict, None]:
+        raise NotImplementedError
+    async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, Union[dict, None]]:
+        raise NotImplementedError
+    async def get_edge(
+        self, source_node_id: str, target_node_id: str
+    ) -> Union[dict, None]:
+        raise NotImplementedError
+    async def get_edges_batch(
+        self, edge_pairs: list[tuple[str, str]]
+    ) -> list[Union[dict, None]]:
+        raise NotImplementedError
+    async def get_node_edges(
+        self, source_node_id: str
+    ) -> Union[list[tuple[str, str]], None]:
+        raise NotImplementedError
+    async def get_nodes_edges_batch(
+        self, node_ids: list[str]
+    ) -> list[list[tuple[str, str]]]:
+        raise NotImplementedError
+    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        raise NotImplementedError
+    async def upsert_nodes_batch(self, nodes_data: list[tuple[str, dict[str, str]]]):
+        raise NotImplementedError
+    async def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        raise NotImplementedError
+    async def upsert_edges_batch(
+        self, edges_data: list[tuple[str, str, dict[str, str]]]
+    ):
+        raise NotImplementedError
+    async def clustering(self, algorithm: str):
+        raise NotImplementedError
+    async def community_schema(self) -> dict[str, SingleCommunitySchema]:
+        """Return the community representation with report and nodes"""
+        raise NotImplementedError
+    async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]:
+        raise NotImplementedError("Node embedding is not used in nano-graphrag.")

nano-graphrag/nano_graphrag/entity_extraction/__init__.py ADDED Viewed

File without changes

nano-graphrag/nano_graphrag/entity_extraction/extract.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from typing import Union
+import pickle
+import asyncio
+from openai import BadRequestError
+from collections import defaultdict
+import dspy
+from nano_graphrag.base import (
+    BaseGraphStorage,
+    BaseVectorStorage,
+    TextChunkSchema,
+)
+from nano_graphrag.prompt import PROMPTS
+from nano_graphrag._utils import logger, compute_mdhash_id
+from nano_graphrag.entity_extraction.module import TypedEntityRelationshipExtractor
+from nano_graphrag._op import _merge_edges_then_upsert, _merge_nodes_then_upsert
+async def generate_dataset(
+    chunks: dict[str, TextChunkSchema],
+    filepath: str,
+    save_dataset: bool = True,
+    global_config: dict = {},
+) -> list[dspy.Example]:
+    entity_extractor = TypedEntityRelationshipExtractor(num_refine_turns=1, self_refine=True)
+    if global_config.get("use_compiled_dspy_entity_relationship", False):
+        entity_extractor.load(global_config["entity_relationship_module_path"])
+    ordered_chunks = list(chunks.items())
+    already_processed = 0
+    already_entities = 0
+    already_relations = 0
+    async def _process_single_content(
+        chunk_key_dp: tuple[str, TextChunkSchema]
+    ) -> dspy.Example:
+        nonlocal already_processed, already_entities, already_relations
+        chunk_dp = chunk_key_dp[1]
+        content = chunk_dp["content"]
+        try:
+            prediction = await asyncio.to_thread(entity_extractor, input_text=content)
+            entities, relationships = prediction.entities, prediction.relationships
+        except BadRequestError as e:
+            logger.error(f"Error in TypedEntityRelationshipExtractor: {e}")
+            entities, relationships = [], []
+        example = dspy.Example(
+            input_text=content, entities=entities, relationships=relationships
+        ).with_inputs("input_text")
+        already_entities += len(entities)
+        already_relations += len(relationships)
+        already_processed += 1
+        now_ticks = PROMPTS["process_tickers"][
+            already_processed % len(PROMPTS["process_tickers"])
+        ]
+        print(
+            f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
+            end="",
+            flush=True,
+        )
+        return example
+    examples = await asyncio.gather(
+        *[_process_single_content(c) for c in ordered_chunks]
+    )
+    filtered_examples = [
+        example
+        for example in examples
+        if len(example.entities) > 0 and len(example.relationships) > 0
+    ]
+    num_filtered_examples = len(examples) - len(filtered_examples)
+    if save_dataset:
+        with open(filepath, "wb") as f:
+            pickle.dump(filtered_examples, f)
+            logger.info(
+                f"Saved {len(filtered_examples)} examples with keys: {filtered_examples[0].keys()}, filtered {num_filtered_examples} examples"
+            )
+    return filtered_examples
+async def extract_entities_dspy(
+    chunks: dict[str, TextChunkSchema],
+    knwoledge_graph_inst: BaseGraphStorage,
+    entity_vdb: BaseVectorStorage,
+    global_config: dict,
+) -> Union[BaseGraphStorage, None]:
+    entity_extractor = TypedEntityRelationshipExtractor(num_refine_turns=1, self_refine=True)
+    if global_config.get("use_compiled_dspy_entity_relationship", False):
+        entity_extractor.load(global_config["entity_relationship_module_path"])
+    ordered_chunks = list(chunks.items())
+    already_processed = 0
+    already_entities = 0
+    already_relations = 0
+    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
+        nonlocal already_processed, already_entities, already_relations
+        chunk_key = chunk_key_dp[0]
+        chunk_dp = chunk_key_dp[1]
+        content = chunk_dp["content"]
+        try:
+            prediction = await asyncio.to_thread(entity_extractor, input_text=content)
+            entities, relationships = prediction.entities, prediction.relationships
+        except BadRequestError as e:
+            logger.error(f"Error in TypedEntityRelationshipExtractor: {e}")
+            entities, relationships = [], []
+        maybe_nodes = defaultdict(list)
+        maybe_edges = defaultdict(list)
+        for entity in entities:
+            entity["source_id"] = chunk_key
+            maybe_nodes[entity["entity_name"]].append(entity)
+            already_entities += 1
+        for relationship in relationships:
+            relationship["source_id"] = chunk_key
+            maybe_edges[(relationship["src_id"], relationship["tgt_id"])].append(
+                relationship
+            )
+            already_relations += 1
+        already_processed += 1
+        now_ticks = PROMPTS["process_tickers"][
+            already_processed % len(PROMPTS["process_tickers"])
+        ]
+        print(
+            f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
+            end="",
+            flush=True,
+        )
+        return dict(maybe_nodes), dict(maybe_edges)
+    results = await asyncio.gather(
+        *[_process_single_content(c) for c in ordered_chunks]
+    )
+    print()
+    maybe_nodes = defaultdict(list)
+    maybe_edges = defaultdict(list)
+    for m_nodes, m_edges in results:
+        for k, v in m_nodes.items():
+            maybe_nodes[k].extend(v)
+        for k, v in m_edges.items():
+            maybe_edges[k].extend(v)
+    all_entities_data = await asyncio.gather(
+        *[
+            _merge_nodes_then_upsert(k, v, knwoledge_graph_inst, global_config)
+            for k, v in maybe_nodes.items()
+        ]
+    )
+    await asyncio.gather(
+        *[
+            _merge_edges_then_upsert(k[0], k[1], v, knwoledge_graph_inst, global_config)
+            for k, v in maybe_edges.items()
+        ]
+    )
+    if not len(all_entities_data):
+        logger.warning("Didn't extract any entities, maybe your LLM is not working")
+        return None
+    if entity_vdb is not None:
+        data_for_vdb = {
+            compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
+                "content": dp["entity_name"] + dp["description"],
+                "entity_name": dp["entity_name"],
+            }
+            for dp in all_entities_data
+        }
+        await entity_vdb.upsert(data_for_vdb)
+    return knwoledge_graph_inst

nano-graphrag/nano_graphrag/entity_extraction/metric.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import dspy
+from nano_graphrag.entity_extraction.module import Relationship
+class AssessRelationships(dspy.Signature):
+    """
+    Assess the similarity between gold and predicted relationships:
+    1. Match relationships based on src_id and tgt_id pairs, allowing for slight variations in entity names.
+    2. For matched pairs, compare:
+       a) Description similarity (semantic meaning)
+       b) Weight similarity
+       c) Order similarity
+    3. Consider unmatched relationships as penalties.
+    4. Aggregate scores, accounting for precision and recall.
+    5. Return a final similarity score between 0 (no similarity) and 1 (perfect match).
+    Key considerations:
+    - Prioritize matching based on entity pairs over exact string matches.
+    - Use semantic similarity for descriptions rather than exact matches.
+    - Weight the importance of different aspects (e.g., entity matching, description, weight, order).
+    - Balance the impact of matched and unmatched relationships in the final score.
+    """
+    gold_relationships: list[Relationship] = dspy.InputField(
+        desc="The gold-standard relationships to compare against."
+    )
+    predicted_relationships: list[Relationship] = dspy.InputField(
+        desc="The predicted relationships to compare against the gold-standard relationships."
+    )
+    similarity_score: float = dspy.OutputField(
+        desc="Similarity score between 0 and 1, with 1 being the highest similarity."
+    )
+def relationships_similarity_metric(
+    gold: dspy.Example, pred: dspy.Prediction, trace=None
+) -> float:
+    model = dspy.ChainOfThought(AssessRelationships)
+    gold_relationships = [Relationship(**item) for item in gold["relationships"]]
+    predicted_relationships = [Relationship(**item) for item in pred["relationships"]]
+    similarity_score = float(
+        model(
+            gold_relationships=gold_relationships,
+            predicted_relationships=predicted_relationships,
+        ).similarity_score
+    )
+    return similarity_score
+def entity_recall_metric(
+    gold: dspy.Example, pred: dspy.Prediction, trace=None
+) -> float:
+    true_set = set(item["entity_name"] for item in gold["entities"])
+    pred_set = set(item["entity_name"] for item in pred["entities"])
+    true_positives = len(pred_set.intersection(true_set))
+    false_negatives = len(true_set - pred_set)
+    recall = (
+        true_positives / (true_positives + false_negatives)
+        if (true_positives + false_negatives) > 0
+        else 0
+    )
+    return recall