diff --git a/nano-graphrag b/nano-graphrag deleted file mode 160000 index 01f429e8c562e8f19b2449f90cec9a4a67d4f6ee..0000000000000000000000000000000000000000 --- a/nano-graphrag +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 01f429e8c562e8f19b2449f90cec9a4a67d4f6ee diff --git a/nano-graphrag/.coveragerc b/nano-graphrag/.coveragerc new file mode 100644 index 0000000000000000000000000000000000000000..177da7eb2bb8eff8fe0f1bd7bc075b444003bb30 --- /dev/null +++ b/nano-graphrag/.coveragerc @@ -0,0 +1,11 @@ +[report] +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain if tests don't hit defensive assertion code: + raise NotImplementedError + logger. +omit = + # Don't have a nice github action for neo4j now, so skip this file: + nano_graphrag/_storage/gdb_neo4j.py \ No newline at end of file diff --git a/nano-graphrag/.env.example.azure b/nano-graphrag/.env.example.azure new file mode 100644 index 0000000000000000000000000000000000000000..c5b0269f6baa12bc2c8afd7c9e80201a875cdf34 --- /dev/null +++ b/nano-graphrag/.env.example.azure @@ -0,0 +1,7 @@ +API_KEY_EMB="" +AZURE_ENDPOINT_EMB="" +API_VERSION_EMB="" + +AZURE_OPENAI_API_KEY="" +AZURE_OPENAI_ENDPOINT="" +OPENAI_API_VERSION="" \ No newline at end of file diff --git a/nano-graphrag/.github/workflows/test.yml b/nano-graphrag/.github/workflows/test.yml new file mode 100644 index 0000000000000000000000000000000000000000..e975b44029449c8fef7203b3327888f2446e25ad --- /dev/null +++ b/nano-graphrag/.github/workflows/test.yml @@ -0,0 +1,58 @@ +name: test + +on: + push: + branches: + - main + - dev + paths-ignore: + - '**/*.md' + - '**/*.ipynb' + - 'examples/**' + pull_request: + branches: + - main + - dev + paths-ignore: + - '**/*.md' + - '**/*.ipynb' + - 'examples/**' + +jobs: + test: + name: Tests on ${{ matrix.os }} for ${{ matrix.python-version }} + strategy: + matrix: + python-version: [3.9] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + - name: Lint with flake8 + run: | + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + - name: Build and Test + env: + NANO_GRAPHRAG_TEST_IGNORE_NEO4J: true + run: | + python -m pytest -o log_cli=true -o log_cli_level="INFO" --cov=nano_graphrag --cov-report=xml -v ./ + - name: Check codecov file + id: check_files + uses: andstor/file-existence-action@v1 + with: + files: './coverage.xml' + - name: Upload coverage from test to Codecov + uses: codecov/codecov-action@v2 + with: + file: ./coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/nano-graphrag/.gitignore b/nano-graphrag/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..801875a5b59261e1fcfcdb9077fb87b673b1144a --- /dev/null +++ b/nano-graphrag/.gitignore @@ -0,0 +1,183 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python +test_cache.json +run_test*.py +nano_graphrag_cache*/ +*.txt +examples/benchmarks/fixtures/ +tests/original_workflow.txt +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +.vscode +.DS_Store +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/python + diff --git a/nano-graphrag/LICENSE b/nano-graphrag/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..61fc28bec13fc4385cde55639bc8ae7fea2c098b --- /dev/null +++ b/nano-graphrag/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Gustavo Ye + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/nano-graphrag/MANIFEST.in b/nano-graphrag/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..1a69f4c1edc94aeafc47d0dc9aecccd07fa2f341 --- /dev/null +++ b/nano-graphrag/MANIFEST.in @@ -0,0 +1 @@ +include readme.md diff --git a/nano-graphrag/docs/CONTRIBUTING.md b/nano-graphrag/docs/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..e43906cefcdde31d0eb48bf6e81f9ae8afe79cff --- /dev/null +++ b/nano-graphrag/docs/CONTRIBUTING.md @@ -0,0 +1,19 @@ +# Contributing to nano-graphrag + +### Submit your Contribution through PR + +To make a contribution, follow these steps: + +1. Fork and clone this repository +3. If you modified the core code (`./nano_graphrag`), please add tests for it +4. **Include proper documentation / docstring or examples** +5. Ensure that all tests pass by running `pytest` +6. Submit a pull request + +For more details about pull requests, please read [GitHub's guides](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request). + + + +### Only add a dependency when we have to + +`nano-graphrag` needs to be `nano` and `light`. If we want to add more features, we add them smartly. Don't introduce a huge dependency just for a simple function. \ No newline at end of file diff --git a/nano-graphrag/docs/FAQ.md b/nano-graphrag/docs/FAQ.md new file mode 100644 index 0000000000000000000000000000000000000000..d570f8c401b49947bf073ce2ef45f03ce1359ffe --- /dev/null +++ b/nano-graphrag/docs/FAQ.md @@ -0,0 +1,41 @@ +### `Leiden.EmptyNetworkError:EmptyNetworkError` + +This error is caused by `nano-graphrag` tries to compute communities on an empty network. In most cases, this is caused by the LLM model you're using, it fails to extract any entities or relations, so the graph is empty. + +Try to use another bigger LLM, or here are some ideas to fix it: + + - Check the response from the LLM, make sure the result fits the desired response format of the extracting entities prompt. + + The desired response format is something like that: + + ```text + ("entity"<|>"Cruz"<|>"person"<|>"Cruz is associated with a vision of control and order, influencing the dynamics among other characters.") + ``` + + - Some LLMs may not return the format like above, so one possible solution is to add a system instruction to the input prompt, such like: + ```json + { + "role": "system", + "content": "You are an intelligent assistant and will follow the instructions given to you to fulfill the goal. The answer should be in the format as in the given example." + } + ``` + You can use this system_prompt as default for your LLM calling funcation + + +### One possible reason of 'Processed 42 chunks,0 entities(duplicated),0 relations(duplicated)WARNING:nano-graphrag:Didn't extract any entities, maybe your LLM is not working WARNING:nano-graphrag:No new entities found' + +The default num_ctx of ollama is 2048 which is too small for the input prompt of entity extraction. This causes the model to fail to respond correctly. + +Solution: + Each model in Ollama has a configuration file. Here, you need to generate a new configuration file based on the original one, and then use this configuration file to generate a new model. + For example the qwen2, run the following command: + + `ollama show --modelfile qwen2 > Modelfile` + + Add a new line into this file below the 'FROM': + + `PARAMETER num_ctx 32000` + + `ollama create -f Modelfile qwen2:ctx32k` + + Afterwards, you can use qwen2:ctx32k to replace qwen2. diff --git a/nano-graphrag/docs/ROADMAP.md b/nano-graphrag/docs/ROADMAP.md new file mode 100644 index 0000000000000000000000000000000000000000..9455d32ac74f8f77a8af088167963e9755cdb267 --- /dev/null +++ b/nano-graphrag/docs/ROADMAP.md @@ -0,0 +1,25 @@ +## Next Version + +- [ ] Add DSpy for prompt-tuning to make small models(Qwen2 7B, Llama 3.1 8B...) can extract entities. @NumberChiffre @gusye1234 +- [ ] Optimize Algorithm: add `global_local` query method, globally rewrite query then perform local search. + + + +## In next few versions + +- [ ] Add rate limiter: support token limit (tokens per second, per minute) + +- [ ] Add other advanced RAG algorithms, candidates: + + - [ ] [HybridRAG](https://arxiv.org/abs/2408.04948) + - [ ] [HippoRAG](https://arxiv.org/abs/2405.14831) + + + + + + +## Interesting directions + +- [ ] Add [Sciphi Triplex](https://huggingface.co/SciPhi/Triplex) as the entity extraction model. +- [ ] Add new components, see [issue](https://github.com/gusye1234/nano-graphrag/issues/2) \ No newline at end of file diff --git a/nano-graphrag/docs/benchmark-dspy-entity-extraction.md b/nano-graphrag/docs/benchmark-dspy-entity-extraction.md new file mode 100644 index 0000000000000000000000000000000000000000..7c2aa51e9328efc8416d3c4bb840b77913f17c55 --- /dev/null +++ b/nano-graphrag/docs/benchmark-dspy-entity-extraction.md @@ -0,0 +1,276 @@ +# Chain Of Thought Prompting with DSPy-AI (v2.4.16) +## Main Takeaways +- Time difference: 156.99 seconds +- Execution time with DSPy-AI: 304.38 seconds +- Execution time without DSPy-AI: 147.39 seconds +- Entities extracted: 22 (without DSPy-AI) vs 37 (with DSPy-AI) +- Relationships extracted: 21 (without DSPy-AI) vs 36 (with DSPy-AI) + + +## Results +```markdown +> python examples/benchmarks/dspy_entity.py + +Running benchmark with DSPy-AI: +INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK" +INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK" +INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK" +DEBUG:nano-graphrag:Entities: 14 | Missed Entities: 23 | Total Entities: 37 +DEBUG:nano-graphrag:Relationships: 13 | Missed Relationships: 23 | Total Relationships: 36 +DEBUG:nano-graphrag:Direct Relationships: 31 | Second-order: 5 | Third-order: 0 | Total Relationships: 36 +⠙ Processed 1 chunks, 37 entities(duplicated), 36 relations(duplicated) +Execution time with DSPy-AI: 304.38 seconds + +Entities: +- 朱元璋 (PERSON): + 明朝开国皇帝,原名朱重八,后改名朱元璋。他出身贫农,经历了从放牛娃到皇帝的传奇人生。在元朝末年,他参加了红巾军起义,最终推翻元朝,建立了明朝。 +- 朱五四 (PERSON): + 朱元璋的父亲,农民出身,家境贫寒。他在朱元璋幼年时去世,对朱元璋的成长和人生选择产生了深远影响。 +- 陈氏 (PERSON): + 朱元璋的母亲,农民出身,家境贫寒。她在朱元璋幼年时去世,对朱元璋的成长和人生选择产生了深远影响。 +- 汤和 (PERSON): + 朱元璋的幼年朋友,后来成为朱元璋起义军中的重要将领。他在朱元璋早期的发展中起到了关键作用。 +- 郭子兴 (PERSON): + 红巾军起义的领导人之一,朱元璋的岳父。他在朱元璋早期的发展中起到了重要作用,但后来与朱元璋产生了矛盾。 +- 马姑娘 (PERSON): + 郭子兴的义女,朱元璋的妻子。她在朱元璋最困难的时候给予了极大的支持,是朱元璋成功的重要因素之一。 +- 元朝 (ORGANIZATION): + 中国历史上的一个朝代,由蒙古族建立。元朝末年,社会矛盾激化,最终导致了红巾军起义和明朝的建立。 +- 红巾军 (ORGANIZATION): + 元朝末年起义军的一支,主要由农民组成。朱元璋最初加入的就是红巾军,并在其中逐渐崭露头角。 +- 皇觉寺 (LOCATION): + 朱元璋早年出家的地方,位于安徽凤阳。他在寺庙中度过了几年的时光,这段经历对他的人生观和价值观产生了深远影响。 +- 濠州 (LOCATION): + 朱元璋早期活动的重要地点,也是红巾军的重要据点之一。朱元璋在这里经历了许多重要事件,包括与郭子兴的矛盾和最终的离开。 +- 1328年 (DATE): + 朱元璋出生的年份。这一年标志着明朝开国皇帝传奇人生的开始。 +- 1344年 (DATE): + 朱元璋家庭遭遇重大变故的年份,他的父母在这一年相继去世。这一事件对朱元璋的人生选择产生了深远影响。 +- 1352年 (DATE): + 朱元璋正式加入红巾军起义的年份。这一年标志着朱元璋从农民到起义军领袖的转变。 +- 1368年 (DATE): + 朱元璋推翻元朝,建立明朝的年份。这一年标志着朱元璋从起义军领袖到皇帝的转变。 +- 朱百六 (PERSON): + 朱元璋的高祖,名字具有元朝时期老百姓命名的特点,即以数字命名。 +- 朱四九 (PERSON): + 朱元璋的曾祖,名字同样具有元朝时期老百姓命名的特点,即以数字命名。 +- 朱初一 (PERSON): + 朱元璋的祖父,名字具有元朝时期老百姓命名的特点,即以数字命名。 +- 刘德 (PERSON): + 朱元璋早年为其放牛的地主,对朱元璋的童年生活有重要影响。 +- 韩山童 (PERSON): + 红巾军起义的早期领导人之一,与刘福通共同起义,对朱元璋的起义选择有间接影响。 +- 刘福通 (PERSON): + 红巾军起义的早期领导人之一,与韩山童共同起义,对朱元璋的起义选择有间接影响。 +- 脱脱 (PERSON): + 元朝末年的著名宰相,主张治理黄河,但他的政策间接导致了红巾军起义的爆发。 +- 元顺帝 (PERSON): + 元朝末代皇帝,他在位期间元朝社会矛盾激化,最终导致了红巾军起义和明朝的建立。 +- 孙德崖 (PERSON): + 红巾军起义的领导人之一,与郭子兴有矛盾,曾绑架郭子兴,对朱元璋的早期发展有重要影响。 +- 周德兴 (PERSON): + 朱元璋的早期朋友,曾为朱元璋算卦,对朱元璋的人生选择有一定影响。 +- 徐达 (PERSON): + 朱元璋早期的重要将领,后来成为明朝的开国功臣之一。 +- 明教 (RELIGION): + 朱元璋在起义过程中接触到的宗教信仰,对他的思想和行动有一定影响。 +- 弥勒佛 (RELIGION): + 明教中的重要神祇,朱元璋相信弥勒佛会降世,对他的信仰和行动有一定影响。 +- 颖州 (LOCATION): + 朱元璋早年讨饭的地方,也是红巾军起义的重要地点之一。 +- 定远 (LOCATION): + 朱元璋早期攻打的地点之一,是他军事生涯的起点。 +- 怀远 (LOCATION): + 朱元璋早期攻打的地点之一,是他军事生涯的起点。 +- 安奉 (LOCATION): + 朱元璋早期攻打的地点之一,是他军事生涯的起点。 +- 含山 (LOCATION): + 朱元璋早期攻打的地点之一,是他军事生涯的起点。 +- 虹县 (LOCATION): + 朱元璋早期攻打的地点之一,是他军事生涯的起点。 +- 钟离 (LOCATION): + 朱元璋的家乡,他在此地召集了二十四位重要将领。 +- 黄河 (LOCATION): + 元朝末年黄河泛滥,导致了严重的社会问题,间接引发了红巾军起义。 +- 淮河 (LOCATION): + 元朝末年淮河沿岸遭遇严重瘟疫和旱灾,加剧了社会矛盾。 +- 1351年 (DATE): + 红巾军起义爆发的年份,对朱元璋的人生选择产生了重要影响。 + +Relationships: +- 朱元璋 -> 朱五四: + 朱元璋是朱五四的儿子,朱五四的去世对朱元璋的成长和人生选择产生了深远影响。 +- 朱元璋 -> 陈氏: + 朱元璋是陈氏的儿子,陈氏的去世对朱元璋的成长和人生选择产生了深远影响。 +- 朱元璋 -> 汤和: + 汤和是朱元璋的幼年朋友,后来成为朱元璋起义军中的重要将领,对朱元璋早期的发展起到了关键作用。 +- 朱元璋 -> 郭子兴: + 郭子兴是朱元璋的岳父,也是红巾军起义的领导人之一。他在朱元璋早期的发展中起到了重要作用,但后来与朱元璋产生了矛盾。 +- 朱元璋 -> 马姑娘: + 马姑娘是朱元璋的妻子,她在朱元璋最困难的时候给予了极大的支持,是朱元璋成功的重要因素之一。 +- 朱元璋 -> 元朝: + 朱元璋在元朝末年参加了红巾军起义,最终推翻了元朝,建立了明朝。 +- 朱元璋 -> 红巾军: + 朱元璋最初加入的是红巾军,并在其中逐渐崭露头角,最终成为起义军的重要领导人。 +- 朱元璋 -> 皇觉寺: + 朱元璋早年出家的地方是皇觉寺,这段经历对他的人生观和价值观产生了深远影响。 +- 朱元璋 -> 濠州: + 濠州是朱元璋早期活动的重要地点,也是红巾军的重要据点之一。朱元璋在这里经历了许多重要事件,包括与郭子兴的矛盾和最终的离开。 +- 朱元璋 -> 1328年: + 1328年是朱元璋出生的年份,这一年标志着明朝开国皇帝传奇人生的开始。 +- 朱元璋 -> 1344年: + 1344年是朱元璋家庭遭遇重大变故的年份,他的父母在这一年相继去世,这一事件对朱元璋的人生选择产生了深远影响。 +- 朱元璋 -> 1352年: + 1352年是朱元璋正式加入红巾军起义的年份,这一年标志着朱元璋从农民到起义军领袖的转变。 +- 朱元璋 -> 1368年: + 1368年是朱元璋推翻元朝,建立明朝的年份,这一年标志着朱元璋从起义军领袖到皇帝的转变。 +- 朱元璋 -> 朱百六: + 朱百六是朱元璋的高祖,对朱元璋的家族背景有重要影响。 +- 朱元璋 -> 朱四九: + 朱四九是朱元璋的曾祖,对朱元璋的家族背景有重要影响。 +- 朱元璋 -> 朱初一: + 朱初一是朱元璋的祖父,对朱元璋的家族背景有重要影响。 +- 朱元璋 -> 刘德: + 刘德是朱元璋早年为其放牛的地主,对朱元璋的童年生活有重要影响。 +- 朱元璋 -> 韩山童: + 韩山童是红巾军起义的早期领导人之一,对朱元璋的起义选择有间接影响。 +- 朱元璋 -> 刘福通: + 刘福通是红巾军起义的早期领导人之一,对朱元璋的起义选择有间接影响。 +- 朱元璋 -> 脱脱: + 脱脱是元朝末年的著名宰相,他的政策间接导致了红巾军起义的爆发,对朱元璋的起义选择有间接影响。 +- 朱元璋 -> 元顺帝: + 元顺帝是元朝末代皇帝,他在位期间社会矛盾激化,最终导致了红巾军起义和明朝的建立,对朱元璋的起义选择有重要影响。 +- 朱元璋 -> 孙德崖: + 孙德崖是红巾军起义的领导人之一,与郭子兴有矛盾,曾绑架郭子兴,对朱元璋的早期发展有重要影响。 +- 朱元璋 -> 周德兴: + 周德兴是朱元璋的早期朋友,曾为朱元璋算卦,对朱元璋的人生选择有一定影响。 +- 朱元璋 -> 徐达: + 徐达是朱元璋早期的重要将领,后来成为明朝的开国功臣之一,对朱元璋的军事生涯有重要影响。 +- 朱元璋 -> 明教: + 朱元璋在起义过程中接触到的宗教信仰,对他的思想和行动有一定影响。 +- 朱元璋 -> 弥勒佛: + 朱元璋相信弥勒佛会降世,对他的信仰和行动有一定影响。 +- 朱元璋 -> 颖州: + 颖州是朱元璋早年讨饭的地方,也是红巾军起义的重要地点之一,对朱元璋的早期生活有重要影响。 +- 朱元璋 -> 定远: + 定远是朱元璋早期攻打的地点之一,是他军事生涯的起点,对朱元璋的军事发展有重要影响。 +- 朱元璋 -> 怀远: + 怀远是朱元璋早期攻打的地点之一,是他军事生涯的起点,对朱元璋的军事发展有重要影响。 +- 朱元璋 -> 安奉: + 安奉是朱元璋早期攻打的地点之一,是他军事生涯的起点,对朱元璋的军事发展有重要影响。 +- 朱元璋 -> 含山: + 含山是朱元璋早期攻打的地点之一,是他军事生涯的起点,对朱元璋的军事发展有重要影响。 +- 朱元璋 -> 虹县: + 虹县是朱元璋早期攻打的地点之一,是他军事生涯的起点,对朱元璋的军事发展有重要影响。 +- 朱元璋 -> 钟离: + 钟离是朱元璋的家乡,他在此地召集了二十四位重要将领,对朱元璋的军事发展有重要影响。 +- 朱元璋 -> 黄河: + 元朝末年黄河泛滥,导致了严重的社会问题,间接引发了红巾军起义,对朱元璋的起义选择有重要影响。 +- 朱元璋 -> 淮河: + 元朝末年淮河沿岸遭遇严重瘟疫和旱灾,加剧了社会矛盾,对朱元璋的起义选择有重要影响。 +- 朱元璋 -> 1351年: + 1351年是红巾军起义爆发的年份,对朱元璋的人生选择产生了重要影响。 +Running benchmark without DSPy-AI: +INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK" +INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK" +⠙ Processed 1 chunks, 22 entities(duplicated), 21 relations(duplicated) +Execution time without DSPy-AI: 147.39 seconds + +Entities: +- "朱元璋" ("PERSON"): + "朱元璋,原名朱重八,后改名朱元璋,是明朝的开国皇帝。他出身贫农,经历了从放牛娃到和尚,再到起义军领袖,最终成为皇帝的传奇人生。" +- "朱五四" ("PERSON"): + "朱五四,朱元璋的父亲,是一个农民,为地主种地,家境贫寒。" +- "陈氏" ("PERSON"): + "陈氏,朱元璋的母亲,是一个农民,与丈夫朱五四一起辛勤劳作,家境贫寒。" +- "汤和" ("PERSON"): + "汤和,朱元璋的幼年朋友,后来成为朱元璋的战友,在朱元璋的崛起过程中起到了重要作用。" +- "郭子兴" ("PERSON"): + "郭子兴,濠州城的守卫者,是朱元璋的岳父,也是朱元璋早期的重要支持者。" +- "韩山童" ("PERSON"): + "韩山童,与刘福通一起起义反抗元朝统治,是元末农民起义的重要领袖之一。""韩山童,元末农民起义的领袖之一,自称宋朝皇室后裔,与刘福通一起起义。" +- "刘福通" ("PERSON"): + "刘福通,与韩山童一起起义反抗元朝统治,是元末农民起义的重要领袖之一。""刘福通,元末农民起义的领袖之一,自称刘光世大将的后人,与韩山童一起起义。" +- "元朝" ("ORGANIZATION"): + "元朝,由蒙古族建立的王朝,统治中国时期实行了严格的等级制度,导致社会矛盾激化,最终被朱元璋领导的起义军推翻。" +- "皇觉寺" ("ORGANIZATION"): + "皇觉寺,朱元璋曾经在此当和尚,从事杂役工作,后来因饥荒严重,和尚们都被派出去化缘。" +- "白莲教" ("ORGANIZATION"): + "白莲教,元末农民起义中的一种宗教组织,韩山童和刘福通起义时利用了这一宗教信仰。" +- "濠州城" ("GEO"): + "濠州城,位于今安徽省,是朱元璋早期活动的重要地点,也是郭子兴的驻地。" +- "定远" ("GEO"): + "定远,朱元璋奉命攻击的地方,成功攻克后在元军回援前撤出,显示了其军事才能。" +- "钟离" ("GEO"): + "钟离,朱元璋的家乡,他在此招收了二十四名壮丁,这些人后来成为明朝的高级干部。" +- "元末农民起义" ("EVENT"): + "元末农民起义,是元朝末年由韩山童、刘福通等人领导的反抗元朝统治的大规模起义,最终导致了元朝的灭亡。" +- "马姑娘" ("PERSON"): + "马姑娘,郭子兴的义女,后来成为朱元璋的妻子,在朱元璋被关押时,她冒着危险送饭给朱元璋,表现出深厚的感情。" +- "孙德崖" ("PERSON"): + "孙德崖,与郭子兴有矛盾的起义军领袖之一,曾参与绑架郭子兴。" +- "徐达" ("PERSON"): + "徐达,朱元璋的二十四名亲信之一,后来成为明朝的重要将领。" +- "周德兴" ("PERSON"): + "周德兴,朱元璋的二十四名亲信之一,曾为朱元璋算过命。" +- "脱脱" ("PERSON"): + "脱脱,元朝的著名宰相,主张治理黄河,但他的政策间接导致了元朝的灭亡。" +- "元顺帝" ("PERSON"): + "元顺帝,元朝的最后一位皇帝,统治时期元朝社会矛盾激化,最终导致了元朝的灭亡。" +- "刘德" ("PERSON"): + "刘德,地主,朱元璋早年为其放牛。" +- "吴老太" ("PERSON"): + "吴老太,村口的媒人,朱元璋曾希望托她找一个媳妇。" + +Relationships: +- "朱元璋" -> "朱五四": + "朱元璋的父亲,对他的成长和早期生活有重要影响。" +- "朱元璋" -> "陈氏": + "朱元璋的母亲,对他的成长和早期生活有重要影响。" +- "朱元璋" -> "汤和": + "朱元璋的幼年朋友,后来成为他的战友,在朱元璋的崛起过程中起到了重要作用。" +- "朱元璋" -> "郭子兴": + "朱元璋的岳父,是他在起义军中的重要支持者。" +- "朱元璋" -> "韩山童": + "朱元璋在起义过程中与韩山童有间接联系,韩山童的起义对朱元璋的崛起有重要影响。" +- "朱元璋" -> "刘福通": + "朱元璋在起义过程中与刘福通有间接联系,刘福通的起义对朱元璋的崛起有重要影响。" +- "朱元璋" -> "元朝": + "朱元璋最终推翻了元朝的统治,建立了明朝。" +- "朱元璋" -> "皇觉寺": + "朱元璋曾经在此当和尚,这段经历对他的成长有重要影响。" +- "朱元璋" -> "白莲教": + "朱元璋在起义过程中接触到了白莲教,虽然他本人可能并不信仰,但白莲教的起义对他有重要影响。" +- "朱元璋" -> "濠州城": + "朱元璋在濠州城的活动对其早期军事和政治生涯有重要影响。" +- "朱元璋" -> "定远": + "朱元璋成功攻克定远,显示了其军事才能。" +- "朱元璋" -> "钟离": + "朱元璋的家乡,他在此招收了二十四名壮丁,这些人后来成为明朝的高级干部。" +- "朱元璋" -> "元末农民起义": + "朱元璋参与并最终领导了元末农民起义,推翻了元朝的统治。" +- "朱元璋" -> "马姑娘": + "朱元璋的妻子,在朱元璋被关押时,她冒着危险送饭给朱元璋,表现出深厚的感情。" +- "朱元璋" -> "孙德崖": + "朱元璋在孙德崖与郭子兴的矛盾中起到了调解作用,显示了其政治智慧。" +- "朱元璋" -> "徐达": + "朱元璋的二十四名亲信之一,后来成为明朝的重要将领。" +- "朱元璋" -> "周德兴": + "朱元璋的二十四名亲信之一,曾为朱元璋算过命。" +- "朱元璋" -> "脱脱": + "朱元璋在起义过程中间接受到脱脱政策的影响,脱脱的政策间接导致了元朝的灭亡。" +- "朱元璋" -> "元顺帝": + "朱元璋最终推翻了元顺帝的统治,建立了明朝。" +- "朱元璋" -> "刘德": + "朱元璋早年为刘德放牛,这段经历对他的成长有重要影响。" +- "朱元璋" -> "吴老太": + "朱元璋曾希望托吴老太找一个媳妇,显示了他对家庭的渴望。" +``` + +# Self-Refine with DSPy-AI (v2.5.6) +## Main Takeaways +- Time difference: 66.24 seconds +- Execution time with DSPy-AI: 211.04 seconds +- Execution time without DSPy-AI: 144.80 seconds +- Entities extracted: 38 (without DSPy-AI) vs 16 (with DSPy-AI) +- Relationships extracted: 38 (without DSPy-AI) vs 16 (with DSPy-AI) diff --git a/nano-graphrag/docs/benchmark-en.md b/nano-graphrag/docs/benchmark-en.md new file mode 100644 index 0000000000000000000000000000000000000000..b1d4b6aebbbf305bb6aaefe32892875633b1a4b1 --- /dev/null +++ b/nano-graphrag/docs/benchmark-en.md @@ -0,0 +1,150 @@ +- We use [A Christmas Carol](https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt) by Dickens as the benchmark corpus. +- We use `61b5eea34783c58074b3c53f1689ad8a5ba6b6ee` commit of [Official GraphRAG implementation](https://github.com/microsoft/graphrag/tree/main) +- Both GraphRAG and `nano-graphrag` use OpenAI Embedding and `gpt-4o`. +- Not Cache for both. On the same device and network connection. +- GrapgRAG Max Async API request: 25 +- `nano-graphrag` Max Async API request: 16 + +## Index Benchmark + +**GraphRAG index time**: more than 5 minutes + +**`nano-graphrag` index time**: less than 4 minutes + +## Local Search Results + +#### GraphRAG + +"What are the top themes in this story?" + +```markdown +# Top Themes in the Story + +The story revolves around several central themes that are intricately woven into the narrative, each contributing to the overall message and character development. Here are the top themes: + +## Family and Relationships + +Family and relationships play a crucial role in the story, particularly through the interactions between Scrooge and his nephew and niece. Scrooge's nephew is depicted as a warm-hearted and persistent character who continually tries to engage Scrooge, especially during the Christmas season. Despite Scrooge's initial disdain for Christmas, his nephew's efforts highlight themes of compassion and familial bonds [Data: Scrooge's Family and Christmas (20); Relationships (149, 152, 154)]. Scrooge's niece also contributes to this theme with her strong opinions and musical talent, which help soften Scrooge's attitude and remind him of his past [Data: Scrooge's Family and Christmas (20); Relationships (154)]. + +## Transformation and Redemption + +The theme of transformation and redemption is central to the story, primarily illustrated through Scrooge's character arc. Initially, Scrooge is cynical and dismissive of Christmas and its associated values. However, the recurring event of Christmas serves as a catalyst for his transformation. Through various emotional experiences and reflections prompted by the holiday, Scrooge undergoes a significant change, ultimately embracing generosity and kindness [Data: Scrooge's Family and Christmas (20); Relationships (96, 156, 153, 154)]. + +## The Spirit of Christmas + +Christmas itself is a recurring and significant event in the story, symbolizing themes of goodwill, generosity, and reflection. The holiday serves as the backdrop for many of the story's pivotal moments and emotional responses. It is during Christmas that Scrooge's transformation occurs, highlighting the holiday's themes of kindness and community. The discussions and celebrations around Christmas emphasize its importance in fostering a sense of togetherness and compassion [Data: Scrooge's Family and Christmas (20); Relationships (96, 149, 153, 154, 156)]. + +## Social Commentary + +The story also provides a social commentary on the issues of poverty and social inequality. Characters like Ignorance and Want symbolize the social issues of the time, revealed by the Ghost of Christmas Present. Scrooge's initial attitude towards the poor and his interactions with the gentlemen seeking charity highlight the harsh realities of social neglect and the need for compassion and support for the less fortunate [Data: Entities (40); Sources (4)]. + +## The Role of Memory and Reflection + +Memory and reflection are significant themes that contribute to Scrooge's transformation. The story often delves into Scrooge's past, evoking memories that help him reflect on his life choices and their impact on his current state. These reflections are crucial in prompting Scrooge to change his ways and embrace a more generous and compassionate outlook [Data: Scrooge's Family and Christmas (20); Relationships (96, 156)]. + +## Conclusion + +In summary, the top themes in the story include family and relationships, transformation and redemption, the spirit of Christmas, social commentary, and the role of memory and reflection. Each of these themes is intricately connected to the characters and events, contributing to the story's rich narrative and moral lessons. +``` + +### `nano-graphrag` + +"What are the top themes in this story? " + +```markdown +Sure, let’s explore the top themes in "A Christmas Carol" based on the data provided. + +### Transformation and Redemption + +The central theme of "A Christmas Carol" revolves around the profound transformation and redemption of Ebenezer Scrooge. Initially depicted as a miserly and solitary man who disdains Christmas and human connection, Scrooge undergoes a radical change after a series of supernatural visits. These visitations force him to reflect on his past, recognize the suffering of others in the present, and confront the grim future that awaits him if he remains unchanged [Data: Reports (0); Entities (1); Relationships (0, 3, 5, 14, 45, +more)]. + +### Influence of Supernatural Entities + +Supernatural elements are crucial in facilitating Scrooge's transformation. The Ghosts of Christmas Past, Present, and Yet to Come each play a significant role by exposing Scrooge to various scenes, prompting deep introspection and emotional responses. Jacob Marley's ghost, in particular, acts as the initial catalyst, warning Scrooge of the consequences of his miserly ways and setting the stage for the more impactful supernatural encounters that follow [Data: Reports (0); Entities (17, 18, 46); Relationships (29, 30, 46, 55, 114, +more)]. + +### Social Critique + +Dickens uses the story to critique the social conditions of his time, emphasizing themes of poverty and social neglect. Characters like Bob Cratchit and his family highlight the struggles of the working class, while the conceptual children, Ignorance and Want, presented by the Ghost of Christmas Present, illustrate the dire consequences of societal indifference. This critique encourages readers to reflect on their own social responsibilities and the broader societal need for compassion and generosity [Data: Reports (0); Entities (57, 107, 141); Relationships (45, 102, 159, +more)]. + +### Family and Community + +Another prominent theme is the importance of family and community. Scrooge's interactions with Fred, his cheerful and persistent nephew, highlight the warmth and joy that family connections can bring. Fred's continued efforts to include Scrooge in family celebrations play a pivotal role in rekindling their relationship. Likewise, the Cratchit family, despite their poverty, exemplifies love, togetherness, and the true spirit of Christmas, all of which contribute to Scrooge's awakening and ultimate redemption [Data: Reports (0); Entities (14, 44); Relationships (28, 39, 40, 86, 153, +more)]. + +### Generosity and Charity + +The theme of generosity and charity is woven throughout the narrative. Scrooge's eventual acts of kindness, such as his improved treatment of Bob Cratchit and his charitable gestures towards the end of the story, symbolize the transformative power of giving. This newly embraced generosity contrasts sharply with his former frugality and serves as a testament to the positive changes that have taken place within him [Data: Reports (0); Entities (6, 43); Relationships (7, 8, 84, 127, +more)]. + +### Conclusion + +In summary, "A Christmas Carol" is rich with themes that explore personal transformation, social responsibility, family, and the spirit of generosity. These themes are intricately linked through the narrative structure and character interactions, making the story a timeless reflection on the human condition and the potential for redemption and change [Data: Reports (0); Entities (1, 17, 18); Relationships (0, 14, 31, 45, 48, +more)]. + +Feel free to ask if you need more detailed information on any specific aspect of these themes! +``` + + + +## Global Search Results + +### GraphRAG + +"What are the top themes in this story? " + +```markdown +#### Transformation +The central theme of the story is the transformation of Ebenezer Scrooge. Initially depicted as a miserly and unsympathetic individual, Scrooge undergoes a profound change to become a generous and caring person. This transformation is driven by his interactions with various supernatural entities, including Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come [Data: Reports (18, 17, 27, 21, 6, 26)]. + +#### Familial Love and Support +The theme of familial love and support is prominently depicted through the Cratchit Family. Despite their financial struggles, the Cratchit family remains loving and supportive of each other. Key relationships include Bob Cratchit's deep affection for his son Tiny Tim and the family's collective grief over Tiny Tim's potential death [Data: Reports (20, 19)]. + +#### Generosity and Compassion +Generosity and compassion are illustrated through characters like Fezziwig and the transformed Scrooge. Fezziwig's joyful Christmas parties and Scrooge's eventual acts of kindness towards the Cratchit family and others highlight the positive impact of generosity on the community [Data: Reports (23, 19, 26)]. + +#### Social Responsibility +The theme of social responsibility is underscored by the allegorical figures of Ignorance and Want, as well as the broader societal issues highlighted by the Ghost of Christmas Present. These elements emphasize the importance of addressing poverty and the consequences of neglecting the less fortunate [Data: Reports (17, 21)]. + +#### The Christmas Spirit +The Christmas spirit, encompassing joy, celebration, and togetherness, is a recurring motif throughout the story. Various scenes, such as the Cratchit family's Christmas dinner, Fred's Christmas party, and the celebrations shown by the Ghost of Christmas Present, reinforce the importance of embracing the festive spirit and its associated values [Data: Reports (20, 18, 23)]. + +#### Impact of Past Relationships +Scrooge's past relationships, particularly with 'The Girl' (later 'The Matron') and Belle, highlight themes of lost opportunities and the contrast between his lonely existence and the warmth of family life [Data: Reports (2)]. + +#### Role of Supernatural Elements +Supernatural elements, including Marley's Ghost and the three spirits, play a crucial role in Scrooge's transformation. These entities guide Scrooge through various significant scenes, influencing his reflections and realizations [Data: Reports (26)]. + +#### Family and Love +The relationships within the community, such as those between Mr. and Mrs. Fezziwig and Belle's family, underscore the importance of family and love. These relationships serve as a reflection on what Scrooge has missed out on due to his choices [Data: Reports (2, 25)]. + +These themes collectively highlight the story's emphasis on personal growth, the importance of community and family, and the enduring values associated with the Christmas season. +``` + + + +### `nano-graghrag` + +"What are the top themes in this story? " + +```markdown +### Key Themes in the Story + +#### 1. Transformation and Redemption +The central theme of the story revolves around Ebenezer Scrooge's profound transformation from a miserly, cold-hearted man to a generous and kind individual. This journey of change emphasizes the possibility of redemption and the capacity for personal change. The transformation is predominantly driven by Scrooge's interactions with various supernatural entities such as the Ghosts of Christmas Past, Present, and Yet to Come [Data: Reports (0, 1, 2, 3, 4, 5, 6, 12, 23, 14, +more)]. + +#### 2. The Role of Supernatural Entities +Supernatural spirits play critical roles in guiding Scrooge toward his transformation. The Ghosts of Christmas Past, Present, and Yet to Come evoke reflection, regret, and fear in Scrooge, highlighting the impact of spiritual guidance on personal redemption. These encounters force him to confront and reflect on his past actions and their consequences [Data: Reports (1, 2, 3, 12, 0)]. + +#### 3. Generosity and Community Support +Another significant theme is the spirit of generosity and community support. Initially depicted as a miser, Scrooge's eventual acts of generosity and the festive spirit among various characters underscore the importance of kindness and community. The Cratchit family's modest yet joyous Christmas celebration poignantly highlights this theme [Data: Reports (26, 6, 2, 14)]. + +#### 4. Family and Emotional Bonds +The story places significant emphasis on family and emotional connections. This theme is illustrated through the relationships between Scrooge and his family, such as the close sibling bond with Fan and the dynamics within the Cratchit family. These relationships and their development underscore the importance of familial love and support [Data: Reports (0, 2, 4, 17, 5, 15)]. + +#### 5. Social Justice and Economic Disparity +The narrative also addresses themes of social justice and economic disparity. Scrooge's initial indifference to others' struggles and the contrasting lifestyles of characters like Bob Cratchit highlight the societal inequalities of the time. This critique is further exemplified through the characters’ interactions and the broader social critique in the story [Data: Reports (0, 1, 24, 9, 25, +more)]. + +#### 6. Role of Memory and the Past +Closely tied to Scrooge’s transformation is the theme of memory and the past. The Ghosts of Christmas Past and Marley’s Ghost prompt Scrooge to reflect on his own life, evoking memories that significantly impact his journey toward becoming a better man. These reflections play a crucial role in his redemption [Data: Reports (3, 8, 13, 17, 6, +more)]. + +### Conclusion +These themes collectively create a rich and multifaceted narrative that conveys powerful messages about personal growth, the impact of community, the value of generosity, and the critique of social inequities. The journey of Ebenezer Scrooge serves as a timeless reminder of the potential for change and the importance of compassion and empathy in society. +``` + diff --git a/nano-graphrag/docs/benchmark-zh.md b/nano-graphrag/docs/benchmark-zh.md new file mode 100644 index 0000000000000000000000000000000000000000..0318852391b48ea847afa4f2d3c8c139bc8015be --- /dev/null +++ b/nano-graphrag/docs/benchmark-zh.md @@ -0,0 +1,91 @@ +- We use [三国演义](https://github.com/tennessine/corpus/blob/master/%E4%B8%89%E5%9B%BD%E6%BC%94%E4%B9%89.txt) by 罗贯中 as the benchmark corpus. +- We use `61b5eea34783c58074b3c53f1689ad8a5ba6b6ee` commit of [Official GraphRAG implementation](https://github.com/microsoft/graphrag/tree/main) +- Both GraphRAG and `nano-graphrag` use OpenAI Embedding and `gpt-4o`. +- Not Cache for both. +- On the same device and network connection. +- GrapgRAG Max Async API request: 25 +- `nano-graphrag` Max Async API request: 16 + +## Index Benchmark + +Both are suck, more than 10 minutes. + +## Local Search Results + +### GraphRAG + +"大乔和曹操的关系" + +```markdown +大乔与曹操之间的关系在历史和文学中被描绘为复杂且多维的,尽管直接的互动可能并不频繁。大乔,作为东吴孙策的妻子,以及其美貌和与三国时期重要人物的关联而闻名。曹操,作为三国时期的关键政治和军事领导者,其与大乔的关系主要通过他与东吴的关系以及对大乔及其家族的兴趣来体现。 + +首先,大乔的美貌在三国时期广为人知,甚至引起了曹操的注意。据记载,曹操曾有意俘获大乔以供己用,这一点体现了曹操对大乔的个人兴趣,尽管这种兴趣可能更多地是基于她的美貌和象征意义,而非两人之间有实质性的个人关系 [Data: Entities (266)]。 + +其次,大乔作为孙策的妻子,自然成为了孙家与东吴势力中的一员。孙策和孙权是曹操在政治和军事上的对手,这使得大乔与曹操之间存在着间接的敌对关系。曹操对东吴的军事行动,包括对孙权的攻击,间接地影响到了大乔及其家族的安全和利益 [Data: Entities (266)]。 + +最后,尽管直接的文献记录可能不多,大乔的地位和她所处的时代背景使她与曹操之间的关系充满了历史和文化上的意义。她的故事和曹操的野心相交织,展现了三国时期女性角色与政治军事活动之间的复杂联系。 + +综上所述,大乔与曹操之间的关系主要是通过她的家庭背景、曹操的政治军事活动,以及文化记忆中对她美貌的追捧来定义的。这种关系更多地体现在历史和文学的叙述中,而不是基于两人之间有直接的个人互动。 +``` + +### `nano-graphrag` + +"大乔和曹操的关系" + +```markdown +# 大乔和曹操的关系 +## 大乔的背景 +大乔(Da Qiao)是江东乔公的长女,以其极美的容貌闻名于世。她最为著名的身份是孙策(Sun Ce)的妻子,这一身份在三国时期的政治和军事格局中也占有重要地位。周瑜特别提到她,显示她在周瑜与曹操局势中的重要性 [Data: Entities (1)]。 +## 曹操的欲望 +曹操(Cao Cao)对大乔的渴望主要源自其好色本性。他希望得到大乔和她的妹妹小乔,以满足个人欲望,并且认为获得她们可以帮助他提高政治地位和享乐。曹操曾发誓希望能得到江东二乔(大乔和小乔),并将她们置于铜雀台上,以乐晚年 [Data: Relationships (6, 7); Entities (1, 2)]。 +## 谋略与政治 +在三国时期的权谋斗争中,诸葛亮曾提议使用美人计,将大乔和小乔送给曹操,以此来使曹操班师回朝,减少对东吴的威胁。这一提议显示出大乔在权谋中的重要角色以及她可能对大局产生的影响 [Data: Sources (0); Relationships (12)]。 +## 曹操的策略和行动 +尽管曹操对大乔有很大的兴趣,但实际行动中的证据有限。周瑜得知曹操对大乔的意图后,表达了强烈的反对,并与诸葛亮商议对策,以确保她和小乔的安全 [Data: Sources (2)]。周瑜明确表示大乔是孙策的正妻,而小乔是他的妻子,拒绝将她们送给曹操。 +## 总结 +总体而言,曹操和大乔没有直接的互动记录,但曹操对大乔的欲望及其可能带来的政治影响却是不容忽视的。大乔作为江东乔公的长女,其美貌和身份使她成为多方势力争夺的对象,但她与曹操的关系主要体现在曹操的欲望和诸葛亮、周瑜等人的权谋计划中。 +曹操对大乔的欲望和权谋体现了三国时期复杂的政治动态,同时也揭示了美人计在当时战略中的重要性 [Data: Relationships (6, 12, 13); Entities (1); Sources (0, 2)]. +``` + + + +## Global Search Results + +### GraphRAG + +"大乔与曹操的关系" + +```markdown +### 大乔与曹操的关系 + +大乔(Da Qiao)是乔公的女儿,以其非凡的美貌而闻名。这一特质使她在社区中成为一个重要人物,尤其是在曹操(Cao Cao)对她的渴望背景下 [Data: Reports (41)]。 + +#### 曹操的计划 + +曹操计划在统一江南后,将大乔安置在铜雀台(Tongque Tai)。铜雀台是曹操在叶县张河上建造的一座宏伟平台,象征着好运和庆祝。曹操打算将包括大乔在内的美丽女子安置在铜雀台中 [Data: Reports (269)]。这一计划不仅反映了曹操的野心和愿景,也显示了他对大乔的特别关注。 + +#### 乔公与曹操的关系 + +乔公是大乔的父亲,同时也是曹操的朋友。这层关系不仅突显了乔公的影响力,还暗示了由于曹操对大乔的渴望,可能会产生潜在的杠杆作用或冲突 [Data: Reports (41)]。 + +### 结论 + +综上所述,大乔与曹操的关系主要体现在曹操对她的渴望以及他计划将她安置在铜雀台的意图上。此外,乔公与曹操的友谊也为这一关系增添了复杂性,可能会在未来引发更多的动态变化 [Data: Reports (41, 269)]。 +``` + + + +### `nano-graph` + +"大乔与曹操的关系" + +```markdown +## 曹操与大乔的关系 + +曹操对大乔展现了显著的政治和战略兴趣,这部分是由于大乔的美貌以及她在战略上的重要性。在那个时期,美丽并非单纯的外貌特征,还往往与政治资源和战略权力相关联。曹操认为拥有大乔会对他的政治野心有所帮助,并为其权势扩大增添筹码 [Data: Reports (6)]。 + +大乔作为孙策的妻子,再加上她的地位和美貌,使得她在权力动态中成为一个关键的目标。曹操对大乔的重视,不仅因为她个人的吸引力,更因为她所代表的权力象征。如果能将大乔纳入自己的势力范围,曹操将可能通过这种方式削弱敌对势力,同时增强自己的影响力 [Data: Reports (6)]。 + +总之,曹操和大乔的关系不仅仅停留在个人层面,更深层次的是政治策略和权力博弈的体现。曹操对她的兴趣,反映了他在权力斗争中的精明与野心。 +``` + diff --git a/nano-graphrag/docs/use_neo4j_for_graphrag.md b/nano-graphrag/docs/use_neo4j_for_graphrag.md new file mode 100644 index 0000000000000000000000000000000000000000..abc3f353c7d9dce1af79b34d2d60d4fb0a1fb9e5 --- /dev/null +++ b/nano-graphrag/docs/use_neo4j_for_graphrag.md @@ -0,0 +1,27 @@ +1. Install [Neo4j](https://neo4j.com/docs/operations-manual/current/installation/) (please use 5.x version) +2. Install Neo4j GDS (graph data science) [plugin](https://neo4j.com/docs/graph-data-science/current/installation/neo4j-server/) +3. Start neo4j server +4. Get the `NEO4J_URL`, `NEO4J_USER` and `NEO4J_PASSWORD` + - By default, `NEO4J_URL` is `neo4j://localhost:7687` , `NEO4J_USER` is `neo4j` and `NEO4J_PASSWORD` is `neo4j` + +Pass your neo4j instance to `GraphRAG`: + +```python +from nano_graphrag import GraphRAG +from nano_graphrag._storage import Neo4jStorage + +neo4j_config = { + "neo4j_url": os.environ.get("NEO4J_URL", "neo4j://localhost:7687"), + "neo4j_auth": ( + os.environ.get("NEO4J_USER", "neo4j"), + os.environ.get("NEO4J_PASSWORD", "neo4j"), + ) +} +GraphRAG( + graph_storage_cls=Neo4jStorage, + addon_params=neo4j_config, +) +``` + + + diff --git a/nano-graphrag/examples/benchmarks/dspy_entity.py b/nano-graphrag/examples/benchmarks/dspy_entity.py new file mode 100644 index 0000000000000000000000000000000000000000..449fb4e09fc6550bc8a6b221dd4b8f47499ba3a6 --- /dev/null +++ b/nano-graphrag/examples/benchmarks/dspy_entity.py @@ -0,0 +1,152 @@ +import dspy +import os +from dotenv import load_dotenv +from openai import AsyncOpenAI +import logging +import asyncio +import time +import shutil +from nano_graphrag.entity_extraction.extract import extract_entities_dspy +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._storage import NetworkXStorage +from nano_graphrag._utils import compute_mdhash_id, compute_args_hash +from nano_graphrag._op import extract_entities + +WORKING_DIR = "./nano_graphrag_cache_dspy_entity" + +load_dotenv() + +logger = logging.getLogger("nano-graphrag") +logger.setLevel(logging.DEBUG) + + +async def deepseepk_model_if_cache( + prompt: str, model: str = "deepseek-chat", system_prompt : str = None, history_messages: list = [], **kwargs +) -> str: + openai_async_client = AsyncOpenAI( + api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com" + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + + response = await openai_async_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": model}} + ) + # ----------------------------------------------------- + return response.choices[0].message.content + + +async def benchmark_entity_extraction(text: str, system_prompt: str, use_dspy: bool = False): + working_dir = os.path.join(WORKING_DIR, f"use_dspy={use_dspy}") + if os.path.exists(working_dir): + shutil.rmtree(working_dir) + + start_time = time.time() + graph_storage = NetworkXStorage(namespace="test", global_config={ + "working_dir": working_dir, + "entity_summary_to_max_tokens": 500, + "cheap_model_func": lambda *args, **kwargs: deepseepk_model_if_cache(*args, system_prompt=system_prompt, **kwargs), + "best_model_func": lambda *args, **kwargs: deepseepk_model_if_cache(*args, system_prompt=system_prompt, **kwargs), + "cheap_model_max_token_size": 4096, + "best_model_max_token_size": 4096, + "tiktoken_model_name": "gpt-4o", + "hashing_kv": BaseKVStorage(namespace="test", global_config={"working_dir": working_dir}), + "entity_extract_max_gleaning": 1, + "entity_extract_max_tokens": 4096, + "entity_extract_max_entities": 100, + "entity_extract_max_relationships": 100, + }) + chunks = {compute_mdhash_id(text, prefix="chunk-"): {"content": text}} + + if use_dspy: + graph_storage = await extract_entities_dspy(chunks, graph_storage, None, graph_storage.global_config) + else: + graph_storage = await extract_entities(chunks, graph_storage, None, graph_storage.global_config) + + end_time = time.time() + execution_time = end_time - start_time + + return graph_storage, execution_time + + +def print_extraction_results(graph_storage: NetworkXStorage): + print("\nEntities:") + entities = [] + for node, data in graph_storage._graph.nodes(data=True): + entity_type = data.get('entity_type', 'Unknown') + description = data.get('description', 'No description') + entities.append(f"- {node} ({entity_type}):\n {description}") + print("\n".join(entities)) + + print("\nRelationships:") + relationships = [] + for source, target, data in graph_storage._graph.edges(data=True): + description = data.get('description', 'No description') + relationships.append(f"- {source} -> {target}:\n {description}") + print("\n".join(relationships)) + + +async def run_benchmark(text: str): + print("\nRunning benchmark with DSPy-AI:") + system_prompt = """ + You are an expert system specialized in entity and relationship extraction from complex texts. + Your task is to thoroughly analyze the given text and extract all relevant entities and their relationships with utmost precision and completeness. + """ + system_prompt_dspy = f"{system_prompt} Time: {time.time()}." + lm = dspy.LM( + model="deepseek/deepseek-chat", + model_type="chat", + api_provider="openai", + api_key=os.environ["DEEPSEEK_API_KEY"], + base_url=os.environ["DEEPSEEK_BASE_URL"], + system_prompt=system_prompt, + temperature=1.0, + max_tokens=8192 + ) + dspy.settings.configure(lm=lm, experimental=True) + graph_storage_with_dspy, time_with_dspy = await benchmark_entity_extraction(text, system_prompt_dspy, use_dspy=True) + print(f"Execution time with DSPy-AI: {time_with_dspy:.2f} seconds") + print_extraction_results(graph_storage_with_dspy) + + print("Running benchmark without DSPy-AI:") + system_prompt_no_dspy = f"{system_prompt} Time: {time.time()}." + graph_storage_without_dspy, time_without_dspy = await benchmark_entity_extraction(text, system_prompt_no_dspy, use_dspy=False) + print(f"Execution time without DSPy-AI: {time_without_dspy:.2f} seconds") + print_extraction_results(graph_storage_without_dspy) + + print("\nComparison:") + print(f"Time difference: {abs(time_with_dspy - time_without_dspy):.2f} seconds") + print(f"DSPy-AI is {'faster' if time_with_dspy < time_without_dspy else 'slower'}") + + entities_without_dspy = len(graph_storage_without_dspy._graph.nodes()) + entities_with_dspy = len(graph_storage_with_dspy._graph.nodes()) + relationships_without_dspy = len(graph_storage_without_dspy._graph.edges()) + relationships_with_dspy = len(graph_storage_with_dspy._graph.edges()) + + print(f"Entities extracted: {entities_without_dspy} (without DSPy-AI) vs {entities_with_dspy} (with DSPy-AI)") + print(f"Relationships extracted: {relationships_without_dspy} (without DSPy-AI) vs {relationships_with_dspy} (with DSPy-AI)") + + +if __name__ == "__main__": + with open("./tests/zhuyuanzhang.txt", encoding="utf-8-sig") as f: + text = f.read() + + asyncio.run(run_benchmark(text=text)) diff --git a/nano-graphrag/examples/benchmarks/eval_naive_graphrag_on_multi_hop.ipynb b/nano-graphrag/examples/benchmarks/eval_naive_graphrag_on_multi_hop.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..870ba524419e0ab72a8a76955dd01d6eba82f9c4 --- /dev/null +++ b/nano-graphrag/examples/benchmarks/eval_naive_graphrag_on_multi_hop.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we are going to evaluate the performance of the naive RAG and the GraphRAG algorithm on a [multi-hop RAG task](https://github.com/yixuantt/MultiHop-RAG)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "Make sure you install the necessary dependencies by running the following commands:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install ragas nest_asyncio datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import the necessary libraries, and set up your openai api key if needed:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"\n", + "import json\n", + "import sys\n", + "sys.path.append(\"../..\")\n", + "\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "import logging\n", + "\n", + "logging.basicConfig(level=logging.WARNING)\n", + "logging.getLogger(\"nano-graphrag\").setLevel(logging.INFO)\n", + "from nano_graphrag import GraphRAG, QueryParam\n", + "from datasets import Dataset \n", + "from ragas import evaluate\n", + "from ragas.metrics import (\n", + " answer_correctness,\n", + " answer_similarity,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the dataset from [Github Repo](https://github.com/yixuantt/MultiHop-RAG/tree/main/dataset). \n", + "If should contain two files:\n", + "- `MultiHopRAG.json`\n", + "- `corpus.json`\n", + "\n", + "After downloading the dataset, replace the below paths to the paths on your machine." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "multi_hop_rag_file = \"./fixtures/MultiHopRAG.json\"\n", + "multi_hop_corpus_file = \"./fixtures/corpus.json\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "with open(multi_hop_rag_file) as f:\n", + " multi_hop_rag_dataset = json.load(f)\n", + "with open(multi_hop_corpus_file) as f:\n", + " multi_hop_corpus = json.load(f)\n", + "\n", + "corups_url_refernces = {}\n", + "for cor in multi_hop_corpus:\n", + " corups_url_refernces[cor['url']] = cor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We only use the top-100 queries for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Queries have types: {'inference_query', 'comparison_query', 'null_query', 'temporal_query'}\n", + "We will need 139 articles:\n", + "## ASX set to drop as Wall Street’s September slump deepens\n", + "Author: Stan Choe, The Sydney Morning Herald\n", + "Category: business\n", + "Publised: 2023-09-26T19:11:30+00:00\n", + "ETF provider Betashares, which manages $ ...\n" + ] + } + ], + "source": [ + "multi_hop_rag_dataset = multi_hop_rag_dataset[:100]\n", + "print(\"Queries have types:\", set([q['question_type'] for q in multi_hop_rag_dataset]))\n", + "total_urls = set()\n", + "for q in multi_hop_rag_dataset:\n", + " total_urls.update([up['url'] for up in q['evidence_list']])\n", + "corups_url_refernces = {k:v for k, v in corups_url_refernces.items() if k in total_urls}\n", + "\n", + "total_corups = [f\"## {cor['title']}\\nAuthor: {cor['author']}, {cor['source']}\\nCategory: {cor['category']}\\nPublised: {cor['published_at']}\\n{cor['body']}\" for cor in corups_url_refernces.values()]\n", + "\n", + "print(f\"We will need {len(total_corups)} articles:\")\n", + "print(total_corups[0][:200], \"...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add index for the `total_corups` using naive RAG and GraphRAG" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:nano-graphrag:Load KV full_docs with 139 data\n", + "INFO:nano-graphrag:Load KV text_chunks with 408 data\n", + "INFO:nano-graphrag:Load KV llm_response_cache with 1634 data\n", + "INFO:nano-graphrag:Load KV community_reports with 794 data\n", + "INFO:nano-graphrag:Loaded graph from nano_graphrag_cache_multi_hop_rag_test/graph_chunk_entity_relation.graphml with 6181 nodes, 5423 edges\n", + "WARNING:nano-graphrag:All docs are already in the storage\n", + "INFO:nano-graphrag:Writing graph with 6181 nodes, 5423 edges\n" + ] + } + ], + "source": [ + "# First time indexing will cost many time, roughly 15~20 minutes\n", + "graphrag_func = GraphRAG(working_dir=\"nano_graphrag_cache_multi_hop_rag_test\", enable_naive_rag=True,\n", + " embedding_func_max_async=4)\n", + "graphrag_func.insert(total_corups)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look at the response of different RAG methods on the first query:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "response_formate = \"Single phrase or sentence, concise and no redundant explanation needed. If you don't have the answer in context, Just response 'Insufficient information'\"\n", + "naive_rag_query_param = QueryParam(mode='naive', response_type=response_formate)\n", + "naive_rag_query_only_context_param = QueryParam(mode='naive', only_need_context=True)\n", + "local_graphrag_query_param = QueryParam(mode='local', response_type=response_formate)\n", + "local_graphrag_only_context__param = QueryParam(mode='local', only_need_context=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?\n", + "GroundTruth Answer: Sam Bankman-Fried\n" + ] + } + ], + "source": [ + "query = multi_hop_rag_dataset[0]\n", + "print(\"Question:\", query['query'])\n", + "print(\"GroundTruth Answer:\", query['answer'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:nano-graphrag:Truncate 20 to 12 chunks\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NaiveRAG Answer: Sam Bankman-Fried\n" + ] + } + ], + "source": [ + "print(\"NaiveRAG Answer:\", graphrag_func.query(query['query'], param=naive_rag_query_param))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:nano-graphrag:Using 20 entites, 3 communities, 124 relations, 3 text units\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Local GraphRAG Answer: Sam Bankman-Fried\n" + ] + } + ], + "source": [ + "print(\"Local GraphRAG Answer:\", graphrag_func.query(query['query'], param=local_graphrag_query_param))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! Now we're ready to evaluate more detailed metrics. We will use [ragas](https://docs.ragas.io/en/stable/) to evalue the answers' quality." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "questions = [q['query'] for q in multi_hop_rag_dataset]\n", + "labels = [q['answer'] for q in multi_hop_rag_dataset]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/100 [00:00 np.ndarray: + return np.float32(np.random.rand(len(texts), FAKE_DIM)) + + +def generate_test_data(): + return {str(i): {"content": f"Test content {i}"} for i in range(DATA_LEN)} + + +async def benchmark_storage(storage_class, name): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=sample_embedding) + storage = storage_class( + namespace=f"benchmark_{name}", + global_config=rag.__dict__, + embedding_func=sample_embedding, + meta_fields={"content"}, + ) + + test_data = generate_test_data() + + print(f"Benchmarking {name}...") + with tqdm(total=DATA_LEN, desc=f"{name} Benchmark") as pbar: + start_time = time.time() + for i in range(0, len(test_data), BATCH_SIZE): + batch = {k: test_data[k] for k in list(test_data.keys())[i:i+BATCH_SIZE]} + await storage.upsert(batch) + pbar.update(min(BATCH_SIZE, DATA_LEN - i)) + + insert_time = time.time() - start_time + + save_start_time = time.time() + await storage.index_done_callback() + save_time = time.time() - save_start_time + pbar.update(1) + + query_vector = np.random.rand(FAKE_DIM) + query_times = [] + for _ in range(100): + query_start = time.time() + await storage.query(query_vector, top_k=10) + query_times.append(time.time() - query_start) + pbar.update(1) + + avg_query_time = sum(query_times) / len(query_times) + + print(f"{name} - Insert: {insert_time:.2f}s, Save: {save_time:.2f}s, Avg Query: {avg_query_time:.4f}s") + return insert_time, save_time, avg_query_time + + +async def run_benchmarks(): + print("Running NanoVectorDB benchmark...") + nano_insert_time, nano_save_time, nano_query_time = await benchmark_storage(NanoVectorDBStorage, "nano") + + print("\nRunning HNSWVectorStorage benchmark...") + hnsw_insert_time, hnsw_save_time, hnsw_query_time = await benchmark_storage(HNSWVectorStorage, "hnsw") + + print("\nBenchmark Results:") + print(f"NanoVectorDB - Insert: {nano_insert_time:.2f}s, Save: {nano_save_time:.2f}s, Avg Query: {nano_query_time:.4f}s") + print(f"HNSWVectorStorage - Insert: {hnsw_insert_time:.2f}s, Save: {hnsw_save_time:.2f}s, Avg Query: {hnsw_query_time:.4f}s") + + +if __name__ == "__main__": + asyncio.run(run_benchmarks()) \ No newline at end of file diff --git a/nano-graphrag/examples/benchmarks/md5_vs_xxhash.py b/nano-graphrag/examples/benchmarks/md5_vs_xxhash.py new file mode 100644 index 0000000000000000000000000000000000000000..ce24dbcc0a39610d94cabc095371f0635ac13e7a --- /dev/null +++ b/nano-graphrag/examples/benchmarks/md5_vs_xxhash.py @@ -0,0 +1,54 @@ +import time +import xxhash +from hashlib import md5 +from tqdm import tqdm +import numpy as np + + +def xxhash_ids(data: list[str]) -> np.ndarray: + return np.fromiter( + (xxhash.xxh32_intdigest(d.encode()) for d in data), + dtype=np.uint32, + count=len(data) + ) + + +def md5_ids(data: list[str]) -> np.ndarray: + return np.fromiter( + (int(md5(d.encode()).hexdigest(), 16) & 0xFFFFFFFF for d in data), + dtype=np.uint32, + count=len(data) + ) + + +if __name__ == "__main__": + num_ids = 1000000 + num_iterations = 100 + xxhash_times = [] + md5_times = [] + + for i in tqdm(range(num_iterations)): + test_data = [f"{i}_{j}" for j in range(num_ids)] + + start_time = time.time() + xxhash_result = xxhash_ids(test_data) + xxhash_times.append(time.time() - start_time) + + start_time = time.time() + md5_result = md5_ids(test_data) + md5_times.append(time.time() - start_time) + + assert len(xxhash_result) == len(md5_result) == num_ids + assert not np.array_equal(xxhash_result, md5_result) + + avg_xxhash_time = np.mean(xxhash_times) + avg_md5_time = np.mean(md5_times) + std_xxhash_time = np.std(xxhash_times) + std_md5_time = np.std(md5_times) + + print(f"num_ids: {num_ids} | num_iterations: {num_iterations}") + print(f"\nAverage xxhash time: {avg_xxhash_time:.4f} seconds") + print(f"Average MD5 time: {avg_md5_time:.4f} seconds") + print(f"xxhash is {avg_md5_time / avg_xxhash_time:.2f}x faster than MD5 on average") + print(f"\nxxhash time standard deviation: {std_xxhash_time:.4f} seconds") + print(f"MD5 time standard deviation: {std_md5_time:.4f} seconds") \ No newline at end of file diff --git a/nano-graphrag/examples/finetune_entity_relationship_dspy.ipynb b/nano-graphrag/examples/finetune_entity_relationship_dspy.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7503ae13e5b58e6a19ce453c6b8d2c165ef843a1 --- /dev/null +++ b/nano-graphrag/examples/finetune_entity_relationship_dspy.ipynb @@ -0,0 +1,14478 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluating Entity Relationship Extraction with DSPy & Fine-Tune Prompt Instructions\n", + "\n", + "## Steps\n", + "- Load DSPy examples (separated into train, val, dev) that are saved locally.\n", + "- Evaluate the extraction module with the dev examples to determine the baseline scores, i.e: the fine-tuned extraction module should score higher.\n", + "- Run bootstrapping with random search with train examples, evaluate its compiled extraction module on the same dev examples to compare against baseline scores.\n", + "- Run MIPROv2 with train and dev examples, evaluate its compiled extraction module on the same dev examples to compare against baseline and bootstrapping with random search scores.\n", + "\n", + "## Why Use MIPROv2?\n", + "MIPROv2 is an optimizer that generates candidate few-shot examples and instructions for each prompt in the extraction module, and then optimizes over the fewshot examples and instructions as hyperparameters for a number of batches. For each batch, the optimizer evaluates different combinations of prompts on a subset of training inputs and learns the combinations that maximizes performance. This optimizer is quite useful when dealing with a large number of training examples. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniconda/base/envs/nano-graphrag/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import dspy\n", + "from dspy.teleprompt import BootstrapFewShotWithRandomSearch, MIPROv2\n", + "from dspy.evaluate import Evaluate\n", + "import os\n", + "import numpy as np\n", + "from dotenv import load_dotenv\n", + "import logging\n", + "import pickle\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from nano_graphrag.entity_extraction.module import TypedEntityRelationshipExtractor\n", + "from nano_graphrag.entity_extraction.metric import relationships_similarity_metric, entity_recall_metric" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "WORKING_DIR = \"./nano_graphrag_cache_finetune_entity_relationship_dspy\"\n", + "EXAMPLES_DIR = \"./nano_graphrag_cache_generate_dspy_examples\"\n", + "load_dotenv()\n", + "\n", + "logging.basicConfig(level=logging.WARNING)\n", + "logging.getLogger(\"nano-graphrag\").setLevel(logging.DEBUG)\n", + "\n", + "np.random.seed(1337)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + " You are a world-class AI system, capable of complex reasoning and reflection. \n", + " Reason through the query, and then provide your final response. \n", + " If you detect that you made a mistake in your reasoning at any point, correct yourself.\n", + " Think carefully.\n", + "\"\"\"\n", + "deepseek = dspy.OpenAI(\n", + " model=\"deepseek-chat\", \n", + " model_type=\"chat\",\n", + " api_key=os.environ[\"DEEPSEEK_API_KEY\"], \n", + " base_url=os.environ[\"DEEPSEEK_BASE_URL\"], \n", + " system_prompt=system_prompt, \n", + " temperature=1.0,\n", + " max_tokens=8192\n", + ")\n", + "qwen2 = dspy.OllamaLocal(\n", + " model=\"qwen2\", \n", + " system=system_prompt,\n", + " temperature=1.0,\n", + " max_tokens=4096,\n", + " num_ctx=32000,\n", + " format=\"json\",\n", + " timeout_s=240,\n", + ")\n", + "dspy.settings.configure(lm=deepseek, experimental=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(WORKING_DIR, exist_ok=True)\n", + "entity_relationship_rs_path = os.path.join(WORKING_DIR, \"entity_relationship_extraction_news_rs.json\")\n", + "entity_relationship_miprov2_path = os.path.join(WORKING_DIR, \"entity_relationship_extraction_news_miprov2.json\")\n", + "entity_relationship_trainset_path = os.path.join(EXAMPLES_DIR, \"entity_relationship_extraction_news_trainset.pkl\")\n", + "entity_relationship_valset_path = os.path.join(EXAMPLES_DIR, \"entity_relationship_extraction_news_valset.pkl\")\n", + "entity_relationship_devset_path = os.path.join(EXAMPLES_DIR, \"entity_relationship_extraction_news_devset.pkl\")\n", + "\n", + "trainset = pickle.load(open(entity_relationship_trainset_path, \"rb\"))\n", + "valset = pickle.load(open(entity_relationship_valset_path, \"rb\"))\n", + "devset = pickle.load(open(entity_relationship_devset_path, \"rb\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for example in trainset:\n", + " for relationship in example.relationships:\n", + " if relationship['order'] == 2:\n", + " print(relationship)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for example in trainset:\n", + " for relationship in example.relationships:\n", + " if relationship['order'] == 3:\n", + " print(relationship)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainset[0].relationships[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "valset[0].relationships[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for example in valset:\n", + " for relationship in example.relationships:\n", + " if relationship['order'] == 2:\n", + " print(relationship)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for example in valset:\n", + " for relationship in example.relationships:\n", + " if relationship['order'] == 3:\n", + " print(relationship)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "devset[0].relationships[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for example in devset:\n", + " for relationship in example.relationships:\n", + " if relationship['order'] == 2:\n", + " print(relationship)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for example in devset:\n", + " for relationship in example.relationships:\n", + " if relationship['order'] == 3:\n", + " print(relationship)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "extractor.predictor.predictor = Predict(StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given a text document that is potentially relevant to this activity and a list of entity types, \\nidentify all entities of those types from the text and all relationships among the identified entities.\\n\\nEntity Guidelines:\\n1. Each entity name should be an actual atomic word from the input text. \\n2. Avoid duplicates and generic terms.\\n3. Make sure descriptions are detailed and comprehensive. Use multiple complete sentences for each point below:\\n a). The entity\\'s role or significance in the context\\n b). Key attributes or characteristics\\n c). Relationships to other entities (if applicable)\\n d). Historical or cultural relevance (if applicable)\\n e). Any notable actions or events associated with the entity\\n4. All entity types from the text must be included. \\n5. IMPORTANT: Only use entity types from the provided \\'entity_types\\' list. Do not introduce new entity types.\\n\\nRelationship Guidelines:\\n1. Make sure relationship descriptions are detailed and comprehensive. Use multiple complete sentences for each point below:\\n a). The nature of the relationship (e.g., familial, professional, causal)\\n b). The impact or significance of the relationship on both entities\\n c). Any historical or contextual information relevant to the relationship\\n d). How the relationship evolved over time (if applicable)\\n e). Any notable events or actions that resulted from this relationship\\n2. Include direct relationships (order 1) as well as higher-order relationships (order 2 and 3):\\n a). Direct relationships: Immediate connections between entities.\\n b). Second-order relationships: Indirect effects or connections that result from direct relationships.\\n c). Third-order relationships: Further indirect effects that result from second-order relationships.\\n3. The \"src_id\" and \"tgt_id\" fields must exactly match entity names from the extracted entities list.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + "))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = TypedEntityRelationshipExtractor()\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/20 [00:00\n", + "#T_18824 th {\n", + " text-align: left;\n", + "}\n", + "#T_18824 td {\n", + " text-align: left;\n", + "}\n", + "#T_18824_row0_col0, #T_18824_row0_col1, #T_18824_row0_col2, #T_18824_row0_col3, #T_18824_row0_col4, #T_18824_row0_col5, #T_18824_row1_col0, #T_18824_row1_col1, #T_18824_row1_col2, #T_18824_row1_col3, #T_18824_row1_col4, #T_18824_row1_col5, #T_18824_row2_col0, #T_18824_row2_col1, #T_18824_row2_col2, #T_18824_row2_col3, #T_18824_row2_col4, #T_18824_row2_col5, #T_18824_row3_col0, #T_18824_row3_col1, #T_18824_row3_col2, #T_18824_row3_col3, #T_18824_row3_col4, #T_18824_row3_col5, #T_18824_row4_col0, #T_18824_row4_col1, #T_18824_row4_col2, #T_18824_row4_col3, #T_18824_row4_col4, #T_18824_row4_col5 {\n", + " text-align: left;\n", + " white-space: pre-wrap;\n", + " word-wrap: break-word;\n", + " max-width: 400px;\n", + "}\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 input_textexample_entitiesexample_relationshipspred_entitiespred_relationshipsentity_recall_metric
0As students from Marjory Stoneman Douglas High School confront lawmakers with demands to restrict sales of assault rifles, there were warnings by the president of...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where students confronted lawmakers about restricting sales of assault rifles.', 'importance_score':...[{'src_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'tgt_id': 'FLORIDA', 'description': 'Students from Marjory Stoneman Douglas High School are located in the state of Florida.', 'weight': 0.9,...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where students confronted lawmakers about assault rifle sales.', 'importance_score': 0.9}, {'entity_name':...[{'src_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'tgt_id': 'FLORIDA', 'description': 'Marjory Stoneman Douglas High School is located in Florida.', 'weight': 0.9, 'order': 1}, {'src_id': 'LAURENZO PRADO',...✔️ [0.6666666666666666]
1From ferrying people to and from their place of work to transporting nuclear waste and coal, railways are not only an integral part of 21st...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels whose clients include Deutsche Bahn and French rail operator SNCF.', 'importance_score': 0.9}, {'entity_name': 'DEUTSCHE...[{'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description': 'Railnova provides technology solutions to Deutsche Bahn.', 'weight': 0.9, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'SNCF', 'description': 'Railnova provides technology...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels whose clients include Deutsche Bahn and French rail operator SNCF.', 'importance_score': 0.9}, {'entity_name': 'DEUTSCHE...[{'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description': 'Railnova provides technology solutions to Deutsche Bahn.', 'weight': 0.8, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'SNCF', 'description': 'Railnova provides technology...✔️ [0.8888888888888888]
2Jan 22 (Reuters) - Shanghai Stock Exchange Filing * SHOWS BLOCK TRADE OF YONGHUI SUPERSTORES Co LTd's 166.3 MILLION SHARES INVOLVING 1.63 BILLION YUAN ($254.68...[{'entity_name': 'YONGHUI SUPERSTORES CO LTD', 'entity_type': 'ORGANIZATION', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'importance_score': 1.0}, {'entity_name':...[{'src_id': 'YONGHUI SUPERSTORES CO LTD', 'tgt_id': '166.3 MILLION SHARES', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'weight':...[{'entity_name': 'SHANGHAI STOCK EXCHANGE', 'entity_type': 'ORGANIZATION', 'description': 'The Shanghai Stock Exchange is a stock exchange located in Shanghai, China.', 'importance_score': 0.9}, {'entity_name': 'YONGHUI SUPERSTORES CO...[{'src_id': 'SHANGHAI STOCK EXCHANGE', 'tgt_id': 'YONGHUI SUPERSTORES CO LTD', 'description': 'The Shanghai Stock Exchange filed a block trade involving YONGHUI SUPERSTORES Co Ltd.', 'weight': 0.9,...✔️ [0.8]
3LONDON (Reuters) - Britain’s economy was weaker than previously thought in 2017, official data showed on Thursday, leaving the country lagging further behind the global...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy was weaker than previously thought in 2017.', 'importance_score': 0.9}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.9, 'order': 1}, {'src_id': 'BANK OF ENGLAND', 'tgt_id': 'INTEREST RATES',...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy is discussed in the text.', 'importance_score': 1.0}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The political and...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.9, 'order': 1}, {'src_id': 'BRITAIN', 'tgt_id': 'BANK OF ENGLAND', 'description':...✔️ [1.0]
4Trump taps White House doctor as new VA secretary 2 Hours Ago CNBC's Kayla Tausche reports President Trump has tapped White House physician Rear Admiral...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'President Trump who tapped Ronny Jackson as new VA secretary.', 'importance_score': 1.0}, {'entity_name': 'WHITE HOUSE', 'entity_type': 'ORGANIZATION', 'description': 'The White...[{'src_id': 'TRUMP', 'tgt_id': 'RONNY JACKSON', 'description': 'President Trump taps Ronny Jackson as new VA secretary.', 'weight': 1.0, 'order': 1}, {'src_id': 'RONNY JACKSON', 'tgt_id': 'VA', 'description':...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'President Trump who tapped Ronny Jackson.', 'importance_score': 0.9}, {'entity_name': 'WHITE HOUSE', 'entity_type': 'ORGANIZATION', 'description': 'The White House where Ronny Jackson...[{'src_id': 'TRUMP', 'tgt_id': 'RONNY JACKSON', 'description': 'Trump taps Ronny Jackson.', 'weight': 0.9, 'order': 1}, {'src_id': 'RONNY JACKSON', 'tgt_id': 'DEPARTMENT OF VETERANS AFFAIRS', 'description': 'Ronny Jackson...✔️ [0.8571428571428571]
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " ... 15 more rows not displayed ...\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/20 [00:00\n", + "#T_7104e th {\n", + " text-align: left;\n", + "}\n", + "#T_7104e td {\n", + " text-align: left;\n", + "}\n", + "#T_7104e_row0_col0, #T_7104e_row0_col1, #T_7104e_row0_col2, #T_7104e_row0_col3, #T_7104e_row0_col4, #T_7104e_row0_col5, #T_7104e_row1_col0, #T_7104e_row1_col1, #T_7104e_row1_col2, #T_7104e_row1_col3, #T_7104e_row1_col4, #T_7104e_row1_col5, #T_7104e_row2_col0, #T_7104e_row2_col1, #T_7104e_row2_col2, #T_7104e_row2_col3, #T_7104e_row2_col4, #T_7104e_row2_col5, #T_7104e_row3_col0, #T_7104e_row3_col1, #T_7104e_row3_col2, #T_7104e_row3_col3, #T_7104e_row3_col4, #T_7104e_row3_col5, #T_7104e_row4_col0, #T_7104e_row4_col1, #T_7104e_row4_col2, #T_7104e_row4_col3, #T_7104e_row4_col4, #T_7104e_row4_col5 {\n", + " text-align: left;\n", + " white-space: pre-wrap;\n", + " word-wrap: break-word;\n", + " max-width: 400px;\n", + "}\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 input_textexample_entitiesexample_relationshipspred_entitiespred_relationshipsrelationships_similarity_metric
0As students from Marjory Stoneman Douglas High School confront lawmakers with demands to restrict sales of assault rifles, there were warnings by the president of...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where students confronted lawmakers about restricting sales of assault rifles.', 'importance_score':...[{'src_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'tgt_id': 'FLORIDA', 'description': 'Students from Marjory Stoneman Douglas High School are located in the state of Florida.', 'weight': 0.9,...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where students confronted lawmakers about assault rifle sales.', 'importance_score': 0.9}, {'entity_name':...[{'src_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'tgt_id': 'FLORIDA', 'description': 'Marjory Stoneman Douglas High School is located in Florida.', 'weight': 0.9, 'order': 1}, {'src_id': 'LAURENZO PRADO',...✔️ [0.45]
1From ferrying people to and from their place of work to transporting nuclear waste and coal, railways are not only an integral part of 21st...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels whose clients include Deutsche Bahn and French rail operator SNCF.', 'importance_score': 0.9}, {'entity_name': 'DEUTSCHE...[{'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description': 'Railnova provides technology solutions to Deutsche Bahn.', 'weight': 0.9, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'SNCF', 'description': 'Railnova provides technology...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels whose clients include Deutsche Bahn and French rail operator SNCF.', 'importance_score': 0.9}, {'entity_name': 'DEUTSCHE...[{'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description': 'Railnova provides technology solutions to Deutsche Bahn.', 'weight': 0.8, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'SNCF', 'description': 'Railnova provides technology...✔️ [0.75]
2Jan 22 (Reuters) - Shanghai Stock Exchange Filing * SHOWS BLOCK TRADE OF YONGHUI SUPERSTORES Co LTd's 166.3 MILLION SHARES INVOLVING 1.63 BILLION YUAN ($254.68...[{'entity_name': 'YONGHUI SUPERSTORES CO LTD', 'entity_type': 'ORGANIZATION', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'importance_score': 1.0}, {'entity_name':...[{'src_id': 'YONGHUI SUPERSTORES CO LTD', 'tgt_id': '166.3 MILLION SHARES', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'weight':...[{'entity_name': 'SHANGHAI STOCK EXCHANGE', 'entity_type': 'ORGANIZATION', 'description': 'The Shanghai Stock Exchange is a stock exchange located in Shanghai, China.', 'importance_score': 0.9}, {'entity_name': 'YONGHUI SUPERSTORES CO...[{'src_id': 'SHANGHAI STOCK EXCHANGE', 'tgt_id': 'YONGHUI SUPERSTORES CO LTD', 'description': 'The Shanghai Stock Exchange filed a block trade involving YONGHUI SUPERSTORES Co Ltd.', 'weight': 0.9,...✔️ [0.8]
3LONDON (Reuters) - Britain’s economy was weaker than previously thought in 2017, official data showed on Thursday, leaving the country lagging further behind the global...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy was weaker than previously thought in 2017.', 'importance_score': 0.9}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.9, 'order': 1}, {'src_id': 'BANK OF ENGLAND', 'tgt_id': 'INTEREST RATES',...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy is discussed in the text.', 'importance_score': 1.0}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The political and...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.9, 'order': 1}, {'src_id': 'BRITAIN', 'tgt_id': 'BANK OF ENGLAND', 'description':...✔️ [0.2]
4Trump taps White House doctor as new VA secretary 2 Hours Ago CNBC's Kayla Tausche reports President Trump has tapped White House physician Rear Admiral...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'President Trump who tapped Ronny Jackson as new VA secretary.', 'importance_score': 1.0}, {'entity_name': 'WHITE HOUSE', 'entity_type': 'ORGANIZATION', 'description': 'The White...[{'src_id': 'TRUMP', 'tgt_id': 'RONNY JACKSON', 'description': 'President Trump taps Ronny Jackson as new VA secretary.', 'weight': 1.0, 'order': 1}, {'src_id': 'RONNY JACKSON', 'tgt_id': 'VA', 'description':...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'President Trump who tapped Ronny Jackson.', 'importance_score': 0.9}, {'entity_name': 'WHITE HOUSE', 'entity_type': 'ORGANIZATION', 'description': 'The White House where Ronny Jackson...[{'src_id': 'TRUMP', 'tgt_id': 'RONNY JACKSON', 'description': 'Trump taps Ronny Jackson.', 'weight': 0.9, 'order': 1}, {'src_id': 'RONNY JACKSON', 'tgt_id': 'DEPARTMENT OF VETERANS AFFAIRS', 'description': 'Ronny Jackson...✔️ [0.65]
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " ... 15 more rows not displayed ...\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "metrics = [entity_recall_metric, relationships_similarity_metric]\n", + "baseline_scores = []\n", + "for metric in metrics:\n", + " evaluate = Evaluate(\n", + " devset=devset[:20], \n", + " metric=metric, \n", + " num_threads=os.cpu_count(), \n", + " display_progress=True,\n", + " display_table=5,\n", + " )\n", + " baseline_scores.append(evaluate(model))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Assess the similarity between gold and predicted relationships:\n", + "1. Match relationships based on src_id and tgt_id pairs, allowing for slight variations in entity names.\n", + "2. For matched pairs, compare:\n", + " a) Description similarity (semantic meaning)\n", + " b) Weight similarity\n", + " c) Order similarity\n", + "3. Consider unmatched relationships as penalties.\n", + "4. Aggregate scores, accounting for precision and recall.\n", + "5. Return a final similarity score between 0 (no similarity) and 1 (perfect match).\n", + "\n", + "Key considerations:\n", + "- Prioritize matching based on entity pairs over exact string matches.\n", + "- Use semantic similarity for descriptions rather than exact matches.\n", + "- Weight the importance of different aspects (e.g., entity matching, description, weight, order).\n", + "- Balance the impact of matched and unmatched relationships in the final score.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "Gold Relationships: The gold-standard relationships to compare against.\n", + "\n", + "Predicted Relationships: The predicted relationships to compare against the gold-standard relationships.\n", + "\n", + "Reasoning: Let's think step by step in order to ${produce the similarity_score}. We ...\n", + "\n", + "Similarity Score: Similarity score between 0 and 1, with 1 being the highest similarity. (Respond with a single float value)\n", + "\n", + "---\n", + "\n", + "Gold Relationships: [{\"src_id\":\"INSTITUTE FOR FISCAL STUDIES\",\"tgt_id\":\"EUROPEAN UNION\",\"description\":\"The Institute for Fiscal Studies said cutting trade tariffs completely when leaving the European Union would reduce prices in British shops by only 1.2 percent.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"KLÉPIERRE SA\",\"tgt_id\":\"HAMMERSON PLC\",\"description\":\"Klépierre SA made a 4.9 billion pound takeover approach to Hammerson Plc.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"SNAP INC\",\"tgt_id\":\"TWITTER\",\"description\":\"Snap Inc's Snapchat is so popular in Britain that its advertising revenue will overtake Twitter's UK revenue in 2019.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"PERSIMMON PLC\",\"tgt_id\":\"JEFF FAIRBURN\",\"description\":\"Persimmon Plc paid its Chief Executive, Jeff Fairburn, and two other executives a combined 104 million pounds last year.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"SHERBORNE INVESTORS\",\"tgt_id\":\"BARCLAYS PLC\",\"description\":\"Sherborne Investors, led by British-born Edward Bramson, has taken a 5 percent stake in Barclays Plc.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"MONZO\",\"description\":\"The Royal Bank of Scotland is working on secret plans to create a standalone digital bank to compete with emerging British fintech champions including Monzo.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"REVOLUT\",\"description\":\"The Royal Bank of Scotland is working on secret plans to create a standalone digital bank to compete with emerging British fintech champions including Revolut.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"HERSHEY CO\",\"tgt_id\":\"TYRRELLS\",\"description\":\"Hershey Co, the U.S. confectionery giant, is at the early stages of exploring options for the British crisps brand Tyrrells.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"FINANCIAL REPORTING COUNCIL\",\"tgt_id\":\"CARILLION PLC\",\"description\":\"The Financial Reporting Council, UK's accountancy watchdog has launched an investigation into two former finance directors of collapsed construction firm Carillion Plc.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"RAC FOUNDATION\",\"tgt_id\":\"CAR-OWNING HOUSEHOLDS\",\"description\":\"The RAC Foundation reported that running a car has become more than a third more expensive in one year for the poorest families.\",\"weight\":0.7,\"order\":1}]\n", + "\n", + "Predicted Relationships: [{\"src_id\":\"HAMMERSON PLC\",\"tgt_id\":\"KLÉPIERRE SA\",\"description\":\"Hammerson Plc received a takeover approach from Klépierre SA.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"SNAP INC\",\"tgt_id\":\"TWITTER\",\"description\":\"Snap Inc's Snapchat is expected to overtake Twitter's UK revenue.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"PERSIMMON PLC\",\"tgt_id\":\"JEFF FAIRBURN\",\"description\":\"Persimmon Plc paid its Chief Executive, Jeff Fairburn, a large sum.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"SHERBORNE INVESTORS\",\"tgt_id\":\"BARCLAYS PLC\",\"description\":\"Sherborne Investors took a 5 percent stake in Barclays Plc.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"MONZO\",\"description\":\"Royal Bank of Scotland is competing with Monzo in the digital banking space.\",\"weight\":0.6,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"REVOLUT\",\"description\":\"Royal Bank of Scotland is competing with Revolut in the digital banking space.\",\"weight\":0.6,\"order\":1},{\"src_id\":\"HERSHEY CO\",\"tgt_id\":\"TYRRELLS\",\"description\":\"Hershey Co is exploring options for the British crisps brand Tyrrells.\",\"weight\":0.6,\"order\":1},{\"src_id\":\"FINANCIAL REPORTING COUNCIL\",\"tgt_id\":\"CARILLION PLC\",\"description\":\"The Financial Reporting Council is investigating former finance directors of Carillion Plc.\",\"weight\":0.6,\"order\":1}]\n", + "\n", + "Please provide the output fields Reasoning then Similarity Score. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with the field Reasoning.\n", + "\n", + "\u001b[32mReasoning: Let's think step by step in order to produce the similarity_score. We start by matching relationships based on src_id and tgt_id pairs, allowing for slight variations in entity names. We then compare the matched pairs on description similarity (semantic meaning), weight similarity, and order similarity. Unmatched relationships are considered penalties. Finally, we aggregate scores, accounting for precision and recall, to return a final similarity score between 0 (no similarity) and 1 (perfect match).\n", + "\n", + "1. **Entity Pair Matching:**\n", + " - Matched pairs: \n", + " - (HAMMERSON PLC, KLÉPIERRE SA)\n", + " - (SNAP INC, TWITTER)\n", + " - (PERSIMMON PLC, JEFF FAIRBURN)\n", + " - (SHERBORNE INVESTORS, BARCLAYS PLC)\n", + " - (ROYAL BANK OF SCOTLAND, MONZO)\n", + " - (ROYAL BANK OF SCOTLAND, REVOLUT)\n", + " - (HERSHEY CO, TYRRELLS)\n", + " - (FINANCIAL REPORTING COUNCIL, CARILLION PLC)\n", + " - Unmatched pairs in gold: \n", + " - (INSTITUTE FOR FISCAL STUDIES, EUROPEAN UNION)\n", + " - (RAC FOUNDATION, CAR-OWNING HOUSEHOLDS)\n", + "\n", + "2. **Comparison of Matched Pairs:**\n", + " - **Description Similarity:**\n", + " - (HAMMERSON PLC, KLÉPIERRE SA): High similarity.\n", + " - (SNAP INC, TWITTER): High similarity.\n", + " - (PERSIMMON PLC, JEFF FAIRBURN): Moderate similarity.\n", + " - (SHERBORNE INVESTORS, BARCLAYS PLC): High similarity.\n", + " - (ROYAL BANK OF SCOTLAND, MONZO): Moderate similarity.\n", + " - (ROYAL BANK OF SCOTLAND, REVOLUT): Moderate similarity.\n", + " - (HERSHEY CO, TYRRELLS): High similarity.\n", + " - (FINANCIAL REPORTING COUNCIL, CARILLION PLC): High similarity.\n", + " - **Weight Similarity:**\n", + " - (HAMMERSON PLC, KLÉPIERRE SA): 0.9 (gold) vs 0.8 (predicted)\n", + " - (SNAP INC, TWITTER): 0.8 (gold) vs 0.7 (predicted)\n", + " - (PERSIMMON PLC, JEFF FAIRBURN): 0.9 (gold) vs 0.7 (predicted)\n", + " - (SHERBORNE INVESTORS, BARCLAYS PLC): 0.9 (gold) vs 0.7 (predicted)\n", + " - (ROYAL BANK OF SCOTLAND, MONZO): 0.8 (gold) vs 0.6 (predicted)\n", + " - (ROYAL BANK OF SCOTLAND, REVOLUT): 0.8 (gold) vs 0.6 (predicted)\n", + " - (HERSHEY CO, TYRRELLS): 0.7 (gold) vs 0.6 (predicted)\n", + " - (FINANCIAL REPORTING COUNCIL, CARILLION PLC): 0.9 (gold) vs 0.6 (predicted)\n", + " - **Order Similarity:**\n", + " - All matched pairs have the same order (1).\n", + "\n", + "3. **Penalties for Unmatched Relationships:**\n", + " - Two unmatched pairs in gold: (INSTITUTE FOR FISCAL STUDIES, EUROPEAN UNION) and (RAC FOUNDATION, CAR-OWNING HOUSEHOLDS).\n", + "\n", + "4. **Aggregation of Scores:**\n", + " - **Precision and Recall:**\n", + " - Precision: 8/8 (all predicted relationships are matched).\n", + " - Recall: 8/10 (8 out of 10 gold relationships are matched).\n", + " - **Weighted Score:**\n", + " - Description similarity: High for most pairs, moderate for some.\n", + " - Weight similarity: Varied, with some discrepancies.\n", + " - Order similarity: Perfect match.\n", + "\n", + "5. **Final Similarity Score:**\n", + " - Considering the high description similarity, moderate weight similarity, perfect order similarity, and the penalties for unmatched relationships, the final similarity score is calculated as follows:\n", + " - Precision: 1.0\n", + " - Recall: 0.8\n", + " - Weighted average considering the importance of each aspect: 0.85\n", + "\n", + "Similarity Score: 0.85\u001b[0m\n", + "\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'\\n\\n\\nAssess the similarity between gold and predicted relationships:\\n1. Match relationships based on src_id and tgt_id pairs, allowing for slight variations in entity names.\\n2. For matched pairs, compare:\\n a) Description similarity (semantic meaning)\\n b) Weight similarity\\n c) Order similarity\\n3. Consider unmatched relationships as penalties.\\n4. Aggregate scores, accounting for precision and recall.\\n5. Return a final similarity score between 0 (no similarity) and 1 (perfect match).\\n\\nKey considerations:\\n- Prioritize matching based on entity pairs over exact string matches.\\n- Use semantic similarity for descriptions rather than exact matches.\\n- Weight the importance of different aspects (e.g., entity matching, description, weight, order).\\n- Balance the impact of matched and unmatched relationships in the final score.\\n\\n---\\n\\nFollow the following format.\\n\\nGold Relationships: The gold-standard relationships to compare against.\\n\\nPredicted Relationships: The predicted relationships to compare against the gold-standard relationships.\\n\\nReasoning: Let\\'s think step by step in order to ${produce the similarity_score}. We ...\\n\\nSimilarity Score: Similarity score between 0 and 1, with 1 being the highest similarity. (Respond with a single float value)\\n\\n---\\n\\nGold Relationships: [{\"src_id\":\"INSTITUTE FOR FISCAL STUDIES\",\"tgt_id\":\"EUROPEAN UNION\",\"description\":\"The Institute for Fiscal Studies said cutting trade tariffs completely when leaving the European Union would reduce prices in British shops by only 1.2 percent.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"KLÉPIERRE SA\",\"tgt_id\":\"HAMMERSON PLC\",\"description\":\"Klépierre SA made a 4.9 billion pound takeover approach to Hammerson Plc.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"SNAP INC\",\"tgt_id\":\"TWITTER\",\"description\":\"Snap Inc\\'s Snapchat is so popular in Britain that its advertising revenue will overtake Twitter\\'s UK revenue in 2019.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"PERSIMMON PLC\",\"tgt_id\":\"JEFF FAIRBURN\",\"description\":\"Persimmon Plc paid its Chief Executive, Jeff Fairburn, and two other executives a combined 104 million pounds last year.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"SHERBORNE INVESTORS\",\"tgt_id\":\"BARCLAYS PLC\",\"description\":\"Sherborne Investors, led by British-born Edward Bramson, has taken a 5 percent stake in Barclays Plc.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"MONZO\",\"description\":\"The Royal Bank of Scotland is working on secret plans to create a standalone digital bank to compete with emerging British fintech champions including Monzo.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"REVOLUT\",\"description\":\"The Royal Bank of Scotland is working on secret plans to create a standalone digital bank to compete with emerging British fintech champions including Revolut.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"HERSHEY CO\",\"tgt_id\":\"TYRRELLS\",\"description\":\"Hershey Co, the U.S. confectionery giant, is at the early stages of exploring options for the British crisps brand Tyrrells.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"FINANCIAL REPORTING COUNCIL\",\"tgt_id\":\"CARILLION PLC\",\"description\":\"The Financial Reporting Council, UK\\'s accountancy watchdog has launched an investigation into two former finance directors of collapsed construction firm Carillion Plc.\",\"weight\":0.9,\"order\":1},{\"src_id\":\"RAC FOUNDATION\",\"tgt_id\":\"CAR-OWNING HOUSEHOLDS\",\"description\":\"The RAC Foundation reported that running a car has become more than a third more expensive in one year for the poorest families.\",\"weight\":0.7,\"order\":1}]\\n\\nPredicted Relationships: [{\"src_id\":\"HAMMERSON PLC\",\"tgt_id\":\"KLÉPIERRE SA\",\"description\":\"Hammerson Plc received a takeover approach from Klépierre SA.\",\"weight\":0.8,\"order\":1},{\"src_id\":\"SNAP INC\",\"tgt_id\":\"TWITTER\",\"description\":\"Snap Inc\\'s Snapchat is expected to overtake Twitter\\'s UK revenue.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"PERSIMMON PLC\",\"tgt_id\":\"JEFF FAIRBURN\",\"description\":\"Persimmon Plc paid its Chief Executive, Jeff Fairburn, a large sum.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"SHERBORNE INVESTORS\",\"tgt_id\":\"BARCLAYS PLC\",\"description\":\"Sherborne Investors took a 5 percent stake in Barclays Plc.\",\"weight\":0.7,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"MONZO\",\"description\":\"Royal Bank of Scotland is competing with Monzo in the digital banking space.\",\"weight\":0.6,\"order\":1},{\"src_id\":\"ROYAL BANK OF SCOTLAND\",\"tgt_id\":\"REVOLUT\",\"description\":\"Royal Bank of Scotland is competing with Revolut in the digital banking space.\",\"weight\":0.6,\"order\":1},{\"src_id\":\"HERSHEY CO\",\"tgt_id\":\"TYRRELLS\",\"description\":\"Hershey Co is exploring options for the British crisps brand Tyrrells.\",\"weight\":0.6,\"order\":1},{\"src_id\":\"FINANCIAL REPORTING COUNCIL\",\"tgt_id\":\"CARILLION PLC\",\"description\":\"The Financial Reporting Council is investigating former finance directors of Carillion Plc.\",\"weight\":0.6,\"order\":1}]\\n\\nPlease provide the output fields Reasoning then Similarity Score. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with the field Reasoning.\\n\\n\\x1b[32mReasoning: Let\\'s think step by step in order to produce the similarity_score. We start by matching relationships based on src_id and tgt_id pairs, allowing for slight variations in entity names. We then compare the matched pairs on description similarity (semantic meaning), weight similarity, and order similarity. Unmatched relationships are considered penalties. Finally, we aggregate scores, accounting for precision and recall, to return a final similarity score between 0 (no similarity) and 1 (perfect match).\\n\\n1. **Entity Pair Matching:**\\n - Matched pairs: \\n - (HAMMERSON PLC, KLÉPIERRE SA)\\n - (SNAP INC, TWITTER)\\n - (PERSIMMON PLC, JEFF FAIRBURN)\\n - (SHERBORNE INVESTORS, BARCLAYS PLC)\\n - (ROYAL BANK OF SCOTLAND, MONZO)\\n - (ROYAL BANK OF SCOTLAND, REVOLUT)\\n - (HERSHEY CO, TYRRELLS)\\n - (FINANCIAL REPORTING COUNCIL, CARILLION PLC)\\n - Unmatched pairs in gold: \\n - (INSTITUTE FOR FISCAL STUDIES, EUROPEAN UNION)\\n - (RAC FOUNDATION, CAR-OWNING HOUSEHOLDS)\\n\\n2. **Comparison of Matched Pairs:**\\n - **Description Similarity:**\\n - (HAMMERSON PLC, KLÉPIERRE SA): High similarity.\\n - (SNAP INC, TWITTER): High similarity.\\n - (PERSIMMON PLC, JEFF FAIRBURN): Moderate similarity.\\n - (SHERBORNE INVESTORS, BARCLAYS PLC): High similarity.\\n - (ROYAL BANK OF SCOTLAND, MONZO): Moderate similarity.\\n - (ROYAL BANK OF SCOTLAND, REVOLUT): Moderate similarity.\\n - (HERSHEY CO, TYRRELLS): High similarity.\\n - (FINANCIAL REPORTING COUNCIL, CARILLION PLC): High similarity.\\n - **Weight Similarity:**\\n - (HAMMERSON PLC, KLÉPIERRE SA): 0.9 (gold) vs 0.8 (predicted)\\n - (SNAP INC, TWITTER): 0.8 (gold) vs 0.7 (predicted)\\n - (PERSIMMON PLC, JEFF FAIRBURN): 0.9 (gold) vs 0.7 (predicted)\\n - (SHERBORNE INVESTORS, BARCLAYS PLC): 0.9 (gold) vs 0.7 (predicted)\\n - (ROYAL BANK OF SCOTLAND, MONZO): 0.8 (gold) vs 0.6 (predicted)\\n - (ROYAL BANK OF SCOTLAND, REVOLUT): 0.8 (gold) vs 0.6 (predicted)\\n - (HERSHEY CO, TYRRELLS): 0.7 (gold) vs 0.6 (predicted)\\n - (FINANCIAL REPORTING COUNCIL, CARILLION PLC): 0.9 (gold) vs 0.6 (predicted)\\n - **Order Similarity:**\\n - All matched pairs have the same order (1).\\n\\n3. **Penalties for Unmatched Relationships:**\\n - Two unmatched pairs in gold: (INSTITUTE FOR FISCAL STUDIES, EUROPEAN UNION) and (RAC FOUNDATION, CAR-OWNING HOUSEHOLDS).\\n\\n4. **Aggregation of Scores:**\\n - **Precision and Recall:**\\n - Precision: 8/8 (all predicted relationships are matched).\\n - Recall: 8/10 (8 out of 10 gold relationships are matched).\\n - **Weighted Score:**\\n - Description similarity: High for most pairs, moderate for some.\\n - Weight similarity: Varied, with some discrepancies.\\n - Order similarity: Perfect match.\\n\\n5. **Final Similarity Score:**\\n - Considering the high description similarity, moderate weight similarity, perfect order similarity, and the penalties for unmatched relationships, the final similarity score is calculated as follows:\\n - Precision: 1.0\\n - Recall: 0.8\\n - Weighted average considering the importance of each aspect: 0.85\\n\\nSimilarity Score: 0.85\\x1b[0m\\n\\n\\n'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "deepseek.inspect_history(n=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = BootstrapFewShotWithRandomSearch(\n", + " metric=relationships_similarity_metric, \n", + " num_threads=os.cpu_count(),\n", + " num_candidate_programs=10,\n", + " max_labeled_demos=5,\n", + " max_bootstrapped_demos=2,\n", + ")\n", + "rs_model = optimizer.compile(model, trainset=trainset, valset=valset)\n", + "rs_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = [entity_recall_metric, relationships_similarity_metric]\n", + "for metric in metrics:\n", + " evaluate = Evaluate(\n", + " devset=devset, \n", + " metric=metric, \n", + " num_threads=os.cpu_count(), \n", + " display_progress=True,\n", + " display_table=5,\n", + " )\n", + " evaluate(rs_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rs_model.save(entity_relationship_rs_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MIPROv2\n", + "- Let's take a small LLM to complete the entity relationship extraction, while making a larger LLM create the prompt instructions and few-shot examples.\n", + "- Feel free to generate sufficient amount of train dataset to avoid overfitting." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[93m\u001b[1mWARNING: Projected Language Model (LM) Calls\u001b[0m\n", + "\n", + "Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:\n", + "\n", + "\n", + "\u001b[93m- Prompt Model: \u001b[94m\u001b[1m10\u001b[0m\u001b[93m data summarizer calls + \u001b[94m\u001b[1m10\u001b[0m\u001b[93m * \u001b[94m\u001b[1m1\u001b[0m\u001b[93m lm calls in program + (\u001b[94m\u001b[1m2\u001b[0m\u001b[93m) lm calls in program aware proposer = \u001b[94m\u001b[1m22\u001b[0m\u001b[93m prompt model calls\u001b[0m\n", + "\u001b[93m- Task Model: \u001b[94m\u001b[1m25\u001b[0m\u001b[93m examples in minibatch * \u001b[94m\u001b[1m20\u001b[0m\u001b[93m batches + \u001b[94m\u001b[1m50\u001b[0m\u001b[93m examples in train set * \u001b[94m\u001b[1m2\u001b[0m\u001b[93m full evals = \u001b[94m\u001b[1m600\u001b[0m\u001b[93m task model calls\u001b[0m\n", + "\n", + "\u001b[93m\u001b[1mEstimated Cost Calculation:\u001b[0m\n", + "\n", + "\u001b[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) \n", + " + (Number of calls to prompt model * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).\u001b[0m\n", + "\n", + "For a preliminary estimate of potential costs, we recommend you perform your own calculations based on the task\n", + "and prompt models you intend to use. If the projected costs exceed your budget or expectations, you may consider:\n", + "\n", + "\u001b[93m- Reducing the number of trials (`num_batches`), the size of the trainset, or the number of LM calls in your program.\u001b[0m\n", + "\u001b[93m- Using a cheaper task model to optimize the prompt.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b: 10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b: 20\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b: 30\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b: 40\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "summary: Prediction(\n", + " summary='The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.'\n", + ")\n", + "DATA SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/50 [00:00 reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and relationships from a given text input. It uses a pipeline that involves a language model to perform this extraction. The program works by defining a signature that specifies the input fields (input_text and entity_types) and the output fields (reasoning and entities_relationships). The TypedPredictor class ensures that the type annotations in the signature are enforced, and it handles retries and error explanations if the output is invalid. The TypedEntityRelationshipExtractor class integrates this predictor to extract entities and relationships from the input text, converting the results into a structured format. The program uses a combination of type checking, JSON schema validation, and language model predictions to achieve its task.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S): \n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously extract and identify all entities and their relationships within the text. Ensure that each entity is accurately classified according to the provided entity types, and clearly define the relationships between entities, including their descriptions, weights, and orders. Provide a step-by-step reasoning process to justify the extraction and classification of each entity and relationship.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously extract and identify all entities and their relationships within the text. Ensure that each entity is accurately classified according to the provided entity types, and clearly define the relationships between entities, including their descriptions, weights, and orders. Provide a step-by-step reasoning process to justify the extraction and classification of each entity and relationship.\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and relationships from a given text input. It uses a pipeline that involves a language model to perform this extraction. The program works by defining a signature that specifies the input fields (input_text and entity_types) and the output fields (reasoning and entities_relationships). The TypedPredictor class ensures that the type annotations in the signature are enforced, and it handles retries and error explanations if the output is invalid. The TypedEntityRelationshipExtractor class integrates this predictor to extract entities and relationships from the input text, converting the results into a structured format. The program uses a combination of type checking, JSON schema validation, and language model predictions to achieve its task.\n", + "task_demos \n", + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and relationships from a given text input. It uses a pipeline that involves a language model to perform this extraction. The program works by defining a signature that specifies the input fields (input_text and entity_types) and the output fields (reasoning and entities_relationships). The TypedPredictor class ensures that the type annotations in the signature are enforced, and it handles retries and error explanations if the output is invalid. The TypedEntityRelationshipExtractor class integrates this predictor to extract entities and relationships from the input text, converting the results into a structured format. The program uses a combination of type checking, JSON schema validation, and language model predictions to achieve its task.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S): \n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously extract and identify all entities and their relationships within the text. Ensure that each entity is accurately classified according to the provided entity types, and clearly define the relationships between entities, including their descriptions, weights, and orders. Provide a step-by-step reasoning process to justify the extraction and classification of each entity and relationship.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously extract and identify all entities and their relationships within the text. Ensure that each entity is accurately classified according to the provided entity types, and clearly define the relationships between entities, including their descriptions, weights, and orders. Provide a step-by-step reasoning process to justify the extraction and classification of each entity and relationship.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and relationships from input text. It works by first identifying entities in the text based on provided entity types, then determining the relationships between these entities considering their interactions and dependencies within the context of the text. Finally, it formats the entities and relationships according to a specified JSON schema. The program uses a pipeline that includes a language model to perform these tasks, ensuring type annotations and handling retries and errors in the extraction process.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: Melbourne, Australia (CNN)After spending part of the off-season training with Mike Tyson, Serena Williams is hoping to deliver a knockout punch at the Australian Open. Follow @cnnsport\n", + "\n", + "For Williams that would mean winning a record-tying 24th grand slam title, which has so far proved elusive despite getting close four times since returning to the tour after giving birth to daughter Alexis Olympia. Her preparation for the year's first major couldn't have gone much better, suggesting the mini grand slam drought for arguably tennis' greatest ever player is about to cease. Williams let rip into a punching bag in December -- drawing a compliment from former heavyweight boxing champion Tyson, whose daughter happens to be a budding tennis star -- and then won a buildup tournament in Auckland last week to incredibly land a title in a fourth straight decade. She also mingled with one of tennis' youngest sensations, Coco Gauff, as part of the training camp in Florida organized by her coach, Patrick Mouratoglou. JUST WATCHEDMouratoglou on Serena comeback and McEnroeReplayMore Videos ...MUST WATCHMouratoglou on Serena comeback and McEnroe 02:19Read MoreWilliams overcame singles opponents with differing styles, reached the doubles final with one of her best friends -- the soon-to-be retired Caroline Wozniacki -- and most crucially, ended a five-match losing streak in finals with her daughter and husband Alexis Ohanian looking on. No wonder the 38-year-old said following her straight-set victory over fellow American Jessica Pegula: \"It's pretty satisfying just to get a win in the final. That was really important for me. And I just want to build on it,\" added Williams, who donated her prize money check of $43,000 to bush fire relief efforts in Australia. \"It's just a step towards the next goal.\"Indeed. READ: Can Rafael Nadal match Roger Federer's all-time grand slam record?READ: Player brands Australian Open email a 'slap in the face'Eyes on bigger prizeYes, as nice as it was to be holding the winners' trophy in Auckland -- where Williams once hit 88 unforced errors in a loss to Madison Brengle -- she didn't make the long trip to simply prosper in New Zealand. The much bigger prize is the Australian Open, where Williams triumphed while in the early stages of pregnancy in 2017. If Williams makes the final in Melbourne -- and she might have to defeat the likes of twice grand slam winner Naomi Osaka and current world No. 1 Ashleigh Barty along the way -- she will probably have to then defeat someone with a heftier reputation than the 66th-ranked Pegula. Helping Williams, however, is that one of the main contenders, Bianca Andreescu, isn't in Melbourne because of another injury, this time to a knee. But winning any final -- after losses in grand slam finals to Osaka, Andreescu, Angelique Kerber and Simona Halep and retiring against Andreescu in the Rogers Cup finale last August -- could potentially be turning point as Williams attempts to draw level with the grand slam haul of Australia's Margaret Court. JUST WATCHEDSerena Williams falls short in major title chaseReplayMore Videos ...MUST WATCHSerena Williams falls short in major title chase 01:00\"Serena, she certainly looks hungry, and I think she's got a little momentum going into the Australian Open,\" Chris Evert, the 18-time grand slam champion, told reporters in an ESPN conference call last week. \"And it would probably be the least pressure, this grand slam, to win for her. \"I think every other tournament, the French Open, the clay isn't her best surface. Wimbledon is a lot of pressure, US Open is a lot of pressure. \"This one, the first one of the year, it's a 'Happy Slam,'\" referring to the Australian Open's nickname. \"I think if she just takes a little bit of pressure off herself and she can just play her brand of tennis, I think she's got a good shot at winning it.\"She's better at grand slams than any other player when you look at the last two years.\"The way Wozniacki put it, Williams has a \"big chance\" to match Court. pic.twitter.com/skoZilynH8— Serena Williams (@serenawilliams) January 12, 2020 Other high-profile players, such as Halep and former world No. 1 Karolina Pliskova -- who saved four match points last year in Melbourne against Williams and rallied from 5-1 down in the third set after the American rolled her ankle -- aren't discounting her chances, either, despite just falling short recently at grand slams. \"I'm very impressed about her, that she keeps playing at this level, with being a mother and also being a little bit older than us,\" said Halep, who made a mere three unforced errors in downing Williams in the 2019 Wimbledon final. \"It's impressive what she does.\"Challenges aheadBut Evert also underscored the stiffer challenges facing Williams. \"There are some darned good players out there that I have a lot of respect for, and the way that Osaka, Barty, Pliskova, Halep ... there's not one or two threats to Serena, there's probably about eight threats, eight players that can probably do some damage and that can compete against her.\"And one more thing is -- I always felt this way -- the older you get, I think the more bad days you may have, days when you feel burned out, days when you don't want to get out of bed, days when you don't have incentive. You don't want to have that day during a grand slam, but sometimes you can't help it.\"JUST WATCHEDSimona Halep on winning her first Wimbledon titleReplayMore Videos ...MUST WATCHSimona Halep on winning her first Wimbledon title 01:56Visit our tennis page for more news and videosWilliams figures to still be around towards the end of the Australian Open, but whether it is as the last person standing continues to be the question. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Melbourne\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A city in Australia where the Australian Open is held.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Australia\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The country where Melbourne is located and where the Australian Open takes place.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Serena Williams\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player aiming to win the Australian Open.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Mike Tyson\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former heavyweight boxing champion who trained with Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Australian Open\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A major tennis tournament held annually in Melbourne, Australia.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Alexis Olympia\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The daughter of Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Caroline Wozniacki\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player and friend of Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Coco Gauff\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A young tennis sensation who trained with Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Patrick Mouratoglou\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The coach of Serena Williams who organized the training camp.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Naomi Osaka\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player and grand slam winner who could face Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ashleigh Barty\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The current world No. 1 tennis player who could face Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Bianca Andreescu\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player who is a main contender but is not in Melbourne due to injury.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Simona Halep\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player who defeated Serena Williams in the 2019 Wimbledon final.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Chris Evert\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"An 18-time grand slam champion who commented on Serena Williams' chances at the Australian Open.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Margaret Court\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"An Australian tennis player with a record-tying 24 grand slam titles.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jessica Pegula\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A fellow American tennis player who Serena Williams defeated in a match.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Alexis Ohanian\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The husband of Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Karolina Pliskova\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former world No. 1 tennis player who saved match points against Serena Williams in Melbourne.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Mike Tyson\",\n", + "\n", + "\"description\": \"Serena Williams trained with Mike Tyson during the off-season.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Australian Open\",\n", + "\n", + "\"description\": \"Serena Williams is aiming to win the Australian Open.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Alexis Olympia\",\n", + "\n", + "\"description\": \"Serena Williams is the mother of Alexis Olympia.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Caroline Wozniacki\",\n", + "\n", + "\"description\": \"Serena Williams reached the doubles final with Caroline Wozniacki.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Coco Gauff\",\n", + "\n", + "\"description\": \"Serena Williams trained with Coco Gauff.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Patrick Mouratoglou\",\n", + "\n", + "\"description\": \"Patrick Mouratoglou is the coach of Serena Williams.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Naomi Osaka\",\n", + "\n", + "\"description\": \"Serena Williams might have to defeat Naomi Osaka to reach the final of the Australian Open.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Ashleigh Barty\",\n", + "\n", + "\"description\": \"Serena Williams might have to defeat Ashleigh Barty to reach the final of the Australian Open.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Bianca Andreescu\",\n", + "\n", + "\"description\": \"Bianca Andreescu is a main contender but is not in Melbourne due to injury.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Simona Halep\",\n", + "\n", + "\"description\": \"Simona Halep defeated Serena Williams in the 2019 Wimbledon final.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Chris Evert\",\n", + "\n", + "\"description\": \"Chris Evert commented on Serena Williams' chances at the Australian Open.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Margaret Court\",\n", + "\n", + "\"description\": \"Serena Williams is attempting to match Margaret Court's record of 24 grand slam titles.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Jessica Pegula\",\n", + "\n", + "\"description\": \"Serena Williams defeated Jessica Pegula in a match.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Alexis Ohanian\",\n", + "\n", + "\"description\": \"Alexis Ohanian is the husband of Serena Williams.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Karolina Pliskova\",\n", + "\n", + "\"description\": \"Karolina Pliskova saved match points against Serena Williams in Melbourne.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)Neymar surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.Two penalties and a last-minute tap-in took the Paris Saint-Germain forward's tally to 64 for his country, two ahead of Ronaldo and 13 behind Pele's all-time record of 77.Everton's Richarlison scored Brazil's other goal as the five-time world champion twice had to come from behind to secure the win.Brazil head coach Tite told reporters after the match that it was \"unfair\" to compare Neymar and Ronaldo.\"What I can say is Neymar has this unpredictability,\" he said. \"He is the bow and the arrow, he's a player who both makes and takes chances. And he gets better and better, and more mature.\"Read MoreNeymar celebrates after completing his hat-trick against Peru.The match had a number of controversial VAR moments, particularly the decision to award Neymar a second penalty late in the game with the score tied at 2-2, a decision which surprised even the Brazilian players.Peru's anger was compounded in stoppage time when Carlos Zambrano was shown a red card for an elbow on Richarlison. The Everton forward had escaped punishment for a similar incident earlier in the match that left Peru's Miguel Trauco with a bloody wound above his eye.The latest incidents added to a growing list of decisions that have left South American fans scratching their heads during this international break, the first time VAR has been used in CONMEBOL World Cup qualifiersBrazil's victory ensures it begins the grueling 18-month campaign with a 100% record to top the standings on six points along with Argentina, which secured an impressive 2-1 win in the altitude of La Paz, Bolivia earlier in the day.The top four teams will qualify automatically for Qatar 2022, with the fifth-placed team competing in a two-legged playoff against a country from another continent.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and the context provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Neymar\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Brazilian footballer who surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ronaldo\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former Brazilian footballer who was previously second on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Pele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A legendary Brazilian footballer who holds the all-time record for Brazil with 77 goals.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Brazil\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A country where the football match took place and where Neymar and Ronaldo are from.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peru\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The opposing country in the football match against Brazil.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"World Cup\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"An international football competition where the qualifying match between Brazil and Peru took place.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Ronaldo\",\n", + "\n", + "\"description\": \"Neymar surpassed Ronaldo on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Pele\",\n", + "\n", + "\"description\": \"Neymar is 13 goals behind Pele's all-time record for Brazil.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Brazil\",\n", + "\n", + "\"tgt_id\": \"Peru\",\n", + "\n", + "\"description\": \"Brazil won a football match against Peru in World Cup qualifying.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)The New York City Marathon returns on Sunday following a two-year hiatus, and at the front of the field, legendary distance runner Kenenisa Bekele is primed to \"make more history\" in his storied career. Bekele, a three-time Olympic gold medalist over 5,000 and 10,000 meters, is the headline name competing in the men's elite field in New York. With his only previous marathon in the United States a fourth-place finish in Chicago seven years ago, the Ethiopian is hungry for success as he prepares to take to the streets of the Big Apple.\"Really, I wanted to have a good result in the USA, that's why I chose the New York marathon,\" Bekele told reporters this week.\"The New York marathon is big publicity and a really big marathon race. To achieve a good result in this marathon would be perfect, and for me, it's also really good to make more history in sports.\"Read MoreBekele has a point to prove after a disappointing outing six weeks ago at the Berlin Marathon -- the race at which he came within two seconds of Eliud Kipchoge's world record time two years ago. Bekele catches his breath after this year's Berlin Marathon. Despite expectations that he could challenge Kipchoge's record of two hours, one minute and 39 seconds in Berlin this year, Bekele says he fell foul to a poor night's sleep as he finished third, more than a minute behind winner Guye Adola.\"I was not ready for that race,\" Bekele explained. \"A few weeks before the race, I was not really confident. It's a little bit tough for me because the day before also I couldn't sleep well, I really had bad luck the day before in the night. That also made me very tired.\"I've recovered well ... At this moment, I'm really strong. I hope I'll achieve a good result on Sunday.\"For race organizers, who called off last year's New York City Marathon amid the pandemic, it is a boon to have a figure like Bekele on the start line on Sunday.On top of his three Olympic gold medals -- including a double in the 5,000 and 10,000 meters at Beijing 2008 -- and one silver medal, he also claimed five world athletics championship titles between 2003 to 2009. Bekele's 5,000m world record stood for 16 years before it was broken by Uganda's Joshua Cheptegei last year, while his 10,000m record stood for 15 before also being broken last year by Cheptegei. Eliud Kipchoge: Marathon world record holder has 'the qualities of an ascetic monk'Many consider Bekele to be the greatest male distance runner of all time, such has been his supremacy across multiple distances over the past two decades; others point towards Kenya's Kipchoge, who has been so dominant over the 26.2 miles of a marathon and is the first man to break the distance's two-hour barrier, albeit in unofficial conditions. \"I still feel that I am the best and better than anyone,\" Bekele told Sports Illustrated in August after it was announced that he would make his debut in New York. \"I think every athlete and others should think like that.\"Having struggled with injuries and form in recent years, Bekele now has a second opportunity to win a major marathon in the US. There is little chance that he will be able to come close to his best marathon time in New York; unlike Berlin, the course, which undulates through the city's five boroughs, rarely produces fast performances.\"I know the course is tough and there are also no pacemakers in the race. It's more about concentration and a tactical race,\" said Bekele. He will be up against the Netherlands' Adbi Nageeye, who won the silver medal at the Olympics in Sapporo three months ago, and the half marathon world record holder Kibiwott Kandie of Kenya, who is making his marathon debut. The women's race is headlined by Kenya's Peres Jepchirchir, the gold medalist at the Olympics. She will be joined by the USA's Olympic bronze medalist Molly Seidel. Seidel poses with her bronze medal from the Tokyo Olympics. Visit CNN.com/sport for more news, videos and featuresAnother athlete to keep an eye on is US star Shalane Flanagan as she attempts to run six marathons in 42 days. Having already completed marathons in Berlin, London, Chicago, Boston and Portland -- all between times of 2:35:14 and 2:46:39 -- Flanagan will aim to complete her challenge in New York. And as for Bekele, like so many other runners who will take to the start line on Sunday, he's grateful to be able to race in New York following the pandemic and last year's cancellation.\"To see the people of New York participate in this race again -- to see this for me is fantastic,\" he said. \"I want to wish all participants good luck.\"\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kenenisa Bekele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Legendary distance runner and three-time Olympic gold medalist over 5,000 and 10,000 meters.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"New York City Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in New York City, returning after a two-year hiatus due to the pandemic.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Eliud Kipchoge\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Marathon world record holder and dominant runner over the 26.2 miles of a marathon.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Berlin Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in Berlin, where Kenenisa Bekele had a disappointing outing six weeks ago.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Guye Adola\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Winner of the Berlin Marathon, finishing more than a minute ahead of Kenenisa Bekele.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Adbi Nageeye\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Dutch runner who won the silver medal at the Olympics in Sapporo and will compete in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kibiwott Kandie\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Half marathon world record holder from Kenya, making his marathon debut in New York.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peres Jepchirchir\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Kenyan runner and gold medalist at the Olympics, headlining the women's race in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Molly Seidel\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"American runner and Olympic bronze medalist, competing in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Shalane Flanagan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"US star attempting to run six marathons in 42 days, including the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"New York City Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele is primed to compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Eliud Kipchoge\",\n", + "\n", + "\"description\": \"Kenenisa Bekele aims to make more history in his storied career, comparing himself to Eliud Kipchoge.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Berlin Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele had a disappointing outing at the Berlin Marathon six weeks ago.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Guye Adola\",\n", + "\n", + "\"description\": \"Kenenisa Bekele finished third in the Berlin Marathon, more than a minute behind winner Guye Adola.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Adbi Nageeye\",\n", + "\n", + "\"description\": \"Adbi Nageeye will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Kibiwott Kandie\",\n", + "\n", + "\"description\": \"Kibiwott Kandie will make his marathon debut in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Peres Jepchirchir\",\n", + "\n", + "\"description\": \"Peres Jepchirchir will headline the women's race in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Molly Seidel\",\n", + "\n", + "\"description\": \"Molly Seidel will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Shalane Flanagan\",\n", + "\n", + "\"description\": \"Shalane Flanagan will aim to complete her challenge of running six marathons in 42 days in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and relationships from input text. It works by first identifying entities in the text based on provided entity types, then determining the relationships between these entities considering their interactions and dependencies within the context of the text. Finally, it formats the entities and relationships according to a specified JSON schema. The program uses a pipeline that includes a language model to perform these tasks, ensuring type annotations and handling retries and errors in the extraction process.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: Melbourne, Australia (CNN)After spending part of the off-season training with Mike Tyson, Serena Williams is hoping to deliver a knockout punch at the Australian Open. Follow @cnnsport\n", + "\n", + "For Williams that would mean winning a record-tying 24th grand slam title, which has so far proved elusive despite getting close four times since returning to the tour after giving birth to daughter Alexis Olympia. Her preparation for the year's first major couldn't have gone much better, suggesting the mini grand slam drought for arguably tennis' greatest ever player is about to cease. Williams let rip into a punching bag in December -- drawing a compliment from former heavyweight boxing champion Tyson, whose daughter happens to be a budding tennis star -- and then won a buildup tournament in Auckland last week to incredibly land a title in a fourth straight decade. She also mingled with one of tennis' youngest sensations, Coco Gauff, as part of the training camp in Florida organized by her coach, Patrick Mouratoglou. JUST WATCHEDMouratoglou on Serena comeback and McEnroeReplayMore Videos ...MUST WATCHMouratoglou on Serena comeback and McEnroe 02:19Read MoreWilliams overcame singles opponents with differing styles, reached the doubles final with one of her best friends -- the soon-to-be retired Caroline Wozniacki -- and most crucially, ended a five-match losing streak in finals with her daughter and husband Alexis Ohanian looking on. No wonder the 38-year-old said following her straight-set victory over fellow American Jessica Pegula: \"It's pretty satisfying just to get a win in the final. That was really important for me. And I just want to build on it,\" added Williams, who donated her prize money check of $43,000 to bush fire relief efforts in Australia. \"It's just a step towards the next goal.\"Indeed. READ: Can Rafael Nadal match Roger Federer's all-time grand slam record?READ: Player brands Australian Open email a 'slap in the face'Eyes on bigger prizeYes, as nice as it was to be holding the winners' trophy in Auckland -- where Williams once hit 88 unforced errors in a loss to Madison Brengle -- she didn't make the long trip to simply prosper in New Zealand. The much bigger prize is the Australian Open, where Williams triumphed while in the early stages of pregnancy in 2017. If Williams makes the final in Melbourne -- and she might have to defeat the likes of twice grand slam winner Naomi Osaka and current world No. 1 Ashleigh Barty along the way -- she will probably have to then defeat someone with a heftier reputation than the 66th-ranked Pegula. Helping Williams, however, is that one of the main contenders, Bianca Andreescu, isn't in Melbourne because of another injury, this time to a knee. But winning any final -- after losses in grand slam finals to Osaka, Andreescu, Angelique Kerber and Simona Halep and retiring against Andreescu in the Rogers Cup finale last August -- could potentially be turning point as Williams attempts to draw level with the grand slam haul of Australia's Margaret Court. JUST WATCHEDSerena Williams falls short in major title chaseReplayMore Videos ...MUST WATCHSerena Williams falls short in major title chase 01:00\"Serena, she certainly looks hungry, and I think she's got a little momentum going into the Australian Open,\" Chris Evert, the 18-time grand slam champion, told reporters in an ESPN conference call last week. \"And it would probably be the least pressure, this grand slam, to win for her. \"I think every other tournament, the French Open, the clay isn't her best surface. Wimbledon is a lot of pressure, US Open is a lot of pressure. \"This one, the first one of the year, it's a 'Happy Slam,'\" referring to the Australian Open's nickname. \"I think if she just takes a little bit of pressure off herself and she can just play her brand of tennis, I think she's got a good shot at winning it.\"She's better at grand slams than any other player when you look at the last two years.\"The way Wozniacki put it, Williams has a \"big chance\" to match Court. pic.twitter.com/skoZilynH8— Serena Williams (@serenawilliams) January 12, 2020 Other high-profile players, such as Halep and former world No. 1 Karolina Pliskova -- who saved four match points last year in Melbourne against Williams and rallied from 5-1 down in the third set after the American rolled her ankle -- aren't discounting her chances, either, despite just falling short recently at grand slams. \"I'm very impressed about her, that she keeps playing at this level, with being a mother and also being a little bit older than us,\" said Halep, who made a mere three unforced errors in downing Williams in the 2019 Wimbledon final. \"It's impressive what she does.\"Challenges aheadBut Evert also underscored the stiffer challenges facing Williams. \"There are some darned good players out there that I have a lot of respect for, and the way that Osaka, Barty, Pliskova, Halep ... there's not one or two threats to Serena, there's probably about eight threats, eight players that can probably do some damage and that can compete against her.\"And one more thing is -- I always felt this way -- the older you get, I think the more bad days you may have, days when you feel burned out, days when you don't want to get out of bed, days when you don't have incentive. You don't want to have that day during a grand slam, but sometimes you can't help it.\"JUST WATCHEDSimona Halep on winning her first Wimbledon titleReplayMore Videos ...MUST WATCHSimona Halep on winning her first Wimbledon title 01:56Visit our tennis page for more news and videosWilliams figures to still be around towards the end of the Australian Open, but whether it is as the last person standing continues to be the question. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Melbourne\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A city in Australia where the Australian Open is held.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Australia\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The country where Melbourne is located and where the Australian Open takes place.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Serena Williams\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player aiming to win the Australian Open.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Mike Tyson\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former heavyweight boxing champion who trained with Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Australian Open\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A major tennis tournament held annually in Melbourne, Australia.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Alexis Olympia\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The daughter of Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Caroline Wozniacki\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player and friend of Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Coco Gauff\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A young tennis sensation who trained with Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Patrick Mouratoglou\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The coach of Serena Williams who organized the training camp.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Naomi Osaka\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player and grand slam winner who could face Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ashleigh Barty\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The current world No. 1 tennis player who could face Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Bianca Andreescu\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player who is a main contender but is not in Melbourne due to injury.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Simona Halep\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player who defeated Serena Williams in the 2019 Wimbledon final.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Chris Evert\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"An 18-time grand slam champion who commented on Serena Williams' chances at the Australian Open.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Margaret Court\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"An Australian tennis player with a record-tying 24 grand slam titles.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jessica Pegula\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A fellow American tennis player who Serena Williams defeated in a match.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Alexis Ohanian\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The husband of Serena Williams.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Karolina Pliskova\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former world No. 1 tennis player who saved match points against Serena Williams in Melbourne.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Mike Tyson\",\n", + "\n", + "\"description\": \"Serena Williams trained with Mike Tyson during the off-season.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Australian Open\",\n", + "\n", + "\"description\": \"Serena Williams is aiming to win the Australian Open.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Alexis Olympia\",\n", + "\n", + "\"description\": \"Serena Williams is the mother of Alexis Olympia.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Caroline Wozniacki\",\n", + "\n", + "\"description\": \"Serena Williams reached the doubles final with Caroline Wozniacki.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Coco Gauff\",\n", + "\n", + "\"description\": \"Serena Williams trained with Coco Gauff.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Patrick Mouratoglou\",\n", + "\n", + "\"description\": \"Patrick Mouratoglou is the coach of Serena Williams.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Naomi Osaka\",\n", + "\n", + "\"description\": \"Serena Williams might have to defeat Naomi Osaka to reach the final of the Australian Open.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Ashleigh Barty\",\n", + "\n", + "\"description\": \"Serena Williams might have to defeat Ashleigh Barty to reach the final of the Australian Open.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Bianca Andreescu\",\n", + "\n", + "\"description\": \"Bianca Andreescu is a main contender but is not in Melbourne due to injury.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Simona Halep\",\n", + "\n", + "\"description\": \"Simona Halep defeated Serena Williams in the 2019 Wimbledon final.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Chris Evert\",\n", + "\n", + "\"description\": \"Chris Evert commented on Serena Williams' chances at the Australian Open.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Margaret Court\",\n", + "\n", + "\"description\": \"Serena Williams is attempting to match Margaret Court's record of 24 grand slam titles.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Jessica Pegula\",\n", + "\n", + "\"description\": \"Serena Williams defeated Jessica Pegula in a match.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Alexis Ohanian\",\n", + "\n", + "\"description\": \"Alexis Ohanian is the husband of Serena Williams.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Serena Williams\",\n", + "\n", + "\"tgt_id\": \"Karolina Pliskova\",\n", + "\n", + "\"description\": \"Karolina Pliskova saved match points against Serena Williams in Melbourne.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)Neymar surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.Two penalties and a last-minute tap-in took the Paris Saint-Germain forward's tally to 64 for his country, two ahead of Ronaldo and 13 behind Pele's all-time record of 77.Everton's Richarlison scored Brazil's other goal as the five-time world champion twice had to come from behind to secure the win.Brazil head coach Tite told reporters after the match that it was \"unfair\" to compare Neymar and Ronaldo.\"What I can say is Neymar has this unpredictability,\" he said. \"He is the bow and the arrow, he's a player who both makes and takes chances. And he gets better and better, and more mature.\"Read MoreNeymar celebrates after completing his hat-trick against Peru.The match had a number of controversial VAR moments, particularly the decision to award Neymar a second penalty late in the game with the score tied at 2-2, a decision which surprised even the Brazilian players.Peru's anger was compounded in stoppage time when Carlos Zambrano was shown a red card for an elbow on Richarlison. The Everton forward had escaped punishment for a similar incident earlier in the match that left Peru's Miguel Trauco with a bloody wound above his eye.The latest incidents added to a growing list of decisions that have left South American fans scratching their heads during this international break, the first time VAR has been used in CONMEBOL World Cup qualifiersBrazil's victory ensures it begins the grueling 18-month campaign with a 100% record to top the standings on six points along with Argentina, which secured an impressive 2-1 win in the altitude of La Paz, Bolivia earlier in the day.The top four teams will qualify automatically for Qatar 2022, with the fifth-placed team competing in a two-legged playoff against a country from another continent.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and the context provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Neymar\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Brazilian footballer who surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ronaldo\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former Brazilian footballer who was previously second on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Pele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A legendary Brazilian footballer who holds the all-time record for Brazil with 77 goals.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Brazil\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A country where the football match took place and where Neymar and Ronaldo are from.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peru\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The opposing country in the football match against Brazil.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"World Cup\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"An international football competition where the qualifying match between Brazil and Peru took place.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Ronaldo\",\n", + "\n", + "\"description\": \"Neymar surpassed Ronaldo on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Pele\",\n", + "\n", + "\"description\": \"Neymar is 13 goals behind Pele's all-time record for Brazil.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Brazil\",\n", + "\n", + "\"tgt_id\": \"Peru\",\n", + "\n", + "\"description\": \"Brazil won a football match against Peru in World Cup qualifying.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)The New York City Marathon returns on Sunday following a two-year hiatus, and at the front of the field, legendary distance runner Kenenisa Bekele is primed to \"make more history\" in his storied career. Bekele, a three-time Olympic gold medalist over 5,000 and 10,000 meters, is the headline name competing in the men's elite field in New York. With his only previous marathon in the United States a fourth-place finish in Chicago seven years ago, the Ethiopian is hungry for success as he prepares to take to the streets of the Big Apple.\"Really, I wanted to have a good result in the USA, that's why I chose the New York marathon,\" Bekele told reporters this week.\"The New York marathon is big publicity and a really big marathon race. To achieve a good result in this marathon would be perfect, and for me, it's also really good to make more history in sports.\"Read MoreBekele has a point to prove after a disappointing outing six weeks ago at the Berlin Marathon -- the race at which he came within two seconds of Eliud Kipchoge's world record time two years ago. Bekele catches his breath after this year's Berlin Marathon. Despite expectations that he could challenge Kipchoge's record of two hours, one minute and 39 seconds in Berlin this year, Bekele says he fell foul to a poor night's sleep as he finished third, more than a minute behind winner Guye Adola.\"I was not ready for that race,\" Bekele explained. \"A few weeks before the race, I was not really confident. It's a little bit tough for me because the day before also I couldn't sleep well, I really had bad luck the day before in the night. That also made me very tired.\"I've recovered well ... At this moment, I'm really strong. I hope I'll achieve a good result on Sunday.\"For race organizers, who called off last year's New York City Marathon amid the pandemic, it is a boon to have a figure like Bekele on the start line on Sunday.On top of his three Olympic gold medals -- including a double in the 5,000 and 10,000 meters at Beijing 2008 -- and one silver medal, he also claimed five world athletics championship titles between 2003 to 2009. Bekele's 5,000m world record stood for 16 years before it was broken by Uganda's Joshua Cheptegei last year, while his 10,000m record stood for 15 before also being broken last year by Cheptegei. Eliud Kipchoge: Marathon world record holder has 'the qualities of an ascetic monk'Many consider Bekele to be the greatest male distance runner of all time, such has been his supremacy across multiple distances over the past two decades; others point towards Kenya's Kipchoge, who has been so dominant over the 26.2 miles of a marathon and is the first man to break the distance's two-hour barrier, albeit in unofficial conditions. \"I still feel that I am the best and better than anyone,\" Bekele told Sports Illustrated in August after it was announced that he would make his debut in New York. \"I think every athlete and others should think like that.\"Having struggled with injuries and form in recent years, Bekele now has a second opportunity to win a major marathon in the US. There is little chance that he will be able to come close to his best marathon time in New York; unlike Berlin, the course, which undulates through the city's five boroughs, rarely produces fast performances.\"I know the course is tough and there are also no pacemakers in the race. It's more about concentration and a tactical race,\" said Bekele. He will be up against the Netherlands' Adbi Nageeye, who won the silver medal at the Olympics in Sapporo three months ago, and the half marathon world record holder Kibiwott Kandie of Kenya, who is making his marathon debut. The women's race is headlined by Kenya's Peres Jepchirchir, the gold medalist at the Olympics. She will be joined by the USA's Olympic bronze medalist Molly Seidel. Seidel poses with her bronze medal from the Tokyo Olympics. Visit CNN.com/sport for more news, videos and featuresAnother athlete to keep an eye on is US star Shalane Flanagan as she attempts to run six marathons in 42 days. Having already completed marathons in Berlin, London, Chicago, Boston and Portland -- all between times of 2:35:14 and 2:46:39 -- Flanagan will aim to complete her challenge in New York. And as for Bekele, like so many other runners who will take to the start line on Sunday, he's grateful to be able to race in New York following the pandemic and last year's cancellation.\"To see the people of New York participate in this race again -- to see this for me is fantastic,\" he said. \"I want to wish all participants good luck.\"\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kenenisa Bekele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Legendary distance runner and three-time Olympic gold medalist over 5,000 and 10,000 meters.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"New York City Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in New York City, returning after a two-year hiatus due to the pandemic.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Eliud Kipchoge\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Marathon world record holder and dominant runner over the 26.2 miles of a marathon.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Berlin Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in Berlin, where Kenenisa Bekele had a disappointing outing six weeks ago.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Guye Adola\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Winner of the Berlin Marathon, finishing more than a minute ahead of Kenenisa Bekele.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Adbi Nageeye\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Dutch runner who won the silver medal at the Olympics in Sapporo and will compete in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kibiwott Kandie\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Half marathon world record holder from Kenya, making his marathon debut in New York.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peres Jepchirchir\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Kenyan runner and gold medalist at the Olympics, headlining the women's race in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Molly Seidel\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"American runner and Olympic bronze medalist, competing in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Shalane Flanagan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"US star attempting to run six marathons in 42 days, including the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"New York City Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele is primed to compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Eliud Kipchoge\",\n", + "\n", + "\"description\": \"Kenenisa Bekele aims to make more history in his storied career, comparing himself to Eliud Kipchoge.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Berlin Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele had a disappointing outing at the Berlin Marathon six weeks ago.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Guye Adola\",\n", + "\n", + "\"description\": \"Kenenisa Bekele finished third in the Berlin Marathon, more than a minute behind winner Guye Adola.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Adbi Nageeye\",\n", + "\n", + "\"description\": \"Adbi Nageeye will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Kibiwott Kandie\",\n", + "\n", + "\"description\": \"Kibiwott Kandie will make his marathon debut in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Peres Jepchirchir\",\n", + "\n", + "\"description\": \"Peres Jepchirchir will headline the women's race in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Molly Seidel\",\n", + "\n", + "\"description\": \"Molly Seidel will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Shalane Flanagan\",\n", + "\n", + "\"description\": \"Shalane Flanagan will aim to complete her challenge of running six marathons in 42 days in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, identify and extract all entities within the text that match the specified entity types. After identifying the entities, determine the relationships between these entities based on their interactions and dependencies within the context of the text. Ensure that the extracted entities and relationships are formatted according to the specified JSON schema, including the fields `entity_name`, `entity_type`, `description`, `importance_score` for entities, and `src_id`, `tgt_id`, `description`, `weight`, and `order` for relationships. Provide a detailed step-by-step reasoning process to justify the extraction and relationship determination.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, identify and extract all entities within the text that match the specified entity types. After identifying the entities, determine the relationships between these entities based on their interactions and dependencies within the context of the text. Ensure that the extracted entities and relationships are formatted according to the specified JSON schema, including the fields `entity_name`, `entity_type`, `description`, `importance_score` for entities, and `src_id`, `tgt_id`, `description`, `weight`, and `order` for relationships. Provide a detailed step-by-step reasoning process to justify the extraction and relationship determination.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and their relationships from a given text. It works by first identifying entities in the text based on predefined entity types, then determining the relationships between these entities considering their interactions and the context provided in the text. The program ensures that each entity and relationship is described comprehensively and assigned appropriate importance scores and weights. The output is formatted according to a specified JSON schema, listing entities and their relationships in a structured manner.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: (CNN)Former Bosnian Serb army leader Ratko Mladic was sentenced to life in prison Wednesday after being found guilty of genocide for atrocities committed during the Bosnian war from 1992 to 1995.Verdict proceedings had been interrupted earlier when the 74-year-old's legal team claimed that his blood pressure was too high to continue.After outbursts from Mladic, Judge Alphons Orie, who was delivering a summation of the case, ordered the removal of the ex-general, telling him he could monitor proceedings by audio and video. \"I am very distraught,\" Mladic shouted inside the courtroom. \"Everything that you have said is pure lies. Shame on you. It's all lies.\"Mladic's legal team had asked for proceedings to be halted or for the summation of the case to be skipped, which the Judge refused. Read MoreMladic was charged with two counts of genocide and nine crimes against humanity and war crimes for his role in the conflict in the former Yugoslavia from 1992 to 1995, during which 100,000 people were killed and another 2.2 million displaced. He was found not guilty on one charge of genocide, but received a guilty verdict on each of the other 10 counts. Mladic's lawyer, Dragan Ivetic, said it was \"certain\" Mladic would appeal.\"Butcher of Bosnia\" Ratko Mladic has been found guilty of the highest crimes against international law, says CNN's @camanpour, who covered the Bosnian war https://t.co/4A7DtVjHn8 pic.twitter.com/HdPKU3tqRD— CNN International (@cnni) November 22, 2017The trial, which opened in 2012, took place at the International Criminal Tribunal for the former Yugoslavia in The Hague, Netherlands. The ad hoc court was established to prosecute crimes committed during the Balkans conflict. Mladic was accused of orchestrating a campaign of ethnic cleansing, including the slaughter of thousands of Muslim men and boys at Srebrenica in July 1995. It is the worst massacre to have taken place in Europe since the Second World War.Mladic judgment brings back stench of Bosnian genocideProsecutor Serge Brammertz told reporters that Mladic will be remembered by history \"for the many communities and lives he destroyed.\"\"Today's judgment is a milestone in the tribunal's history and for international justice,\" he added.The trial of Mladic, who was arrested in 2011, has lasted 530 days and included more than 500 witnesses and nearly 10,000 exhibits. Before the case was adjourned last December, prosecutors recommended a life sentence. Mladic had previously referred to the court as \"satanic\" and labeled the charges against him as \"obnoxious.\"ReactionAt a center for the association of women victims of war in Sarajevo, there was an outpouring of emotion during the judge's summation.There was particular frustration that Mladic was acquitted on one charge of genocide in Bosnian municipalities outside of Srebrenica.Amela Meduseljac (L) and Meliha Mrdzic were unhappy that Mladic was acquitted on one charge.Meliha Mrdzic, who said her father and brother were killed and thrown into the Drina River in Visegrad, told CNN she was left humiliated by the decision.\"The international community made me a victim a second time,\" she said. \"They make it seem like we killed ourselves, raped ourselves, slaughtered ourselves. I feel so hurt, I can't describe it.\"Amela Meduseljac, who said she was raped by Mladic's soldiers at Visegrad, said that victims will struggle to get over the judgment.\"Our mission as a rape survivor association was to stop victims from feeling like victims,\" she said. \"But it's getting worse from year to year and it will get especially worse after this verdict.\"People in Srebrenica celebrate as they watch a live TV broadcast of the trial Wednesday.UN High Commissioner for Human Rights Zeid Ra'ad Al Hussein called Mladic the \"epitome of evil\" and labeled his conviction a \"momentous victory for justice.\" \"Mladic presided over some of the darkest crimes to occur in Europe since World War II, bringing terror, death and destruction to thousands of victims, and sorrow, tragedy and trauma to countless more,\" Zeid said in a statement.\"His conviction is a testament to the courage and determination of those victims and witnesses who never gave up hope that they would see him brought to justice.\"A woman writes in a book inside a traveling monument called \"Prijedor 92\" outside the tribunal in The Hague on Wednesday.In Serbia, the country's president, Aleksandar Vučić, urged his people to look forward to the future.\"Today is not a day for joy, nor for sorrow, but to see what kind of future we want,\" he told reporters. \"We all knew that the judgment would be like that. There is no one who did not know it in advance. My call to all citizens of Serbia is to start looking at the future today. \"Let's think about where and how our children will live. How and in what way will we preserve peace and stability in the region\"In a separate development, Serbia's Minister for Justice Nela Kuburović urged that Mladic be released to undergo medical treatment.Who is Ratko Mladic?The ex-general -- accused of being \"the Butcher of Bosnia\" -- was in command of the Bosnian Serb army that entered the town of Srebrenica in July 1995. In the days that followed, 8,000 Muslim men and boys were systematically slaughtered by troops under his leadership.JUST WATCHEDAmanpour meets the 'Butcher of Bosnia'ReplayMore Videos ...MUST WATCHAmanpour meets the 'Butcher of Bosnia' 04:05The late Bosnia peace negotiator Richard Holbrooke once described Mladic as \"one of those lethal combinations that history thrusts up occasionally -- a charismatic murderer.\"Read: Bosnia's future is tied to justiceMladic faced charges over his actions during the siege of Sarajevo, where his heavily armed forces cut the city off from the outside world. Serb forces pounded the city from higher ground each day, trapping Sarajevo's residents in the valley below. More than 10,000 people, mostly civilians, perished.JUST WATCHEDMarking 20 years since Srebrenica: A survivor's storyReplayMore Videos ...MUST WATCHMarking 20 years since Srebrenica: A survivor's story 04:26After the war ended in 1995, Mladic went on the run before being found 16 years later when police burst into the garden of a small house in northern Serbia.Though he was carrying two handguns, he surrendered without a fight. He was extradited for trial in the Netherlands.Srebrenica massacre: Two decades on, wounds still raw, graves still openIn 2011, a tribunal judge entered pleas of not guilty for Mladic after he refused to cooperate and was forcibly removed from the courtroom at the judge's order.Mladic's judgment day comes more than a year after Bosnian Serb political leader Radovan Karadzic was sentenced to 40 years in prison for his role in the 1990s conflict. Former Serbian President Slobodan Milosevic was arrested in 2001 but died before his trial could be completed.At a news conference following the verdict, Mladic's son Darko said that his father felt sorry for every victim of the conflict. \"General Mladic cannot accept responsibility for things he did not do,\" Darko Mladic said. He argued that what had happened in Srebrenica was \"legitimate.\"Melina Borcak and Lindsay Isaac in Sarajevo contributed to this report.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and the context provided in the text. We ensure that each entity and relationship is described comprehensively and assigned appropriate importance scores and weights.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ratko Mladic\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Former Bosnian Serb army leader found guilty of genocide and other crimes during the Bosnian war.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Bosnian war\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Conflict in the former Yugoslavia from 1992 to 1995 involving atrocities and genocide.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Srebrenica\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"Site of the worst massacre in Europe since World War II, where thousands of Muslim men and boys were slaughtered.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"International Criminal Tribunal for the former Yugoslavia\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"Ad hoc court established to prosecute crimes committed during the Balkans conflict.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Sarajevo\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"Capital of Bosnia and Herzegovina, heavily affected by the Bosnian war.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Drina River\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"River in Bosnia and Herzegovina where victims were thrown.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Aleksandar Vučić\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"President of Serbia during the trial of Ratko Mladic.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Radovan Karadzic\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Bosnian Serb political leader sentenced for his role in the 1990s conflict.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Slobodan Milosevic\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Former Serbian President who died before his trial could be completed.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Bosnian war\",\n", + "\n", + "\"description\": \"Ratko Mladic was a key figure in the Bosnian war, leading the Bosnian Serb army.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Bosnian war\",\n", + "\n", + "\"tgt_id\": \"Srebrenica\",\n", + "\n", + "\"description\": \"The Bosnian war included the massacre at Srebrenica.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"International Criminal Tribunal for the former Yugoslavia\",\n", + "\n", + "\"description\": \"Ratko Mladic's trial took place at the International Criminal Tribunal for the former Yugoslavia.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Sarajevo\",\n", + "\n", + "\"description\": \"Ratko Mladic's forces besieged Sarajevo during the Bosnian war.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Drina River\",\n", + "\n", + "\"description\": \"Victims of Ratko Mladic's forces were thrown into the Drina River.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Aleksandar Vučić\",\n", + "\n", + "\"tgt_id\": \"Ratko Mladic\",\n", + "\n", + "\"description\": \"Aleksandar Vučić, as the President of Serbia, commented on the trial of Ratko Mladic.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Radovan Karadzic\",\n", + "\n", + "\"description\": \"Ratko Mladic and Radovan Karadzic were both key figures in the Bosnian Serb leadership during the war.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Slobodan Milosevic\",\n", + "\n", + "\"description\": \"Ratko Mladic and Slobodan Milosevic were both involved in the conflicts in the former Yugoslavia.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)Neymar surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.Two penalties and a last-minute tap-in took the Paris Saint-Germain forward's tally to 64 for his country, two ahead of Ronaldo and 13 behind Pele's all-time record of 77.Everton's Richarlison scored Brazil's other goal as the five-time world champion twice had to come from behind to secure the win.Brazil head coach Tite told reporters after the match that it was \"unfair\" to compare Neymar and Ronaldo.\"What I can say is Neymar has this unpredictability,\" he said. \"He is the bow and the arrow, he's a player who both makes and takes chances. And he gets better and better, and more mature.\"Read MoreNeymar celebrates after completing his hat-trick against Peru.The match had a number of controversial VAR moments, particularly the decision to award Neymar a second penalty late in the game with the score tied at 2-2, a decision which surprised even the Brazilian players.Peru's anger was compounded in stoppage time when Carlos Zambrano was shown a red card for an elbow on Richarlison. The Everton forward had escaped punishment for a similar incident earlier in the match that left Peru's Miguel Trauco with a bloody wound above his eye.The latest incidents added to a growing list of decisions that have left South American fans scratching their heads during this international break, the first time VAR has been used in CONMEBOL World Cup qualifiersBrazil's victory ensures it begins the grueling 18-month campaign with a 100% record to top the standings on six points along with Argentina, which secured an impressive 2-1 win in the altitude of La Paz, Bolivia earlier in the day.The top four teams will qualify automatically for Qatar 2022, with the fifth-placed team competing in a two-legged playoff against a country from another continent.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and the context provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Neymar\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Brazilian footballer who surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ronaldo\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former Brazilian footballer who was previously second on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Pele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A legendary Brazilian footballer who holds the all-time record for Brazil with 77 goals.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Brazil\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A country where the football match took place and where Neymar and Ronaldo are from.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peru\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The opposing country in the football match against Brazil.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"World Cup\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"An international football competition where the qualifying match between Brazil and Peru took place.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Ronaldo\",\n", + "\n", + "\"description\": \"Neymar surpassed Ronaldo on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Pele\",\n", + "\n", + "\"description\": \"Neymar is 13 goals behind Pele's all-time record for Brazil.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Brazil\",\n", + "\n", + "\"tgt_id\": \"Peru\",\n", + "\n", + "\"description\": \"Brazil won a football match against Peru in World Cup qualifying.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and their relationships from a given text. It works by first identifying entities in the text based on predefined entity types, then determining the relationships between these entities considering their interactions and the context provided in the text. The program ensures that each entity and relationship is described comprehensively and assigned appropriate importance scores and weights. The output is formatted according to a specified JSON schema, listing entities and their relationships in a structured manner.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: (CNN)Former Bosnian Serb army leader Ratko Mladic was sentenced to life in prison Wednesday after being found guilty of genocide for atrocities committed during the Bosnian war from 1992 to 1995.Verdict proceedings had been interrupted earlier when the 74-year-old's legal team claimed that his blood pressure was too high to continue.After outbursts from Mladic, Judge Alphons Orie, who was delivering a summation of the case, ordered the removal of the ex-general, telling him he could monitor proceedings by audio and video. \"I am very distraught,\" Mladic shouted inside the courtroom. \"Everything that you have said is pure lies. Shame on you. It's all lies.\"Mladic's legal team had asked for proceedings to be halted or for the summation of the case to be skipped, which the Judge refused. Read MoreMladic was charged with two counts of genocide and nine crimes against humanity and war crimes for his role in the conflict in the former Yugoslavia from 1992 to 1995, during which 100,000 people were killed and another 2.2 million displaced. He was found not guilty on one charge of genocide, but received a guilty verdict on each of the other 10 counts. Mladic's lawyer, Dragan Ivetic, said it was \"certain\" Mladic would appeal.\"Butcher of Bosnia\" Ratko Mladic has been found guilty of the highest crimes against international law, says CNN's @camanpour, who covered the Bosnian war https://t.co/4A7DtVjHn8 pic.twitter.com/HdPKU3tqRD— CNN International (@cnni) November 22, 2017The trial, which opened in 2012, took place at the International Criminal Tribunal for the former Yugoslavia in The Hague, Netherlands. The ad hoc court was established to prosecute crimes committed during the Balkans conflict. Mladic was accused of orchestrating a campaign of ethnic cleansing, including the slaughter of thousands of Muslim men and boys at Srebrenica in July 1995. It is the worst massacre to have taken place in Europe since the Second World War.Mladic judgment brings back stench of Bosnian genocideProsecutor Serge Brammertz told reporters that Mladic will be remembered by history \"for the many communities and lives he destroyed.\"\"Today's judgment is a milestone in the tribunal's history and for international justice,\" he added.The trial of Mladic, who was arrested in 2011, has lasted 530 days and included more than 500 witnesses and nearly 10,000 exhibits. Before the case was adjourned last December, prosecutors recommended a life sentence. Mladic had previously referred to the court as \"satanic\" and labeled the charges against him as \"obnoxious.\"ReactionAt a center for the association of women victims of war in Sarajevo, there was an outpouring of emotion during the judge's summation.There was particular frustration that Mladic was acquitted on one charge of genocide in Bosnian municipalities outside of Srebrenica.Amela Meduseljac (L) and Meliha Mrdzic were unhappy that Mladic was acquitted on one charge.Meliha Mrdzic, who said her father and brother were killed and thrown into the Drina River in Visegrad, told CNN she was left humiliated by the decision.\"The international community made me a victim a second time,\" she said. \"They make it seem like we killed ourselves, raped ourselves, slaughtered ourselves. I feel so hurt, I can't describe it.\"Amela Meduseljac, who said she was raped by Mladic's soldiers at Visegrad, said that victims will struggle to get over the judgment.\"Our mission as a rape survivor association was to stop victims from feeling like victims,\" she said. \"But it's getting worse from year to year and it will get especially worse after this verdict.\"People in Srebrenica celebrate as they watch a live TV broadcast of the trial Wednesday.UN High Commissioner for Human Rights Zeid Ra'ad Al Hussein called Mladic the \"epitome of evil\" and labeled his conviction a \"momentous victory for justice.\" \"Mladic presided over some of the darkest crimes to occur in Europe since World War II, bringing terror, death and destruction to thousands of victims, and sorrow, tragedy and trauma to countless more,\" Zeid said in a statement.\"His conviction is a testament to the courage and determination of those victims and witnesses who never gave up hope that they would see him brought to justice.\"A woman writes in a book inside a traveling monument called \"Prijedor 92\" outside the tribunal in The Hague on Wednesday.In Serbia, the country's president, Aleksandar Vučić, urged his people to look forward to the future.\"Today is not a day for joy, nor for sorrow, but to see what kind of future we want,\" he told reporters. \"We all knew that the judgment would be like that. There is no one who did not know it in advance. My call to all citizens of Serbia is to start looking at the future today. \"Let's think about where and how our children will live. How and in what way will we preserve peace and stability in the region\"In a separate development, Serbia's Minister for Justice Nela Kuburović urged that Mladic be released to undergo medical treatment.Who is Ratko Mladic?The ex-general -- accused of being \"the Butcher of Bosnia\" -- was in command of the Bosnian Serb army that entered the town of Srebrenica in July 1995. In the days that followed, 8,000 Muslim men and boys were systematically slaughtered by troops under his leadership.JUST WATCHEDAmanpour meets the 'Butcher of Bosnia'ReplayMore Videos ...MUST WATCHAmanpour meets the 'Butcher of Bosnia' 04:05The late Bosnia peace negotiator Richard Holbrooke once described Mladic as \"one of those lethal combinations that history thrusts up occasionally -- a charismatic murderer.\"Read: Bosnia's future is tied to justiceMladic faced charges over his actions during the siege of Sarajevo, where his heavily armed forces cut the city off from the outside world. Serb forces pounded the city from higher ground each day, trapping Sarajevo's residents in the valley below. More than 10,000 people, mostly civilians, perished.JUST WATCHEDMarking 20 years since Srebrenica: A survivor's storyReplayMore Videos ...MUST WATCHMarking 20 years since Srebrenica: A survivor's story 04:26After the war ended in 1995, Mladic went on the run before being found 16 years later when police burst into the garden of a small house in northern Serbia.Though he was carrying two handguns, he surrendered without a fight. He was extradited for trial in the Netherlands.Srebrenica massacre: Two decades on, wounds still raw, graves still openIn 2011, a tribunal judge entered pleas of not guilty for Mladic after he refused to cooperate and was forcibly removed from the courtroom at the judge's order.Mladic's judgment day comes more than a year after Bosnian Serb political leader Radovan Karadzic was sentenced to 40 years in prison for his role in the 1990s conflict. Former Serbian President Slobodan Milosevic was arrested in 2001 but died before his trial could be completed.At a news conference following the verdict, Mladic's son Darko said that his father felt sorry for every victim of the conflict. \"General Mladic cannot accept responsibility for things he did not do,\" Darko Mladic said. He argued that what had happened in Srebrenica was \"legitimate.\"Melina Borcak and Lindsay Isaac in Sarajevo contributed to this report.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and the context provided in the text. We ensure that each entity and relationship is described comprehensively and assigned appropriate importance scores and weights.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ratko Mladic\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Former Bosnian Serb army leader found guilty of genocide and other crimes during the Bosnian war.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Bosnian war\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Conflict in the former Yugoslavia from 1992 to 1995 involving atrocities and genocide.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Srebrenica\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"Site of the worst massacre in Europe since World War II, where thousands of Muslim men and boys were slaughtered.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"International Criminal Tribunal for the former Yugoslavia\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"Ad hoc court established to prosecute crimes committed during the Balkans conflict.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Sarajevo\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"Capital of Bosnia and Herzegovina, heavily affected by the Bosnian war.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Drina River\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"River in Bosnia and Herzegovina where victims were thrown.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Aleksandar Vučić\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"President of Serbia during the trial of Ratko Mladic.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Radovan Karadzic\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Bosnian Serb political leader sentenced for his role in the 1990s conflict.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Slobodan Milosevic\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Former Serbian President who died before his trial could be completed.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Bosnian war\",\n", + "\n", + "\"description\": \"Ratko Mladic was a key figure in the Bosnian war, leading the Bosnian Serb army.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Bosnian war\",\n", + "\n", + "\"tgt_id\": \"Srebrenica\",\n", + "\n", + "\"description\": \"The Bosnian war included the massacre at Srebrenica.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"International Criminal Tribunal for the former Yugoslavia\",\n", + "\n", + "\"description\": \"Ratko Mladic's trial took place at the International Criminal Tribunal for the former Yugoslavia.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Sarajevo\",\n", + "\n", + "\"description\": \"Ratko Mladic's forces besieged Sarajevo during the Bosnian war.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Drina River\",\n", + "\n", + "\"description\": \"Victims of Ratko Mladic's forces were thrown into the Drina River.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Aleksandar Vučić\",\n", + "\n", + "\"tgt_id\": \"Ratko Mladic\",\n", + "\n", + "\"description\": \"Aleksandar Vučić, as the President of Serbia, commented on the trial of Ratko Mladic.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Radovan Karadzic\",\n", + "\n", + "\"description\": \"Ratko Mladic and Radovan Karadzic were both key figures in the Bosnian Serb leadership during the war.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Ratko Mladic\",\n", + "\n", + "\"tgt_id\": \"Slobodan Milosevic\",\n", + "\n", + "\"description\": \"Ratko Mladic and Slobodan Milosevic were both involved in the conflicts in the former Yugoslavia.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)Neymar surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.Two penalties and a last-minute tap-in took the Paris Saint-Germain forward's tally to 64 for his country, two ahead of Ronaldo and 13 behind Pele's all-time record of 77.Everton's Richarlison scored Brazil's other goal as the five-time world champion twice had to come from behind to secure the win.Brazil head coach Tite told reporters after the match that it was \"unfair\" to compare Neymar and Ronaldo.\"What I can say is Neymar has this unpredictability,\" he said. \"He is the bow and the arrow, he's a player who both makes and takes chances. And he gets better and better, and more mature.\"Read MoreNeymar celebrates after completing his hat-trick against Peru.The match had a number of controversial VAR moments, particularly the decision to award Neymar a second penalty late in the game with the score tied at 2-2, a decision which surprised even the Brazilian players.Peru's anger was compounded in stoppage time when Carlos Zambrano was shown a red card for an elbow on Richarlison. The Everton forward had escaped punishment for a similar incident earlier in the match that left Peru's Miguel Trauco with a bloody wound above his eye.The latest incidents added to a growing list of decisions that have left South American fans scratching their heads during this international break, the first time VAR has been used in CONMEBOL World Cup qualifiersBrazil's victory ensures it begins the grueling 18-month campaign with a 100% record to top the standings on six points along with Argentina, which secured an impressive 2-1 win in the altitude of La Paz, Bolivia earlier in the day.The top four teams will qualify automatically for Qatar 2022, with the fifth-placed team competing in a two-legged playoff against a country from another continent.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and the context provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Neymar\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Brazilian footballer who surpassed Ronaldo and moved to second on Brazil's all-time goalscoring list with a hat-trick in a 4-2 win against Peru in World Cup qualifying.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ronaldo\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A former Brazilian footballer who was previously second on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Pele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A legendary Brazilian footballer who holds the all-time record for Brazil with 77 goals.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Brazil\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A country where the football match took place and where Neymar and Ronaldo are from.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peru\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The opposing country in the football match against Brazil.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"World Cup\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"An international football competition where the qualifying match between Brazil and Peru took place.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Ronaldo\",\n", + "\n", + "\"description\": \"Neymar surpassed Ronaldo on Brazil's all-time goalscoring list.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Neymar\",\n", + "\n", + "\"tgt_id\": \"Pele\",\n", + "\n", + "\"description\": \"Neymar is 13 goals behind Pele's all-time record for Brazil.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Brazil\",\n", + "\n", + "\"tgt_id\": \"Peru\",\n", + "\n", + "\"description\": \"Brazil won a football match against Peru in World Cup qualifying.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and `entity_types`, meticulously identify and extract entities from the text. For each identified entity, provide a comprehensive description and assign an importance score. Subsequently, determine the relationships between these entities, considering their interactions and the context provided in the text. Each relationship should include a description and appropriate weight. Ensure that the output is formatted according to the specified JSON schema, listing entities and their relationships in a structured manner.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and `entity_types`, meticulously identify and extract entities from the text. For each identified entity, provide a comprehensive description and assign an importance score. Subsequently, determine the relationships between these entities, considering their interactions and the context provided in the text. Each relationship should include a description and appropriate weight. Ensure that the output is formatted according to the specified JSON schema, listing entities and their relationships in a structured manner.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and their relationships from a given text input. It works by first identifying entities in the text based on specified entity types. Then, it determines the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, it formats the entities and relationships according to a specified JSON schema. The program uses a language model to perform these tasks and includes mechanisms for error handling and retries to ensure accurate results.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: (CNN)The New York City Marathon returns on Sunday following a two-year hiatus, and at the front of the field, legendary distance runner Kenenisa Bekele is primed to \"make more history\" in his storied career. Bekele, a three-time Olympic gold medalist over 5,000 and 10,000 meters, is the headline name competing in the men's elite field in New York. With his only previous marathon in the United States a fourth-place finish in Chicago seven years ago, the Ethiopian is hungry for success as he prepares to take to the streets of the Big Apple.\"Really, I wanted to have a good result in the USA, that's why I chose the New York marathon,\" Bekele told reporters this week.\"The New York marathon is big publicity and a really big marathon race. To achieve a good result in this marathon would be perfect, and for me, it's also really good to make more history in sports.\"Read MoreBekele has a point to prove after a disappointing outing six weeks ago at the Berlin Marathon -- the race at which he came within two seconds of Eliud Kipchoge's world record time two years ago. Bekele catches his breath after this year's Berlin Marathon. Despite expectations that he could challenge Kipchoge's record of two hours, one minute and 39 seconds in Berlin this year, Bekele says he fell foul to a poor night's sleep as he finished third, more than a minute behind winner Guye Adola.\"I was not ready for that race,\" Bekele explained. \"A few weeks before the race, I was not really confident. It's a little bit tough for me because the day before also I couldn't sleep well, I really had bad luck the day before in the night. That also made me very tired.\"I've recovered well ... At this moment, I'm really strong. I hope I'll achieve a good result on Sunday.\"For race organizers, who called off last year's New York City Marathon amid the pandemic, it is a boon to have a figure like Bekele on the start line on Sunday.On top of his three Olympic gold medals -- including a double in the 5,000 and 10,000 meters at Beijing 2008 -- and one silver medal, he also claimed five world athletics championship titles between 2003 to 2009. Bekele's 5,000m world record stood for 16 years before it was broken by Uganda's Joshua Cheptegei last year, while his 10,000m record stood for 15 before also being broken last year by Cheptegei. Eliud Kipchoge: Marathon world record holder has 'the qualities of an ascetic monk'Many consider Bekele to be the greatest male distance runner of all time, such has been his supremacy across multiple distances over the past two decades; others point towards Kenya's Kipchoge, who has been so dominant over the 26.2 miles of a marathon and is the first man to break the distance's two-hour barrier, albeit in unofficial conditions. \"I still feel that I am the best and better than anyone,\" Bekele told Sports Illustrated in August after it was announced that he would make his debut in New York. \"I think every athlete and others should think like that.\"Having struggled with injuries and form in recent years, Bekele now has a second opportunity to win a major marathon in the US. There is little chance that he will be able to come close to his best marathon time in New York; unlike Berlin, the course, which undulates through the city's five boroughs, rarely produces fast performances.\"I know the course is tough and there are also no pacemakers in the race. It's more about concentration and a tactical race,\" said Bekele. He will be up against the Netherlands' Adbi Nageeye, who won the silver medal at the Olympics in Sapporo three months ago, and the half marathon world record holder Kibiwott Kandie of Kenya, who is making his marathon debut. The women's race is headlined by Kenya's Peres Jepchirchir, the gold medalist at the Olympics. She will be joined by the USA's Olympic bronze medalist Molly Seidel. Seidel poses with her bronze medal from the Tokyo Olympics. Visit CNN.com/sport for more news, videos and featuresAnother athlete to keep an eye on is US star Shalane Flanagan as she attempts to run six marathons in 42 days. Having already completed marathons in Berlin, London, Chicago, Boston and Portland -- all between times of 2:35:14 and 2:46:39 -- Flanagan will aim to complete her challenge in New York. And as for Bekele, like so many other runners who will take to the start line on Sunday, he's grateful to be able to race in New York following the pandemic and last year's cancellation.\"To see the people of New York participate in this race again -- to see this for me is fantastic,\" he said. \"I want to wish all participants good luck.\"\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kenenisa Bekele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Legendary distance runner and three-time Olympic gold medalist over 5,000 and 10,000 meters.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"New York City Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in New York City, returning after a two-year hiatus due to the pandemic.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Eliud Kipchoge\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Marathon world record holder and dominant runner over the 26.2 miles of a marathon.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Berlin Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in Berlin, where Kenenisa Bekele had a disappointing outing six weeks ago.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Guye Adola\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Winner of the Berlin Marathon, finishing more than a minute ahead of Kenenisa Bekele.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Adbi Nageeye\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Dutch runner who won the silver medal at the Olympics in Sapporo and will compete in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kibiwott Kandie\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Half marathon world record holder from Kenya, making his marathon debut in New York.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peres Jepchirchir\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Kenyan runner and gold medalist at the Olympics, headlining the women's race in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Molly Seidel\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"American runner and Olympic bronze medalist, competing in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Shalane Flanagan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"US star attempting to run six marathons in 42 days, including the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"New York City Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele is primed to compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Eliud Kipchoge\",\n", + "\n", + "\"description\": \"Kenenisa Bekele aims to make more history in his storied career, comparing himself to Eliud Kipchoge.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Berlin Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele had a disappointing outing at the Berlin Marathon six weeks ago.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Guye Adola\",\n", + "\n", + "\"description\": \"Kenenisa Bekele finished third in the Berlin Marathon, more than a minute behind winner Guye Adola.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Adbi Nageeye\",\n", + "\n", + "\"description\": \"Adbi Nageeye will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Kibiwott Kandie\",\n", + "\n", + "\"description\": \"Kibiwott Kandie will make his marathon debut in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Peres Jepchirchir\",\n", + "\n", + "\"description\": \"Peres Jepchirchir will headline the women's race in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Molly Seidel\",\n", + "\n", + "\"description\": \"Molly Seidel will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Shalane Flanagan\",\n", + "\n", + "\"description\": \"Shalane Flanagan will aim to complete her challenge of running six marathons in 42 days in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and their relationships from a given text input. It works by first identifying entities in the text based on specified entity types. Then, it determines the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, it formats the entities and relationships according to a specified JSON schema. The program uses a language model to perform these tasks and includes mechanisms for error handling and retries to ensure accurate results.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: (CNN)The New York City Marathon returns on Sunday following a two-year hiatus, and at the front of the field, legendary distance runner Kenenisa Bekele is primed to \"make more history\" in his storied career. Bekele, a three-time Olympic gold medalist over 5,000 and 10,000 meters, is the headline name competing in the men's elite field in New York. With his only previous marathon in the United States a fourth-place finish in Chicago seven years ago, the Ethiopian is hungry for success as he prepares to take to the streets of the Big Apple.\"Really, I wanted to have a good result in the USA, that's why I chose the New York marathon,\" Bekele told reporters this week.\"The New York marathon is big publicity and a really big marathon race. To achieve a good result in this marathon would be perfect, and for me, it's also really good to make more history in sports.\"Read MoreBekele has a point to prove after a disappointing outing six weeks ago at the Berlin Marathon -- the race at which he came within two seconds of Eliud Kipchoge's world record time two years ago. Bekele catches his breath after this year's Berlin Marathon. Despite expectations that he could challenge Kipchoge's record of two hours, one minute and 39 seconds in Berlin this year, Bekele says he fell foul to a poor night's sleep as he finished third, more than a minute behind winner Guye Adola.\"I was not ready for that race,\" Bekele explained. \"A few weeks before the race, I was not really confident. It's a little bit tough for me because the day before also I couldn't sleep well, I really had bad luck the day before in the night. That also made me very tired.\"I've recovered well ... At this moment, I'm really strong. I hope I'll achieve a good result on Sunday.\"For race organizers, who called off last year's New York City Marathon amid the pandemic, it is a boon to have a figure like Bekele on the start line on Sunday.On top of his three Olympic gold medals -- including a double in the 5,000 and 10,000 meters at Beijing 2008 -- and one silver medal, he also claimed five world athletics championship titles between 2003 to 2009. Bekele's 5,000m world record stood for 16 years before it was broken by Uganda's Joshua Cheptegei last year, while his 10,000m record stood for 15 before also being broken last year by Cheptegei. Eliud Kipchoge: Marathon world record holder has 'the qualities of an ascetic monk'Many consider Bekele to be the greatest male distance runner of all time, such has been his supremacy across multiple distances over the past two decades; others point towards Kenya's Kipchoge, who has been so dominant over the 26.2 miles of a marathon and is the first man to break the distance's two-hour barrier, albeit in unofficial conditions. \"I still feel that I am the best and better than anyone,\" Bekele told Sports Illustrated in August after it was announced that he would make his debut in New York. \"I think every athlete and others should think like that.\"Having struggled with injuries and form in recent years, Bekele now has a second opportunity to win a major marathon in the US. There is little chance that he will be able to come close to his best marathon time in New York; unlike Berlin, the course, which undulates through the city's five boroughs, rarely produces fast performances.\"I know the course is tough and there are also no pacemakers in the race. It's more about concentration and a tactical race,\" said Bekele. He will be up against the Netherlands' Adbi Nageeye, who won the silver medal at the Olympics in Sapporo three months ago, and the half marathon world record holder Kibiwott Kandie of Kenya, who is making his marathon debut. The women's race is headlined by Kenya's Peres Jepchirchir, the gold medalist at the Olympics. She will be joined by the USA's Olympic bronze medalist Molly Seidel. Seidel poses with her bronze medal from the Tokyo Olympics. Visit CNN.com/sport for more news, videos and featuresAnother athlete to keep an eye on is US star Shalane Flanagan as she attempts to run six marathons in 42 days. Having already completed marathons in Berlin, London, Chicago, Boston and Portland -- all between times of 2:35:14 and 2:46:39 -- Flanagan will aim to complete her challenge in New York. And as for Bekele, like so many other runners who will take to the start line on Sunday, he's grateful to be able to race in New York following the pandemic and last year's cancellation.\"To see the people of New York participate in this race again -- to see this for me is fantastic,\" he said. \"I want to wish all participants good luck.\"\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kenenisa Bekele\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Legendary distance runner and three-time Olympic gold medalist over 5,000 and 10,000 meters.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"New York City Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in New York City, returning after a two-year hiatus due to the pandemic.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Eliud Kipchoge\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Marathon world record holder and dominant runner over the 26.2 miles of a marathon.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Berlin Marathon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"Annual marathon in Berlin, where Kenenisa Bekele had a disappointing outing six weeks ago.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Guye Adola\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Winner of the Berlin Marathon, finishing more than a minute ahead of Kenenisa Bekele.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Adbi Nageeye\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Dutch runner who won the silver medal at the Olympics in Sapporo and will compete in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kibiwott Kandie\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Half marathon world record holder from Kenya, making his marathon debut in New York.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Peres Jepchirchir\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Kenyan runner and gold medalist at the Olympics, headlining the women's race in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Molly Seidel\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"American runner and Olympic bronze medalist, competing in the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Shalane Flanagan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"US star attempting to run six marathons in 42 days, including the New York City Marathon.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"New York City Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele is primed to compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Eliud Kipchoge\",\n", + "\n", + "\"description\": \"Kenenisa Bekele aims to make more history in his storied career, comparing himself to Eliud Kipchoge.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Berlin Marathon\",\n", + "\n", + "\"description\": \"Kenenisa Bekele had a disappointing outing at the Berlin Marathon six weeks ago.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Kenenisa Bekele\",\n", + "\n", + "\"tgt_id\": \"Guye Adola\",\n", + "\n", + "\"description\": \"Kenenisa Bekele finished third in the Berlin Marathon, more than a minute behind winner Guye Adola.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Adbi Nageeye\",\n", + "\n", + "\"description\": \"Adbi Nageeye will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Kibiwott Kandie\",\n", + "\n", + "\"description\": \"Kibiwott Kandie will make his marathon debut in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Peres Jepchirchir\",\n", + "\n", + "\"description\": \"Peres Jepchirchir will headline the women's race in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Molly Seidel\",\n", + "\n", + "\"description\": \"Molly Seidel will compete in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"New York City Marathon\",\n", + "\n", + "\"tgt_id\": \"Shalane Flanagan\",\n", + "\n", + "\"description\": \"Shalane Flanagan will aim to complete her challenge of running six marathons in 42 days in the New York City Marathon.\",\n", + "\n", + "\"weight\": 0.4,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously extract entities and their relationships from the text. First, identify all entities in the text that match the provided entity types, ensuring each entity includes its name, type, a brief description, and an importance score. Next, determine the relationships between these entities by analyzing their interactions and dependencies within the context of the text. Each relationship should include the source entity ID, target entity ID, a description of the relationship, a weight indicating the strength of the relationship, and an order indicating the sequence of the relationship. Finally, format the entities and relationships according to the specified JSON schema, ensuring all fields are correctly populated and validated.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously extract entities and their relationships from the text. First, identify all entities in the text that match the provided entity types, ensuring each entity includes its name, type, a brief description, and an importance score. Next, determine the relationships between these entities by analyzing their interactions and dependencies within the context of the text. Each relationship should include the source entity ID, target entity ID, a description of the relationship, a weight indicating the strength of the relationship, and an order indicating the sequence of the relationship. Finally, format the entities and relationships according to the specified JSON schema, ensuring all fields are correctly populated and validated.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and their relationships from a given text. It uses a language model to identify entities based on specified types and then determines the relationships between these entities. The program works by first defining a signature that includes input fields for the text and entity types, and output fields for reasoning and the extracted entities and relationships. It then uses a TypedPredictor to enforce type annotations and handle potential errors, ensuring that the output matches the expected format. The program iteratively attempts to extract and validate the entities and relationships, providing explanations for errors and suggesting corrections if necessary. Finally, it returns the extracted entities and their relationships in a structured format.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: (CNN)Roger Federer thinks the professional tennis circuit won't return for a while due to the coronavirus pandemic but, when the time does come, the Swiss superstar said he would find it difficult to play without fans. While European football's Bundesliga resurfaced last week behind closed doors and Spain's La Liga is set to resume the middle of next month, the last official word from tennis authorities essentially saw all action suspended through July. The hiatus began in March, with Federer already sidelined since he was recuperating from knee surgery. Sunday would have marked the first day of the French Open in its usual spot on the tennis calendar -- in March, though, it was rescheduled for September -- and another grand slam, Wimbledon in July, was called off. \"I'm not training at the moment because I don't see a reason for that to be honest,\" Federer told three-time French Open champion Gustavo Kuerten -- who is raising funds for coronavirus relief efforts in his native Brazil -- in a video interview reported by Tennis.com.Read More\"I am happy with my body now and I still believe that the return of the tour is a long way off,\" continued the 38-year-old. \"And I think it's important mentally to enjoy this break, having played so much tennis. \"When I'm getting towards returning and have a goal to train for, I think I will be super motivated.\"We should be sliding into @rolandgarros right now, thinking of our mates in Paris 👋 pic.twitter.com/0PLKryyIjj— #AusOpen (@AustralianOpen) May 24, 2020 Federer is arguably tennis' best supported player ever, and the prospect of competing without spectators doesn't appeal to him. \"Most of the time when we are training, there is no one,\" said the men's record 20-time grand slam champion. \"For us, of course, it is possible to play if there are no fans. But on the other hand, I really hope that the circuit can return as it normally is. \"And hold off till the time is appropriate, minimum a third of the stadium or half full. But, for me, completely empty when playing big tournaments is very difficult.\"Federer has been active on social media during the lockdown, sparking a public discussion on the merging of the men's and women's tours with a tweet last month and embarking on a funny Instagram Live with tennis rival Rafael Nadal.Nadal, unlike Federer, has started practicing, though only very recently. The Spaniard would have been favored to win a 20th major and tie Federer had the French Open been played as usual given he has collected a record 12 titles at Roland Garros. Here I am, the first pictures I am posting for you on court. This is my practice earlier today at @rnadalacademy #BackOnCourt #BabolatFamily 🎾👍🏻💪🏻😉 pic.twitter.com/x7tzgLj9pc— Rafa Nadal (@RafaelNadal) May 22, 2020 The next grand slam is scheduled to be late August's US Open in New York, with organizers expected to announce in June if it will go ahead. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We identify the entities in the text based on the provided entity types and then determine the relationships between these entities. We focus on key entities such as Roger Federer, Rafael Nadal, French Open, and Wimbledon, and their interactions and associations within the context of the text. We also consider the relationships between these entities and other relevant entities like the coronavirus pandemic and the tennis circuit.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Roger Federer\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Swiss professional tennis player, considered one of the greatest of all time.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Rafael Nadal\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Spanish professional tennis player, known for his dominance on clay courts.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"French Open\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A major tennis tournament held over two weeks between late May and early June.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Wimbledon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A major tennis tournament held at the All England Club in Wimbledon, London, every year.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"coronavirus pandemic\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A global outbreak of a novel coronavirus that has led to significant disruptions worldwide.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Roger Federer\",\n", + "\n", + "\"tgt_id\": \"French Open\",\n", + "\n", + "\"description\": \"Roger Federer discusses the impact of the coronavirus pandemic on the French Open's schedule.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Roger Federer\",\n", + "\n", + "\"tgt_id\": \"Wimbledon\",\n", + "\n", + "\"description\": \"Roger Federer mentions the cancellation of Wimbledon due to the coronavirus pandemic.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Roger Federer\",\n", + "\n", + "\"tgt_id\": \"Rafael Nadal\",\n", + "\n", + "\"description\": \"Roger Federer and Rafael Nadal engage in social media interactions during the lockdown.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Rafael Nadal\",\n", + "\n", + "\"tgt_id\": \"French Open\",\n", + "\n", + "\"description\": \"Rafael Nadal is expected to compete in the French Open, a tournament he has won multiple times.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"coronavirus pandemic\",\n", + "\n", + "\"tgt_id\": \"French Open\",\n", + "\n", + "\"description\": \"The coronavirus pandemic has caused the French Open to be rescheduled.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"coronavirus pandemic\",\n", + "\n", + "\"tgt_id\": \"Wimbledon\",\n", + "\n", + "\"description\": \"The coronavirus pandemic has led to the cancellation of Wimbledon.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and their relationships from a given text. It uses a language model to identify entities based on specified types and then determines the relationships between these entities. The program works by first defining a signature that includes input fields for the text and entity types, and output fields for reasoning and the extracted entities and relationships. It then uses a TypedPredictor to enforce type annotations and handle potential errors, ensuring that the output matches the expected format. The program iteratively attempts to extract and validate the entities and relationships, providing explanations for errors and suggesting corrections if necessary. Finally, it returns the extracted entities and their relationships in a structured format.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: (CNN)Roger Federer thinks the professional tennis circuit won't return for a while due to the coronavirus pandemic but, when the time does come, the Swiss superstar said he would find it difficult to play without fans. While European football's Bundesliga resurfaced last week behind closed doors and Spain's La Liga is set to resume the middle of next month, the last official word from tennis authorities essentially saw all action suspended through July. The hiatus began in March, with Federer already sidelined since he was recuperating from knee surgery. Sunday would have marked the first day of the French Open in its usual spot on the tennis calendar -- in March, though, it was rescheduled for September -- and another grand slam, Wimbledon in July, was called off. \"I'm not training at the moment because I don't see a reason for that to be honest,\" Federer told three-time French Open champion Gustavo Kuerten -- who is raising funds for coronavirus relief efforts in his native Brazil -- in a video interview reported by Tennis.com.Read More\"I am happy with my body now and I still believe that the return of the tour is a long way off,\" continued the 38-year-old. \"And I think it's important mentally to enjoy this break, having played so much tennis. \"When I'm getting towards returning and have a goal to train for, I think I will be super motivated.\"We should be sliding into @rolandgarros right now, thinking of our mates in Paris 👋 pic.twitter.com/0PLKryyIjj— #AusOpen (@AustralianOpen) May 24, 2020 Federer is arguably tennis' best supported player ever, and the prospect of competing without spectators doesn't appeal to him. \"Most of the time when we are training, there is no one,\" said the men's record 20-time grand slam champion. \"For us, of course, it is possible to play if there are no fans. But on the other hand, I really hope that the circuit can return as it normally is. \"And hold off till the time is appropriate, minimum a third of the stadium or half full. But, for me, completely empty when playing big tournaments is very difficult.\"Federer has been active on social media during the lockdown, sparking a public discussion on the merging of the men's and women's tours with a tweet last month and embarking on a funny Instagram Live with tennis rival Rafael Nadal.Nadal, unlike Federer, has started practicing, though only very recently. The Spaniard would have been favored to win a 20th major and tie Federer had the French Open been played as usual given he has collected a record 12 titles at Roland Garros. Here I am, the first pictures I am posting for you on court. This is my practice earlier today at @rnadalacademy #BackOnCourt #BabolatFamily 🎾👍🏻💪🏻😉 pic.twitter.com/x7tzgLj9pc— Rafa Nadal (@RafaelNadal) May 22, 2020 The next grand slam is scheduled to be late August's US Open in New York, with organizers expected to announce in June if it will go ahead. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We identify the entities in the text based on the provided entity types and then determine the relationships between these entities. We focus on key entities such as Roger Federer, Rafael Nadal, French Open, and Wimbledon, and their interactions and associations within the context of the text. We also consider the relationships between these entities and other relevant entities like the coronavirus pandemic and the tennis circuit.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Roger Federer\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Swiss professional tennis player, considered one of the greatest of all time.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Rafael Nadal\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Spanish professional tennis player, known for his dominance on clay courts.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"French Open\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A major tennis tournament held over two weeks between late May and early June.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Wimbledon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A major tennis tournament held at the All England Club in Wimbledon, London, every year.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"coronavirus pandemic\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A global outbreak of a novel coronavirus that has led to significant disruptions worldwide.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Roger Federer\",\n", + "\n", + "\"tgt_id\": \"French Open\",\n", + "\n", + "\"description\": \"Roger Federer discusses the impact of the coronavirus pandemic on the French Open's schedule.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Roger Federer\",\n", + "\n", + "\"tgt_id\": \"Wimbledon\",\n", + "\n", + "\"description\": \"Roger Federer mentions the cancellation of Wimbledon due to the coronavirus pandemic.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Roger Federer\",\n", + "\n", + "\"tgt_id\": \"Rafael Nadal\",\n", + "\n", + "\"description\": \"Roger Federer and Rafael Nadal engage in social media interactions during the lockdown.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Rafael Nadal\",\n", + "\n", + "\"tgt_id\": \"French Open\",\n", + "\n", + "\"description\": \"Rafael Nadal is expected to compete in the French Open, a tournament he has won multiple times.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"coronavirus pandemic\",\n", + "\n", + "\"tgt_id\": \"French Open\",\n", + "\n", + "\"description\": \"The coronavirus pandemic has caused the French Open to be rescheduled.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"coronavirus pandemic\",\n", + "\n", + "\"tgt_id\": \"Wimbledon\",\n", + "\n", + "\"description\": \"The coronavirus pandemic has led to the cancellation of Wimbledon.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and `entity_types`, meticulously identify and extract entities from the text. For each identified entity, provide its `entity_name`, `entity_type`, a concise `description`, and an `importance_score`. Subsequently, determine and list the relationships between these entities, specifying the `src_id` (source entity), `tgt_id` (target entity), a `description` of the relationship, a `weight` indicating the strength of the relationship, and an `order` to indicate the sequence of relationships. Ensure that the relationships are coherent and directly derived from the context provided in the `input_text`.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and `entity_types`, meticulously identify and extract entities from the text. For each identified entity, provide its `entity_name`, `entity_type`, a concise `description`, and an `importance_score`. Subsequently, determine and list the relationships between these entities, specifying the `src_id` (source entity), `tgt_id` (target entity), a `description` of the relationship, a `weight` indicating the strength of the relationship, and an `order` to indicate the sequence of relationships. Ensure that the relationships are coherent and directly derived from the context provided in the `input_text`.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to solve the task of entity and relationship extraction from a given text. It works by taking an input text and a list of entity types, then using a language model to identify entities within the text that match the provided types. After identifying the entities, the program determines the relationships between these entities based on their interactions and dependencies within the context of the text. The process involves generating reasoning steps to justify the extraction, and the final output is a list of entities with their types, descriptions, and importance scores, along with relationships between these entities, including descriptions, weights, and order. The program ensures type safety and validation through the use of Pydantic models and retries the extraction process up to a specified number of times if errors are encountered.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: Story highlightsThe Man v Fat soccer league is exclusively for men with a body mass index of 30 or moreThe league helps players lose weight, but it also gives them a built-in support group (CNN)After reviewing restaurants for years, Andrew Shanahan became obese and was looking for a way to lose weight.But it wasn't easy to find a program tailored to men. Every time he tried joining a weight-loss group, he was the only man in the room. \"I wondered how many other men felt that there was something missing and who weren't getting the support that you desperately need when you're trying to lose weight,\" he said.So in 2014, with the help of a crowdfunding campaign, Shanahan launched his Man v Fat initiative to reach out to other men like him. The idea drew attention from celebrities such as chef Jamie Oliver, and Shanahan was encouraged to write a book on it the next year. Being based in England, a soccer league seemed like a natural outreach program. And Shanahan received nearly 1,000 applications for 80 spots in the inaugural league, which was created exclusively for men with a body mass index of 30 or more.Read MorePhotographer Simone PerolariMany of the men used to play soccer -- or as it's called in most of the world, football -- but their weight had become an obstacle in pursuing the sport they love.\"I saw people who want to have fun and slim down and believe in what they do,\" said Simone Perolari, who photographed the league and whose images can be seen in the gallery above. \"There is not a space for them in classic football, but some of them are really good players.\"Rob Bird, 40, read about the league in the Birmingham Mail. He played the sport until he was 25, and he calls the 15 years after that the \"fat years\" when he didn't feel comfortable playing. \"I've always loved football, but playing against 'normal-size' guys left me feeling like I couldn't compete as they were faster,\" Bird said. \"(This league) is a level playing field for bigger guys.\" Shanahan isn't sure what he was expecting to result from the league's first season, but the competitiveness of the players surprised him. \"For a long time they haven't had a level playing field to compete on, so they are all determined to give it everything they have,\" he said. \"Fortunately, even those teams who lose can see that the benefits of losing weight and getting healthy more than make up for not topping the league.\"Social mediaFollow @CNNPhotos on Twitter to join the conversation about photography.Shanahan watched bonds form between the men as they enjoyed their favorite sport and supported one another in their goals to get fit and healthy. After 14 weeks, the 80 team members collectively lost 1,727 pounds, with the average weight loss close to 22 pounds per player. Marcus Farnsworth lost more than 70 pounds and is enjoying the energy boost he's received in return. He hopes it will help with his knees and back as he continues to lose weight. But he also enjoyed the togetherness and support of the guys around him. \"It gives you good motivation if one person is feeling hungry or thinking about snacking,\" Farnsworth said. \"We can give advice and communicate through our WhatsApp group for support.\"Bird has dropped nearly 60 pounds and is now down to 210. His gout and leg ulcer have improved \"immeasurably.\"Being around other guys who struggled with weight fostered a great spirit amongst us of being in it together and sharing our knowledge and experience without any judging or snide comments.Rob Bird, a player in the Man v Fat soccer league\"I feel a million times better,\" Bird said. \"This has given me the confidence that if you are making the right choices, you will feel better for it. Being around other guys who struggled with weight fostered a great spirit amongst us of being in it together and sharing our knowledge and experience without any judging or snide comments.\"Shanahan is expanding the league across the rest of the United Kingdom beginning in July, and he hopes to start similar leagues in the United States and other countries. \"We saw a lot of health improvements, whether it was guys curing their sleep apnea, reversing their diabetes or lowering their blood pressure,\" Shanahan said. \"The other really notable thing was how much many of the players grew in confidence. \"I think it's a great thing to undertake a project with other people who desperately want the same thing. It gives you energy and impetus. Working with the guys has been one of the greatest privileges of my career. To share in their excitement and to witness their achievements has been amazing.\"Simone Perolari is an Italian photographer based in Paris. You can follow him on Facebook and Instagram.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. We then determine the relationships between these entities, considering their interactions and dependencies within the context of the text.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Man v Fat soccer league\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A soccer league exclusively for men with a body mass index of 30 or more, designed to help them lose weight and provide a support group.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Andrew Shanahan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A man who became obese after reviewing restaurants for years and later founded the Man v Fat initiative.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Simone Perolari\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"An Italian photographer based in Paris who photographed the Man v Fat soccer league.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Rob Bird\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A 40-year-old man who played soccer until he was 25 and later joined the Man v Fat soccer league.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Marcus Farnsworth\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A participant in the Man v Fat soccer league who lost more than 70 pounds.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jamie Oliver\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A celebrity chef who supported the Man v Fat initiative.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"England\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The country where the Man v Fat soccer league was initially launched.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"CNN\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A news organization that reported on the Man v Fat soccer league.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Andrew Shanahan\",\n", + "\n", + "\"tgt_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"description\": \"Andrew Shanahan founded the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"Simone Perolari\",\n", + "\n", + "\"description\": \"Simone Perolari photographed the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"Rob Bird\",\n", + "\n", + "\"description\": \"Rob Bird joined the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"Marcus Farnsworth\",\n", + "\n", + "\"description\": \"Marcus Farnsworth participated in the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Andrew Shanahan\",\n", + "\n", + "\"tgt_id\": \"Jamie Oliver\",\n", + "\n", + "\"description\": \"Jamie Oliver supported the Man v Fat initiative founded by Andrew Shanahan.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"England\",\n", + "\n", + "\"description\": \"The Man v Fat soccer league was initially launched in England.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"CNN\",\n", + "\n", + "\"description\": \"CNN reported on the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of entity and relationship extraction from a given text. It works by taking an input text and a list of entity types, then using a language model to identify entities within the text that match the provided types. After identifying the entities, the program determines the relationships between these entities based on their interactions and dependencies within the context of the text. The process involves generating reasoning steps to justify the extraction, and the final output is a list of entities with their types, descriptions, and importance scores, along with relationships between these entities, including descriptions, weights, and order. The program ensures type safety and validation through the use of Pydantic models and retries the extraction process up to a specified number of times if errors are encountered.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: Story highlightsThe Man v Fat soccer league is exclusively for men with a body mass index of 30 or moreThe league helps players lose weight, but it also gives them a built-in support group (CNN)After reviewing restaurants for years, Andrew Shanahan became obese and was looking for a way to lose weight.But it wasn't easy to find a program tailored to men. Every time he tried joining a weight-loss group, he was the only man in the room. \"I wondered how many other men felt that there was something missing and who weren't getting the support that you desperately need when you're trying to lose weight,\" he said.So in 2014, with the help of a crowdfunding campaign, Shanahan launched his Man v Fat initiative to reach out to other men like him. The idea drew attention from celebrities such as chef Jamie Oliver, and Shanahan was encouraged to write a book on it the next year. Being based in England, a soccer league seemed like a natural outreach program. And Shanahan received nearly 1,000 applications for 80 spots in the inaugural league, which was created exclusively for men with a body mass index of 30 or more.Read MorePhotographer Simone PerolariMany of the men used to play soccer -- or as it's called in most of the world, football -- but their weight had become an obstacle in pursuing the sport they love.\"I saw people who want to have fun and slim down and believe in what they do,\" said Simone Perolari, who photographed the league and whose images can be seen in the gallery above. \"There is not a space for them in classic football, but some of them are really good players.\"Rob Bird, 40, read about the league in the Birmingham Mail. He played the sport until he was 25, and he calls the 15 years after that the \"fat years\" when he didn't feel comfortable playing. \"I've always loved football, but playing against 'normal-size' guys left me feeling like I couldn't compete as they were faster,\" Bird said. \"(This league) is a level playing field for bigger guys.\" Shanahan isn't sure what he was expecting to result from the league's first season, but the competitiveness of the players surprised him. \"For a long time they haven't had a level playing field to compete on, so they are all determined to give it everything they have,\" he said. \"Fortunately, even those teams who lose can see that the benefits of losing weight and getting healthy more than make up for not topping the league.\"Social mediaFollow @CNNPhotos on Twitter to join the conversation about photography.Shanahan watched bonds form between the men as they enjoyed their favorite sport and supported one another in their goals to get fit and healthy. After 14 weeks, the 80 team members collectively lost 1,727 pounds, with the average weight loss close to 22 pounds per player. Marcus Farnsworth lost more than 70 pounds and is enjoying the energy boost he's received in return. He hopes it will help with his knees and back as he continues to lose weight. But he also enjoyed the togetherness and support of the guys around him. \"It gives you good motivation if one person is feeling hungry or thinking about snacking,\" Farnsworth said. \"We can give advice and communicate through our WhatsApp group for support.\"Bird has dropped nearly 60 pounds and is now down to 210. His gout and leg ulcer have improved \"immeasurably.\"Being around other guys who struggled with weight fostered a great spirit amongst us of being in it together and sharing our knowledge and experience without any judging or snide comments.Rob Bird, a player in the Man v Fat soccer league\"I feel a million times better,\" Bird said. \"This has given me the confidence that if you are making the right choices, you will feel better for it. Being around other guys who struggled with weight fostered a great spirit amongst us of being in it together and sharing our knowledge and experience without any judging or snide comments.\"Shanahan is expanding the league across the rest of the United Kingdom beginning in July, and he hopes to start similar leagues in the United States and other countries. \"We saw a lot of health improvements, whether it was guys curing their sleep apnea, reversing their diabetes or lowering their blood pressure,\" Shanahan said. \"The other really notable thing was how much many of the players grew in confidence. \"I think it's a great thing to undertake a project with other people who desperately want the same thing. It gives you energy and impetus. Working with the guys has been one of the greatest privileges of my career. To share in their excitement and to witness their achievements has been amazing.\"Simone Perolari is an Italian photographer based in Paris. You can follow him on Facebook and Instagram.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. We then determine the relationships between these entities, considering their interactions and dependencies within the context of the text.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Man v Fat soccer league\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A soccer league exclusively for men with a body mass index of 30 or more, designed to help them lose weight and provide a support group.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Andrew Shanahan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A man who became obese after reviewing restaurants for years and later founded the Man v Fat initiative.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Simone Perolari\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"An Italian photographer based in Paris who photographed the Man v Fat soccer league.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Rob Bird\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A 40-year-old man who played soccer until he was 25 and later joined the Man v Fat soccer league.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Marcus Farnsworth\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A participant in the Man v Fat soccer league who lost more than 70 pounds.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jamie Oliver\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A celebrity chef who supported the Man v Fat initiative.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"England\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The country where the Man v Fat soccer league was initially launched.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"CNN\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A news organization that reported on the Man v Fat soccer league.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Andrew Shanahan\",\n", + "\n", + "\"tgt_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"description\": \"Andrew Shanahan founded the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"Simone Perolari\",\n", + "\n", + "\"description\": \"Simone Perolari photographed the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"Rob Bird\",\n", + "\n", + "\"description\": \"Rob Bird joined the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"Marcus Farnsworth\",\n", + "\n", + "\"description\": \"Marcus Farnsworth participated in the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Andrew Shanahan\",\n", + "\n", + "\"tgt_id\": \"Jamie Oliver\",\n", + "\n", + "\"description\": \"Jamie Oliver supported the Man v Fat initiative founded by Andrew Shanahan.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"England\",\n", + "\n", + "\"description\": \"The Man v Fat soccer league was initially launched in England.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Man v Fat soccer league\",\n", + "\n", + "\"tgt_id\": \"CNN\",\n", + "\n", + "\"description\": \"CNN reported on the Man v Fat soccer league.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text`, identify all entities that match the provided `entity_types`. For each identified entity, provide its `entity_name`, `entity_type`, `description`, and `importance_score`. Then, determine the relationships between these entities, including the `src_id` (source entity), `tgt_id` (target entity), `description` of the relationship, `weight` of the relationship, and `order` of the relationship. Ensure that the relationships are based on interactions and dependencies within the context of the text.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text`, identify all entities that match the provided `entity_types`. For each identified entity, provide its `entity_name`, `entity_type`, `description`, and `importance_score`. Then, determine the relationships between these entities, including the `src_id` (source entity), `tgt_id` (target entity), `description` of the relationship, `weight` of the relationship, and `order` of the relationship. Ensure that the relationships are based on interactions and dependencies within the context of the text.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to extract entities and relationships from a given text based on specified entity types. It uses a language model to perform this task, ensuring that the output adheres to strict type annotations and JSON schemas. The program follows a structured approach: it first identifies entities in the text according to the provided entity types, then determines the relationships between these entities based on their interactions and dependencies within the context of the text, and finally formats the entities and relationships according to a predefined JSON schema. The program includes error handling and retry mechanisms to ensure the output meets the required specifications.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: London (CNN)Prince Philip, the husband of Queen Elizabeth II, is being treated for an infection and is not expected to leave the hospital for several days, according to an update from Buckingham Palace on Tuesday.\"The Duke of Edinburgh remains at King Edward VII's Hospital where he is receiving medical attention for an infection. He is comfortable and responding to treatment but is not expected to leave hospital for several days,\" the palace said in a statement.Prince Philip was taken to the hospital last Tuesday after \"feeling unwell,\" Buckingham Palace said.On Monday, his grandson, Prince William, said the 99-year-old was doing \"OK\" and hospital staff were \"keeping an eye on him.\"Queen Elizabeth and Duke of Edinburgh receive Covid-19 vaccinePrince William made the remarks during a visit to a vaccination center in Norfolk, England, on Monday.Read MoreOn Tuesday, Prince Philip's youngest son, Prince Edward, told the UK's Sky News that he recently spoke to his father, who is doing \"a lot better\" and is looking forward to returning home.Prince Edward said the family was still keeping their \"fingers crossed\" for his father.\"We've had some brilliant and lovely messages from all sorts of people. And we really appreciate that and so does he. I've been passing them on,\" said Prince Edward.Prince Philip, who turns 100 in June, stepped back from public life in 2017 and has been taken to the hospital several times in recent years. In December 2019, he received hospital treatment for a pre-existing condition.His latest admission was not an emergency, according to a royal source, who said he traveled to King Edward VII Hospital by car and walked in unaided. The source added that the illness was not Covid-19 related.Prince Philip and Queen Elizabeth have both received at least one dose of a Covid-19 vaccine.The couple spent most of the past year at Windsor Castle, having moved away from Buckingham Palace during the first wave of the Covid-19 pandemic in spring 2020. CNN's Katharina Krebs contributed to this report.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Prince Philip\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The husband of Queen Elizabeth II, currently being treated for an infection at King Edward VII's Hospital.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Queen Elizabeth II\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The wife of Prince Philip, who has received at least one dose of a Covid-19 vaccine.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"King Edward VII's Hospital\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The hospital where Prince Philip is receiving treatment for an infection.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Buckingham Palace\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The official residence of the British monarch, where updates on Prince Philip's condition are provided.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Prince William\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The grandson of Prince Philip, who visited a vaccination center in Norfolk and provided an update on his grandfather's condition.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Prince Edward\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The youngest son of Prince Philip, who provided an update on his father's condition to the UK's Sky News.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Covid-19\",\n", + "\n", + "\"entity_type\": \"DISEASE\",\n", + "\n", + "\"description\": \"The disease that both Prince Philip and Queen Elizabeth II have received vaccines for.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Windsor Castle\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The residence where Prince Philip and Queen Elizabeth II spent most of the past year during the Covid-19 pandemic.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Philip\",\n", + "\n", + "\"tgt_id\": \"Queen Elizabeth II\",\n", + "\n", + "\"description\": \"Prince Philip is the husband of Queen Elizabeth II.\",\n", + "\n", + "\"weight\": 1.0,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Philip\",\n", + "\n", + "\"tgt_id\": \"King Edward VII's Hospital\",\n", + "\n", + "\"description\": \"Prince Philip is being treated for an infection at King Edward VII's Hospital.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Philip\",\n", + "\n", + "\"tgt_id\": \"Covid-19\",\n", + "\n", + "\"description\": \"Prince Philip has received at least one dose of a Covid-19 vaccine.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Queen Elizabeth II\",\n", + "\n", + "\"tgt_id\": \"Covid-19\",\n", + "\n", + "\"description\": \"Queen Elizabeth II has received at least one dose of a Covid-19 vaccine.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince William\",\n", + "\n", + "\"tgt_id\": \"Prince Philip\",\n", + "\n", + "\"description\": \"Prince William is the grandson of Prince Philip and provided an update on his condition.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 2\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Edward\",\n", + "\n", + "\"tgt_id\": \"Prince Philip\",\n", + "\n", + "\"description\": \"Prince Edward is the youngest son of Prince Philip and provided an update on his condition.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 2\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to extract entities and relationships from a given text based on specified entity types. It uses a language model to perform this task, ensuring that the output adheres to strict type annotations and JSON schemas. The program follows a structured approach: it first identifies entities in the text according to the provided entity types, then determines the relationships between these entities based on their interactions and dependencies within the context of the text, and finally formats the entities and relationships according to a predefined JSON schema. The program includes error handling and retry mechanisms to ensure the output meets the required specifications.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: London (CNN)Prince Philip, the husband of Queen Elizabeth II, is being treated for an infection and is not expected to leave the hospital for several days, according to an update from Buckingham Palace on Tuesday.\"The Duke of Edinburgh remains at King Edward VII's Hospital where he is receiving medical attention for an infection. He is comfortable and responding to treatment but is not expected to leave hospital for several days,\" the palace said in a statement.Prince Philip was taken to the hospital last Tuesday after \"feeling unwell,\" Buckingham Palace said.On Monday, his grandson, Prince William, said the 99-year-old was doing \"OK\" and hospital staff were \"keeping an eye on him.\"Queen Elizabeth and Duke of Edinburgh receive Covid-19 vaccinePrince William made the remarks during a visit to a vaccination center in Norfolk, England, on Monday.Read MoreOn Tuesday, Prince Philip's youngest son, Prince Edward, told the UK's Sky News that he recently spoke to his father, who is doing \"a lot better\" and is looking forward to returning home.Prince Edward said the family was still keeping their \"fingers crossed\" for his father.\"We've had some brilliant and lovely messages from all sorts of people. And we really appreciate that and so does he. I've been passing them on,\" said Prince Edward.Prince Philip, who turns 100 in June, stepped back from public life in 2017 and has been taken to the hospital several times in recent years. In December 2019, he received hospital treatment for a pre-existing condition.His latest admission was not an emergency, according to a royal source, who said he traveled to King Edward VII Hospital by car and walked in unaided. The source added that the illness was not Covid-19 related.Prince Philip and Queen Elizabeth have both received at least one dose of a Covid-19 vaccine.The couple spent most of the past year at Windsor Castle, having moved away from Buckingham Palace during the first wave of the Covid-19 pandemic in spring 2020. CNN's Katharina Krebs contributed to this report.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Prince Philip\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The husband of Queen Elizabeth II, currently being treated for an infection at King Edward VII's Hospital.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Queen Elizabeth II\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The wife of Prince Philip, who has received at least one dose of a Covid-19 vaccine.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"King Edward VII's Hospital\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The hospital where Prince Philip is receiving treatment for an infection.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Buckingham Palace\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The official residence of the British monarch, where updates on Prince Philip's condition are provided.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Prince William\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The grandson of Prince Philip, who visited a vaccination center in Norfolk and provided an update on his grandfather's condition.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Prince Edward\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The youngest son of Prince Philip, who provided an update on his father's condition to the UK's Sky News.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Covid-19\",\n", + "\n", + "\"entity_type\": \"DISEASE\",\n", + "\n", + "\"description\": \"The disease that both Prince Philip and Queen Elizabeth II have received vaccines for.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Windsor Castle\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The residence where Prince Philip and Queen Elizabeth II spent most of the past year during the Covid-19 pandemic.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Philip\",\n", + "\n", + "\"tgt_id\": \"Queen Elizabeth II\",\n", + "\n", + "\"description\": \"Prince Philip is the husband of Queen Elizabeth II.\",\n", + "\n", + "\"weight\": 1.0,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Philip\",\n", + "\n", + "\"tgt_id\": \"King Edward VII's Hospital\",\n", + "\n", + "\"description\": \"Prince Philip is being treated for an infection at King Edward VII's Hospital.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Philip\",\n", + "\n", + "\"tgt_id\": \"Covid-19\",\n", + "\n", + "\"description\": \"Prince Philip has received at least one dose of a Covid-19 vaccine.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Queen Elizabeth II\",\n", + "\n", + "\"tgt_id\": \"Covid-19\",\n", + "\n", + "\"description\": \"Queen Elizabeth II has received at least one dose of a Covid-19 vaccine.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince William\",\n", + "\n", + "\"tgt_id\": \"Prince Philip\",\n", + "\n", + "\"description\": \"Prince William is the grandson of Prince Philip and provided an update on his condition.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 2\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Prince Edward\",\n", + "\n", + "\"tgt_id\": \"Prince Philip\",\n", + "\n", + "\"description\": \"Prince Edward is the youngest son of Prince Philip and provided an update on his condition.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 2\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and `entity_types`, perform the following steps to produce the `entities_relationships`: 1) Identify all entities within the `input_text` that match the specified `entity_types`. 2) For each identified entity, provide a detailed description and an importance score based on its relevance to the text. 3) Determine the relationships between the identified entities, considering their interactions and dependencies within the context of the `input_text`. 4) For each relationship, provide a description, a weight indicating the strength of the relationship, and an order indicating the sequence of interactions. 5) Format the entities and relationships according to the specified JSON schema, ensuring all fields adhere to the required types and structures.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and `entity_types`, perform the following steps to produce the `entities_relationships`: 1) Identify all entities within the `input_text` that match the specified `entity_types`. 2) For each identified entity, provide a detailed description and an importance score based on its relevance to the text. 3) Determine the relationships between the identified entities, considering their interactions and dependencies within the context of the `input_text`. 4) For each relationship, provide a description, a weight indicating the strength of the relationship, and an order indicating the sequence of interactions. 5) Format the entities and relationships according to the specified JSON schema, ensuring all fields adhere to the required types and structures.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to solve the task of entity and relationship extraction from text. It works by first defining a signature that specifies the input fields (input_text and entity_types) and the output fields (reasoning and entities_relationships). The program then uses a TypedPredictor class to enforce type annotations in the signature and handle retries and error explanations if the output is invalid. The TypedEntityRelationshipExtractor class further processes the input text to extract entities and relationships, ensuring that the output is formatted according to the specified JSON schema. The program handles retries and errors by providing detailed explanations and examples to guide the language model in producing the correct output format.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: (CNN)This weekend either Lewis Hamilton or Max Verstappen will be crowned the Formula One world champion. Either way, the sport won big in 2021. This year's success starts with a boom in the sport's fanbase that can be pinned to Netflix's \"Drive to Survive\" docuseries. The show is a crash course on the 10 teams, the 20 drivers, the rivalries between them and the race competition structure. A word-of-mouth campaign for the series along with short, easy viewing episodes made it a perfect pandemic-time binge watch for millions around the world.The series' intimate interviews combined with the best action from races and the high level drama is a strong pitch for the real-life spectacle of Formula One.The multifaceted storytelling lets fans focus on the aspects of the sport that most appeal to them. And most importantly, \"Drive to Survive\" primed fans for an easy transition to the track when the 2021 season started.Read MoreThe popularity of the Netflix program, notably in the US, has paid dividends. ESPN says its race viewership in 2021 is up 56% over 2020. There was record attendance at the US Grand Prix in Austin this October -- drivers also credit the docuseries for the sport's growth, including Hamilton himself. READ: Title rivals wary of being on collision course in Abu Dhabi showdownMax Verstappen and Lewis Hamilton have created a thrilling title race. Title race enthralls New fans are important but getting them to stick with the sport is critical. This year's constant action on the track anchored by the Hamilton-Verstappen title fight has satisfied the second part of that equation. Eyes across the globe will be on all 58 laps in Abu Dhabi on Sunday, the final race to determine who takes home the title. Up until the last lap, every second of the season has been and will be a nail biter.All the variables that brought the two drivers to tally the same exact score heading into the finale is what kept fans' attention throughout the season. Think Verstappen's car perched on top of Hamilton's after the crash in Monza or the smoke rising forebodingly from Hamilton's brakes seconds before the restart in Baku. Verstappen's car ended up on top of Hamilton's at the Italian Grand Prix in September.The best racing is when the two are at it, wheel-to-wheel, like they were through the opening laps of the British Grand Prix.Sprint Qualifying, a format introduced to three weekends this season was seen across the community as a controversial addition to the sport. Despite that, it provided something new for fans who welcomed the extra racing.F1 still has improvements to make on several fronts that would make the sport and viewing experience more inclusive.There is still an obvious need for greater diversity behind the wheel and in the institution itself. The league's first full-time driver from China, Guanyu Zhou, will be a much-welcomed addition to the track when he makes his debut next season. However, it is hard to ignore just how White and male dominated the sport is.JUST WATCHEDValtteri Bottas: F1 title race will go down to the wireReplayMore Videos ...MUST WATCHValtteri Bottas: F1 title race will go down to the wire 04:17The FIA, the governing body of the sport, also has some work to do to make rules more transparent and easier to follow. Commentators do their best to make sense of the calls being made by the stewards, but sometimes, even they are left scratching their heads -- driver and team penalties often seem totally arbitrary or ill-explained.Fan accessibility will remain a challenge. Part of that is built-in because F1 is a global sport with just over 20 events a year. Those constraints make attending a race weekend expensive. For American fans eager to make it to a race, there is some good news: a race in Miami next season means two races in the US for the first time since 1984.After the exciting culmination of this season comes the anticipation for the next, with new car designs and some changes in the driver lineup. Season four of \"Drive to Survive\" is imminent too. All of that adds up to a feeling that there is so much more to come from the sport. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the given entity types. We then determine the relationships between these entities, considering the context and interactions described in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Lewis Hamilton\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Formula One driver who is competing for the world championship.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Max Verstappen\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Formula One driver who is competing for the world championship.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Formula One\",\n", + "\n", + "\"entity_type\": \"SPORT\",\n", + "\n", + "\"description\": \"A motorsport that is the focus of the article, featuring drivers like Lewis Hamilton and Max Verstappen.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Netflix\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A streaming service that produced the 'Drive to Survive' docuseries, which has boosted the popularity of Formula One.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Drive to Survive\",\n", + "\n", + "\"entity_type\": \"TV_SHOW\",\n", + "\n", + "\"description\": \"A docuseries on Netflix that has increased the fanbase of Formula One.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"ESPN\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A sports network that reported a 56% increase in race viewership in 2021.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"US Grand Prix\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A Formula One race event held in Austin, Texas, with record attendance in October.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Abu Dhabi\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The location of the final race in the Formula One season, which will determine the world champion.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Guanyu Zhou\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Formula One driver from China who will make his debut next season.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"FIA\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The governing body of Formula One, which needs to improve transparency in its rules.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Lewis Hamilton\",\n", + "\n", + "\"tgt_id\": \"Max Verstappen\",\n", + "\n", + "\"description\": \"Competing against each other for the Formula One world championship.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Drive to Survive\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A TV show that has increased the popularity of Formula One.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Netflix\",\n", + "\n", + "\"tgt_id\": \"Drive to Survive\",\n", + "\n", + "\"description\": \"A streaming service that produced the 'Drive to Survive' docuseries.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"ESPN\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A sports network that reported increased viewership of Formula One races.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"US Grand Prix\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A Formula One race event with record attendance.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Abu Dhabi\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"The location of the final race in the Formula One season.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Guanyu Zhou\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A new driver who will join Formula One next season.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"FIA\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"The governing body of Formula One that needs to improve its rules' transparency.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)In a five word tweet, Andy Murray made clear he had no sympathy for a fellow tennis player who had just been banned from tennis for doping.Follow @cnnsport\n", + "\n", + "After 29-year-old American Wayne Odesnik was handed a 15-year ban after testing positive for a range of performance enhancing drugs including anabolic steroids, the 2013 Wimbledon champion Murray tweeted: \"Bye bye Wayne... Good riddance.\"Bye bye Wayne... Good riddance— Andy Murray (@andy_murray) March 18, 2015\n", + "This isn't the first time Odesnik has been caught cheating -- he was initially given a two-year ban in 2010 when Australian customs officials discovered eight vials of human growth hormone in his luggage. As this is his second offense the International Tennis Federation increased his punishment to 15 years with Odesnik, who is ranked 267 in the world, subsequently announcing his retirement.Odesnik, who has made over $1m in prize money over his career, says the positive test results weren't his fault.Read MoreJUST WATCHEDThe power of tennis statisticsReplayMore Videos ...MUST WATCHThe power of tennis statistics 03:46\"In December 2014, I unknowingly ingested a contaminated over-the-counter supplement,\" Odesnik said in a statement.\"Upon learning of my positive test results I was immediately heartbroken as words could not describe my shock and disappointment,\" added the former world No. 77.\"Being the most tested American player on tour, I would never knowingly have taken any chance of consuming a banned substance.\"Fellow American tennis player Andy Roddick was also in no mood to forgive Odesnik, tweeting: \"I hate that he has a U.S. flag next to his name when he's cheating. Good riddance.\"According to the United States Anti-Doping Agency (USADA), Odesnik provided out-of-competition urine samples in December 2014 and January 2015 which tested positive.\"[It] was his second such violation, having been sanctioned in 2010 for the possession of human growth hormone. It was, therefore, determined that he is suspended from participation for a period of 15 years\" USADA said in a statement.\"It was also determined that Mr Odesnik's results at the Happy Valley Challenger event, Australian Open and Maui Challenger event should be disqualified, with forfeiture of the ranking points and prize money.\"Odesnik was born in Johannesburg but moved to the U.S. when he was three, before turning pro in 2004. Read: Thatch point! Roger Federer ribs Andy Murray's hairstyle on TwitterRead: The tournament that 'disgraced America'\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. We then determine the relationships between these entities, considering their interactions and the context provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Andy Murray\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player and the 2013 Wimbledon champion.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Wayne Odesnik\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A 29-year-old American tennis player who was banned for 15 years for doping.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"International Tennis Federation\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The governing body that increased Odesnik's punishment to 15 years.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"United States Anti-Doping Agency\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The agency that tested Odesnik's samples and confirmed his second doping violation.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Wimbledon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A prestigious tennis tournament where Murray won in 2013.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Andy Murray\",\n", + "\n", + "\"tgt_id\": \"Wayne Odesnik\",\n", + "\n", + "\"description\": \"Andy Murray expressed no sympathy for Wayne Odesnik after his doping ban.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Wayne Odesnik\",\n", + "\n", + "\"tgt_id\": \"International Tennis Federation\",\n", + "\n", + "\"description\": \"Wayne Odesnik was banned for 15 years by the International Tennis Federation.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Wayne Odesnik\",\n", + "\n", + "\"tgt_id\": \"United States Anti-Doping Agency\",\n", + "\n", + "\"description\": \"Wayne Odesnik's doping violation was confirmed by the United States Anti-Doping Agency.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: Zurich Switzerland has chosen Lockheed Martin's F-35A Lightning II as its next-generation fighter jet, the government said on Wednesday, angering opponents who have pledged a new referendum to overturn what they dubbed an unnecessary \"Ferrari\" option.The $5.5 billion deal adds a 15th nation to the world's largest weapons project -- a family of interconnected, single-engine jets to be used by the United States and its allies and partners.It came alongside a $2.1 billion agreement to buy the Patriot surface-to-air missile system from US group Raytheon, with European competitors losing out on both deals.The F-35 has faced multiple budget overruns, delays and technical setbacks, but is building export momentum. Critics say the project, valued at trillions of dollars over its lifetime, has seen costs soar while failing to meet goals on capability.Neutral Switzerland will buy 36 F-35As after an evaluation found it had \"the highest overall benefit at the lowest overall cost,\" the government said.Read MoreBiden admin intends to move forward with $23B UAE weapons salesThe aircraft beat bids from Boeing's F/A-18 Super Hornet, the Rafale from France's Dassault and the four-nation Eurofighter built by Germany- and Spain-backed Airbus, Italy's Leonardo and Britain's BAE Systems.The decision drew immediate criticism from anti-armaments campaigners and left-wing parties who will now launch a campaign for a referendum on the issue, the third Swiss vote on buying fighter jets.Voters seven years ago rejected the purchase of Gripen jets from Sweden's Saab, while the 6 billion Swiss franc ($6.5 billion) funding, which led to the decision to buy the F-35As, was only narrowly approved last year.Opponents say Switzerland doesn't need cutting-edge warplanes to defend its Alpine territory, which a supersonic jet can cross in 10 minutes.US warplanes fly first combat missions off foreign aircraft carrier since World War II\"The decision is simply incomprehensible,\" said Priska Seiler Graf, a member of Parliament for the left-leaning Social Democrats (SP), who has raised concerns about the cost.\"It's not just about buying them, but the upkeep and operating costs,\" she added. \"We should seek a European solution ... we don't want to be dependent on the United States.\"The government picked the Patriot missile system over Franco-Italian group Eurosam.Defense Minister Viola Amherd said the F-35As were chosen after being the most impressive performer in an evaluation and offered best value for money.Total costs of 15.5 billion francs ($16.7 billion) came in 2 billion francs cheaper than the next lowest bidder, the government said, based on buying and operating the aircraft over 30 years.\"We would not have bought a Ferrari if a VW would do and the Ferrari would be three times more expensive,\" Amherd told a news conference.The Swiss Parliament now has to approve the funding for the purchase, with the debate scheduled for early next year. It can debate costs and terms but not revisit the model selection.Dozens of US Air Force F-35 fighters taxi on the runway in preparation for a combat power exercise on Nov. 19, 2018, at Hill Air Force Base, Utah. The fighter decision was closely watched as the first of three face-offs ahead of Finland and Canada.Lockheed's stealthy fifth-generation fighter recently added Poland to its list of European customers which includes Belgium, Denmark, Italy, the Netherlands, Norway, and Britain.US President Joe Biden had lobbied for American companies when meeting his Swiss counterpart while in Geneva for his summit with Russian President Vladimir Putin this month.JUST WATCHEDWatch US F-35 jet fighters arrive in Europe (2017)ReplayMore Videos ...MUST WATCHWatch US F-35 jet fighters arrive in Europe (2017) 00:55Analysts said the decision to snub both the European fighter jet candidates and surface-to-air missile offering could be seen as a Swiss rebuff to the European Union in a time of strained relations between Bern and Brussels after the collapse of talks over a new agreement governing trade and other matters.By doubling down on US suppliers the government could antagonize the 49.8% of voters who opposed funding last year.South Korea rolls out the KF-21, joining elite group of global supersonic fighter jet makersAnti-arms campaigners say Switzerland, which last fought a foreign war more than 200 years ago and has no discernable enemies, does not need cutting-edge fighters.But supporters have said Switzerland needs to be able to protect itself without relying on others.Jonas Kampus, political secretary of the Group for a Switzerland without an Army, said he was confident of winning a referendum against the F-35As.The government \"can expect a heavy defeat in the vote. The follow-up polls in September (2020) showed a clear rejection of the F-35 among the voting population,\" he said.Marionna Schlatter, a lawmaker with the Greens Party said the September poll was too close to ignore opposition concerns.\"The people don't want a Ferrari in the air,\" she said.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies described in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Zurich Switzerland\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A city and country where the decision to buy F-35A Lightning II was made.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Lockheed Martin\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A company that manufactures the F-35A Lightning II.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"F-35A Lightning II\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A next-generation fighter jet chosen by Zurich Switzerland.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Patriot surface-to-air missile system\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A missile system bought alongside the F-35A Lightning II.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Raytheon\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A US group that manufactures the Patriot missile system.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Boeing\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A company that manufactures the F/A-18 Super Hornet.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"F/A-18 Super Hornet\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A fighter jet bid by Boeing.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Dassault\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A French company that manufactures the Rafale.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Rafale\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A fighter jet bid by Dassault.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Eurofighter\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A fighter jet built by Germany- and Spain-backed Airbus, Italy's Leonardo, and Britain's BAE Systems.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Airbus\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A company that is part of the consortium building the Eurofighter.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Leonardo\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An Italian company that is part of the consortium building the Eurofighter.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"BAE Systems\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A British company that is part of the consortium building the Eurofighter.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Swiss Parliament\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The legislative body of Switzerland that will approve the funding for the F-35A purchase.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Priska Seiler Graf\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A member of Parliament for the left-leaning Social Democrats who raised concerns about the cost of the F-35A.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Viola Amherd\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Defense Minister of Switzerland who justified the choice of the F-35A.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Joe Biden\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The President of the United States who lobbied for American companies.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Vladimir Putin\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The President of Russia with whom Joe Biden had a summit.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jonas Kampus\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The political secretary of the Group for a Switzerland without an Army who is confident of winning a referendum against the F-35A.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Marionna Schlatter\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A lawmaker with the Greens Party who expressed opposition to the F-35A.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"F-35A Lightning II\",\n", + "\n", + "\"description\": \"Zurich Switzerland has chosen the F-35A Lightning II as its next-generation fighter jet.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Patriot surface-to-air missile system\",\n", + "\n", + "\"description\": \"Zurich Switzerland has agreed to buy the Patriot surface-to-air missile system.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"F-35A Lightning II\",\n", + "\n", + "\"tgt_id\": \"Lockheed Martin\",\n", + "\n", + "\"description\": \"The F-35A Lightning II is manufactured by Lockheed Martin.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Patriot surface-to-air missile system\",\n", + "\n", + "\"tgt_id\": \"Raytheon\",\n", + "\n", + "\"description\": \"The Patriot surface-to-air missile system is manufactured by Raytheon.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Boeing\",\n", + "\n", + "\"description\": \"Zurich Switzerland considered the F/A-18 Super Hornet bid by Boeing.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Dassault\",\n", + "\n", + "\"description\": \"Zurich Switzerland considered the Rafale bid by Dassault.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Eurofighter\",\n", + "\n", + "\"description\": \"Zurich Switzerland considered the Eurofighter bid by Airbus, Leonardo, and BAE Systems.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Swiss Parliament\",\n", + "\n", + "\"description\": \"The Swiss Parliament will approve the funding for the F-35A purchase.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Priska Seiler Graf\",\n", + "\n", + "\"description\": \"Priska Seiler Graf, a member of Parliament, raised concerns about the cost of the F-35A.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Viola Amherd\",\n", + "\n", + "\"description\": \"Viola Amherd, the Defense Minister, justified the choice of the F-35A.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Joe Biden\",\n", + "\n", + "\"description\": \"Joe Biden lobbied for American companies during his meeting with the Swiss counterpart.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Vladimir Putin\",\n", + "\n", + "\"description\": \"Joe Biden met with Vladimir Putin during his summit in Geneva.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Jonas Kampus\",\n", + "\n", + "\"description\": \"Jonas Kampus is confident of winning a referendum against the F-35A.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Marionna Schlatter\",\n", + "\n", + "\"description\": \"Marionna Schlatter expressed opposition to the F-35A.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of entity and relationship extraction from text. It works by first defining a signature that specifies the input fields (input_text and entity_types) and the output fields (reasoning and entities_relationships). The program then uses a TypedPredictor class to enforce type annotations in the signature and handle retries and error explanations if the output is invalid. The TypedEntityRelationshipExtractor class further processes the input text to extract entities and relationships, ensuring that the output is formatted according to the specified JSON schema. The program handles retries and errors by providing detailed explanations and examples to guide the language model in producing the correct output format.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: (CNN)This weekend either Lewis Hamilton or Max Verstappen will be crowned the Formula One world champion. Either way, the sport won big in 2021. This year's success starts with a boom in the sport's fanbase that can be pinned to Netflix's \"Drive to Survive\" docuseries. The show is a crash course on the 10 teams, the 20 drivers, the rivalries between them and the race competition structure. A word-of-mouth campaign for the series along with short, easy viewing episodes made it a perfect pandemic-time binge watch for millions around the world.The series' intimate interviews combined with the best action from races and the high level drama is a strong pitch for the real-life spectacle of Formula One.The multifaceted storytelling lets fans focus on the aspects of the sport that most appeal to them. And most importantly, \"Drive to Survive\" primed fans for an easy transition to the track when the 2021 season started.Read MoreThe popularity of the Netflix program, notably in the US, has paid dividends. ESPN says its race viewership in 2021 is up 56% over 2020. There was record attendance at the US Grand Prix in Austin this October -- drivers also credit the docuseries for the sport's growth, including Hamilton himself. READ: Title rivals wary of being on collision course in Abu Dhabi showdownMax Verstappen and Lewis Hamilton have created a thrilling title race. Title race enthralls New fans are important but getting them to stick with the sport is critical. This year's constant action on the track anchored by the Hamilton-Verstappen title fight has satisfied the second part of that equation. Eyes across the globe will be on all 58 laps in Abu Dhabi on Sunday, the final race to determine who takes home the title. Up until the last lap, every second of the season has been and will be a nail biter.All the variables that brought the two drivers to tally the same exact score heading into the finale is what kept fans' attention throughout the season. Think Verstappen's car perched on top of Hamilton's after the crash in Monza or the smoke rising forebodingly from Hamilton's brakes seconds before the restart in Baku. Verstappen's car ended up on top of Hamilton's at the Italian Grand Prix in September.The best racing is when the two are at it, wheel-to-wheel, like they were through the opening laps of the British Grand Prix.Sprint Qualifying, a format introduced to three weekends this season was seen across the community as a controversial addition to the sport. Despite that, it provided something new for fans who welcomed the extra racing.F1 still has improvements to make on several fronts that would make the sport and viewing experience more inclusive.There is still an obvious need for greater diversity behind the wheel and in the institution itself. The league's first full-time driver from China, Guanyu Zhou, will be a much-welcomed addition to the track when he makes his debut next season. However, it is hard to ignore just how White and male dominated the sport is.JUST WATCHEDValtteri Bottas: F1 title race will go down to the wireReplayMore Videos ...MUST WATCHValtteri Bottas: F1 title race will go down to the wire 04:17The FIA, the governing body of the sport, also has some work to do to make rules more transparent and easier to follow. Commentators do their best to make sense of the calls being made by the stewards, but sometimes, even they are left scratching their heads -- driver and team penalties often seem totally arbitrary or ill-explained.Fan accessibility will remain a challenge. Part of that is built-in because F1 is a global sport with just over 20 events a year. Those constraints make attending a race weekend expensive. For American fans eager to make it to a race, there is some good news: a race in Miami next season means two races in the US for the first time since 1984.After the exciting culmination of this season comes the anticipation for the next, with new car designs and some changes in the driver lineup. Season four of \"Drive to Survive\" is imminent too. All of that adds up to a feeling that there is so much more to come from the sport. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the given entity types. We then determine the relationships between these entities, considering the context and interactions described in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Lewis Hamilton\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Formula One driver who is competing for the world championship.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Max Verstappen\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Formula One driver who is competing for the world championship.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Formula One\",\n", + "\n", + "\"entity_type\": \"SPORT\",\n", + "\n", + "\"description\": \"A motorsport that is the focus of the article, featuring drivers like Lewis Hamilton and Max Verstappen.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Netflix\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A streaming service that produced the 'Drive to Survive' docuseries, which has boosted the popularity of Formula One.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Drive to Survive\",\n", + "\n", + "\"entity_type\": \"TV_SHOW\",\n", + "\n", + "\"description\": \"A docuseries on Netflix that has increased the fanbase of Formula One.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"ESPN\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A sports network that reported a 56% increase in race viewership in 2021.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"US Grand Prix\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A Formula One race event held in Austin, Texas, with record attendance in October.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Abu Dhabi\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The location of the final race in the Formula One season, which will determine the world champion.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Guanyu Zhou\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A Formula One driver from China who will make his debut next season.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"FIA\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The governing body of Formula One, which needs to improve transparency in its rules.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Lewis Hamilton\",\n", + "\n", + "\"tgt_id\": \"Max Verstappen\",\n", + "\n", + "\"description\": \"Competing against each other for the Formula One world championship.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Drive to Survive\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A TV show that has increased the popularity of Formula One.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Netflix\",\n", + "\n", + "\"tgt_id\": \"Drive to Survive\",\n", + "\n", + "\"description\": \"A streaming service that produced the 'Drive to Survive' docuseries.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"ESPN\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A sports network that reported increased viewership of Formula One races.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"US Grand Prix\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A Formula One race event with record attendance.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Abu Dhabi\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"The location of the final race in the Formula One season.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Guanyu Zhou\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"A new driver who will join Formula One next season.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"FIA\",\n", + "\n", + "\"tgt_id\": \"Formula One\",\n", + "\n", + "\"description\": \"The governing body of Formula One that needs to improve its rules' transparency.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)In a five word tweet, Andy Murray made clear he had no sympathy for a fellow tennis player who had just been banned from tennis for doping.Follow @cnnsport\n", + "\n", + "After 29-year-old American Wayne Odesnik was handed a 15-year ban after testing positive for a range of performance enhancing drugs including anabolic steroids, the 2013 Wimbledon champion Murray tweeted: \"Bye bye Wayne... Good riddance.\"Bye bye Wayne... Good riddance— Andy Murray (@andy_murray) March 18, 2015\n", + "This isn't the first time Odesnik has been caught cheating -- he was initially given a two-year ban in 2010 when Australian customs officials discovered eight vials of human growth hormone in his luggage. As this is his second offense the International Tennis Federation increased his punishment to 15 years with Odesnik, who is ranked 267 in the world, subsequently announcing his retirement.Odesnik, who has made over $1m in prize money over his career, says the positive test results weren't his fault.Read MoreJUST WATCHEDThe power of tennis statisticsReplayMore Videos ...MUST WATCHThe power of tennis statistics 03:46\"In December 2014, I unknowingly ingested a contaminated over-the-counter supplement,\" Odesnik said in a statement.\"Upon learning of my positive test results I was immediately heartbroken as words could not describe my shock and disappointment,\" added the former world No. 77.\"Being the most tested American player on tour, I would never knowingly have taken any chance of consuming a banned substance.\"Fellow American tennis player Andy Roddick was also in no mood to forgive Odesnik, tweeting: \"I hate that he has a U.S. flag next to his name when he's cheating. Good riddance.\"According to the United States Anti-Doping Agency (USADA), Odesnik provided out-of-competition urine samples in December 2014 and January 2015 which tested positive.\"[It] was his second such violation, having been sanctioned in 2010 for the possession of human growth hormone. It was, therefore, determined that he is suspended from participation for a period of 15 years\" USADA said in a statement.\"It was also determined that Mr Odesnik's results at the Happy Valley Challenger event, Australian Open and Maui Challenger event should be disqualified, with forfeiture of the ranking points and prize money.\"Odesnik was born in Johannesburg but moved to the U.S. when he was three, before turning pro in 2004. Read: Thatch point! Roger Federer ribs Andy Murray's hairstyle on TwitterRead: The tournament that 'disgraced America'\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. We then determine the relationships between these entities, considering their interactions and the context provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Andy Murray\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professional tennis player and the 2013 Wimbledon champion.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Wayne Odesnik\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A 29-year-old American tennis player who was banned for 15 years for doping.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"International Tennis Federation\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The governing body that increased Odesnik's punishment to 15 years.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"United States Anti-Doping Agency\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The agency that tested Odesnik's samples and confirmed his second doping violation.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Wimbledon\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"A prestigious tennis tournament where Murray won in 2013.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Andy Murray\",\n", + "\n", + "\"tgt_id\": \"Wayne Odesnik\",\n", + "\n", + "\"description\": \"Andy Murray expressed no sympathy for Wayne Odesnik after his doping ban.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Wayne Odesnik\",\n", + "\n", + "\"tgt_id\": \"International Tennis Federation\",\n", + "\n", + "\"description\": \"Wayne Odesnik was banned for 15 years by the International Tennis Federation.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Wayne Odesnik\",\n", + "\n", + "\"tgt_id\": \"United States Anti-Doping Agency\",\n", + "\n", + "\"description\": \"Wayne Odesnik's doping violation was confirmed by the United States Anti-Doping Agency.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: Zurich Switzerland has chosen Lockheed Martin's F-35A Lightning II as its next-generation fighter jet, the government said on Wednesday, angering opponents who have pledged a new referendum to overturn what they dubbed an unnecessary \"Ferrari\" option.The $5.5 billion deal adds a 15th nation to the world's largest weapons project -- a family of interconnected, single-engine jets to be used by the United States and its allies and partners.It came alongside a $2.1 billion agreement to buy the Patriot surface-to-air missile system from US group Raytheon, with European competitors losing out on both deals.The F-35 has faced multiple budget overruns, delays and technical setbacks, but is building export momentum. Critics say the project, valued at trillions of dollars over its lifetime, has seen costs soar while failing to meet goals on capability.Neutral Switzerland will buy 36 F-35As after an evaluation found it had \"the highest overall benefit at the lowest overall cost,\" the government said.Read MoreBiden admin intends to move forward with $23B UAE weapons salesThe aircraft beat bids from Boeing's F/A-18 Super Hornet, the Rafale from France's Dassault and the four-nation Eurofighter built by Germany- and Spain-backed Airbus, Italy's Leonardo and Britain's BAE Systems.The decision drew immediate criticism from anti-armaments campaigners and left-wing parties who will now launch a campaign for a referendum on the issue, the third Swiss vote on buying fighter jets.Voters seven years ago rejected the purchase of Gripen jets from Sweden's Saab, while the 6 billion Swiss franc ($6.5 billion) funding, which led to the decision to buy the F-35As, was only narrowly approved last year.Opponents say Switzerland doesn't need cutting-edge warplanes to defend its Alpine territory, which a supersonic jet can cross in 10 minutes.US warplanes fly first combat missions off foreign aircraft carrier since World War II\"The decision is simply incomprehensible,\" said Priska Seiler Graf, a member of Parliament for the left-leaning Social Democrats (SP), who has raised concerns about the cost.\"It's not just about buying them, but the upkeep and operating costs,\" she added. \"We should seek a European solution ... we don't want to be dependent on the United States.\"The government picked the Patriot missile system over Franco-Italian group Eurosam.Defense Minister Viola Amherd said the F-35As were chosen after being the most impressive performer in an evaluation and offered best value for money.Total costs of 15.5 billion francs ($16.7 billion) came in 2 billion francs cheaper than the next lowest bidder, the government said, based on buying and operating the aircraft over 30 years.\"We would not have bought a Ferrari if a VW would do and the Ferrari would be three times more expensive,\" Amherd told a news conference.The Swiss Parliament now has to approve the funding for the purchase, with the debate scheduled for early next year. It can debate costs and terms but not revisit the model selection.Dozens of US Air Force F-35 fighters taxi on the runway in preparation for a combat power exercise on Nov. 19, 2018, at Hill Air Force Base, Utah. The fighter decision was closely watched as the first of three face-offs ahead of Finland and Canada.Lockheed's stealthy fifth-generation fighter recently added Poland to its list of European customers which includes Belgium, Denmark, Italy, the Netherlands, Norway, and Britain.US President Joe Biden had lobbied for American companies when meeting his Swiss counterpart while in Geneva for his summit with Russian President Vladimir Putin this month.JUST WATCHEDWatch US F-35 jet fighters arrive in Europe (2017)ReplayMore Videos ...MUST WATCHWatch US F-35 jet fighters arrive in Europe (2017) 00:55Analysts said the decision to snub both the European fighter jet candidates and surface-to-air missile offering could be seen as a Swiss rebuff to the European Union in a time of strained relations between Bern and Brussels after the collapse of talks over a new agreement governing trade and other matters.By doubling down on US suppliers the government could antagonize the 49.8% of voters who opposed funding last year.South Korea rolls out the KF-21, joining elite group of global supersonic fighter jet makersAnti-arms campaigners say Switzerland, which last fought a foreign war more than 200 years ago and has no discernable enemies, does not need cutting-edge fighters.But supporters have said Switzerland needs to be able to protect itself without relying on others.Jonas Kampus, political secretary of the Group for a Switzerland without an Army, said he was confident of winning a referendum against the F-35As.The government \"can expect a heavy defeat in the vote. The follow-up polls in September (2020) showed a clear rejection of the F-35 among the voting population,\" he said.Marionna Schlatter, a lawmaker with the Greens Party said the September poll was too close to ignore opposition concerns.\"The people don't want a Ferrari in the air,\" she said.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies described in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Zurich Switzerland\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A city and country where the decision to buy F-35A Lightning II was made.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Lockheed Martin\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A company that manufactures the F-35A Lightning II.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"F-35A Lightning II\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A next-generation fighter jet chosen by Zurich Switzerland.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Patriot surface-to-air missile system\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A missile system bought alongside the F-35A Lightning II.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Raytheon\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A US group that manufactures the Patriot missile system.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Boeing\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A company that manufactures the F/A-18 Super Hornet.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"F/A-18 Super Hornet\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A fighter jet bid by Boeing.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Dassault\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A French company that manufactures the Rafale.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Rafale\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A fighter jet bid by Dassault.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Eurofighter\",\n", + "\n", + "\"entity_type\": \"PRODUCT\",\n", + "\n", + "\"description\": \"A fighter jet built by Germany- and Spain-backed Airbus, Italy's Leonardo, and Britain's BAE Systems.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Airbus\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A company that is part of the consortium building the Eurofighter.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Leonardo\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An Italian company that is part of the consortium building the Eurofighter.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"BAE Systems\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A British company that is part of the consortium building the Eurofighter.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Swiss Parliament\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The legislative body of Switzerland that will approve the funding for the F-35A purchase.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Priska Seiler Graf\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A member of Parliament for the left-leaning Social Democrats who raised concerns about the cost of the F-35A.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Viola Amherd\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Defense Minister of Switzerland who justified the choice of the F-35A.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Joe Biden\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The President of the United States who lobbied for American companies.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Vladimir Putin\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The President of Russia with whom Joe Biden had a summit.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jonas Kampus\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The political secretary of the Group for a Switzerland without an Army who is confident of winning a referendum against the F-35A.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Marionna Schlatter\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A lawmaker with the Greens Party who expressed opposition to the F-35A.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"F-35A Lightning II\",\n", + "\n", + "\"description\": \"Zurich Switzerland has chosen the F-35A Lightning II as its next-generation fighter jet.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Patriot surface-to-air missile system\",\n", + "\n", + "\"description\": \"Zurich Switzerland has agreed to buy the Patriot surface-to-air missile system.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"F-35A Lightning II\",\n", + "\n", + "\"tgt_id\": \"Lockheed Martin\",\n", + "\n", + "\"description\": \"The F-35A Lightning II is manufactured by Lockheed Martin.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Patriot surface-to-air missile system\",\n", + "\n", + "\"tgt_id\": \"Raytheon\",\n", + "\n", + "\"description\": \"The Patriot surface-to-air missile system is manufactured by Raytheon.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Boeing\",\n", + "\n", + "\"description\": \"Zurich Switzerland considered the F/A-18 Super Hornet bid by Boeing.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Dassault\",\n", + "\n", + "\"description\": \"Zurich Switzerland considered the Rafale bid by Dassault.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Eurofighter\",\n", + "\n", + "\"description\": \"Zurich Switzerland considered the Eurofighter bid by Airbus, Leonardo, and BAE Systems.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Swiss Parliament\",\n", + "\n", + "\"description\": \"The Swiss Parliament will approve the funding for the F-35A purchase.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Priska Seiler Graf\",\n", + "\n", + "\"description\": \"Priska Seiler Graf, a member of Parliament, raised concerns about the cost of the F-35A.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Viola Amherd\",\n", + "\n", + "\"description\": \"Viola Amherd, the Defense Minister, justified the choice of the F-35A.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Joe Biden\",\n", + "\n", + "\"description\": \"Joe Biden lobbied for American companies during his meeting with the Swiss counterpart.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Vladimir Putin\",\n", + "\n", + "\"description\": \"Joe Biden met with Vladimir Putin during his summit in Geneva.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Jonas Kampus\",\n", + "\n", + "\"description\": \"Jonas Kampus is confident of winning a referendum against the F-35A.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Zurich Switzerland\",\n", + "\n", + "\"tgt_id\": \"Marionna Schlatter\",\n", + "\n", + "\"description\": \"Marionna Schlatter expressed opposition to the F-35A.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously identify and extract all entities present in the text that match the provided entity types. Subsequently, determine the relationships between these entities based on their interactions and the context provided. Ensure that the entities and relationships are formatted according to the specified JSON schema, including providing detailed descriptions and importance scores for each entity, as well as defining the source and target IDs, description, weight, and order for each relationship.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, meticulously identify and extract all entities present in the text that match the provided entity types. Subsequently, determine the relationships between these entities based on their interactions and the context provided. Ensure that the entities and relationships are formatted according to the specified JSON schema, including providing detailed descriptions and importance scores for each entity, as well as defining the source and target IDs, description, weight, and order for each relationship.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and relationships from text, specifically focusing on identifying entities based on predefined types and determining the relationships between these entities. It works by using a language model to process the input text and entity types, generating a step-by-step reasoning process to identify and relate the entities, and finally outputting the entities and their relationships in a structured JSON format. The program ensures type annotations are enforced and includes mechanisms for handling errors and retries to improve the accuracy of the output.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "task_demos Input Text: (CNN)A ban on cellphones in classrooms was announced in the Australian state of Victoria on Wednesday. The ban, which will apply to primary and secondary students, is designed to crack down on cyber-bulling and classroom distractions, the Victorian government said in a statement. While some schools in Australia have already banned cell-phone use, this marks the first statewide ban in the country. It will begin from the first school term of 2020.\"Students will be required to switch off their phones and store them securely in lockers from the start of the school day until the final bell. When emergencies occur, parents or guardians can reach their child by calling the school,\" the statement said.Victoria Education Minister James Merlino said he didn't expect the move to be \"universally popular\" but said it was the \"right thing to do\" on his official social media. Read MoreA report says young people are growing horns on their skulls. Critics don't buy itThe move isn't without precedent. Australia's largest state, New South Wales, banned mobile phones in primary schools in 2018, and France has banned all smart phones and smart devices from schools.Australian Education Minister Dan Tehan said in a statement that he supported the Victoria ban.\"When I talk to parents and teachers the overwhelming majority want action on banning mobile phones in the classroom,\" Tehan said in a statement. In a 2019 survey of about 2,000 Australian adults, by Monash University professor Neil Selywn, almost 80% supported a restriction on cell-phone use in schools, while one-third supported an outright ban.Ann Marie, a secondary school principal at a college in Victoria, told ABC Radio Melbourne that pupils at her school had agreed to restrict their mobile phone use during the day, explaining that they hoped to reduce \"bullying, distraction and one-upmanship.\"But there have been some concerns. The Australian Education Union Victoria said in a statement that while it supports the ban: \"We can't lose sight of the benefits that technology, including mobile phones, can bring to high quality teaching and learning.\"Some pupils with special needs will be exempt from the ban, including those who require telephones to monitor health conditions or those who have received permission from a teacher. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering the context and descriptions provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Victoria\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The Australian state where the cellphone ban in classrooms was announced.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"James Merlino\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Victoria Education Minister who announced the cellphone ban.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"New South Wales\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"Australia's largest state that previously banned mobile phones in primary schools.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"France\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A country that has banned all smart phones and smart devices from schools.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Dan Tehan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Australian Education Minister who supported the Victoria ban.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Monash University\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The university where a survey on cell-phone use in schools was conducted.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Neil Selwyn\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professor at Monash University who conducted the survey on cell-phone use in schools.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ann Marie\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A secondary school principal in Victoria who discussed the impact of the cellphone ban.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Australian Education Union Victoria\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An organization that supports the cellphone ban but highlights the benefits of technology in education.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"James Merlino\",\n", + "\n", + "\"description\": \"James Merlino, the Victoria Education Minister, announced the cellphone ban in Victoria.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"New South Wales\",\n", + "\n", + "\"description\": \"The cellphone ban in Victoria follows a similar ban in New South Wales.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"France\",\n", + "\n", + "\"description\": \"The cellphone ban in Victoria is similar to a ban in France.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Dan Tehan\",\n", + "\n", + "\"description\": \"Dan Tehan, the Australian Education Minister, supported the Victoria ban.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Monash University\",\n", + "\n", + "\"description\": \"A survey conducted at Monash University showed support for the Victoria ban.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Neil Selwyn\",\n", + "\n", + "\"description\": \"Neil Selwyn, a professor at Monash University, conducted the survey on cell-phone use in schools.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Ann Marie\",\n", + "\n", + "\"description\": \"Ann Marie, a secondary school principal in Victoria, discussed the impact of the cellphone ban.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Australian Education Union Victoria\",\n", + "\n", + "\"description\": \"The Australian Education Union Victoria supports the cellphone ban but highlights the benefits of technology in education.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: Manchester, England (CNN)Boris Johnson wants to keep his Brexit plan private. But the Prime Minister isn't having much luck these days. On Monday night, Irish broadcaster RTE reported what it claimed were details of part of Johnson's plan to break the Brexit deadlock. According to RTE, the UK's plan to avoid post-Brexit infrastructure on the frontier between Northern Ireland and the Republic of Ireland involve \"customs clearance centers\" several miles from the border -- and on both sides of it. The women of Boris Johnson's past are starting to catch up with himThat, it hopes, will remove the need for something called the Irish border backstop from any final Brexit deal. The backstop is an emergency measure which would keep the two Irish nations in regulatory alignment, removing the need for any border or \"clearance centers\" at all. It's deeply unpopular with many in Johnson's Conservative party and is one of the main reasons his predecessor, Theresa May, failed to get her Withdrawal Agreement through Parliament.Ireland quickly dismissed the idea as a \"non starter\". And Johnson himself told the BBC on Tuesday, \"that's not what we're proposing at all... you'll forgive me, I would like to veil our proposals in decent obscurity until we've been able to share them properly with our friends\". Read MoreSo why is this story a big deal? Keeping the government's formal proposals from Brussels for the time being might be wise. The crunch EU summit, at which any Brexit deal will be struck, is now under three weeks away. JUST WATCHEDBoris Johnson tells opponents to hold no-confidence voteReplayMore Videos ...MUST WATCHBoris Johnson tells opponents to hold no-confidence vote 02:41The EU is a notoriously tough institution to negotiate with and has a history of tearing apart any Brexit proposal from the moment it leaves the door. Johnson has repeatedly said that the only way to make the EU budge on ditching the backstop is to show that he is both deadly serious about getting a deal but also prepared to leave without one. Some of the ideas reported by RTE chime with speculation about the UK government's plans in recent weeks. And anyone who has followed Brexit for the last three years will tell you, the EU and Ireland will refuse to sign a deal that involves infrastructure between Ireland and Northern Ireland. So, even though the Prime Minister has distanced himself from the proposals, these headlines will have done little to reassure Brussels that Johnson and his government are serious about getting a deal. It's also a bad look for Johnson back home. He is currently out of London and at his Conservative party's annual conference in Manchester, in the north of England. Johnson will close the conference with a keynote speech on Wednesday afternoon. British politics has been taken over by trollsIt's widely thought that he wanted to trail his final Brexit plan during this speech before formally submitting something to Brussels in the subsequent days. The problem with the leak of any Brexit plan now is that it increases the risk of Johnson being greeted by a hostile audience at his conference on Wednesday. Despite being the Brexit hero of 2016, Conservative hard-Brexiteers are deeply worried that Johnson, if he succeeds in getting rid of the backstop, will simply try and push through the rest May's Brexit deal.For Brexiteers, this would be a huge sellout. There are people sitting in Johnson's cabinet who resigned from Theresa May's government to vote against her Brexit deal. And Johnson himself resigned as May's Foreign Secretary over her general approach to Brexit. Up here in Manchester, the mood among party members is generally positive. Johnson has always been a popular Conservative MP and he seems to have broad support for his hard Brexit language. That could turn sour if he's seen to make concessions to Brussels. Which would all be somewhat ironic, for a man who has spent the past few days accusing his opponents of surrendering. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. We then determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Boris Johnson\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Prime Minister of the United Kingdom who is leading the Brexit negotiations.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Brexit\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"The process of the United Kingdom withdrawing from the European Union.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"European Union\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A political and economic union of 27 European countries.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Irish border backstop\",\n", + "\n", + "\"entity_type\": \"LAW\",\n", + "\n", + "\"description\": \"An emergency measure to keep the two Irish nations in regulatory alignment, removing the need for any border or 'clearance centers'.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Theresa May\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Former Prime Minister of the United Kingdom and predecessor of Boris Johnson.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Conservative party\",\n", + "\n", + "\"entity_type\": \"POLITICAL_PARTY\",\n", + "\n", + "\"description\": \"The political party in the United Kingdom to which Boris Johnson and Theresa May belong.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"RTE\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"Irish broadcaster that reported details of Johnson's Brexit plan.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Boris Johnson\",\n", + "\n", + "\"tgt_id\": \"Brexit\",\n", + "\n", + "\"description\": \"Boris Johnson is leading the Brexit negotiations as the Prime Minister of the United Kingdom.\",\n", + "\n", + "\"weight\": 1.0,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Brexit\",\n", + "\n", + "\"tgt_id\": \"European Union\",\n", + "\n", + "\"description\": \"Brexit involves the United Kingdom withdrawing from the European Union.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Boris Johnson\",\n", + "\n", + "\"tgt_id\": \"Irish border backstop\",\n", + "\n", + "\"description\": \"Boris Johnson aims to remove the Irish border backstop from any final Brexit deal.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Theresa May\",\n", + "\n", + "\"tgt_id\": \"Brexit\",\n", + "\n", + "\"description\": \"Theresa May failed to get her Withdrawal Agreement through Parliament due to the Irish border backstop.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Boris Johnson\",\n", + "\n", + "\"tgt_id\": \"Conservative party\",\n", + "\n", + "\"description\": \"Boris Johnson is a member of the Conservative party and its leader.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"RTE\",\n", + "\n", + "\"tgt_id\": \"Boris Johnson\",\n", + "\n", + "\"description\": \"RTE reported details of part of Johnson's plan to break the Brexit deadlock.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)Across the sports world, athletes took a stand for Black Lives Matter on Wednesday night by holding an unprecedented wildcat strike and refusing to play their regularly scheduled games.The decision started with the NBA's Milwaukee Bucks and stemmed from the police shooting of Jacob Blake in Kenosha, Wisconsin, on Sunday. The stand quickly spread across a number of sports -- from basketball courts in Florida to baseball diamonds in California to soccer fields in places like Miami and Salt Lake City and even into the broadcast booth, where TNT's \"Inside the NBA\" analyst Kenny Smith took off his mic and walked off stage.But the issue is bigger than Jacob Blake or George Floyd or Breonna Taylor. Malcolm Jenkins of the NFL's New Orleans Saints positioned the protests as a response to the systemic racism baked into America, including police violence, cycles of incarceration, and the plunder of generational wealth.\"Until we continue to demand it, until 'Black Lives Matter' goes from just an idea or a goal that we're trying to attain as a society and is actually realized in the streets, we won't see any peace,\" he said. \"And I think we'll continue to see athletes, entertainers as well as citizens disrupt the status quo until that's recognized.\"Here's a look at the teams and athletes who pulled back in order to have their voices be heard.Read MoreNBA Photos: The sports world has been taking a standMembers of the Los Angeles Lakers and the Miami Heat kneel during the National Anthem before Game 1 of the NBA Finals on Wednesday, September 30.Hide Caption 1 of 42 Photos: The sports world has been taking a standThe Philadelphia Eagles and the Washington Football Team stand together before their NFL season opener on September 13.Hide Caption 2 of 42 Photos: The sports world has been taking a standTyler Wright, a two-time World Surf League Champion, takes a knee before competing in an event in Tweed Heads South, Australia, on September 13.Hide Caption 3 of 42 Photos: The sports world has been taking a standTennis star Naomi Osaka wears a face mask with Tamir Rice's name before winning the US Open final on September 12. Osaka wore a different name for each of her seven matches. Rice, a 12-year-old boy, was killed by police gunfire in Cleveland while he was holding a toy replica pistol in 2014.Hide Caption 4 of 42 Photos: The sports world has been taking a standChelsea's Pernille Harder walks out for a warm-up before a soccer match in Leigh, England, on September 6.Hide Caption 5 of 42 Photos: The sports world has been taking a standThe University of Alabama football team, led by coach Nick Saban, marches on campus in support of the Black Lives Matter movement.Hide Caption 6 of 42 Photos: The sports world has been taking a standBlack Lives Matter signage is seen in New York's Louis Armstrong Stadium as Cori Gauff plays against Anastasija Sevastova at the US Open.Hide Caption 7 of 42 Photos: The sports world has been taking a standMembers of the WNBA's Indiana Fever wear Black Lives Matter shirts before their game against the Chicago Sky on August 31.Hide Caption 8 of 42 Photos: The sports world has been taking a standMiami Marlins outfielder Lewis Brinson walks off the field August 27 after placing a Black Lives Matter T-shirt on home plate in New York. The Marlins and New York Mets walked off the field after a moment of silence, choosing not to play their scheduled baseball game. Hide Caption 9 of 42 Photos: The sports world has been taking a standNew Orleans Saints wide receiver Emmanuel Sanders wears Jacob Blake's name on his helmet during a practice on August 27.Hide Caption 10 of 42 Photos: The sports world has been taking a standThe Washington Mystics pay tribute to Jacob Blake after their WNBA game was postponed in Palmetto, Florida, on August 26.Hide Caption 11 of 42 Photos: The sports world has been taking a standA grounds crew covers the field at San Francisco's Oracle Park after a Major League Baseball game was postponed on August 26.Hide Caption 12 of 42 Photos: The sports world has been taking a standThe Milwaukee Bucks make a statement to the media after boycotting a playoff game on August 26.Hide Caption 13 of 42 Photos: The sports world has been taking a standDuring a practice round for the BMW Championship, pro golfer Cameron Champ used his shoes to show his support for Blake and the Black Lives Matter movement.Hide Caption 14 of 42 Photos: The sports world has been taking a standNBA superstar LeBron James has been one of the most outspoken athletes in recent months. \"Having two boys of my own and me being African American in America and to see what continues to happen with the police brutality towards my kind, continue to see what goes on with just the (unjust), it's very troubling. It's very troubling,\" James told Turner Sports' Jared Greenberg. \"My prayers go to (the Blake) family, and hopefully we can have some change.\" Hide Caption 15 of 42 Photos: The sports world has been taking a stand\"End racism\" banners are shown in Toronto's Scotiabank Arena before an NHL playoff game on August 26.Hide Caption 16 of 42 Photos: The sports world has been taking a standDoc Rivers, head coach of the NBA's Los Angeles Clippers, became emotional while talking about the Blake shooting and the Republican National Convention. \"All you hear is Donald Trump and all of them talking about fear,\" Rivers said. \"We're the ones getting killed. We're the ones getting shot. We're the ones who were denied to live in certain communities. We've been hung. We've been shot.\"Hide Caption 17 of 42 Photos: The sports world has been taking a standDetroit Lions defensive end Trey Flowers addresses the media with his teammates outside their practice facility on August 25. The Lions canceled their practice in light of the Blake shooting, and they discussed the incident in a team meeting that lasted hours. \"We're going to spread our message; we're going to do it as a team,\" Flowers said. \"We understand that somebody's going to try to skew the narrative ... get the message lost, but we're going to stay focused on the topic.\"Hide Caption 18 of 42 Photos: The sports world has been taking a standA Real Salt Lake fan sits in the stands after the Major League Soccer team had its game postponed on August 26.Hide Caption 19 of 42 Photos: The sports world has been taking a standNASCAR driver Bubba Wallace tweeted this selfie before a Cup Series race in Talladega, Alabama, on June 22. Fellow drivers and pit crew members walked alongside Wallace's car to show their support for him. Wallace, the only Black driver in NASCAR's top circuit, has been an outspoken advocate of the Black Lives Matter movement. Hide Caption 20 of 42 Photos: The sports world has been taking a standMembers of the NHL's Colorado Avalanche, Vegas Golden Knights, Dallas Stars and Vancouver Canucks gather together after their playoff games were postponed in Edmonton, Alberta.Hide Caption 21 of 42 Photos: The sports world has been taking a standThe Black Lives Matter protests haven't been limited to just North America. Here, professional soccer players from Aston Villa and Sheffield United take a knee as their match kicked off in Birmingham, England, on June 17. Premier League teams sported the words \"Black Lives Matter\" on the back of their jerseys when their seasons resumed.Hide Caption 22 of 42 Photos: The sports world has been taking a standFormula One champion Lewis Hamilton raises his fist before a race in Northampton, England, on August 9.Hide Caption 23 of 42 Photos: The sports world has been taking a standMatt Dumba of the Minnesota Wild kneels during the US National Anthem as it is played before an NHL game in Edmonton, Alberta, on August 1. Dumba, a member of the Hockey Diversity Alliance, gave a short speech before kneeling. He is circled by members of the Edmonton Oilers and the Chicago Blackhawks, who were about to play on the first day of the NHL's return.Hide Caption 24 of 42 Photos: The sports world has been taking a standSoccer players from PSG and Lyon take a knee before a Champions League semifinal in Bilbao, Spain, on August 26.Hide Caption 25 of 42 Photos: The sports world has been taking a standBeach volleyball player April Ross wears a temporary Black Lives Matter tattoo during a match in Long Beach, California, on July 19.Hide Caption 26 of 42 Photos: The sports world has been taking a standTennis players Dan Evans and Kyle Edmund join match umpire James Keothavong in taking a knee in London on June 28.Hide Caption 27 of 42 Photos: The sports world has been taking a standCricketers with the Kent Spitfires take a knee during a match in Canterbury, England.Hide Caption 28 of 42 Photos: The sports world has been taking a standFormula One driver Kimi Raikkonen wears an \"end racism\" shirt before a race in Barcelona, Spain, on August 16.Hide Caption 29 of 42 Photos: The sports world has been taking a standMembers of the Trinbago Knight Riders kneel before a cricket match in Port of Spain, Trinidad and Tobago.Hide Caption 30 of 42 Photos: The sports world has been taking a standThe Washington Nationals observe a moment of silence before Major League Baseball's opening game on July 23. Their opponents, The New York Yankees, also took a knee, and the initials BLM were on the pitcher's mound for the game.Hide Caption 31 of 42 Photos: The sports world has been taking a standMajor League Soccer players participate in a pregame ceremony before the league's restart on July 8.Hide Caption 32 of 42 Photos: The sports world has been taking a standHouston Dynamo goalkeeper Cody Cropper warms up in a Black Lives Matter T-shirt on August 25.Hide Caption 33 of 42 Photos: The sports world has been taking a standThe Boston Red Sox Foundation put this sign behind Fenway Park's famous \"Green Monster\" outfield wall.Hide Caption 34 of 42 Photos: The sports world has been taking a standRugby players from Bath and Northampton show their support before a game in Northampton, England.Hide Caption 35 of 42 Photos: The sports world has been taking a standKansas City outfielder Jorge Soler catches a ball in front of a George Floyd sign in Minneapolis on August 15.Hide Caption 36 of 42 Photos: The sports world has been taking a standA Houston Dash player wears a Black Lives Matter armband before a NWSL match in Herriman, Utah, on July 17.Hide Caption 37 of 42 Photos: The sports world has been taking a standAfter winning the Champions League, Bayern Munich defender David Alaba wears a shirt that says \"Black Lives Still Matter.\"Hide Caption 38 of 42 Photos: The sports world has been taking a standA Black Lives Matter sign is seen during a Major League Soccer match in Harrison, New Jersey, on August 24.Hide Caption 39 of 42 Photos: The sports world has been taking a standMembers of the San Antonio Spurs huddle before a game against Sacramento on July 31.Hide Caption 40 of 42 Photos: The sports world has been taking a standKilmarnock manager Alex Dyer, second from right, takes a knee before a soccer match in Glasgow, Scotland.Hide Caption 41 of 42 Photos: The sports world has been taking a standMembers of the Orlando Magic and Brooklyn Nets kneel during the National Anthem before the start of an NBA game on July 31.Hide Caption 42 of 42The Milwaukee Bucks did not emerge from their locker room before the scheduled tip at 4 p.m. Wednesday as they decided to refuse to play their playoff game.Soon after, the NBA announced it would postpone Game 5 of three different playoff series -- the Bucks vs. Orlando Magic, Houston Rockets vs. Oklahoma City Thunder and Los Angeles Lakers vs. Portland Trail Blazers.The postponed games have not yet been rescheduled.Three playoff games on Thursday also were postponed but the league said it hoped to resume games Friday or Saturday. Strikes are banned under the NBA's collective bargaining agreement, which means the Bucks players broke their own contract to protest racial injustice and police violence. In a statement issued on Wednesday afternoon, the Bucks players said they are \"calling for justice for Jacob Blake and demand the officers be held accountable. \"Full statement from the Milwaukee Bucks: pic.twitter.com/jjGEyVcCmB— Milwaukee Bucks (@Bucks) August 26, 2020\n", + "\n", + "\"The past four months have shed a light on the ongoing racial injustices facing our African American communities. Citizens around the country have used their voices and platforms to speak out against these wrongdoings,\" they said in a statement.\"Despite the overwhelming plea for change, there has been no action, so our focus today cannot be on basketball.\"The Los Angeles Lakers also offered a statement in support of the players' decision.\"Eighty percent of NBA players are Black men. We cannot love them for the joy and entertainment that they bring to the world, yet sit in silence and fail to use our platforms and resources to amplify their voices when they demand the justice and equality that America has promised us all, but denied Black people for too long,\" the Lakers said.WNBAElizabeth Williams of the Atlanta Dream said WNBA players were standing in solidarity with \"our brothers in the NBA.\"Representing the six teams slated to play on Wednesday, Atlanta Dream player Elizabeth Williams announced that WNBA players were standing in solidarity with \"our brothers in the NBA\" and also would not play. The ESPN2 broadcast showed players from the six teams scheduled to take the court in locked arms and kneeling while wearing shirts spelling out Jacob Blake's name. The WNBA announced that the three games scheduled for Wednesday evening had been postponed. On Thursday, the league announced the postponement of the day's three scheduled games as well.Instead of playing Thursday, the entire league of WNBA players stood arm-in-arm in solidarity.\"It is important to note that this is not a strike,\" said Women's National Basketball Players Association (WNBPA) president and Los Angeles Sparks star Nneka Ogwumike, reading a statement from the players, to ESPN's Holly Rowe. \"This is not a boycott. This is affirmatively a day of reflection. A day of informed action and mobilization.\"Ogwumike said players plan to play in games that are scheduled for Friday.\"I feel like as a group we decided last night that we want to play,\" Ogwumike said. \"There are games scheduled tomorrow. That's what we're ready to do, but that doesn't come without, of course, demands of players to continue to amplify our voices in more ways than when we came here.\"We realize that the work is not easy, but we also understand that the work is never done. There are things that happen that just allow us times to take a moment -- that's fair; we can take a moment -- and that moment we came together and we decided we need time for ourselves to come back, regroup and continue to amplify our voices, and be there for our communities and demand change from the leaders that are representing us right now.\"MLBThe words 'Black Lives Matter' are displayed on the digital screen after the postponement of the game between the San Francisco Giants and the Los Angeles Dodgers Wednesday.Three MLB games were postponed Wednesday: Cincinnati Reds vs. Milwaukee Brewers, Seattle Mariners vs. San Diego Padres, Los Angeles Dodgers vs. San Francisco Giants.\"With our community and our nation in such pain, we wanted to draw as much attention to the issues that really matter, especially racial injustice and systemic oppression,\" a joint statement from the Brewers and Reds said.\"Given the pain in the communities of Wisconsin and beyond following the shooting of Jacob Blake, we respect the decisions of a number of players not to play tonight,\" a league statement Wednesday said. \"Major League Baseball remains united for change in our society and we will be allies in the fight to end racism and injustice.\"Mets player Dominic Smith, who is Black, was driven to tears on Wednesday, telling reporters after the Mets defeated the Marlins, \"I think the most difficult part is to see people still don't care. And for this to continuously happen it just shows just the hate in people's heart and that just sucks. Being a Black man in America is not easy.\"On Thursday, seven games would not take place: Minnesota Twins vs. Detroit Tigers, Colorado Rockies vs. Arizona Diamondbacks, Baltimore Orioles vs. Tampa Bay Rays, Oakland Athletics vs. Texas Rangers, Philadelphia Phillies vs. Washington Nationals, Boston Red Sox vs. Toronto Blue Jays and Miami Marlins vs. New York Mets.Athletes across US sports take a stand, as games are called off in solidarity with Bucks' boycottIn a powerful moment Thursday at Citi Field in New York, the Mets starting defense took the field as scheduled, and then the remaining players for both teams came out of their dugouts and a moment of silence was observed, which the SNY television broadcast said lasted 42 seconds, and then walked off the field.Before he left the field, Miami Marlins player Lewis Brinson laid a \"Black Lives Matter\" shirt at home plate.\"Enough,\" the Marlins tweeted.Enough. pic.twitter.com/25SnNMCfBu— Miami Marlins (@Marlins) August 27, 2020 MLB is scheduled to observe Jackie Robinson Day on Friday to honor the color-barrier-breaking Hall of Famer, who wore number 42.NFLSeveral National Football League teams canceled practice Thursday to focus on conversations about race.The Denver Broncos joined the Arizona Cardinals, Indianapolis Colts, Washington Football Team and New York Jets in forgoing practice to allow players to weigh in on what they can do to effect change.The Chicago Bears said in a statement Thursday that the organization had decided to pause football activities \"to voice to each other, our coaches and our staff where we stand on the real issues around race and police brutality in our country.\"\"We had a productive discussion, but we all agreed that talks and discussions are simply not enough anymore and we need action,\" the statement said. \"We are putting in plans to take action in our communities and together we believe we can make a real difference. We need action not only today, but in the days to come.\"MLSInter Miami midfielder Lee Nguyen exits the field after Inter Miami and Atlanta United players decided not to play.Five Major League Soccer games scheduled to take place on Wednesday night were postponed, the league announced.\"The entire Major League Soccer family unequivocally condemns racism and has always stood for equality, but we need to do more to take tangible steps to impact change,\" MLS said. \"We will continue to work without players, our clubs and the broader soccer community to harness our collective power to fight for equality and social justice.\"TennisTennis superstar Naomi Osaka announced Wednesday that she would not play in the Western & Southern Open semifinals on Thursday. \"Watching the continued genocide of Black people at the hand of the police is honestly making me sick to my stomach,\" she wrote.Later on Wednesday, after Osaka's post, it was announced that the Western & Southern Open would pause tournament play on Thursday, resuming on Friday.By Thursday, a statement by her management team to The Guardian and The New York Times indicated she planned to play Friday instead.\"They offered to postpone all matches until Friday and in my mind that brings more attention to the movement,\" Osaka said. She also thanked the WTA and the tournament for their support.A WTA spokesperson tells CNN that Osaka did not officially withdraw from the Western & Southern Open on Wednesday. She is scheduled to play Elise Mertens in the semifinals on Friday.NHLThe NHL's Stanley Cup Playoffs continued Wednesday with two games. However, the league opted to postpone two games scheduled for Thursday and two scheduled for Friday.\"After much discussion, NHL Players believe that the best course of action would be to take a step back and not play tonight's and tomorrow's games as scheduled,\" a Thursday joint statement from the league and the players association said. \"The NHL supports the Players' decision and will reschedule those four games beginning Saturday and adjust the remainder of the Second Round schedule accordingly. \"Black and Brown communities continue to face real, painful experiences. The NHL and NHLPA recognize that much work remains to be done before we can play an appropriate role in a discussion centered on diversity, inclusion and social justice.\"Before the start of Game 3 of the second-round series between the Tampa Bay Lightning and Boston Bruins on Wednesday, there was a moment of reflection for racism in response to the Blake shooting.CNN's Ray Sanchez, Eric Levenson and Cesar Marin contributed to this report.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We will first identify the entities in the text based on the provided entity types. Then, we will determine the relationships between these entities, focusing on the interactions and associations described in the text. Finally, we will format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Milwaukee Bucks\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An NBA team that initiated the wildcat strike.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jacob Blake\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A person who was shot by police, leading to the sports world's protest.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kenosha, Wisconsin\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The location where Jacob Blake was shot by police.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"NBA\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The National Basketball Association, which saw multiple teams participating in the strike.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Los Angeles Lakers\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An NBA team that supported the strike.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"WNBA\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The Women's National Basketball Association, which also participated in the strike.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Major League Baseball (MLB)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional baseball organization where games were postponed in solidarity.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"National Football League (NFL)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional football league where teams canceled practices to discuss racial issues.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Major League Soccer (MLS)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional soccer league where games were postponed in solidarity.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Naomi Osaka\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A tennis superstar who refused to play in a tournament to protest racial injustice.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"National Hockey League (NHL)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional hockey league where games were postponed in solidarity.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Milwaukee Bucks\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The Milwaukee Bucks initiated a wildcat strike in the NBA.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Milwaukee Bucks\",\n", + "\n", + "\"tgt_id\": \"Jacob Blake\",\n", + "\n", + "\"description\": \"The Milwaukee Bucks' strike was in response to the police shooting of Jacob Blake.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Los Angeles Lakers\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The Los Angeles Lakers supported the strike in the NBA.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"WNBA\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The WNBA stood in solidarity with the NBA during the strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Major League Baseball (MLB)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The MLB postponed games in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"National Football League (NFL)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The NFL canceled practices to discuss racial issues in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Major League Soccer (MLS)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The MLS postponed games in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Naomi Osaka\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"Naomi Osaka refused to play in a tournament in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"National Hockey League (NHL)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The NHL postponed games in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/beta/chat/completions \"HTTP/1.1 200 OK\"\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/nano-graphrag/lib/python3.10/site-packages/optuna/samplers/_tpe/sampler.py:319: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n", + " warnings.warn(\n", + "[I 2024-09-20 18:37:04,086] A new study created in memory with name: no-name-80b7b333-e37b-4380-94c7-15183cc4517f\n", + "INFO:root:Starting trial num: 0\n", + "INFO:root:instruction_idx 1\n", + "INFO:root:demos_idx 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "DATASET SUMMARY: A description of the dataset that we are using.\n", + "\n", + "PROGRAM CODE: Language model program designed to solve a particular task.\n", + "\n", + "PROGRAM DESCRIPTION: Summary of the task the program is designed to solve, and how it goes about solving it.\n", + "\n", + "MODULE: The module to create an instruction for.\n", + "\n", + "TASK DEMO(S): Example inputs/outputs of our module.\n", + "\n", + "BASIC INSTRUCTION: Basic instruction.\n", + "\n", + "TIP: A suggestion for how to go about generating the new instruction.\n", + "\n", + "PROPOSED INSTRUCTION: Propose an instruction that will be used to prompt a Language Model to perform this task.\n", + "\n", + "---\n", + "\n", + "DATASET SUMMARY: The dataset consists of news articles from CNN, covering a wide range of topics with a structured format including headlines, summaries, and detailed bodies. Key entities are highlighted with importance scores, and multimedia elements are prevalent, making it suitable for natural language processing and information extraction tasks.\n", + "\n", + "PROGRAM CODE:\n", + "StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions='Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")\n", + "\n", + "\n", + "\n", + "class TypedPredictor(dspy.Module):\n", + " def __init__(self, signature, instructions=None, *, max_retries=3, wrap_json=False, explain_errors=False):\n", + " \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + " Args:\n", + " signature: The signature of the module. Can use type annotations.\n", + " instructions: A description of what the model should do.\n", + " max_retries: The number of times to retry the prediction if the output is invalid.\n", + " wrap_json: If True, json objects in the input will be wrapped in ```json ... ```\n", + " explain_errors: If True, the model will try to explain the errors it encounters.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.signature = ensure_signature(signature, instructions)\n", + " self.predictor = dspy.Predict(signature)\n", + " self.max_retries = max_retries\n", + " self.wrap_json = wrap_json\n", + " self.explain_errors = explain_errors\n", + "\n", + " def copy(self) -> \"TypedPredictor\":\n", + " return TypedPredictor(\n", + " self.signature,\n", + " max_retries=self.max_retries,\n", + " wrap_json=self.wrap_json,\n", + " explain_errors=self.explain_errors,\n", + " )\n", + "\n", + " def __repr__(self):\n", + " \"\"\"Return a string representation of the TypedPredictor object.\"\"\"\n", + " return f\"TypedPredictor({self.signature})\"\n", + "\n", + " def _make_example(self, type_) -> str:\n", + " # Note: DSPy will cache this call so we only pay the first time TypedPredictor is called.\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " schema = \"```json\\n\" + schema + \"\\n```\\n\"\n", + " json_object = dspy.Predict(\n", + " make_signature(\n", + " \"json_schema -> json_object\",\n", + " \"Make a very succinct json object that validates with the following schema\",\n", + " ),\n", + " )(json_schema=schema).json_object\n", + " # We use the model_validate_json method to make sure the example is valid\n", + " try:\n", + " type_.model_validate_json(_unwrap_json(json_object, type_.model_validate_json))\n", + " except (pydantic.ValidationError, ValueError):\n", + " return \"\" # Unable to make an example\n", + " return json_object\n", + " # TODO: Another fun idea is to only (but automatically) do this if the output fails.\n", + " # We could also have a more general \"suggest solution\" prompt that tries to fix the output\n", + " # More directly.\n", + " # TODO: Instead of using a language model to create the example, we can also just use a\n", + " # library like https://pypi.org/project/polyfactory/ that's made exactly to do this.\n", + "\n", + " def _format_error(\n", + " self,\n", + " error: Exception,\n", + " task_description: Union[str, FieldInfo],\n", + " model_output: str,\n", + " lm_explain: bool,\n", + " ) -> str:\n", + " if isinstance(error, pydantic.ValidationError):\n", + " errors = []\n", + " for e in error.errors():\n", + " fields = \", \".join(map(str, e[\"loc\"]))\n", + " errors.append(f\"{e['msg']}: {fields} (error type: {e['type']})\")\n", + " error_text = \"; \".join(errors)\n", + " else:\n", + " error_text = repr(error)\n", + "\n", + " if self.explain_errors and lm_explain:\n", + " if isinstance(task_description, FieldInfo):\n", + " args = task_description.json_schema_extra\n", + " task_description = args[\"prefix\"] + \" \" + args[\"desc\"]\n", + " return (\n", + " error_text\n", + " + \"\\n\"\n", + " + self._make_explanation(\n", + " task_description=task_description,\n", + " model_output=model_output,\n", + " error=error_text,\n", + " )\n", + " )\n", + "\n", + " return error_text\n", + "\n", + " def _make_explanation(self, task_description: str, model_output: str, error: str) -> str:\n", + " class Signature(dspy.Signature):\n", + " \"\"\"I gave my language model a task, but it failed.\n", + "\n", + " Figure out what went wrong, and write instructions to help it avoid the error next time.\n", + " \"\"\"\n", + "\n", + " task_description: str = dspy.InputField(desc=\"What I asked the model to do\")\n", + " language_model_output: str = dspy.InputField(desc=\"The output of the model\")\n", + " error: str = dspy.InputField(desc=\"The validation error trigged by the models output\")\n", + " explanation: str = dspy.OutputField(desc=\"Explain what the model did wrong\")\n", + " advice: str = dspy.OutputField(\n", + " desc=\"Instructions for the model to do better next time. A single paragraph.\",\n", + " )\n", + "\n", + " # TODO: We could also try repair the output here. For example, if the output is a float, but the\n", + " # model returned a \"float + explanation\", the repair could be to remove the explanation.\n", + "\n", + " return dspy.Predict(Signature)(\n", + " task_description=task_description,\n", + " language_model_output=model_output,\n", + " error=error,\n", + " ).advice\n", + "\n", + " def _prepare_signature(self) -> dspy.Signature:\n", + " \"\"\"Add formats and parsers to the signature fields, based on the type annotations of the fields.\"\"\"\n", + " signature = self.signature\n", + " for name, field in self.signature.fields.items():\n", + " is_output = field.json_schema_extra[\"__dspy_field_type\"] == \"output\"\n", + " type_ = field.annotation\n", + " if is_output:\n", + " if type_ is bool:\n", + "\n", + " def parse(x):\n", + " x = x.strip().lower()\n", + " if x not in (\"true\", \"false\"):\n", + " raise ValueError(\"Respond with true or false\")\n", + " return x == \"true\"\n", + "\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\" (Respond with true or false)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=parse,\n", + " )\n", + " elif type_ in (str, int, float):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (f\" (Respond with a single {type_.__name__} value)\" if type_ != str else \"\"),\n", + " format=lambda x: x if isinstance(x, str) else str(x),\n", + " parser=type_,\n", + " )\n", + " elif False:\n", + " # TODO: I don't like forcing the model to write \"value\" in the output.\n", + " if not (inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel)):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()[9:-1] # {\"value\":\"123\"}\n", + " from_json = lambda x, type_=type_: type_.model_validate_json('{\"value\":' + x + \"}\").value\n", + " schema = json.dumps(type_.model_json_schema()[\"properties\"][\"value\"])\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " # Anything else we wrap in a pydantic object\n", + " if not (\n", + " inspect.isclass(type_)\n", + " and typing.get_origin(type_) not in (list, tuple) # To support Python 3.9\n", + " and issubclass(type_, pydantic.BaseModel)\n", + " ):\n", + " type_ = pydantic.create_model(\"Output\", value=(type_, ...), __base__=pydantic.BaseModel)\n", + " to_json = lambda x, type_=type_: type_(value=x).model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x).value\n", + " schema = json.dumps(type_.model_json_schema())\n", + " else:\n", + " to_json = lambda x: x.model_dump_json()\n", + " from_json = lambda x, type_=type_: type_.model_validate_json(x)\n", + " schema = json.dumps(type_.model_json_schema())\n", + " if self.wrap_json:\n", + " to_json = lambda x, inner=to_json: \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " schema = \"```json\\n\" + schema + \"\\n```\"\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=field.json_schema_extra.get(\"desc\", \"\")\n", + " + (\". Respond with a single JSON object. JSON Schema: \" + schema),\n", + " format=lambda x, to_json=to_json: (x if isinstance(x, str) else to_json(x)),\n", + " parser=lambda x, from_json=from_json: from_json(_unwrap_json(x, from_json)),\n", + " type_=type_,\n", + " )\n", + " else: # If input field\n", + " is_json = False\n", + " format_ = lambda x: x if isinstance(x, str) else str(x)\n", + " if type_ in (List[str], list[str], Tuple[str], tuple[str]):\n", + " format_ = passages2text\n", + " # Special formatting for lists of known types. Maybe the output fields sohuld have this too?\n", + " elif typing.get_origin(type_) in (List, list, Tuple, tuple):\n", + " (inner_type,) = typing.get_args(type_)\n", + " if inspect.isclass(inner_type) and issubclass(inner_type, pydantic.BaseModel):\n", + " format_ = (\n", + " lambda x: x if isinstance(x, str) else \"[\" + \",\".join(i.model_dump_json() for i in x) + \"]\"\n", + " )\n", + " else:\n", + " format_ = lambda x: x if isinstance(x, str) else json.dumps(x)\n", + " is_json = True\n", + " elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):\n", + " format_ = lambda x: x if isinstance(x, str) else x.model_dump_json()\n", + " is_json = True\n", + " if self.wrap_json and is_json:\n", + " format_ = lambda x, inner=format_: x if isinstance(x, str) else \"```json\\n\" + inner(x) + \"\\n```\\n\"\n", + " signature = signature.with_updated_fields(name, format=format_)\n", + "\n", + " return signature\n", + "\n", + " def forward(self, **kwargs) -> dspy.Prediction:\n", + " modified_kwargs = kwargs.copy()\n", + " # We have to re-prepare the signature on every forward call, because the base\n", + " # signature might have been modified by an optimizer or something like that.\n", + " signature = self._prepare_signature()\n", + " for try_i in range(self.max_retries):\n", + " result = self.predictor(**modified_kwargs, new_signature=signature)\n", + " errors = {}\n", + " parsed_results = []\n", + " # Parse the outputs\n", + " for completion in result.completions:\n", + " parsed = {}\n", + " for name, field in signature.output_fields.items():\n", + " try:\n", + " value = completion[name]\n", + " parser = field.json_schema_extra.get(\"parser\", lambda x: x)\n", + " parsed[name] = parser(value)\n", + " except (pydantic.ValidationError, ValueError) as e:\n", + " errors[name] = self._format_error(\n", + " e,\n", + " signature.fields[name],\n", + " value,\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + "\n", + " # If we can, we add an example to the error message\n", + " current_desc = field.json_schema_extra.get(\"desc\", \"\")\n", + " i = current_desc.find(\"JSON Schema: \")\n", + " if i == -1:\n", + " continue # Only add examples to JSON objects\n", + " suffix, current_desc = current_desc[i:], current_desc[:i]\n", + " prefix = \"You MUST use this format: \"\n", + " if (\n", + " try_i + 1 < self.max_retries\n", + " and prefix not in current_desc\n", + " and (example := self._make_example(field.annotation))\n", + " ):\n", + " signature = signature.with_updated_fields(\n", + " name,\n", + " desc=current_desc + \"\\n\" + prefix + example + \"\\n\" + suffix,\n", + " )\n", + " # No reason trying to parse the general signature, or run more completions, if we already have errors\n", + " if errors:\n", + " break\n", + " # Instantiate the actual signature with the parsed values.\n", + " # This allow pydantic to validate the fields defined in the signature.\n", + " try:\n", + " _ = self.signature(**kwargs, **parsed)\n", + " parsed_results.append(parsed)\n", + " except pydantic.ValidationError as e:\n", + " errors[\"general\"] = self._format_error(\n", + " e,\n", + " signature.instructions,\n", + " \"\\n\\n\".join(\n", + " \"> \" + field.json_schema_extra[\"prefix\"] + \" \" + completion[name]\n", + " for name, field in signature.output_fields.items()\n", + " ),\n", + " lm_explain=try_i + 1 < self.max_retries,\n", + " )\n", + " if errors:\n", + " # Add new fields for each error\n", + " for name, error in errors.items():\n", + " modified_kwargs[f\"error_{name}_{try_i}\"] = error\n", + " if name == \"general\":\n", + " error_prefix = \"General:\"\n", + " else:\n", + " error_prefix = signature.output_fields[name].json_schema_extra[\"prefix\"]\n", + " number = \"\" if try_i == 0 else f\" ({try_i+1})\"\n", + " signature = signature.append(\n", + " f\"error_{name}_{try_i}\",\n", + " dspy.InputField(\n", + " prefix=f\"Past Error{number} in {error_prefix}\",\n", + " desc=\"An error to avoid in the future\",\n", + " ),\n", + " )\n", + " else:\n", + " # If there are no errors, we return the parsed results\n", + " return Prediction.from_completions(\n", + " {key: [r[key] for r in parsed_results] for key in signature.output_fields},\n", + " )\n", + " raise ValueError(\n", + " \"Too many retries trying to get the correct output format. \" + \"Try simplifying the requirements.\",\n", + " errors,\n", + " )\n", + "\n", + "class TypedEntityRelationshipExtractorException(dspy.Module):\n", + " def __init__(self, predictor: dspy.Module, exception_types: tuple[type[Exception]] = (Exception,)):\n", + " super().__init__()\n", + " self.predictor = predictor\n", + " self.exception_types = exception_types\n", + "\n", + " def copy(self):\n", + " return TypedEntityRelationshipExtractorException(self.predictor)\n", + "\n", + " def forward(self, **kwargs):\n", + " try:\n", + " prediction = self.predictor(**kwargs)\n", + " return prediction\n", + "\n", + " except Exception as e:\n", + " if isinstance(e, self.exception_types):\n", + " return dspy.Prediction(entities_relationships=[])\n", + "\n", + " raise e\n", + "\n", + "class TypedEntityRelationshipExtractor(dspy.Module):\n", + " def __init__(self, instructions: str = None, reasoning: str = None, max_retries: int = 3, lm: dspy.LM = None):\n", + " super().__init__()\n", + " self.lm = lm\n", + " self.entity_types = ENTITY_TYPES\n", + " self.extractor = dspy.TypedChainOfThought(\n", + " signature=CombinedExtraction, \n", + " instructions=instructions, \n", + " reasoning=reasoning, \n", + " max_retries=max_retries\n", + " )\n", + " self.extractor = TypedEntityRelationshipExtractorException(self.extractor, exception_types=(ValueError,))\n", + "\n", + " def forward(self, input_text: str) -> dspy.Prediction:\n", + " with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm):\n", + " extraction_result = self.extractor(input_text=input_text, entity_types=self.entity_types)\n", + "\n", + " entities = [\n", + " {\n", + " \"entity_name\": clean_str(entity['entity_name'].upper()),\n", + " \"entity_type\": clean_str(entity['entity_type'].upper()),\n", + " \"description\": entity['description'],\n", + " \"importance_score\": float(entity['importance_score'])\n", + " }\n", + " for entity in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Entity)]\n", + " ]\n", + "\n", + " relationships = [\n", + " {\n", + " \"src_id\": clean_str(relationship['src_id'].upper()),\n", + " \"tgt_id\": clean_str(relationship['tgt_id'].upper()),\n", + " \"description\": relationship['description'],\n", + " \"weight\": float(relationship['weight']),\n", + " \"order\": int(relationship['order'])\n", + " }\n", + " for relationship in [item.model_dump() for item in extraction_result.entities_relationships if isinstance(item, Relationship)]\n", + " ]\n", + " return dspy.Prediction(entities=entities, relationships=relationships)\n", + "\n", + "\n", + "PROGRAM DESCRIPTION: The program is designed to solve the task of extracting entities and relationships from text, specifically focusing on identifying entities based on predefined types and determining the relationships between these entities. It works by using a language model to process the input text and entity types, generating a step-by-step reasoning process to identify and relate the entities, and finally outputting the entities and their relationships in a structured JSON format. The program ensures type annotations are enforced and includes mechanisms for handling errors and retries to improve the accuracy of the output.\n", + "\n", + "MODULE: \"\"\"Like dspy.Predict, but enforces type annotations in the signature.\n", + "\n", + "TASK DEMO(S):\n", + "Input Text: (CNN)A ban on cellphones in classrooms was announced in the Australian state of Victoria on Wednesday. The ban, which will apply to primary and secondary students, is designed to crack down on cyber-bulling and classroom distractions, the Victorian government said in a statement. While some schools in Australia have already banned cell-phone use, this marks the first statewide ban in the country. It will begin from the first school term of 2020.\"Students will be required to switch off their phones and store them securely in lockers from the start of the school day until the final bell. When emergencies occur, parents or guardians can reach their child by calling the school,\" the statement said.Victoria Education Minister James Merlino said he didn't expect the move to be \"universally popular\" but said it was the \"right thing to do\" on his official social media. Read MoreA report says young people are growing horns on their skulls. Critics don't buy itThe move isn't without precedent. Australia's largest state, New South Wales, banned mobile phones in primary schools in 2018, and France has banned all smart phones and smart devices from schools.Australian Education Minister Dan Tehan said in a statement that he supported the Victoria ban.\"When I talk to parents and teachers the overwhelming majority want action on banning mobile phones in the classroom,\" Tehan said in a statement. In a 2019 survey of about 2,000 Australian adults, by Monash University professor Neil Selywn, almost 80% supported a restriction on cell-phone use in schools, while one-third supported an outright ban.Ann Marie, a secondary school principal at a college in Victoria, told ABC Radio Melbourne that pupils at her school had agreed to restrict their mobile phone use during the day, explaining that they hoped to reduce \"bullying, distraction and one-upmanship.\"But there have been some concerns. The Australian Education Union Victoria said in a statement that while it supports the ban: \"We can't lose sight of the benefits that technology, including mobile phones, can bring to high quality teaching and learning.\"Some pupils with special needs will be exempt from the ban, including those who require telephones to monitor health conditions or those who have received permission from a teacher. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering the context and descriptions provided in the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Victoria\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The Australian state where the cellphone ban in classrooms was announced.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"James Merlino\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Victoria Education Minister who announced the cellphone ban.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"New South Wales\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"Australia's largest state that previously banned mobile phones in primary schools.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"France\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"A country that has banned all smart phones and smart devices from schools.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Dan Tehan\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Australian Education Minister who supported the Victoria ban.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Monash University\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The university where a survey on cell-phone use in schools was conducted.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Neil Selwyn\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A professor at Monash University who conducted the survey on cell-phone use in schools.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Ann Marie\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A secondary school principal in Victoria who discussed the impact of the cellphone ban.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Australian Education Union Victoria\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An organization that supports the cellphone ban but highlights the benefits of technology in education.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"James Merlino\",\n", + "\n", + "\"description\": \"James Merlino, the Victoria Education Minister, announced the cellphone ban in Victoria.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"New South Wales\",\n", + "\n", + "\"description\": \"The cellphone ban in Victoria follows a similar ban in New South Wales.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"France\",\n", + "\n", + "\"description\": \"The cellphone ban in Victoria is similar to a ban in France.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Dan Tehan\",\n", + "\n", + "\"description\": \"Dan Tehan, the Australian Education Minister, supported the Victoria ban.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Monash University\",\n", + "\n", + "\"description\": \"A survey conducted at Monash University showed support for the Victoria ban.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Neil Selwyn\",\n", + "\n", + "\"description\": \"Neil Selwyn, a professor at Monash University, conducted the survey on cell-phone use in schools.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Ann Marie\",\n", + "\n", + "\"description\": \"Ann Marie, a secondary school principal in Victoria, discussed the impact of the cellphone ban.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Victoria\",\n", + "\n", + "\"tgt_id\": \"Australian Education Union Victoria\",\n", + "\n", + "\"description\": \"The Australian Education Union Victoria supports the cellphone ban but highlights the benefits of technology in education.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: Manchester, England (CNN)Boris Johnson wants to keep his Brexit plan private. But the Prime Minister isn't having much luck these days. On Monday night, Irish broadcaster RTE reported what it claimed were details of part of Johnson's plan to break the Brexit deadlock. According to RTE, the UK's plan to avoid post-Brexit infrastructure on the frontier between Northern Ireland and the Republic of Ireland involve \"customs clearance centers\" several miles from the border -- and on both sides of it. The women of Boris Johnson's past are starting to catch up with himThat, it hopes, will remove the need for something called the Irish border backstop from any final Brexit deal. The backstop is an emergency measure which would keep the two Irish nations in regulatory alignment, removing the need for any border or \"clearance centers\" at all. It's deeply unpopular with many in Johnson's Conservative party and is one of the main reasons his predecessor, Theresa May, failed to get her Withdrawal Agreement through Parliament.Ireland quickly dismissed the idea as a \"non starter\". And Johnson himself told the BBC on Tuesday, \"that's not what we're proposing at all... you'll forgive me, I would like to veil our proposals in decent obscurity until we've been able to share them properly with our friends\". Read MoreSo why is this story a big deal? Keeping the government's formal proposals from Brussels for the time being might be wise. The crunch EU summit, at which any Brexit deal will be struck, is now under three weeks away. JUST WATCHEDBoris Johnson tells opponents to hold no-confidence voteReplayMore Videos ...MUST WATCHBoris Johnson tells opponents to hold no-confidence vote 02:41The EU is a notoriously tough institution to negotiate with and has a history of tearing apart any Brexit proposal from the moment it leaves the door. Johnson has repeatedly said that the only way to make the EU budge on ditching the backstop is to show that he is both deadly serious about getting a deal but also prepared to leave without one. Some of the ideas reported by RTE chime with speculation about the UK government's plans in recent weeks. And anyone who has followed Brexit for the last three years will tell you, the EU and Ireland will refuse to sign a deal that involves infrastructure between Ireland and Northern Ireland. So, even though the Prime Minister has distanced himself from the proposals, these headlines will have done little to reassure Brussels that Johnson and his government are serious about getting a deal. It's also a bad look for Johnson back home. He is currently out of London and at his Conservative party's annual conference in Manchester, in the north of England. Johnson will close the conference with a keynote speech on Wednesday afternoon. British politics has been taken over by trollsIt's widely thought that he wanted to trail his final Brexit plan during this speech before formally submitting something to Brussels in the subsequent days. The problem with the leak of any Brexit plan now is that it increases the risk of Johnson being greeted by a hostile audience at his conference on Wednesday. Despite being the Brexit hero of 2016, Conservative hard-Brexiteers are deeply worried that Johnson, if he succeeds in getting rid of the backstop, will simply try and push through the rest May's Brexit deal.For Brexiteers, this would be a huge sellout. There are people sitting in Johnson's cabinet who resigned from Theresa May's government to vote against her Brexit deal. And Johnson himself resigned as May's Foreign Secretary over her general approach to Brexit. Up here in Manchester, the mood among party members is generally positive. Johnson has always been a popular Conservative MP and he seems to have broad support for his hard Brexit language. That could turn sour if he's seen to make concessions to Brussels. Which would all be somewhat ironic, for a man who has spent the past few days accusing his opponents of surrendering. \n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. We then determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Boris Johnson\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"The Prime Minister of the United Kingdom who is leading the Brexit negotiations.\",\n", + "\n", + "\"importance_score\": 1.0\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Brexit\",\n", + "\n", + "\"entity_type\": \"EVENT\",\n", + "\n", + "\"description\": \"The process of the United Kingdom withdrawing from the European Union.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"European Union\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A political and economic union of 27 European countries.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Irish border backstop\",\n", + "\n", + "\"entity_type\": \"LAW\",\n", + "\n", + "\"description\": \"An emergency measure to keep the two Irish nations in regulatory alignment, removing the need for any border or 'clearance centers'.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Theresa May\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"Former Prime Minister of the United Kingdom and predecessor of Boris Johnson.\",\n", + "\n", + "\"importance_score\": 0.6\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Conservative party\",\n", + "\n", + "\"entity_type\": \"POLITICAL_PARTY\",\n", + "\n", + "\"description\": \"The political party in the United Kingdom to which Boris Johnson and Theresa May belong.\",\n", + "\n", + "\"importance_score\": 0.5\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"RTE\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"Irish broadcaster that reported details of Johnson's Brexit plan.\",\n", + "\n", + "\"importance_score\": 0.4\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Boris Johnson\",\n", + "\n", + "\"tgt_id\": \"Brexit\",\n", + "\n", + "\"description\": \"Boris Johnson is leading the Brexit negotiations as the Prime Minister of the United Kingdom.\",\n", + "\n", + "\"weight\": 1.0,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Brexit\",\n", + "\n", + "\"tgt_id\": \"European Union\",\n", + "\n", + "\"description\": \"Brexit involves the United Kingdom withdrawing from the European Union.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Boris Johnson\",\n", + "\n", + "\"tgt_id\": \"Irish border backstop\",\n", + "\n", + "\"description\": \"Boris Johnson aims to remove the Irish border backstop from any final Brexit deal.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Theresa May\",\n", + "\n", + "\"tgt_id\": \"Brexit\",\n", + "\n", + "\"description\": \"Theresa May failed to get her Withdrawal Agreement through Parliament due to the Irish border backstop.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Boris Johnson\",\n", + "\n", + "\"tgt_id\": \"Conservative party\",\n", + "\n", + "\"description\": \"Boris Johnson is a member of the Conservative party and its leader.\",\n", + "\n", + "\"weight\": 0.6,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"RTE\",\n", + "\n", + "\"tgt_id\": \"Boris Johnson\",\n", + "\n", + "\"description\": \"RTE reported details of part of Johnson's plan to break the Brexit deadlock.\",\n", + "\n", + "\"weight\": 0.5,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "Input Text: (CNN)Across the sports world, athletes took a stand for Black Lives Matter on Wednesday night by holding an unprecedented wildcat strike and refusing to play their regularly scheduled games.The decision started with the NBA's Milwaukee Bucks and stemmed from the police shooting of Jacob Blake in Kenosha, Wisconsin, on Sunday. The stand quickly spread across a number of sports -- from basketball courts in Florida to baseball diamonds in California to soccer fields in places like Miami and Salt Lake City and even into the broadcast booth, where TNT's \"Inside the NBA\" analyst Kenny Smith took off his mic and walked off stage.But the issue is bigger than Jacob Blake or George Floyd or Breonna Taylor. Malcolm Jenkins of the NFL's New Orleans Saints positioned the protests as a response to the systemic racism baked into America, including police violence, cycles of incarceration, and the plunder of generational wealth.\"Until we continue to demand it, until 'Black Lives Matter' goes from just an idea or a goal that we're trying to attain as a society and is actually realized in the streets, we won't see any peace,\" he said. \"And I think we'll continue to see athletes, entertainers as well as citizens disrupt the status quo until that's recognized.\"Here's a look at the teams and athletes who pulled back in order to have their voices be heard.Read MoreNBA Photos: The sports world has been taking a standMembers of the Los Angeles Lakers and the Miami Heat kneel during the National Anthem before Game 1 of the NBA Finals on Wednesday, September 30.Hide Caption 1 of 42 Photos: The sports world has been taking a standThe Philadelphia Eagles and the Washington Football Team stand together before their NFL season opener on September 13.Hide Caption 2 of 42 Photos: The sports world has been taking a standTyler Wright, a two-time World Surf League Champion, takes a knee before competing in an event in Tweed Heads South, Australia, on September 13.Hide Caption 3 of 42 Photos: The sports world has been taking a standTennis star Naomi Osaka wears a face mask with Tamir Rice's name before winning the US Open final on September 12. Osaka wore a different name for each of her seven matches. Rice, a 12-year-old boy, was killed by police gunfire in Cleveland while he was holding a toy replica pistol in 2014.Hide Caption 4 of 42 Photos: The sports world has been taking a standChelsea's Pernille Harder walks out for a warm-up before a soccer match in Leigh, England, on September 6.Hide Caption 5 of 42 Photos: The sports world has been taking a standThe University of Alabama football team, led by coach Nick Saban, marches on campus in support of the Black Lives Matter movement.Hide Caption 6 of 42 Photos: The sports world has been taking a standBlack Lives Matter signage is seen in New York's Louis Armstrong Stadium as Cori Gauff plays against Anastasija Sevastova at the US Open.Hide Caption 7 of 42 Photos: The sports world has been taking a standMembers of the WNBA's Indiana Fever wear Black Lives Matter shirts before their game against the Chicago Sky on August 31.Hide Caption 8 of 42 Photos: The sports world has been taking a standMiami Marlins outfielder Lewis Brinson walks off the field August 27 after placing a Black Lives Matter T-shirt on home plate in New York. The Marlins and New York Mets walked off the field after a moment of silence, choosing not to play their scheduled baseball game. Hide Caption 9 of 42 Photos: The sports world has been taking a standNew Orleans Saints wide receiver Emmanuel Sanders wears Jacob Blake's name on his helmet during a practice on August 27.Hide Caption 10 of 42 Photos: The sports world has been taking a standThe Washington Mystics pay tribute to Jacob Blake after their WNBA game was postponed in Palmetto, Florida, on August 26.Hide Caption 11 of 42 Photos: The sports world has been taking a standA grounds crew covers the field at San Francisco's Oracle Park after a Major League Baseball game was postponed on August 26.Hide Caption 12 of 42 Photos: The sports world has been taking a standThe Milwaukee Bucks make a statement to the media after boycotting a playoff game on August 26.Hide Caption 13 of 42 Photos: The sports world has been taking a standDuring a practice round for the BMW Championship, pro golfer Cameron Champ used his shoes to show his support for Blake and the Black Lives Matter movement.Hide Caption 14 of 42 Photos: The sports world has been taking a standNBA superstar LeBron James has been one of the most outspoken athletes in recent months. \"Having two boys of my own and me being African American in America and to see what continues to happen with the police brutality towards my kind, continue to see what goes on with just the (unjust), it's very troubling. It's very troubling,\" James told Turner Sports' Jared Greenberg. \"My prayers go to (the Blake) family, and hopefully we can have some change.\" Hide Caption 15 of 42 Photos: The sports world has been taking a stand\"End racism\" banners are shown in Toronto's Scotiabank Arena before an NHL playoff game on August 26.Hide Caption 16 of 42 Photos: The sports world has been taking a standDoc Rivers, head coach of the NBA's Los Angeles Clippers, became emotional while talking about the Blake shooting and the Republican National Convention. \"All you hear is Donald Trump and all of them talking about fear,\" Rivers said. \"We're the ones getting killed. We're the ones getting shot. We're the ones who were denied to live in certain communities. We've been hung. We've been shot.\"Hide Caption 17 of 42 Photos: The sports world has been taking a standDetroit Lions defensive end Trey Flowers addresses the media with his teammates outside their practice facility on August 25. The Lions canceled their practice in light of the Blake shooting, and they discussed the incident in a team meeting that lasted hours. \"We're going to spread our message; we're going to do it as a team,\" Flowers said. \"We understand that somebody's going to try to skew the narrative ... get the message lost, but we're going to stay focused on the topic.\"Hide Caption 18 of 42 Photos: The sports world has been taking a standA Real Salt Lake fan sits in the stands after the Major League Soccer team had its game postponed on August 26.Hide Caption 19 of 42 Photos: The sports world has been taking a standNASCAR driver Bubba Wallace tweeted this selfie before a Cup Series race in Talladega, Alabama, on June 22. Fellow drivers and pit crew members walked alongside Wallace's car to show their support for him. Wallace, the only Black driver in NASCAR's top circuit, has been an outspoken advocate of the Black Lives Matter movement. Hide Caption 20 of 42 Photos: The sports world has been taking a standMembers of the NHL's Colorado Avalanche, Vegas Golden Knights, Dallas Stars and Vancouver Canucks gather together after their playoff games were postponed in Edmonton, Alberta.Hide Caption 21 of 42 Photos: The sports world has been taking a standThe Black Lives Matter protests haven't been limited to just North America. Here, professional soccer players from Aston Villa and Sheffield United take a knee as their match kicked off in Birmingham, England, on June 17. Premier League teams sported the words \"Black Lives Matter\" on the back of their jerseys when their seasons resumed.Hide Caption 22 of 42 Photos: The sports world has been taking a standFormula One champion Lewis Hamilton raises his fist before a race in Northampton, England, on August 9.Hide Caption 23 of 42 Photos: The sports world has been taking a standMatt Dumba of the Minnesota Wild kneels during the US National Anthem as it is played before an NHL game in Edmonton, Alberta, on August 1. Dumba, a member of the Hockey Diversity Alliance, gave a short speech before kneeling. He is circled by members of the Edmonton Oilers and the Chicago Blackhawks, who were about to play on the first day of the NHL's return.Hide Caption 24 of 42 Photos: The sports world has been taking a standSoccer players from PSG and Lyon take a knee before a Champions League semifinal in Bilbao, Spain, on August 26.Hide Caption 25 of 42 Photos: The sports world has been taking a standBeach volleyball player April Ross wears a temporary Black Lives Matter tattoo during a match in Long Beach, California, on July 19.Hide Caption 26 of 42 Photos: The sports world has been taking a standTennis players Dan Evans and Kyle Edmund join match umpire James Keothavong in taking a knee in London on June 28.Hide Caption 27 of 42 Photos: The sports world has been taking a standCricketers with the Kent Spitfires take a knee during a match in Canterbury, England.Hide Caption 28 of 42 Photos: The sports world has been taking a standFormula One driver Kimi Raikkonen wears an \"end racism\" shirt before a race in Barcelona, Spain, on August 16.Hide Caption 29 of 42 Photos: The sports world has been taking a standMembers of the Trinbago Knight Riders kneel before a cricket match in Port of Spain, Trinidad and Tobago.Hide Caption 30 of 42 Photos: The sports world has been taking a standThe Washington Nationals observe a moment of silence before Major League Baseball's opening game on July 23. Their opponents, The New York Yankees, also took a knee, and the initials BLM were on the pitcher's mound for the game.Hide Caption 31 of 42 Photos: The sports world has been taking a standMajor League Soccer players participate in a pregame ceremony before the league's restart on July 8.Hide Caption 32 of 42 Photos: The sports world has been taking a standHouston Dynamo goalkeeper Cody Cropper warms up in a Black Lives Matter T-shirt on August 25.Hide Caption 33 of 42 Photos: The sports world has been taking a standThe Boston Red Sox Foundation put this sign behind Fenway Park's famous \"Green Monster\" outfield wall.Hide Caption 34 of 42 Photos: The sports world has been taking a standRugby players from Bath and Northampton show their support before a game in Northampton, England.Hide Caption 35 of 42 Photos: The sports world has been taking a standKansas City outfielder Jorge Soler catches a ball in front of a George Floyd sign in Minneapolis on August 15.Hide Caption 36 of 42 Photos: The sports world has been taking a standA Houston Dash player wears a Black Lives Matter armband before a NWSL match in Herriman, Utah, on July 17.Hide Caption 37 of 42 Photos: The sports world has been taking a standAfter winning the Champions League, Bayern Munich defender David Alaba wears a shirt that says \"Black Lives Still Matter.\"Hide Caption 38 of 42 Photos: The sports world has been taking a standA Black Lives Matter sign is seen during a Major League Soccer match in Harrison, New Jersey, on August 24.Hide Caption 39 of 42 Photos: The sports world has been taking a standMembers of the San Antonio Spurs huddle before a game against Sacramento on July 31.Hide Caption 40 of 42 Photos: The sports world has been taking a standKilmarnock manager Alex Dyer, second from right, takes a knee before a soccer match in Glasgow, Scotland.Hide Caption 41 of 42 Photos: The sports world has been taking a standMembers of the Orlando Magic and Brooklyn Nets kneel during the National Anthem before the start of an NBA game on July 31.Hide Caption 42 of 42The Milwaukee Bucks did not emerge from their locker room before the scheduled tip at 4 p.m. Wednesday as they decided to refuse to play their playoff game.Soon after, the NBA announced it would postpone Game 5 of three different playoff series -- the Bucks vs. Orlando Magic, Houston Rockets vs. Oklahoma City Thunder and Los Angeles Lakers vs. Portland Trail Blazers.The postponed games have not yet been rescheduled.Three playoff games on Thursday also were postponed but the league said it hoped to resume games Friday or Saturday. Strikes are banned under the NBA's collective bargaining agreement, which means the Bucks players broke their own contract to protest racial injustice and police violence. In a statement issued on Wednesday afternoon, the Bucks players said they are \"calling for justice for Jacob Blake and demand the officers be held accountable. \"Full statement from the Milwaukee Bucks: pic.twitter.com/jjGEyVcCmB— Milwaukee Bucks (@Bucks) August 26, 2020\n", + "\n", + "\"The past four months have shed a light on the ongoing racial injustices facing our African American communities. Citizens around the country have used their voices and platforms to speak out against these wrongdoings,\" they said in a statement.\"Despite the overwhelming plea for change, there has been no action, so our focus today cannot be on basketball.\"The Los Angeles Lakers also offered a statement in support of the players' decision.\"Eighty percent of NBA players are Black men. We cannot love them for the joy and entertainment that they bring to the world, yet sit in silence and fail to use our platforms and resources to amplify their voices when they demand the justice and equality that America has promised us all, but denied Black people for too long,\" the Lakers said.WNBAElizabeth Williams of the Atlanta Dream said WNBA players were standing in solidarity with \"our brothers in the NBA.\"Representing the six teams slated to play on Wednesday, Atlanta Dream player Elizabeth Williams announced that WNBA players were standing in solidarity with \"our brothers in the NBA\" and also would not play. The ESPN2 broadcast showed players from the six teams scheduled to take the court in locked arms and kneeling while wearing shirts spelling out Jacob Blake's name. The WNBA announced that the three games scheduled for Wednesday evening had been postponed. On Thursday, the league announced the postponement of the day's three scheduled games as well.Instead of playing Thursday, the entire league of WNBA players stood arm-in-arm in solidarity.\"It is important to note that this is not a strike,\" said Women's National Basketball Players Association (WNBPA) president and Los Angeles Sparks star Nneka Ogwumike, reading a statement from the players, to ESPN's Holly Rowe. \"This is not a boycott. This is affirmatively a day of reflection. A day of informed action and mobilization.\"Ogwumike said players plan to play in games that are scheduled for Friday.\"I feel like as a group we decided last night that we want to play,\" Ogwumike said. \"There are games scheduled tomorrow. That's what we're ready to do, but that doesn't come without, of course, demands of players to continue to amplify our voices in more ways than when we came here.\"We realize that the work is not easy, but we also understand that the work is never done. There are things that happen that just allow us times to take a moment -- that's fair; we can take a moment -- and that moment we came together and we decided we need time for ourselves to come back, regroup and continue to amplify our voices, and be there for our communities and demand change from the leaders that are representing us right now.\"MLBThe words 'Black Lives Matter' are displayed on the digital screen after the postponement of the game between the San Francisco Giants and the Los Angeles Dodgers Wednesday.Three MLB games were postponed Wednesday: Cincinnati Reds vs. Milwaukee Brewers, Seattle Mariners vs. San Diego Padres, Los Angeles Dodgers vs. San Francisco Giants.\"With our community and our nation in such pain, we wanted to draw as much attention to the issues that really matter, especially racial injustice and systemic oppression,\" a joint statement from the Brewers and Reds said.\"Given the pain in the communities of Wisconsin and beyond following the shooting of Jacob Blake, we respect the decisions of a number of players not to play tonight,\" a league statement Wednesday said. \"Major League Baseball remains united for change in our society and we will be allies in the fight to end racism and injustice.\"Mets player Dominic Smith, who is Black, was driven to tears on Wednesday, telling reporters after the Mets defeated the Marlins, \"I think the most difficult part is to see people still don't care. And for this to continuously happen it just shows just the hate in people's heart and that just sucks. Being a Black man in America is not easy.\"On Thursday, seven games would not take place: Minnesota Twins vs. Detroit Tigers, Colorado Rockies vs. Arizona Diamondbacks, Baltimore Orioles vs. Tampa Bay Rays, Oakland Athletics vs. Texas Rangers, Philadelphia Phillies vs. Washington Nationals, Boston Red Sox vs. Toronto Blue Jays and Miami Marlins vs. New York Mets.Athletes across US sports take a stand, as games are called off in solidarity with Bucks' boycottIn a powerful moment Thursday at Citi Field in New York, the Mets starting defense took the field as scheduled, and then the remaining players for both teams came out of their dugouts and a moment of silence was observed, which the SNY television broadcast said lasted 42 seconds, and then walked off the field.Before he left the field, Miami Marlins player Lewis Brinson laid a \"Black Lives Matter\" shirt at home plate.\"Enough,\" the Marlins tweeted.Enough. pic.twitter.com/25SnNMCfBu— Miami Marlins (@Marlins) August 27, 2020 MLB is scheduled to observe Jackie Robinson Day on Friday to honor the color-barrier-breaking Hall of Famer, who wore number 42.NFLSeveral National Football League teams canceled practice Thursday to focus on conversations about race.The Denver Broncos joined the Arizona Cardinals, Indianapolis Colts, Washington Football Team and New York Jets in forgoing practice to allow players to weigh in on what they can do to effect change.The Chicago Bears said in a statement Thursday that the organization had decided to pause football activities \"to voice to each other, our coaches and our staff where we stand on the real issues around race and police brutality in our country.\"\"We had a productive discussion, but we all agreed that talks and discussions are simply not enough anymore and we need action,\" the statement said. \"We are putting in plans to take action in our communities and together we believe we can make a real difference. We need action not only today, but in the days to come.\"MLSInter Miami midfielder Lee Nguyen exits the field after Inter Miami and Atlanta United players decided not to play.Five Major League Soccer games scheduled to take place on Wednesday night were postponed, the league announced.\"The entire Major League Soccer family unequivocally condemns racism and has always stood for equality, but we need to do more to take tangible steps to impact change,\" MLS said. \"We will continue to work without players, our clubs and the broader soccer community to harness our collective power to fight for equality and social justice.\"TennisTennis superstar Naomi Osaka announced Wednesday that she would not play in the Western & Southern Open semifinals on Thursday. \"Watching the continued genocide of Black people at the hand of the police is honestly making me sick to my stomach,\" she wrote.Later on Wednesday, after Osaka's post, it was announced that the Western & Southern Open would pause tournament play on Thursday, resuming on Friday.By Thursday, a statement by her management team to The Guardian and The New York Times indicated she planned to play Friday instead.\"They offered to postpone all matches until Friday and in my mind that brings more attention to the movement,\" Osaka said. She also thanked the WTA and the tournament for their support.A WTA spokesperson tells CNN that Osaka did not officially withdraw from the Western & Southern Open on Wednesday. She is scheduled to play Elise Mertens in the semifinals on Friday.NHLThe NHL's Stanley Cup Playoffs continued Wednesday with two games. However, the league opted to postpone two games scheduled for Thursday and two scheduled for Friday.\"After much discussion, NHL Players believe that the best course of action would be to take a step back and not play tonight's and tomorrow's games as scheduled,\" a Thursday joint statement from the league and the players association said. \"The NHL supports the Players' decision and will reschedule those four games beginning Saturday and adjust the remainder of the Second Round schedule accordingly. \"Black and Brown communities continue to face real, painful experiences. The NHL and NHLPA recognize that much work remains to be done before we can play an appropriate role in a discussion centered on diversity, inclusion and social justice.\"Before the start of Game 3 of the second-round series between the Tampa Bay Lightning and Boston Bruins on Wednesday, there was a moment of reflection for racism in response to the Blake shooting.CNN's Ray Sanchez, Eric Levenson and Cesar Marin contributed to this report.\n", + "Entity Types: ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENTAGE', 'PRODUCT', 'EVENT', 'LANGUAGE', 'NATIONALITY', 'RELIGION', 'TITLE', 'PROFESSION', 'ANIMAL', 'PLANT', 'DISEASE', 'MEDICATION', 'CHEMICAL', 'MATERIAL', 'COLOR', 'SHAPE', 'MEASUREMENT', 'WEATHER', 'NATURAL_DISASTER', 'AWARD', 'LAW', 'CRIME', 'TECHNOLOGY', 'SOFTWARE', 'HARDWARE', 'VEHICLE', 'FOOD', 'DRINK', 'SPORT', 'MUSIC_GENRE', 'INSTRUMENT', 'ARTWORK', 'BOOK', 'MOVIE', 'TV_SHOW', 'ACADEMIC_SUBJECT', 'SCIENTIFIC_THEORY', 'POLITICAL_PARTY', 'CURRENCY', 'STOCK_SYMBOL', 'FILE_TYPE', 'PROGRAMMING_LANGUAGE', 'MEDICAL_PROCEDURE', 'CELESTIAL_BODY']\n", + "Reasoning: Let's think step by step in order to produce the entities_relationships. We will first identify the entities in the text based on the provided entity types. Then, we will determine the relationships between these entities, focusing on the interactions and associations described in the text. Finally, we will format the entities and relationships according to the specified JSON schema.\n", + "Entities Relationships: ```json\n", + "\n", + "{\n", + "\n", + "\"value\": [\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Milwaukee Bucks\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An NBA team that initiated the wildcat strike.\",\n", + "\n", + "\"importance_score\": 0.9\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Jacob Blake\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A person who was shot by police, leading to the sports world's protest.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Kenosha, Wisconsin\",\n", + "\n", + "\"entity_type\": \"LOCATION\",\n", + "\n", + "\"description\": \"The location where Jacob Blake was shot by police.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"NBA\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The National Basketball Association, which saw multiple teams participating in the strike.\",\n", + "\n", + "\"importance_score\": 0.8\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Los Angeles Lakers\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"An NBA team that supported the strike.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"WNBA\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"The Women's National Basketball Association, which also participated in the strike.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Major League Baseball (MLB)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional baseball organization where games were postponed in solidarity.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"National Football League (NFL)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional football league where teams canceled practices to discuss racial issues.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Major League Soccer (MLS)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional soccer league where games were postponed in solidarity.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"Naomi Osaka\",\n", + "\n", + "\"entity_type\": \"PERSON\",\n", + "\n", + "\"description\": \"A tennis superstar who refused to play in a tournament to protest racial injustice.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"entity_name\": \"National Hockey League (NHL)\",\n", + "\n", + "\"entity_type\": \"ORGANIZATION\",\n", + "\n", + "\"description\": \"A professional hockey league where games were postponed in solidarity.\",\n", + "\n", + "\"importance_score\": 0.7\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Milwaukee Bucks\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The Milwaukee Bucks initiated a wildcat strike in the NBA.\",\n", + "\n", + "\"weight\": 0.9,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Milwaukee Bucks\",\n", + "\n", + "\"tgt_id\": \"Jacob Blake\",\n", + "\n", + "\"description\": \"The Milwaukee Bucks' strike was in response to the police shooting of Jacob Blake.\",\n", + "\n", + "\"weight\": 0.8,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Los Angeles Lakers\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The Los Angeles Lakers supported the strike in the NBA.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"WNBA\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The WNBA stood in solidarity with the NBA during the strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Major League Baseball (MLB)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The MLB postponed games in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"National Football League (NFL)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The NFL canceled practices to discuss racial issues in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Major League Soccer (MLS)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The MLS postponed games in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"Naomi Osaka\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"Naomi Osaka refused to play in a tournament in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "},\n", + "\n", + "{\n", + "\n", + "\"src_id\": \"National Hockey League (NHL)\",\n", + "\n", + "\"tgt_id\": \"NBA\",\n", + "\n", + "\"description\": \"The NHL postponed games in solidarity with the NBA strike.\",\n", + "\n", + "\"weight\": 0.7,\n", + "\n", + "\"order\": 1\n", + "\n", + "}\n", + "\n", + "]\n", + "\n", + "}\n", + "\n", + "```\n", + "\n", + "\n", + "BASIC INSTRUCTION: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "TIP: Make sure your instruction is very informative and descriptive.\n", + "\n", + "Please provide the output field PROPOSED INSTRUCTION. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with only the field PROPOSED INSTRUCTION.\n", + "\n", + "\u001b[32mPROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, your task is to meticulously extract entities and their relationships from the text. First, identify all entities in the text that match the provided entity types. Then, analyze the context to determine the relationships between these entities, considering their interactions and dependencies within the text. Finally, format the identified entities and their relationships according to the specified JSON schema, ensuring all fields are accurately populated with relevant details such as entity names, types, descriptions, and importance scores for entities, and source IDs, target IDs, descriptions, weights, and orders for relationships.\u001b[0m\n", + "\n", + "\n", + "\n", + "PROPOSED INSTRUCTION: Given the `input_text` and a list of `entity_types`, your task is to meticulously extract entities and their relationships from the text. First, identify all entities in the text that match the provided entity types. Then, analyze the context to determine the relationships between these entities, considering their interactions and dependencies within the text. Finally, format the identified entities and their relationships according to the specified JSON schema, ensuring all fields are accurately populated with relevant details such as entity names, types, descriptions, and importance scores for entities, and source IDs, target IDs, descriptions, weights, and orders for relationships.\n", + "CANDIDATE PROGRAM:\n", + "Predictor 0\n", + "i: Given the `input_text` and a list of `entity_types`, meticulously extract and identify all entities and their relationships within the text. Ensure that each entity is accurately classified according to the provided entity types, and clearly define the relationships between entities, including their descriptions, weights, and orders. Provide a step-by-step reasoning process to justify the extraction and classification of each entity and relationship.\n", + "p: Entities Relationships:\n", + "\n", + "\n", + "...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/25 [00:00 reasoning, entities_relationships\n", + " instructions='Given the `input_text` and a list of `entity_types`, meticulously identify and extract all entities present in the text that match the provided entity types. Subsequently, determine the relationships between these entities based on their interactions and the context provided. Ensure that the entities and relationships are formatted according to the specified JSON schema, including providing detailed descriptions and importance scores for each entity, as well as defining the source and target IDs, description, weight, and order for each relationship.'\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + "))" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimizer = MIPROv2(\n", + " prompt_model=deepseek,\n", + " task_model=qwen2,\n", + " metric=entity_recall_metric,\n", + " init_temperature=1.4,\n", + " num_candidates=10,\n", + " verbose=False\n", + ")\n", + "kwargs = dict(num_threads=os.cpu_count(), display_progress=True, display_table=0)\n", + "miprov2_model = optimizer.compile(\n", + " model, \n", + " trainset=trainset[:50],\n", + " valset=valset[:20],\n", + " requires_permission_to_run=False,\n", + " num_batches=20, \n", + " max_labeled_demos=5, \n", + " max_bootstrapped_demos=3, \n", + " eval_kwargs=kwargs\n", + ")\n", + "miprov2_model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "---\n", + "\n", + "Follow the following format.\n", + "\n", + "Input Text: The text to extract entities and relationships from.\n", + "\n", + "Entity Types: List of entity types used for extraction.\n", + "\n", + "Reasoning: Let's think step by step in order to ${produce the entities_relationships}. We ...\n", + "\n", + "Entities Relationships: List of entities and relationships extracted from the text.. Respond with a single JSON object. JSON Schema: {\"$defs\": {\"Entity\": {\"properties\": {\"entity_name\": {\"description\": \"The name of the entity.\", \"title\": \"Entity Name\", \"type\": \"string\"}, \"entity_type\": {\"description\": \"The type of the entity.\", \"title\": \"Entity Type\", \"type\": \"string\"}, \"description\": {\"description\": \"The description of the entity, in details and comprehensive.\", \"title\": \"Description\", \"type\": \"string\"}, \"importance_score\": {\"description\": \"Importance score of the entity. Should be between 0 and 1 with 1 being the most important.\", \"maximum\": 1.0, \"minimum\": 0.0, \"title\": \"Importance Score\", \"type\": \"number\"}}, \"required\": [\"entity_name\", \"entity_type\", \"description\", \"importance_score\"], \"title\": \"Entity\", \"type\": \"object\"}, \"Relationship\": {\"properties\": {\"src_id\": {\"description\": \"The name of the source entity.\", \"title\": \"Src Id\", \"type\": \"string\"}, \"tgt_id\": {\"description\": \"The name of the target entity.\", \"title\": \"Tgt Id\", \"type\": \"string\"}, \"description\": {\"description\": \"The description of the relationship between the source and target entity, in details and comprehensive.\", \"title\": \"Description\", \"type\": \"string\"}, \"weight\": {\"description\": \"The weight of the relationship. Should be between 0 and 1 with 1 being the strongest relationship.\", \"maximum\": 1.0, \"minimum\": 0.0, \"title\": \"Weight\", \"type\": \"number\"}, \"order\": {\"description\": \"The order of the relationship. 1 for direct relationships, 2 for second-order, 3 for third-order.\", \"maximum\": 3, \"minimum\": 1, \"title\": \"Order\", \"type\": \"integer\"}}, \"required\": [\"src_id\", \"tgt_id\", \"description\", \"weight\", \"order\"], \"title\": \"Relationship\", \"type\": \"object\"}}, \"properties\": {\"value\": {\"items\": {\"anyOf\": [{\"$ref\": \"#/$defs/Entity\"}, {\"$ref\": \"#/$defs/Relationship\"}]}, \"title\": \"Value\", \"type\": \"array\"}}, \"required\": [\"value\"], \"title\": \"Output\", \"type\": \"object\"}\n", + "\n", + "---\n", + "\n", + "Input Text:\n", + "Melbourne, Australia (CNN)After spending part of the off-season training with Mike Tyson, Serena Williams is hoping to deliver a knockout punch at the Australian Open. Follow @cnnsport\n", + "\n", + "For Williams that would mean winning a record-tying 24th grand slam title, which has so far proved elusive despite getting close four times since returning to the tour after giving birth to daughter Alexis Olympia. Her preparation for the year's first major couldn't have gone much better, suggesting the mini grand slam drought for arguably tennis' greatest ever player is about to cease. Williams let rip into a punching bag in December -- drawing a compliment from former heavyweight boxing champion Tyson, whose daughter happens to be a budding tennis star -- and then won a buildup tournament in Auckland last week to incredibly land a title in a fourth straight decade. She also mingled with one of tennis' youngest sensations, Coco Gauff, as part of the training camp in Florida organized by her coach, Patrick Mouratoglou. JUST WATCHEDMouratoglou on Serena comeback and McEnroeReplayMore Videos ...MUST WATCHMouratoglou on Serena comeback and McEnroe 02:19Read MoreWilliams overcame singles opponents with differing styles, reached the doubles final with one of her best friends -- the soon-to-be retired Caroline Wozniacki -- and most crucially, ended a five-match losing streak in finals with her daughter and husband Alexis Ohanian looking on. No wonder the 38-year-old said following her straight-set victory over fellow American Jessica Pegula: \"It's pretty satisfying just to get a win in the final. That was really important for me. And I just want to build on it,\" added Williams, who donated her prize money check of $43,000 to bush fire relief efforts in Australia. \"It's just a step towards the next goal.\"Indeed. READ: Can Rafael Nadal match Roger Federer's all-time grand slam record?READ: Player brands Australian Open email a 'slap in the face'Eyes on bigger prizeYes, as nice as it was to be holding the winners' trophy in Auckland -- where Williams once hit 88 unforced errors in a loss to Madison Brengle -- she didn't make the long trip to simply prosper in New Zealand. The much bigger prize is the Australian Open, where Williams triumphed while in the early stages of pregnancy in 2017. If Williams makes the final in Melbourne -- and she might have to defeat the likes of twice grand slam winner Naomi Osaka and current world No. 1 Ashleigh Barty along the way -- she will probably have to then defeat someone with a heftier reputation than the 66th-ranked Pegula. Helping Williams, however, is that one of the main contenders, Bianca Andreescu, isn't in Melbourne because of another injury, this time to a knee. But winning any final -- after losses in grand slam finals to Osaka, Andreescu, Angelique Kerber and Simona Halep and retiring against Andreescu in the Rogers Cup finale last August -- could potentially be turning point as Williams attempts to draw level with the grand slam haul of Australia's Margaret Court. JUST WATCHEDSerena Williams falls short in major title chaseReplayMore Videos ...MUST WATCHSerena Williams falls short in major title chase 01:00\"Serena, she certainly looks hungry, and I think she's got a little momentum going into the Australian Open,\" Chris Evert, the 18-time grand slam champion, told reporters in an ESPN conference call last week. \"And it would probably be the least pressure, this grand slam, to win for her. \"I think every other tournament, the French Open, the clay isn't her best surface. Wimbledon is a lot of pressure, US Open is a lot of pressure. \"This one, the first one of the year, it's a 'Happy Slam,'\" referring to the Australian Open's nickname. \"I think if she just takes a little bit of pressure off herself and she can just play her brand of tennis, I think she's got a good shot at winning it.\"She's better at grand slams than any other player when you look at the last two years.\"The way Wozniacki put it, Williams has a \"big chance\" to match Court. pic.twitter.com/skoZilynH8— Serena Williams (@serenawilliams) January 12, 2020 Other high-profile players, such as Halep and former world No. 1 Karolina Pliskova -- who saved four match points last year in Melbourne against Williams and rallied from 5-1 down in the third set after the American rolled her ankle -- aren't discounting her chances, either, despite just falling short recently at grand slams. \"I'm very impressed about her, that she keeps playing at this level, with being a mother and also being a little bit older than us,\" said Halep, who made a mere three unforced errors in downing Williams in the 2019 Wimbledon final. \"It's impressive what she does.\"Challenges aheadBut Evert also underscored the stiffer challenges facing Williams. \"There are some darned good players out there that I have a lot of respect for, and the way that Osaka, Barty, Pliskova, Halep ... there's not one or two threats to Serena, there's probably about eight threats, eight players that can probably do some damage and that can compete against her.\"And one more thing is -- I always felt this way -- the older you get, I think the more bad days you may have, days when you feel burned out, days when you don't want to get out of bed, days when you don't have incentive. You don't want to have that day during a grand slam, but sometimes you can't help it.\"JUST WATCHEDSimona Halep on winning her first Wimbledon titleReplayMore Videos ...MUST WATCHSimona Halep on winning her first Wimbledon title 01:56Visit our tennis page for more news and videosWilliams figures to still be around towards the end of the Australian Open, but whether it is as the last person standing continues to be the question. \n", + "\n", + "Entity Types:\n", + "[1] «PERSON»\n", + "[2] «ORGANIZATION»\n", + "[3] «LOCATION»\n", + "[4] «DATE»\n", + "[5] «TIME»\n", + "[6] «MONEY»\n", + "[7] «PERCENTAGE»\n", + "[8] «PRODUCT»\n", + "[9] «EVENT»\n", + "[10] «LANGUAGE»\n", + "[11] «NATIONALITY»\n", + "[12] «RELIGION»\n", + "[13] «TITLE»\n", + "[14] «PROFESSION»\n", + "[15] «ANIMAL»\n", + "[16] «PLANT»\n", + "[17] «DISEASE»\n", + "[18] «MEDICATION»\n", + "[19] «CHEMICAL»\n", + "[20] «MATERIAL»\n", + "[21] «COLOR»\n", + "[22] «SHAPE»\n", + "[23] «MEASUREMENT»\n", + "[24] «WEATHER»\n", + "[25] «NATURAL_DISASTER»\n", + "[26] «AWARD»\n", + "[27] «LAW»\n", + "[28] «CRIME»\n", + "[29] «TECHNOLOGY»\n", + "[30] «SOFTWARE»\n", + "[31] «HARDWARE»\n", + "[32] «VEHICLE»\n", + "[33] «FOOD»\n", + "[34] «DRINK»\n", + "[35] «SPORT»\n", + "[36] «MUSIC_GENRE»\n", + "[37] «INSTRUMENT»\n", + "[38] «ARTWORK»\n", + "[39] «BOOK»\n", + "[40] «MOVIE»\n", + "[41] «TV_SHOW»\n", + "[42] «ACADEMIC_SUBJECT»\n", + "[43] «SCIENTIFIC_THEORY»\n", + "[44] «POLITICAL_PARTY»\n", + "[45] «CURRENCY»\n", + "[46] «STOCK_SYMBOL»\n", + "[47] «FILE_TYPE»\n", + "[48] «PROGRAMMING_LANGUAGE»\n", + "[49] «MEDICAL_PROCEDURE»\n", + "[50] «CELESTIAL_BODY»\n", + "\n", + "Please provide the output fields Reasoning then Entities Relationships. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with the field Reasoning.\n", + "\n", + "\u001b[32mReasoning: Let's think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\n", + "\n", + "Entities Relationships:\n", + "```json\n", + "{\n", + " \"value\": [\n", + " {\n", + " \"entity_name\": \"Melbourne\",\n", + " \"entity_type\": \"LOCATION\",\n", + " \"description\": \"A city in Australia where the Australian Open is held.\",\n", + " \"importance_score\": 0.8\n", + " },\n", + " {\n", + " \"entity_name\": \"Australia\",\n", + " \"entity_type\": \"LOCATION\",\n", + " \"description\": \"The country where Melbourne is located and where the Australian Open takes place.\",\n", + " \"importance_score\": 0.7\n", + " },\n", + " {\n", + " \"entity_name\": \"Serena Williams\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A professional tennis player aiming to win the Australian Open.\",\n", + " \"importance_score\": 1.0\n", + " },\n", + " {\n", + " \"entity_name\": \"Mike Tyson\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A former heavyweight boxing champion who trained with Serena Williams.\",\n", + " \"importance_score\": 0.6\n", + " },\n", + " {\n", + " \"entity_name\": \"Australian Open\",\n", + " \"entity_type\": \"EVENT\",\n", + " \"description\": \"A major tennis tournament held annually in Melbourne, Australia.\",\n", + " \"importance_score\": 0.9\n", + " },\n", + " {\n", + " \"entity_name\": \"Alexis Olympia\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"The daughter of Serena Williams.\",\n", + " \"importance_score\": 0.5\n", + " },\n", + " {\n", + " \"entity_name\": \"Caroline Wozniacki\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A professional tennis player and friend of Serena Williams.\",\n", + " \"importance_score\": 0.6\n", + " },\n", + " {\n", + " \"entity_name\": \"Coco Gauff\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A young tennis sensation who trained with Serena Williams.\",\n", + " \"importance_score\": 0.5\n", + " },\n", + " {\n", + " \"entity_name\": \"Patrick Mouratoglou\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"The coach of Serena Williams who organized the training camp.\",\n", + " \"importance_score\": 0.6\n", + " },\n", + " {\n", + " \"entity_name\": \"Naomi Osaka\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A professional tennis player and grand slam winner who could face Serena Williams.\",\n", + " \"importance_score\": 0.7\n", + " },\n", + " {\n", + " \"entity_name\": \"Ashleigh Barty\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"The current world No. 1 tennis player who could face Serena Williams.\",\n", + " \"importance_score\": 0.7\n", + " },\n", + " {\n", + " \"entity_name\": \"Bianca Andreescu\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A professional tennis player who is a main contender but is not in Melbourne due to injury.\",\n", + " \"importance_score\": 0.6\n", + " },\n", + " {\n", + " \"entity_name\": \"Simona Halep\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A professional tennis player who defeated Serena Williams in the 2019 Wimbledon final.\",\n", + " \"importance_score\": 0.6\n", + " },\n", + " {\n", + " \"entity_name\": \"Chris Evert\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"An 18-time grand slam champion who commented on Serena Williams' chances at the Australian Open.\",\n", + " \"importance_score\": 0.5\n", + " },\n", + " {\n", + " \"entity_name\": \"Margaret Court\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"An Australian tennis player with a record-tying 24 grand slam titles.\",\n", + " \"importance_score\": 0.5\n", + " },\n", + " {\n", + " \"entity_name\": \"Jessica Pegula\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A fellow American tennis player who Serena Williams defeated in a match.\",\n", + " \"importance_score\": 0.4\n", + " },\n", + " {\n", + " \"entity_name\": \"Alexis Ohanian\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"The husband of Serena Williams.\",\n", + " \"importance_score\": 0.4\n", + " },\n", + " {\n", + " \"entity_name\": \"Karolina Pliskova\",\n", + " \"entity_type\": \"PERSON\",\n", + " \"description\": \"A former world No. 1 tennis player who saved match points against Serena Williams in Melbourne.\",\n", + " \"importance_score\": 0.5\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Mike Tyson\",\n", + " \"description\": \"Serena Williams trained with Mike Tyson during the off-season.\",\n", + " \"weight\": 0.7,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Australian Open\",\n", + " \"description\": \"Serena Williams is aiming to win the Australian Open.\",\n", + " \"weight\": 0.9,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Alexis Olympia\",\n", + " \"description\": \"Serena Williams is the mother of Alexis Olympia.\",\n", + " \"weight\": 0.6,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Caroline Wozniacki\",\n", + " \"description\": \"Serena Williams reached the doubles final with Caroline Wozniacki.\",\n", + " \"weight\": 0.6,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Coco Gauff\",\n", + " \"description\": \"Serena Williams trained with Coco Gauff.\",\n", + " \"weight\": 0.5,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Patrick Mouratoglou\",\n", + " \"description\": \"Patrick Mouratoglou is the coach of Serena Williams.\",\n", + " \"weight\": 0.6,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Naomi Osaka\",\n", + " \"description\": \"Serena Williams might have to defeat Naomi Osaka to reach the final of the Australian Open.\",\n", + " \"weight\": 0.7,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Ashleigh Barty\",\n", + " \"description\": \"Serena Williams might have to defeat Ashleigh Barty to reach the final of the Australian Open.\",\n", + " \"weight\": 0.7,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Bianca Andreescu\",\n", + " \"description\": \"Bianca Andreescu is a main contender but is not in Melbourne due to injury.\",\n", + " \"weight\": 0.6,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Simona Halep\",\n", + " \"description\": \"Simona Halep defeated Serena Williams in the 2019 Wimbledon final.\",\n", + " \"weight\": 0.6,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Chris Evert\",\n", + " \"description\": \"Chris Evert commented on Serena Williams' chances at the Australian Open.\",\n", + " \"weight\": 0.5,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Margaret Court\",\n", + " \"description\": \"Serena Williams is attempting to match Margaret Court's record of 24 grand slam titles.\",\n", + " \"weight\": 0.5,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Jessica Pegula\",\n", + " \"description\": \"Serena Williams defeated Jessica Pegula in a match.\",\n", + " \"weight\": 0.4,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Alexis Ohanian\",\n", + " \"description\": \"Alexis Ohanian is the husband of Serena Williams.\",\n", + " \"weight\": 0.4,\n", + " \"order\": 1\n", + " },\n", + " {\n", + " \"src_id\": \"Serena Williams\",\n", + " \"tgt_id\": \"Karolina Pliskova\",\n", + " \"description\": \"Karolina Pliskova saved match points against Serena Williams in Melbourne.\",\n", + " \"weight\": 0.5,\n", + " \"order\": 1\n", + " }\n", + " ]\n", + "}\n", + "```\u001b[0m\n", + "\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'\\n\\n\\nGiven the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\\n\\n---\\n\\nFollow the following format.\\n\\nInput Text: The text to extract entities and relationships from.\\n\\nEntity Types: List of entity types used for extraction.\\n\\nReasoning: Let\\'s think step by step in order to ${produce the entities_relationships}. We ...\\n\\nEntities Relationships: List of entities and relationships extracted from the text.. Respond with a single JSON object. JSON Schema: {\"$defs\": {\"Entity\": {\"properties\": {\"entity_name\": {\"description\": \"The name of the entity.\", \"title\": \"Entity Name\", \"type\": \"string\"}, \"entity_type\": {\"description\": \"The type of the entity.\", \"title\": \"Entity Type\", \"type\": \"string\"}, \"description\": {\"description\": \"The description of the entity, in details and comprehensive.\", \"title\": \"Description\", \"type\": \"string\"}, \"importance_score\": {\"description\": \"Importance score of the entity. Should be between 0 and 1 with 1 being the most important.\", \"maximum\": 1.0, \"minimum\": 0.0, \"title\": \"Importance Score\", \"type\": \"number\"}}, \"required\": [\"entity_name\", \"entity_type\", \"description\", \"importance_score\"], \"title\": \"Entity\", \"type\": \"object\"}, \"Relationship\": {\"properties\": {\"src_id\": {\"description\": \"The name of the source entity.\", \"title\": \"Src Id\", \"type\": \"string\"}, \"tgt_id\": {\"description\": \"The name of the target entity.\", \"title\": \"Tgt Id\", \"type\": \"string\"}, \"description\": {\"description\": \"The description of the relationship between the source and target entity, in details and comprehensive.\", \"title\": \"Description\", \"type\": \"string\"}, \"weight\": {\"description\": \"The weight of the relationship. Should be between 0 and 1 with 1 being the strongest relationship.\", \"maximum\": 1.0, \"minimum\": 0.0, \"title\": \"Weight\", \"type\": \"number\"}, \"order\": {\"description\": \"The order of the relationship. 1 for direct relationships, 2 for second-order, 3 for third-order.\", \"maximum\": 3, \"minimum\": 1, \"title\": \"Order\", \"type\": \"integer\"}}, \"required\": [\"src_id\", \"tgt_id\", \"description\", \"weight\", \"order\"], \"title\": \"Relationship\", \"type\": \"object\"}}, \"properties\": {\"value\": {\"items\": {\"anyOf\": [{\"$ref\": \"#/$defs/Entity\"}, {\"$ref\": \"#/$defs/Relationship\"}]}, \"title\": \"Value\", \"type\": \"array\"}}, \"required\": [\"value\"], \"title\": \"Output\", \"type\": \"object\"}\\n\\n---\\n\\nInput Text:\\nMelbourne, Australia (CNN)After spending part of the off-season training with Mike Tyson, Serena Williams is hoping to deliver a knockout punch at the Australian Open. Follow @cnnsport\\n\\nFor Williams that would mean winning a record-tying 24th grand slam title, which has so far proved elusive despite getting close four times since returning to the tour after giving birth to daughter Alexis Olympia. Her preparation for the year\\'s first major couldn\\'t have gone much better, suggesting the mini grand slam drought for arguably tennis\\' greatest ever player is about to cease. Williams let rip into a punching bag in December -- drawing a compliment from former heavyweight boxing champion Tyson, whose daughter happens to be a budding tennis star -- and then won a buildup tournament in Auckland last week to incredibly land a title in a fourth straight decade. She also mingled with one of tennis\\' youngest sensations, Coco Gauff, as part of the training camp in Florida organized by her coach, Patrick Mouratoglou. JUST WATCHEDMouratoglou on Serena comeback and McEnroeReplayMore Videos ...MUST WATCHMouratoglou on Serena comeback and McEnroe 02:19Read MoreWilliams overcame singles opponents with differing styles, reached the doubles final with one of her best friends -- the soon-to-be retired Caroline Wozniacki -- and most crucially, ended a five-match losing streak in finals with her daughter and husband Alexis Ohanian looking on. No wonder the 38-year-old said following her straight-set victory over fellow American Jessica Pegula: \"It\\'s pretty satisfying just to get a win in the final. That was really important for me. And I just want to build on it,\" added Williams, who donated her prize money check of $43,000 to bush fire relief efforts in Australia. \"It\\'s just a step towards the next goal.\"Indeed. READ: Can Rafael Nadal match Roger Federer\\'s all-time grand slam record?READ: Player brands Australian Open email a \\'slap in the face\\'Eyes on bigger prizeYes, as nice as it was to be holding the winners\\' trophy in Auckland -- where Williams once hit 88 unforced errors in a loss to Madison Brengle -- she didn\\'t make the long trip to simply prosper in New Zealand. The much bigger prize is the Australian Open, where Williams triumphed while in the early stages of pregnancy in 2017. If Williams makes the final in Melbourne -- and she might have to defeat the likes of twice grand slam winner Naomi Osaka and current world No. 1 Ashleigh Barty along the way -- she will probably have to then defeat someone with a heftier reputation than the 66th-ranked Pegula. Helping Williams, however, is that one of the main contenders, Bianca Andreescu, isn\\'t in Melbourne because of another injury, this time to a knee. But winning any final -- after losses in grand slam finals to Osaka, Andreescu, Angelique Kerber and Simona Halep and retiring against Andreescu in the Rogers Cup finale last August -- could potentially be turning point as Williams attempts to draw level with the grand slam haul of Australia\\'s Margaret Court. JUST WATCHEDSerena Williams falls short in major title chaseReplayMore Videos ...MUST WATCHSerena Williams falls short in major title chase 01:00\"Serena, she certainly looks hungry, and I think she\\'s got a little momentum going into the Australian Open,\" Chris Evert, the 18-time grand slam champion, told reporters in an ESPN conference call last week. \"And it would probably be the least pressure, this grand slam, to win for her. \"I think every other tournament, the French Open, the clay isn\\'t her best surface. Wimbledon is a lot of pressure, US Open is a lot of pressure. \"This one, the first one of the year, it\\'s a \\'Happy Slam,\\'\" referring to the Australian Open\\'s nickname. \"I think if she just takes a little bit of pressure off herself and she can just play her brand of tennis, I think she\\'s got a good shot at winning it.\"She\\'s better at grand slams than any other player when you look at the last two years.\"The way Wozniacki put it, Williams has a \"big chance\" to match Court. pic.twitter.com/skoZilynH8— Serena Williams (@serenawilliams) January 12, 2020 Other high-profile players, such as Halep and former world No. 1 Karolina Pliskova -- who saved four match points last year in Melbourne against Williams and rallied from 5-1 down in the third set after the American rolled her ankle -- aren\\'t discounting her chances, either, despite just falling short recently at grand slams. \"I\\'m very impressed about her, that she keeps playing at this level, with being a mother and also being a little bit older than us,\" said Halep, who made a mere three unforced errors in downing Williams in the 2019 Wimbledon final. \"It\\'s impressive what she does.\"Challenges aheadBut Evert also underscored the stiffer challenges facing Williams. \"There are some darned good players out there that I have a lot of respect for, and the way that Osaka, Barty, Pliskova, Halep ... there\\'s not one or two threats to Serena, there\\'s probably about eight threats, eight players that can probably do some damage and that can compete against her.\"And one more thing is -- I always felt this way -- the older you get, I think the more bad days you may have, days when you feel burned out, days when you don\\'t want to get out of bed, days when you don\\'t have incentive. You don\\'t want to have that day during a grand slam, but sometimes you can\\'t help it.\"JUST WATCHEDSimona Halep on winning her first Wimbledon titleReplayMore Videos ...MUST WATCHSimona Halep on winning her first Wimbledon title 01:56Visit our tennis page for more news and videosWilliams figures to still be around towards the end of the Australian Open, but whether it is as the last person standing continues to be the question. \\n\\nEntity Types:\\n[1] «PERSON»\\n[2] «ORGANIZATION»\\n[3] «LOCATION»\\n[4] «DATE»\\n[5] «TIME»\\n[6] «MONEY»\\n[7] «PERCENTAGE»\\n[8] «PRODUCT»\\n[9] «EVENT»\\n[10] «LANGUAGE»\\n[11] «NATIONALITY»\\n[12] «RELIGION»\\n[13] «TITLE»\\n[14] «PROFESSION»\\n[15] «ANIMAL»\\n[16] «PLANT»\\n[17] «DISEASE»\\n[18] «MEDICATION»\\n[19] «CHEMICAL»\\n[20] «MATERIAL»\\n[21] «COLOR»\\n[22] «SHAPE»\\n[23] «MEASUREMENT»\\n[24] «WEATHER»\\n[25] «NATURAL_DISASTER»\\n[26] «AWARD»\\n[27] «LAW»\\n[28] «CRIME»\\n[29] «TECHNOLOGY»\\n[30] «SOFTWARE»\\n[31] «HARDWARE»\\n[32] «VEHICLE»\\n[33] «FOOD»\\n[34] «DRINK»\\n[35] «SPORT»\\n[36] «MUSIC_GENRE»\\n[37] «INSTRUMENT»\\n[38] «ARTWORK»\\n[39] «BOOK»\\n[40] «MOVIE»\\n[41] «TV_SHOW»\\n[42] «ACADEMIC_SUBJECT»\\n[43] «SCIENTIFIC_THEORY»\\n[44] «POLITICAL_PARTY»\\n[45] «CURRENCY»\\n[46] «STOCK_SYMBOL»\\n[47] «FILE_TYPE»\\n[48] «PROGRAMMING_LANGUAGE»\\n[49] «MEDICAL_PROCEDURE»\\n[50] «CELESTIAL_BODY»\\n\\nPlease provide the output fields Reasoning then Entities Relationships. Do so immediately, without additional content before or after, and precisely as the format above shows. Begin with the field Reasoning.\\n\\n\\x1b[32mReasoning: Let\\'s think step by step in order to produce the entities_relationships. We first identify the entities in the text based on the provided entity types. Then, we determine the relationships between these entities, considering their interactions and dependencies within the context of the text. Finally, we format the entities and relationships according to the specified JSON schema.\\n\\nEntities Relationships:\\n```json\\n{\\n \"value\": [\\n {\\n \"entity_name\": \"Melbourne\",\\n \"entity_type\": \"LOCATION\",\\n \"description\": \"A city in Australia where the Australian Open is held.\",\\n \"importance_score\": 0.8\\n },\\n {\\n \"entity_name\": \"Australia\",\\n \"entity_type\": \"LOCATION\",\\n \"description\": \"The country where Melbourne is located and where the Australian Open takes place.\",\\n \"importance_score\": 0.7\\n },\\n {\\n \"entity_name\": \"Serena Williams\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A professional tennis player aiming to win the Australian Open.\",\\n \"importance_score\": 1.0\\n },\\n {\\n \"entity_name\": \"Mike Tyson\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A former heavyweight boxing champion who trained with Serena Williams.\",\\n \"importance_score\": 0.6\\n },\\n {\\n \"entity_name\": \"Australian Open\",\\n \"entity_type\": \"EVENT\",\\n \"description\": \"A major tennis tournament held annually in Melbourne, Australia.\",\\n \"importance_score\": 0.9\\n },\\n {\\n \"entity_name\": \"Alexis Olympia\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"The daughter of Serena Williams.\",\\n \"importance_score\": 0.5\\n },\\n {\\n \"entity_name\": \"Caroline Wozniacki\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A professional tennis player and friend of Serena Williams.\",\\n \"importance_score\": 0.6\\n },\\n {\\n \"entity_name\": \"Coco Gauff\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A young tennis sensation who trained with Serena Williams.\",\\n \"importance_score\": 0.5\\n },\\n {\\n \"entity_name\": \"Patrick Mouratoglou\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"The coach of Serena Williams who organized the training camp.\",\\n \"importance_score\": 0.6\\n },\\n {\\n \"entity_name\": \"Naomi Osaka\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A professional tennis player and grand slam winner who could face Serena Williams.\",\\n \"importance_score\": 0.7\\n },\\n {\\n \"entity_name\": \"Ashleigh Barty\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"The current world No. 1 tennis player who could face Serena Williams.\",\\n \"importance_score\": 0.7\\n },\\n {\\n \"entity_name\": \"Bianca Andreescu\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A professional tennis player who is a main contender but is not in Melbourne due to injury.\",\\n \"importance_score\": 0.6\\n },\\n {\\n \"entity_name\": \"Simona Halep\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A professional tennis player who defeated Serena Williams in the 2019 Wimbledon final.\",\\n \"importance_score\": 0.6\\n },\\n {\\n \"entity_name\": \"Chris Evert\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"An 18-time grand slam champion who commented on Serena Williams\\' chances at the Australian Open.\",\\n \"importance_score\": 0.5\\n },\\n {\\n \"entity_name\": \"Margaret Court\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"An Australian tennis player with a record-tying 24 grand slam titles.\",\\n \"importance_score\": 0.5\\n },\\n {\\n \"entity_name\": \"Jessica Pegula\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A fellow American tennis player who Serena Williams defeated in a match.\",\\n \"importance_score\": 0.4\\n },\\n {\\n \"entity_name\": \"Alexis Ohanian\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"The husband of Serena Williams.\",\\n \"importance_score\": 0.4\\n },\\n {\\n \"entity_name\": \"Karolina Pliskova\",\\n \"entity_type\": \"PERSON\",\\n \"description\": \"A former world No. 1 tennis player who saved match points against Serena Williams in Melbourne.\",\\n \"importance_score\": 0.5\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Mike Tyson\",\\n \"description\": \"Serena Williams trained with Mike Tyson during the off-season.\",\\n \"weight\": 0.7,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Australian Open\",\\n \"description\": \"Serena Williams is aiming to win the Australian Open.\",\\n \"weight\": 0.9,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Alexis Olympia\",\\n \"description\": \"Serena Williams is the mother of Alexis Olympia.\",\\n \"weight\": 0.6,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Caroline Wozniacki\",\\n \"description\": \"Serena Williams reached the doubles final with Caroline Wozniacki.\",\\n \"weight\": 0.6,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Coco Gauff\",\\n \"description\": \"Serena Williams trained with Coco Gauff.\",\\n \"weight\": 0.5,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Patrick Mouratoglou\",\\n \"description\": \"Patrick Mouratoglou is the coach of Serena Williams.\",\\n \"weight\": 0.6,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Naomi Osaka\",\\n \"description\": \"Serena Williams might have to defeat Naomi Osaka to reach the final of the Australian Open.\",\\n \"weight\": 0.7,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Ashleigh Barty\",\\n \"description\": \"Serena Williams might have to defeat Ashleigh Barty to reach the final of the Australian Open.\",\\n \"weight\": 0.7,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Bianca Andreescu\",\\n \"description\": \"Bianca Andreescu is a main contender but is not in Melbourne due to injury.\",\\n \"weight\": 0.6,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Simona Halep\",\\n \"description\": \"Simona Halep defeated Serena Williams in the 2019 Wimbledon final.\",\\n \"weight\": 0.6,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Chris Evert\",\\n \"description\": \"Chris Evert commented on Serena Williams\\' chances at the Australian Open.\",\\n \"weight\": 0.5,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Margaret Court\",\\n \"description\": \"Serena Williams is attempting to match Margaret Court\\'s record of 24 grand slam titles.\",\\n \"weight\": 0.5,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Jessica Pegula\",\\n \"description\": \"Serena Williams defeated Jessica Pegula in a match.\",\\n \"weight\": 0.4,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Alexis Ohanian\",\\n \"description\": \"Alexis Ohanian is the husband of Serena Williams.\",\\n \"weight\": 0.4,\\n \"order\": 1\\n },\\n {\\n \"src_id\": \"Serena Williams\",\\n \"tgt_id\": \"Karolina Pliskova\",\\n \"description\": \"Karolina Pliskova saved match points against Serena Williams in Melbourne.\",\\n \"weight\": 0.5,\\n \"order\": 1\\n }\\n ]\\n}\\n```\\x1b[0m\\n\\n\\n'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "deepseek.inspect_history(n=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcYAAAE8CAYAAABaaxFWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJl0lEQVR4nO3deVhUZf8/8PeAMAzLoCKrgiCIoqIWLrmgqciiIj7uWom5tWiKmus3FNw1M9RMq8ew9AF3fTIXAst9LUMtzdRUXHDJRxkQgWHm/P7wxzQj6wzDLPB+XRdXzTn3ued9jsN8OPfZRIIgCCAiIiIAgIWxAxAREZkSFkYiIiI1LIxERERqWBiJiIjUsDASERGpYWEkIiJSw8JIRESkhoWRiIhIDQsjERGRGhZGIjOxYcMGiEQi/Pzzz8aOQlStsTASlaCoCKn/uLi4oFu3bti/f7/O/S5atAi7d+/WX1AtHTt2DBEREahfvz5sbGzg5eWFyMhIJCUlGS0TkalhYSQqw7x587Bx40Z8++23mD59Oh49eoRevXrh+++/16k/YxbGbdu2oUuXLnjw4AEmTZqE1atX480338STJ0/w1VdfGSUTkSmqZewARKYsIiICbdq0Ub0ePXo0XF1dkZycjD59+hgxmfbi4uLQrFkznDp1CtbW1hrzHj58aLAcgiAgLy8PEonEYO9JpA3uMRJpoXbt2pBIJKhVS/NvyuXLl6Njx45wcnKCRCJBUFAQtm/frtFGJBLh2bNn+Oabb1TDsyNHjlTNv3v3LkaPHg0PDw+IxWL4+PjgvffeQ0FBgUY/+fn5mDJlCpydnWFnZ4d//etfePToUbnZr1+/jrZt2xYrigDg4uKi8VqpVGLlypUIDAyEjY0NnJ2dER4ernF8s7CwEPPnz4evry/EYjG8vb0xe/Zs5Ofna/Tl7e2NPn36ICUlBW3atIFEIsEXX3wBAHj69CliYmLg6ekJsVgMPz8/LF26FEqlUqOPzZs3IygoCA4ODpBKpQgMDMTKlSvLXWciXXCPkagMWVlZ+PvvvyEIAh4+fIjVq1cjJycHb775pka7lStXom/fvnjjjTdQUFCAzZs3Y9CgQfj+++/Ru3dvAMDGjRsxZswYtGvXDuPGjQMA+Pr6AgDu3buHdu3a4enTpxg3bhyaNm2Ku3fvYvv27cjNzdUoZh988AHq1KmDuXPn4ubNm0hISMCECROwZcuWMtelYcOGOHjwIO7cuYMGDRqU2Xb06NHYsGEDIiIiMGbMGBQWFuLo0aM4deqUag96zJgx+OabbzBw4EBMnToVp0+fxuLFi3H58mXs2rVLo78rV65g2LBheOeddzB27Fg0adIEubm56Nq1K+7evYt33nkHXl5eOHHiBGbNmoXMzEwkJCQAAFJTUzFs2DD06NEDS5cuBQBcvnwZx48fx6RJk8pcDyKdCERUTGJiogCg2I9YLBY2bNhQrH1ubq7G64KCAqFFixZC9+7dNabb2dkJ0dHRxZYfMWKEYGFhIZw9e7bYPKVSqZEpJCRENU0QBGHy5MmCpaWl8PTp0zLXaf369QIAwdraWujWrZsQGxsrHD16VFAoFBrtfvzxRwGAMHHixFKzpKenCwCEMWPGaMz/8MMPBQDCjz/+qJrWsGFDAYBw4MABjbbz588X7OzshD///FNj+syZMwVLS0shIyNDEARBmDRpkiCVSoXCwsIy149IXziUSlSGNWvWIDU1Fampqdi0aRO6deuGMWPGYOfOnRrt1I+XPXnyBFlZWQgODsa5c+fKfQ+lUondu3cjMjJS43hmEZFIpPF63LhxGtOCg4OhUChw69atMt9n1KhROHDgAF5//XUcO3YM8+fPR3BwMBo3bowTJ06o2u3YsQMikQhz584tNcu+ffsAAFOmTNGYP3XqVADA3r17Nab7+PggLCxMY9q2bdsQHByMOnXq4O+//1b9hISEQKFQ4MiRIwBeDF8/e/YMqampZa4fkb5wKJWoDO3atdMoVsOGDcMrr7yCCRMmoE+fPqohzu+//x4LFixAenq6xjG2l4taSR49egSZTIYWLVpUKJOXl5fG6zp16gB4UZDLExYWhrCwMOTm5uKXX37Bli1bsG7dOvTp0wd//PEHXFxccP36dXh4eKBu3bql9nPr1i1YWFjAz89PY7qbmxtq165drEj7+PgU6+Pq1au4cOECnJ2dS3yPohOC3n//fWzdulV1mUloaCgGDx6M8PDwcteXSBcsjERasLCwQLdu3bBy5UpcvXoVzZs3x9GjR9G3b1906dIFn3/+Odzd3WFlZYXExMQquT7Q0tKyxOmCIFS4D1tbWwQHByM4OBj16tVDfHw89u/fj+joaK2yVKTwAyjxDFSlUomePXti+vTpJS7j7+8P4MWJQenp6UhJScH+/fuxf/9+JCYmYsSIEfjmm2+0yktUESyMRFoqLCwEAOTk5AB4MfRoY2ODlJQUiMViVbvExMRiy5ZUSJydnSGVSvHbb79VUeKyFe0RZ2ZmAnhxQlBKSgr+97//lbrX2LBhQyiVSly9ehUBAQGq6Q8ePMDTp0/RsGHDct/X19cXOTk5CAkJKbettbU1IiMjERkZCaVSiffffx9ffPEFYmNji+21ElUWjzESaUEul+OHH36AtbW1qiBYWlpCJBJBoVCo2t28ebPEC/nt7Ozw9OlTjWkWFhbo168f9uzZU+Lt3rTZEyzLwYMHS5xedLywSZMmAIABAwZAEATEx8eXmqVXr14AoDpztMiKFSsAQHUmblkGDx6MkydPIiUlpdi8p0+fqv4Aefz4scY8CwsLtGzZEgCKXRpCpA/cYyQqw/79+/HHH38AeHHMKykpCVevXsXMmTMhlUoBvCgCK1asQHh4OIYPH46HDx9izZo18PPzw4ULFzT6CwoKQlpaGlasWAEPDw/4+Pigffv2WLRoEX744Qd07doV48aNQ0BAADIzM7Ft2zYcO3YMtWvXrvS6REVFwcfHB5GRkfD19cWzZ8+QlpaGPXv2oG3btoiMjAQAdOvWDW+99RZWrVqFq1evIjw8HEqlEkePHkW3bt0wYcIEtGrVCtHR0fjyyy/x9OlTdO3aFWfOnME333yDfv36oVu3buXmmTZtGr777jv06dMHI0eORFBQEJ49e4aLFy9i+/btuHnzJurVq4cxY8bgf//7H7p3744GDRrg1q1bWL16NVq3bq2xt0qkN8Y9KZbINJV0uYaNjY3QunVrYe3atRqXSwjCi0shGjduLIjFYqFp06ZCYmKiMHfuXOHlX7E//vhD6NKliyCRSAQAGpdu3Lp1SxgxYoTg7OwsiMVioVGjRsL48eOF/Px8jUwvX9Lx008/CQCEn376qcx1Sk5OFoYOHSr4+voKEolEsLGxEZo1ayb83//9nyCTyTTaFhYWCh9//LHQtGlTwdraWnB2dhYiIiKEX375RdVGLpcL8fHxgo+Pj2BlZSV4enoKs2bNEvLy8jT6atiwodC7d+8SM2VnZwuzZs0S/Pz8BGtra6FevXpCx44dheXLlwsFBQWCIAjC9u3bhdDQUMHFxUWwtrYWvLy8hHfeeUfIzMwsc32JdCUSBD2N0xAREVUDPMZIRESkhoWRiIhIDQsjERGRGhZGIiIiNSyMREREalgYiYiI1FT7C/yVSiXu3bsHBweHCt/XkYiIqhdBEJCdnQ0PDw9YWJS9T1jtC+O9e/fg6elp7BhERGQCbt++Xe6Duqt9YXRwcADwYmMU3cJLF0X3yAwNDYWVlZW+4lU55jY8c83O3IZnrtnNMbdMJoOnp6eqJpSl2hfGouFTqVRa6cJoa2sLqVRqNh8EgLmNwVyzM7fhmWt2c80NVOxRaTz5hoiISA0LIxERkZpqP5RKRESGpVAqcDTjKDKzM+Hu4I5gr2BYWlgaO1aFsTASEZHe7Ly8E5MOTMId2R3VtAbSBlgZvhL9A/obMVnFcSiViIj0YuflnRi4daBGUQSAu7K7GLh1IHZe3mmkZNphYSQiokpTKBWYdGASBBR/xG/RtJgDMVAoFYaOpjWjFsbs7GzExMSgYcOGkEgk6NixI86ePauaLwgC5syZA3d3d0gkEoSEhODq1atGTExERCU5mnG02J6iOgECbstu42jGUQOm0o1RC+OYMWOQmpqKjRs34uLFiwgNDUVISAju3r0LAFi2bBlWrVqFdevW4fTp07Czs0NYWBjy8vKMGdskKJQKHLp5CMkXk3Ho5iGz+CuMiPTPVL4LMrMz9drOmIx28s3z58+xY8cO/Pe//0WXLl0AAHFxcdizZw/Wrl2L+fPnIyEhAR999BGioqIAAN9++y1cXV2xe/duDB061FjRja46HNwmosozpe8Cdwd3vbYzJqMVxsLCQigUCtjY2GhMl0gkOHbsGG7cuIH79+8jJCRENc/R0RHt27fHyZMnSy2M+fn5yM/PV72WyWQAXtypQS6X65y3aNnK9KEPe67swVu73oIAARILiWr6/3L+h7e2vwX8C4hsEqmabiq5tWWuuQHzzc7chleZ7Np+F+hTSblfc38Nfo5+uJd9r8TjjCKIUN+hPl5zf80o/1bavKdIEITia2AgHTt2hLW1NZKSkuDq6ork5GRER0fDz88PiYmJ6NSpE+7duwd393/+whg8eDBEIhG2bNlSYp9xcXGIj48vNj0pKQm2trZVti5ERGS6cnNzMXz4cGRlZZV7e1CjXse4ceNGjBo1CvXr14elpSVeffVVDBs2DL/88ovOfc6aNQtTpkxRvS66cWxoaGil75WampqKnj17Gu3egMcyjqF3Uu9y2+0dvhedvToDMI3cujDX3ID5Zmduw9M1uy7fBfpUVu49V/ZgRtoM3M2+q5rWwKEBloQsqbI92IooGj2sCKMWRl9fXxw+fBjPnj2DTCaDu7s7hgwZgkaNGsHNzQ0A8ODBA409xgcPHqB169al9ikWiyEWi4tNt7Ky0ssvjb760cX93Pt4rnxeoXYvZzRm7sow19yA+WZnbsPTNntlvgv0qaTc/Vv0R1SzKJO7840228Ek7nxjZ2cHOzs7PHnyBCkpKVi2bBl8fHzg5uaGgwcPqgqhTCbD6dOn8d577xk3sJFUp4PbRKQ7U/8usLSwxOverxvlvfXBqIUxJSUFgiCgSZMmuHbtGqZNm4amTZvi7bffhkgkQkxMDBYsWIDGjRvDx8cHsbGx8PDwQL9+/YwZ22iCvYLRQNoAd2V3Sz243UDaAMFewUZIR0SGwu+CqmXU6xizsrIwfvx4NG3aFCNGjEDnzp2RkpKi2uWdPn06PvjgA4wbNw5t27ZFTk4ODhw4UOxM1prC0sISK8NXAnjxwVdX9DohPMHoQxZEVLX4XVC1jFoYBw8ejOvXryM/Px+ZmZn47LPP4OjoqJovEokwb9483L9/H3l5eUhLS4O/v78RExtf/4D+2D54O+pL62tMbyBtgO2Dt/M6RqIagt8FVcckjjGSdvoH9EdUE9M7uE1EhsXvgqrBwmimzP3gNhHpB78L9I+FkYiIAJj/A4b1hYWRiIhM6r6rxsbnMRIR1XDV5QHD+sLCSERUg1WnBwzrCwsjEVENVp0eMKwvLIxERDVYdXrAsL6wMBIR1WCmft9VY2BhJCKqwYruu/ryreWKiCCCp9SzRt13lYWRiKgG431Xi2NhJDIwhVKBQzcPIfliMg7dPFSjzvYj08T7rmriBf5EBsSLqMlU8b6r/2BhJDKQoouoX75erOgi6pr4lzmZFt539QUOpRIZAC+iJjIfLIxEBlBTLqLm8VOqDjiUSmQANeEiah4/peqCe4xEBlDdL6LmTaipOmFhJDKA6nwRtTkcP+UQL2mDhZHIAKrzRdSmfvx05+Wd8F7pjW7fdMPwncPR7Ztu8F7pzb1YKhULI1EF6GOPw5QvolYoFTiWcQwAcCzjmFbrZ8rHTznES7rgyTdE5dDnSSWmeBF10fo9znmM5JbJ6J3UG072ThVeP1M9flreEK8IIsQciEFUkyiz3FOnqmPUPUaFQoHY2Fj4+PhAIpHA19cX8+fPhyD880EeOXIkRCKRxk94eLgRU1NNUhV7HEUXUQ8LHIbXvV83elGs7PqZ6vFTUx/iJdNl1MK4dOlSrF27Fp999hkuX76MpUuXYtmyZVi9erVGu/DwcGRmZqp+kpOTjZSYahJzOKmkMvS1fqZ6/NSUh3jJtBm1MJ44cQJRUVHo3bs3vL29MXDgQISGhuLMmTMa7cRiMdzc3FQ/derUMVLiyuGZcealuu9x6HP9TPH4qakO8ZLpM+oxxo4dO+LLL7/En3/+CX9/f5w/fx7Hjh3DihUrNNodOnQILi4uqFOnDrp3744FCxbAycmpxD7z8/ORn5+vei2TyQAAcrkccrlc56xFy+rax54rezAjbQbuZt9VTavvUB9LQ5YiskmkzrnKU9ncxmIKuTOzMiGxkFSonXpOU8heES+vX9H/v7zOL69faSL9ItHr/V44eeck7ufch5u9Gzo06ABLC8sq3Ralbe/X3F+Dn6Mf7mXfK3GvWAQR6jvUx2vurxnt38pcPisvM8fc2mQVCeoH9AxMqVRi9uzZWLZsGSwtLaFQKLBw4ULMmjVL1Wbz5s2wtbWFj48Prl+/jtmzZ8Pe3h4nT56EpWXxoZm4uDjEx8cXm56UlARbW9sqXR8iIjJNubm5GD58OLKysiCVSstsa9TCuHnzZkybNg0ff/wxmjdvjvT0dMTExGDFihWIjo4ucZm//voLvr6+SEtLQ48ePYrNL2mP0dPTE3///Xe5G6Mscrkcqamp6NmzJ6ysrCq8nEKpQODaQI09RXVFf7VeeO9ClRyD0TW3sZlC7qJ/u/L2OF7+tzOF7BXx8vpJLCT4usXXGPXbKDxXPq/yz6a+lLe9SxqtaeDQAEtCllTpaE1FmMtn5WXmmFsmk6FevXoVKoxGHUqdNm0aZs6ciaFDhwIAAgMDcevWLSxevLjUwtioUSPUq1cP165dK7EwisViiMXiYtOtrKz08g+obT/Hbx7HtaxrZba5mnUVpzJPVenjXvS1/oZmzNxWsMLSsKUYuHUgAGgUx6KTSpaELYGN2Kbk5as4u0KpqNRlHy+vX5HnyufIU+YBKHv9TE1p27t/i/6IamZal8i8jL+fVU+bnEYtjLm5ubCw0Dz/x9LSEkqlstRl7ty5g8ePH8Pd3TwOmPPMOPNWdFJJSdcxJoQnGO2ifH1dW6m+fo9zHmv0Zcz10zc+Z5C0YdTCGBkZiYULF8LLywvNmzfHr7/+ihUrVmDUqFEAgJycHMTHx2PAgAFwc3PD9evXMX36dPj5+SEsLMyY0SuMZ8aZP1O7KF/fDzwuWr8jN45A9psMe4fvRRefLia1R0VkSEYtjKtXr0ZsbCzef/99PHz4EB4eHnjnnXcwZ84cAC/2Hi9cuIBvvvkGT58+hYeHB0JDQzF//vwSh0tNUdHFz3dld0s9TtVA2sAsbx5dk5jKHkdV3c3F0sISnb06Y99v+9DZq7NJFMXKDhXXlEykf0YtjA4ODkhISEBCQkKJ8yUSCVJSUgwbSs+KLn4euHUgRBCVeJzKXG8eTYanzbWHplDIdWWKz3Y0xUxUNXgTcQMwxYufyTzVhGPWpnjjb1PMRFWHNxE3EFM7TlUVOMxU9ar7MWtTvPG3KWaiqsXCaECmcpyqKnCYyTCq+zFrUxwqNsVMVLU4lEqVxmEmwzHVG3briykOFZtiJqpaLIxUKdX9CRSmqDofszbFoWJTzERVi0OpVCkcZjKO6nrM2hSHik0xE1Ut7jFSpXCYyXhM6YHH+mKKQ8WmmImqFgsjVQqHmUjfTHGo2BQzUdXhUCpVCoeZqCqY4lCxKWaiqsHCSJXCO/tQVTHFy5tMMRPpH4dSqdI4zERE1Qn3GEkvOMxERNUFC2MFKJQKHMs4BgA4lnGMj+QpBYeZiKg64FBqOXZe3gnvld7ondQbANA7qTe8V3rzbi5ERAagUCpw6OYhJF9MxqGbhwxysxDuMZZB/YGwEguJarquD4QlIqKKM9Y9mLnHWAre6oyIyHiMeQ9mFsZSaHOrMyIi0h9j75iwMJaCtzojIjIOY++YsDCWgrc6IyIyDmPvmLAwlqLoVmcv3zS4iAgieEo9easzIiI9M/aOCQtjKXhHfSIi4zD2jolRC6NCoUBsbCx8fHwgkUjg6+uL+fPnQxD+OeAqCALmzJkDd3d3SCQShISE4OrVqwbJx1udEREZnrF3TIxaGJcuXYq1a9fis88+w+XLl7F06VIsW7YMq1evVrVZtmwZVq1ahXXr1uH06dOws7NDWFgY8vLyDJKxf0B/3Jx0E3uH7wUA7B2+Fzcm3WBRJCKqQsbcMTHqBf4nTpxAVFQUevd+cVcZb29vJCcn48yZMwBe7C0mJCTgo48+QlRUFADg22+/haurK3bv3o2hQ4caJKelhSU6e3XGvt/2obNXZw6fEhEZgLHuwWzUwtixY0d8+eWX+PPPP+Hv74/z58/j2LFjWLFiBQDgxo0buH//PkJCQlTLODo6on379jh58mSJhTE/Px/5+fmq1zKZDAAgl8shl8t1zlq0bGX6MAbmNjxzzc7chmeu2Q2du1P9Tqr/VyqUUCqUWvehTVaRoH5Az8CUSiVmz56NZcuWwdLSEgqFAgsXLsSsWbMAvNij7NSpE+7duwd393/OPho8eDBEIhG2bNlSrM+4uDjEx8cXm56UlARbW9uqWxkiIjJZubm5GD58OLKysiCVSstsa9Q9xq1bt+I///kPkpKS0Lx5c6SnpyMmJgYeHh6Ijo7Wqc9Zs2ZhypQpqtcymQyenp4IDQ0td2OURS6XIzU1FT179oSVlZXO/RgacxueuWZnbsMz1+zmmLto9LAijFoYp02bhpkzZ6qGRAMDA3Hr1i0sXrwY0dHRcHNzAwA8ePBAY4/xwYMHaN26dYl9isViiMXiYtOtrKz08g+or34MjbkNz1yzM7fhmWt2c8qtTU6jnpWam5sLCwvNCJaWllAqX4wf+/j4wM3NDQcPHlTNl8lkOH36NDp06GDQrEREVDMYdY8xMjISCxcuhJeXF5o3b45ff/0VK1aswKhRowAAIpEIMTExWLBgARo3bgwfHx/ExsbCw8MD/fr1M2Z0IiKqpoxaGFevXo3Y2Fi8//77ePjwITw8PPDOO+9gzpw5qjbTp0/Hs2fPMG7cODx9+hSdO3fGgQMHYGNjY8TkRERUXRm1MDo4OCAhIQEJCQmlthGJRJg3bx7mzZtnuGBERFRj8V6pREREalgYiYiI1LAwEhERqWFhJCIiUsPCSEREpIaFkYiISA0LIxERkRoWRiIiIjUsjERERGpYGImIiNSwMBIREalhYSQiIlLDwkhERKSmUoWxoKAAV65cQWFhob7yEBERGZVOhTE3NxejR4+Gra0tmjdvjoyMDADABx98gCVLlug1IBERkSHpVBhnzZqF8+fP49ChQxoPDA4JCcGWLVv0Fo6IiMjQdHpQ8e7du7Flyxa89tprEIlEqunNmzfH9evX9RaOiIjI0HTaY3z06BFcXFyKTX/27JlGoSQiIjI3OhXGNm3aYO/evarXRcXw3//+Nzp06KCfZEREREag01DqokWLEBERgUuXLqGwsBArV67EpUuXcOLECRw+fFjfGYmIiAxGpz3Gzp074/z58ygsLERgYCB++OEHuLi44OTJkwgKCtJ3RqphFEoFjmUcAwAcyzgGhVJh5EREVJNoXRjlcjlGjRoFkUiEr776CmfOnMGlS5ewadMmBAYGVkVGqkF2Xt4J75Xe6J3UGwDQO6k3vFd6Y+flnUZORkQ1hdaF0crKCjt27NDLm3t7e0MkEhX7GT9+PADg9ddfLzbv3Xff1ct7k+nZeXknBm4diDuyOxrT78ruYuDWgSyORGQQOg2l9uvXD7t37670m589exaZmZmqn9TUVADAoEGDVG3Gjh2r0WbZsmWVfl8yPQqlApMOTIIAodi8omkxB2I4rEpEVU6nk28aN26MefPm4fjx4wgKCoKdnZ3G/IkTJ1aoH2dnZ43XS5Ysga+vL7p27aqaZmtrCzc3twpny8/PR35+vuq1TCYD8GIIWC6XV7iflxUtW5k+jMFcch/LOIbHOY8hsZAAQLH/AsDfOX/jyI0j6OzV2SgZK8pctvnLmNvwzDW7OebWJqtIEITif6KXw8fHp/QORSL89ddf2naJgoICeHh4YMqUKZg9ezaAF0Opv//+OwRBgJubGyIjIxEbGwtbW9tS+4mLi0N8fHyx6UlJSWUuR0RE1Vdubi6GDx+OrKwsSKXSMtvqVBirwtatWzF8+HBkZGTAw8MDAPDll1+iYcOG8PDwwIULFzBjxgy0a9cOO3eWfqyppD1GT09P/P333+VujLLI5XKkpqaiZ8+esLKy0rkfQzOX3McyjqlOuAFe7Cl+3eJrjPptFJ4rn6um7x2+1yz2GM1hm7+MuQ3PXLObY26ZTIZ69epVqDDqNJSqrqiuVvaON+vXr0dERISqKALAuHHjVP8fGBgId3d39OjRA9evX4evr2+J/YjFYojF4mLTrays9PIPqK9+DM3Uc3fx6QIneyfcld3VOM74XPkcz5XPIYIIDaQN0MWnCywtLI2YtOJMfZuXhrkNz1yzm1NubXLq/Nipb7/9FoGBgZBIJJBIJGjZsiU2btyoU1+3bt1CWloaxowZU2a79u3bAwCuXbum0/uQ6bK0sMTK8JUAABE0/8gqep0QnmA2RZGIzJdOhXHFihV477330KtXL2zduhVbt25FeHg43n33XXz66ada95eYmAgXFxf07t27zHbp6ekAAHd3d11ik4nrH9Af2wdvR31pfY3pDaQNsH3wdvQP6G+kZERUk+g0lLp69WqsXbsWI0aMUE3r27cvmjdvjri4OEyePLnCfSmVSiQmJiI6Ohq1av0T5/r160hKSkKvXr3g5OSECxcuYPLkyejSpQtatmypS2wyA/0D+iOqSRSO3DgC2W8y7B2+16yGT4nI/Om0x5iZmYmOHTsWm96xY0dkZmZq1VdaWhoyMjIwatQojenW1tZIS0tDaGgomjZtiqlTp2LAgAHYs2ePLpHJjFhaWKpOsOns1ZlFkYgMSqc9Rj8/P2zdulV1WUWRLVu2oHHjxlr1FRoaipJOjPX09OQNyYmo2nr5nsAcGTEdOhXG+Ph4DBkyBEeOHEGnTp0AAMePH8fBgwexdetWvQYkIqpudl7eiUkHJuFxzmMkt0xG76TecLJ3wsrwlTyWbgJ0GkodMGAATp8+jXr16mH37t3YvXs36tWrhzNnzuBf//qXvjMSEVUbvCew6dP5OsagoCBs2rRJn1mIiKq18u4JLIIIMQdiENUkisOqRqTTHuO+ffuQkpJSbHpKSgr2799f6VBERNXR0YyjxfYU1QkQcFt2G0czjhowFb1Mp8I4c+ZMKBTFn3IgCAJmzpxZ6VBERNVRZnbFztqvaDuqGjoVxqtXr6JZs2bFpjdt2pR3pSEiKoW7Q8VuTlLRdlQ1dCqMjo6OJT5B49q1a8UeQUVERC8EewWjgbRBsdseFhFBBE+pJ4K9gg2cjNTpVBijoqIQExOD69evq6Zdu3YNU6dORd++ffUWjoioOuE9gc2DToVx2bJlsLOzQ9OmTeHj4wMfHx80bdoUTk5OWL58ub4zEhFVG7wnsOnT6XINR0dHnDhxAqmpqTh//jwkEglatWqF4GDu/hMRlYf3BDZtWu0xnjx5Et9//z2AF89fDA0NhYuLC5YvX44BAwZg3LhxGg8JJiKikvGewKZLq8I4b948/P7776rXFy9exNixY9GzZ0/MnDkTe/bsweLFi/UekoiIyFC0Kozp6eno0aOH6vXmzZvRrl07fPXVV5gyZQpWrVrFe6USEZFZ06owPnnyBK6urqrXhw8fRkREhOp127Ztcfv2bf2lIyIiMjCtCqOrqytu3LgBACgoKMC5c+fw2muvqeZnZ2fDyspKvwmJiIgMSKvC2KtXL8ycORNHjx7FrFmzYGtrq3Em6oULF+Dr66v3kERERIai1eUa8+fPR//+/dG1a1fY29vjm2++gbW1tWr+119/jdDQUL2HJCIiMhStCmO9evVw5MgRZGVlwd7eHpaWmqcXb9u2Dfb29noNSEREZEg6X+Bfkrp161YqDBERkbHpdEs4IiKi6oqFkYiISI1RC6O3tzdEIlGxn/HjxwMA8vLyMH78eDg5OcHe3h4DBgzAgwcPjBmZiIiqOaMWxrNnzyIzM1P1k5qaCgAYNGgQAGDy5MnYs2cPtm3bhsOHD+PevXvo3593nicioqqj08k3+uLs7KzxesmSJfD19UXXrl2RlZWF9evXIykpCd27dwcAJCYmIiAgAKdOndK4sQAREZG+GLUwqisoKMCmTZswZcoUiEQi/PLLL5DL5QgJCVG1adq0Kby8vHDy5MlSC2N+fr7GEz5kMhkAQC6XQy6X65yvaNnK9GEMzG145pqduQ3PXLObY25tsooEQRCqMEuFbd26FcOHD0dGRgY8PDyQlJSEt99+u9hjrNq1a4du3bph6dKlJfYTFxeH+Pj4YtOTkpJga2tbJdmJiMi05ebmYvjw4cjKyoJUKi2zrcnsMa5fvx4RERHw8PCoVD+zZs3ClClTVK9lMhk8PT0RGhpa7sYoi1wuR2pqKnr27GlW94NlbsMz1+zMbXjmmt0ccxeNHlaESRTGW7duIS0tDTt37lRNc3NzQ0FBAZ4+fYratWurpj948ABubm6l9iUWiyEWi4tNt7Ky0ss/oL76MTTmNjxzzc7chmeu2c0ptzY5TeI6xsTERLi4uKB3796qaUFBQbCyssLBgwdV065cuYKMjAx06NDBGDGJiKgGMPoeo1KpRGJiIqKjo1Gr1j9xHB0dMXr0aEyZMgV169aFVCrFBx98gA4dOvCMVCIiqjJGL4xpaWnIyMjAqFGjis379NNPYWFhgQEDBiA/Px9hYWH4/PPPjZCSiIhqCqMXxtDQUJR2YqyNjQ3WrFmDNWvWGDgVERHVVCZxjJGIiMhUsDASERGpYWEkIiJSw8JIRESkhoWRiIhIDQsjERGRGhZGIiIiNSyMREREalgYiYiI1LAwEhERqWFhJCIiUsPCSEREpIaFkYiISA0LIxERkRoWRiIiIjUsjERERGpYGImIiNSwMBIREalhYSQiIlLDwkhERKSGhZGIiEiN0Qvj3bt38eabb8LJyQkSiQSBgYH4+eefVfNHjhwJkUik8RMeHm7ExEREVJ3VMuabP3nyBJ06dUK3bt2wf/9+ODs74+rVq6hTp45Gu/DwcCQmJqpei8ViQ0clIqIawqiFcenSpfD09NQoej4+PsXaicViuLm5GTIaERHVUEYtjN999x3CwsIwaNAgHD58GPXr18f777+PsWPHarQ7dOgQXFxcUKdOHXTv3h0LFiyAk5NTiX3m5+cjPz9f9VomkwEA5HI55HK5zlmLlq1MH8bA3IZnrtmZ2/DMNbs55tYmq0gQBKEKs5TJxsYGADBlyhQMGjQIZ8+exaRJk7Bu3TpER0cDADZv3gxbW1v4+Pjg+vXrmD17Nuzt7XHy5ElYWloW6zMuLg7x8fHFpiclJcHW1rZqV4iIiExSbm4uhg8fjqysLEil0jLbGrUwWltbo02bNjhx4oRq2sSJE3H27FmcPHmyxGX++usv+Pr6Ii0tDT169Cg2v6Q9Rk9PT/z999/lboyyyOVypKamomfPnrCystK5H0NjbsMz1+zMbXjmmt0cc8tkMtSrV69ChdGoQ6nu7u5o1qyZxrSAgADs2LGj1GUaNWqEevXq4dq1ayUWRrFYXOLJOVZWVnr5B9RXP4bG3IZnrtmZ2/DMNbs55dYmp1Ev1+jUqROuXLmiMe3PP/9Ew4YNS13mzp07ePz4Mdzd3as6HhER1UBGLYyTJ0/GqVOnsGjRIly7dg1JSUn48ssvMX78eABATk4Opk2bhlOnTuHmzZs4ePAgoqKi4Ofnh7CwMGNGJyKiasqohbFt27bYtWsXkpOT0aJFC8yfPx8JCQl44403AACWlpa4cOEC+vbtC39/f4wePRpBQUE4evQor2UkIqIqYdRjjADQp08f9OnTp8R5EokEKSkpBk5EREQ1mdFvCUdERGRKWBiJiIjUsDASERGpYWEkIiJSw8JIRESkhoWRiIhIDQsjERGRGhZGIiIiNSyMREREalgYiYiI1LAwEhERqWFhJCIiUsPCSEREpIaFkYiISA0LIxERkRoWRiIiIjVGf1AxEZknQRBQWFgIhUJh1BxyuRy1atVCXl6e0bNoy1yzm2puKysrWFpaVrofFkYi0lpBQQEyMzORm5tr7CgQBAFubm64ffs2RCKRseNoxVyzm2pukUiEBg0awN7evlL9sDASkVaUSiVu3LgBS0tLeHh4wNra2qhfjkqlEjk5ObC3t4eFhXkdHTLX7KaYWxAEPHr0CHfu3EHjxo0rtefIwkhEWikoKIBSqYSnpydsbW2NHQdKpRIFBQWwsbExmS/pijLX7Kaa29nZGTdv3oRcLq9UYTSdNSIis2JKX4hEAPQ2csFPNhERkRqjF8a7d+/izTffhJOTEyQSCQIDA/Hzzz+r5guCgDlz5sDd3R0SiQQhISG4evWqERMTEVF1ZtTC+OTJE3Tq1AlWVlbYv38/Ll26hE8++QR16tRRtVm2bBlWrVqFdevW4fTp07Czs0NYWBjy8vKMmJyIyLQcP34cgYGBsLKyQr9+/Sq0TFxcHFq3bl2luSrD29sbCQkJBn9foxbGpUuXwtPTE4mJiWjXrh18fHwQGhoKX19fAC/2FhMSEvDRRx8hKioKLVu2xLfffot79+5h9+7dxoxORGbm0aNHeO+99+Dl5QWxWAw3NzeEhYXh+PHjxo6mF1OmTEHr1q1x48YNbNiwQS993rx5EyKRSPVjbW0NPz8/LFy4EIIgaNWXSCQym+9to56V+t133yEsLAyDBg3C4cOHUb9+fbz//vsYO3YsAODGjRu4f/8+QkJCVMs4Ojqiffv2OHnyJIYOHVqsz/z8fOTn56tey2QyAC8uSJXL5TpnLVq2Mn0YA3Mbnrlmr2huuVwOQRCgVCqhVCp1fj+FUoGjGUeRmZMJd3t3BHsFw9JC+zMJi76gizKVZsCAASgoKEBiYiIaNWqEBw8e4Mcff8SjR48qtR6VUdHsFXH9+nWMGzcOHh4eAFCh/orev7S2RdN/+OEHNG/eHPn5+Th27BjGjRuH2rVr4/3339cqty6fGW22jVKphCAIJZ6Vqs3vo1EL419//YW1a9diypQpmD17Ns6ePYuJEyfC2toa0dHRuH//PgDA1dVVYzlXV1fVvJctXrwY8fHxxab/8MMPejm1PDU1tdJ9GANzG565Zi8vd61ateDm5oacnBwUFBTo9B57ru3BzMMzcS/nnmqah70HlnRdgki/SJ36zM7OLnVeVlYWjh49iu+//x5BQUEAgDp16qBp06YAXvwBnZGRgVatWuHIkSMIDAxULeft7Y09e/agc+fOAIDLly8jLi4OJ0+ehCAIaNGiBT7//HP4+PgAADZt2oQ1a9bgr7/+Qp06dRAZGYmPP/5Y1V9sbCz27duHgoICtG7dGgsXLlS938WLFzF79mykp6dDJBKhUaNG+PTTT/HKK68gIyMD06dPx6lTpyCXy+Hl5YX4+Hg0bdoUrVq1AgCMGTMGY8aMwZo1awAAs2bNwq1bt1TbYe/evXjzzTfx5MkTAC92JBQKhWoH4mU5OTkAABsbG9ja2sLW1haRkZFo3749zp8/r9rm586dw/z583HhwgXI5XIEBgZi0aJFqlwtW7YE8OKPEwDw9PTEhQsXAAD79+/Hxx9/jEuXLsHOzg4dOnTApk2bALwodP/73/8wYsQI/Pe//4WjoyM+/PBDjBw5ssS8BQUFeP78OY4cOYLCwkKNedrcjMKohVGpVKJNmzZYtGgRAOCVV17Bb7/9hnXr1iE6OlqnPmfNmoUpU6aoXstkMnh6eiI0NBRSqVTnrHK5HKmpqejZsyesrKx07sfQanLuPVf2YEbaDNzNvquaVt+hPpaGLEVkE92+fCuium/zvLw83L59G/b29rCxsdH6fXZe3onovdEQoDkUl5mTiei90dg6cCv6B/SvcH+CICA7OxsODg6lnq5va2sLe3t7pKamonv37hCLxcXaFN0txc7OTvVdUbSnYmtrC6lUirt376JPnz7o2rUr0tLSIJVKcfz4cdjY2EAqlWLt2rWYNm0aFi9ejPDwcGRlZeHEiROq/gYOHAiJRIJ9+/bB0dERX3zxBfr164c//vgDTk5OeO+999C6dWt88cUXsLS0RHp6OmrXrg2pVIpZs2ZBoVDg8OHDsLOzw6VLlyCVShEQEIC7d+8iICAA8fHxGDx4MBwdHbFlyxaIRCKN7z2JRAIAqmlisRiWlpalfjeWtE1+/vlnnD9/HkOHDlVtc6VSibfffhtt2rSBIAhYsWIFhgwZgitXrsDBwQFnz56Fm5sb1q9fj/DwcNV77t27F2+99RZmz56NjRs3oqCgAPv371e9l4WFBT7//HPMmzcPc+bMwY4dOzB16lSEhYWhSZMmxfLm5eVBIpGgS5cuxT6bpRX/khi1MLq7u6NZs2Ya0wICArBjxw4AgJubGwDgwYMHcHd3V7V58OBBqQeMxWJxiR96KysrvXxJ6asfQ6tpuXde3omBOwYW+/K9nnUdA3cMxPbB27X68tVFdd3mCoUCIpEIFhYWWl/LqFAqMPmHycX+XQBAgAARRJjywxT8K+BfFR5WLSpeRZlKYm1tjQ0bNmDs2LH44osv8Oqrr6Jr164YOnSoam+maFn19Xp52tq1a1VFp2gbFe11AsCiRYswdepUxMTEqKa1b98eAHDs2DGcPXsWDx8+VH1HLV++HLt378aOHTvw7rvvIiMjA9OmTVN9L6p/+d++fRsDBgxQ7YX5+fmp5nl4eEAkEqF27dqqodSX16GkaUV/SJS23Yqmd+7cGRYWFigoKIBcLsfYsWMxdOhQ1TZXP9wFAF999RVq166No0ePok+fPqpRv7p166ryAS9G+IYOHYp58+appr3yyisaffXq1Qvjx48HAMycORMJCQk4fPgwAgICSswrEolK/Axr87to1JNvOnXqhCtXrmhM+/PPP9GwYUMAgI+PD9zc3HDw4EHVfJlMhtOnT6NDhw4GzUrmQ6FUYNKBSaV++QJAzIEYKJSmc/PjmuJoxlHckd0pdb4AAbdlt3E046je33vAgAG4d+8evvvuO4SHh+PQoUN49dVXtTpRJT09HcHBwSV+yT58+BD37t1Djx49Slz2/PnzyMnJgZOTE+zt7WFvbw+pVIpbt27hr7/+AvDiBJoxY8YgJCQES5YswfXr11XLT5w4EQsWLECnTp0wd+5c1VCkIWzZsgXp6ek4f/48tm7diu+++w5xcXGq+Q8ePMDYsWPRuHFjODo6QiqVIicnBxkZGWX2m56eXur2KlL0hwvwopC7ubnh4cOHlVqf8hi1ME6ePBmnTp3CokWLcO3aNSQlJeHLL79U/XUgEokQExODBQsW4LvvvsPFixcxYsQIeHh4VPh0ZKp5jPnlS2XLzM7Uaztt2djYoGfPnoiNjcWJEycwcuRIzJ07F8A/e0fqZ1u+fMJG0VBkScqaB7w4Xufu7o709HTVz7lz53D27Fl8+OGHAF5cPvH777+jd+/e+PHHH9GsWTPs2rULwIvjh3/99RfeeustXLx4EW3atMHq1atLfT8LC4tiZ47qekKYp6cn/Pz8EBAQgEGDBmHSpElYs2aN6rK56OhopKenY+XKlThx4gTS09Ph5ORU7jHo8rYZUHxPr2jotioZtTC2bdsWu3btQnJyMlq0aIH58+cjISEBb7zxhqrN9OnT8cEHH2DcuHFo27YtcnJycODAAZ2ObVDNYOwvXyqdu4N7+Y20aFdZzZo1w7NnzwC8uM8mAGRm/vO5SE9P12jfsmVLHD16tMQC4+DgAG9vb40RLnWvvvoq7t+/j1q1asHPz0/106hRI9SrV0/Vzt/fH5MnT8YPP/yA/v37IzExUTXP09MT7777Lnbu3ImpU6fiq6++KnXdnJ2dkZ2drVq/ktZHV5aWligsLFQVvuPHj2PixIno1asXmjdvDrFYjL///ltjGSsrq2KPqGrZsmWp28uYjH4T8T59+qBPnz6lzheJRJg3b57GGDRRWUzty5f+EewVjAbSBrgru1viULcIIjSQNkCwV7Be3/fx48cYNGgQRo0ahZYtW8LBwQE///wzli1bhqioKAAv9l5ee+01LFmyBD4+Pnj48CE++ugjjX4mTJiA1atXY+jQoZg1axYcHR1x6tQptGvXDk2aNEFcXBzeffdduLi4ICIiAtnZ2Th+/Dg++OADhISEoEOHDujXrx+WLVsGf39/3LlzB7t27cKQIUMQGBiIadOmYeDAgfDx8cGdO3dw9uxZ1ZmcMTExiIiIgL+/P548eYKffvqpxONsRdq3bw9bW1vMnj0bEydOxOnTp3W+vvHx48e4f/8+CgsLcfHiRaxatQrBwcGqk2QaN26MjRs3ok2bNpDJZJg2bVqxvcGiPxo6deoEsViMOnXqYO7cuejRowd8fX0xdOhQFBYWYt++fZgxY4ZOOfXF6LeEI+NSKBU4dPMQki8m49DNQ9XiuFvRl68IJZ+hKIIInlJPvX/5UvksLSyxMnwlABT79yl6nRCeoNP1jGWxt7dH+/bt8emnn6JLly5o0aIFYmNjMXbsWHz22Weqdl9//TUKCwsRFBSkOoyjzsnJCT/++CNycnLQtWtXBAUF4auvvlIN90VHRyMhIQGff/45mjdvjj59+qhuYSkSibBv3z506dIFb7/9Nvz9/TF8+HDcvn0brq6usLS0xOPHjzFixAj4+/tj8ODBiIiIUF1+plAoMH78eAQEBCA8PBz+/v74/PPPS13nunXrYtOmTdi3bx8CAwORnJyscVxQGyEhIXB3d4e3tzfGjRuHiIgIfP3116r569evx5MnT/Dqq6/irbfewsSJE+Hi4qLRxyeffILU1FR4enqqTrB5/fXXsW3bNnz33Xdo3bo1unfvjjNnzuiUUZ9Egra3LzAzMpkMjo6OyMrKqvTlGvv27UOvXr3M6kzDsnLvvLwTkw5M0jge10DaACvDV1b5GZvlqez23nl5JwZuHQgAGnsmRV++VXlWanX8rKjLy8vDjRs34OPjo/MhjZI+e55STySEJ2j976JUKiGTySCVSs3uiR/mmt1Uc5f12dSmFhh9KJWMo6hwvDycdVd2FwO3GuZyhqrUP6A/tg/eXmLh1+XLl/Srf0B/RDWJenHnm+xMuDvofucbIn1jYayByrucQQQRYg7EIKpJlFl/UfHL17RZWljide/XjR2DqBgWxhpIm8sZzP2Li1++RKQt0xkcJoPh5QxERKVjYayBeDkD6UM1P2+PzJC+PpMsjDUQL2egyig6Y1WbpxUQGULRDQdefuSUtniMsQYqupZs4NaBEEFU4uUMVXEtGVUPlpaWqF27tup+lba2tqU+1cIQlEolCgoKkJeXZ1KXDlSEuWY3xdxKpRKPHj2Cra0tatWqXGljYayheDkDVUbRk2+q+mbOFSEIAp4/fw6JRGLUAq0Lc81uqrktLCzg5eVV6UwsjDUYL2cgXYlEIri7u8PFxUXnG1Pri1wux5EjR9ClSxezuqECYL7ZTTW3tbW1XvZgWRhrOF7OQJVhaWlZ6eM5+shQWFgIGxsbk/qSrghzzW6uuSvKNAaHiYiITAQLIxERkRoWRiIiIjXV/hhj0QWfMpmsUv3I5XLk5uZCJpOZ1Zg6cxueuWZnbsMz1+zmmLuoBlTkJgDVvjBmZ2cDePHkayIiqtmys7Ph6OhYZptq/zxGpVKJe/fuwcHBoVLXtshkMnh6euL27duVeq6joTG34ZlrduY2PHPNbo65BUFAdnY2PDw8yr2ko9rvMVpYWKBBgwZ6608qlZrNB0EdcxueuWZnbsMz1+zmlru8PcUiPPmGiIhIDQsjERGRGhbGChKLxZg7dy7EYrGxo2iFuQ3PXLMzt+GZa3ZzzV1R1f7kGyIiIm1wj5GIiEgNCyMREZEaFkYiIiI1LIxERERqWBj/vzVr1sDb2xs2NjZo3749zpw5U2b7bdu2oWnTprCxsUFgYCD27dtnoKT/WLx4Mdq2bQsHBwe4uLigX79+uHLlSpnLbNiwASKRSOPHxsbGQIlfiIuLK5ahadOmZS5jCtsbALy9vYtlF4lEGD9+fIntjbW9jxw5gsjISHh4eEAkEmH37t0a8wVBwJw5c+Du7g6JRIKQkBBcvXq13H61/T3RZ265XI4ZM2YgMDAQdnZ28PDwwIgRI3Dv3r0y+9Tl86bv7AAwcuTIYjnCw8PL7deY2xxAiZ93kUiEjz/+uNQ+DbXNqwoLI4AtW7ZgypQpmDt3Ls6dO4dWrVohLCwMDx8+LLH9iRMnMGzYMIwePRq//vor+vXrh379+uG3334zaO7Dhw9j/PjxOHXqFFJTUyGXyxEaGopnz56VuZxUKkVmZqbq59atWwZK/I/mzZtrZDh27FipbU1lewPA2bNnNXKnpqYCAAYNGlTqMsbY3s+ePUOrVq2wZs2aEucvW7YMq1atwrp163D69GnY2dkhLCwMeXl5pfap7e+JvnPn5ubi3LlziI2Nxblz57Bz505cuXIFffv2LbdfbT5vVZG9SHh4uEaO5OTkMvs09jYHoJE3MzMTX3/9NUQiEQYMGFBmv4bY5lVGIKFdu3bC+PHjVa8VCoXg4eEhLF68uMT2gwcPFnr37q0xrX379sI777xTpTnL8/DhQwGAcPjw4VLbJCYmCo6OjoYLVYK5c+cKrVq1qnB7U93egiAIkyZNEnx9fQWlUlnifFPY3gCEXbt2qV4rlUrBzc1N+Pjjj1XTnj59KojFYiE5ObnUfrT9PdF37pKcOXNGACDcunWr1Dbaft70oaTs0dHRQlRUlFb9mOI2j4qKErp3715mG2Nsc32q8XuMBQUF+OWXXxASEqKaZmFhgZCQEJw8ebLEZU6ePKnRHgDCwsJKbW8oWVlZAIC6deuW2S4nJwcNGzaEp6cnoqKi8PvvvxsinoarV6/Cw8MDjRo1whtvvIGMjIxS25rq9i4oKMCmTZswatSoMm9QbwrbW92NGzdw//59jW3q6OiI9u3bl7pNdfk9MYSsrCyIRCLUrl27zHbafN6q0qFDh+Di4oImTZrgvffew+PHj0tta4rb/MGDB9i7dy9Gjx5dbltT2ea6qPGF8e+//4ZCoYCrq6vGdFdXV9y/f7/EZe7fv69Ve0NQKpWIiYlBp06d0KJFi1LbNWnSBF9//TX++9//YtOmTVAqlejYsSPu3LljsKzt27fHhg0bcODAAaxduxY3btxAcHCw6hFhLzPF7Q0Au3fvxtOnTzFy5MhS25jC9n5Z0XbTZpvq8ntS1fLy8jBjxgwMGzaszBtZa/t5qyrh4eH49ttvcfDgQSxduhSHDx9GREQEFApFie1NcZt/8803cHBwQP/+/ctsZyrbXFfV/ukaNcX48ePx22+/lTuO36FDB3To0EH1umPHjggICMAXX3yB+fPnV3VMAEBERITq/1u2bIn27dujYcOG2Lp1a4X+EjUV69evR0REBDw8PEptYwrbuzqSy+UYPHgwBEHA2rVry2xrKp+3oUOHqv4/MDAQLVu2hK+vLw4dOoQePXoYLEdlfP3113jjjTfKPYHMVLa5rmr8HmO9evVgaWmJBw8eaEx/8OAB3NzcSlzGzc1Nq/ZVbcKECfj+++/x008/af2ILSsrK7zyyiu4du1aFaUrX+3ateHv719qBlPb3gBw69YtpKWlYcyYMVotZwrbu2i7abNNdfk9qSpFRfHWrVtITU3V+rFH5X3eDKVRo0aoV69eqTlMaZsDwNGjR3HlyhWtP/OA6WzziqrxhdHa2hpBQUE4ePCgappSqcTBgwc1/tJX16FDB432AJCamlpq+6oiCAImTJiAXbt24ccff4SPj4/WfSgUCly8eBHu7u5VkLBicnJycP369VIzmMr2VpeYmAgXFxf07t1bq+VMYXv7+PjAzc1NY5vKZDKcPn261G2qy+9JVSgqilevXkVaWhqcnJy07qO8z5uh3LlzB48fPy41h6ls8yLr169HUFAQWrVqpfWyprLNK8zYZ/+Ygs2bNwtisVjYsGGDcOnSJWHcuHFC7dq1hfv37wuCIAhvvfWWMHPmTFX748ePC7Vq1RKWL18uXL58WZg7d65gZWUlXLx40aC533vvPcHR0VE4dOiQkJmZqfrJzc1VtXk5e3x8vJCSkiJcv35d+OWXX4ShQ4cKNjY2wu+//26w3FOnThUOHTok3LhxQzh+/LgQEhIi1KtXT3j48GGJmU1lexdRKBSCl5eXMGPGjGLzTGV7Z2dnC7/++qvw66+/CgCEFStWCL/++qvq7M0lS5YItWvXFv773/8KFy5cEKKiogQfHx/h+fPnqj66d+8urF69WvW6vN+Tqs5dUFAg9O3bV2jQoIGQnp6u8ZnPz88vNXd5nzdDZM/OzhY+/PBD4eTJk8KNGzeEtLQ04dVXXxUaN24s5OXllZrd2Nu8SFZWlmBrayusXbu2xD6Mtc2rCgvj/7d69WrBy8tLsLa2Ftq1ayecOnVKNa9r165CdHS0RvutW7cK/v7+grW1tdC8eXNh7969Bk784tTqkn4SExNVbV7OHhMTo1pPV1dXoVevXsK5c+cMmnvIkCGCu7u7YG1tLdSvX18YMmSIcO3atVIzC4JpbO8iKSkpAgDhypUrxeaZyvb+6aefSvxsFGVTKpVCbGys4OrqKojFYqFHjx7F1qdhw4bC3LlzNaaV9XtS1blv3LhR6mf+p59+KjV3eZ83Q2TPzc0VQkNDBWdnZ8HKykpo2LChMHbs2GIFztS2eZEvvvhCkEgkwtOnT0vsw1jbvKrwsVNERERqavwxRiIiInUsjERERGpYGImIiNSwMBIREalhYSQiIlLDwkhERKSGhZGIiEgNCyMREZEaFkYiE7dhw4ZynzdoikaOHIl+/foZOwaR1lgYiSpg5MiREIlEqh8nJyeEh4fjwoULWvUTFxeH1q1bV01INTdv3oRIJIKLi0uxZ+C1bt0acXFxVZ6ByFyxMBJVUHh4ODIzM5GZmYmDBw+iVq1a6NOnj7FjlSk7OxvLly83dgy9EQQBhYWFxo5B1RwLI1EFicViuLm5wc3NDa1bt8bMmTNx+/ZtPHr0SNVmxowZ8Pf3h62tLRo1aoTY2FjI5XIAL4ZE4+Pjcf78edWe54YNGwAAT58+xTvvvANXV1fY2NigRYsW+P777zXePyUlBQEBAbC3t1cV6fJ88MEHWLFiBR4+fFhqG5FIhN27d2tMq127tipb0d7n1q1bERwcDIlEgrZt2+LPP//E2bNn0aZNG9jb2yMiIkJjWxSJj4+Hs7MzpFIp3n33XRQUFKjmKZVKLF68GD4+PpBIJGjVqhW2b9+umn/o0CGIRCLs378fQUFBEIvF5T6Mm6iyahk7AJE5ysnJwaZNm+Dn56fxTEAHBwds2LABHh4euHjxIsaOHQsHBwdMnz4dQ4YMwW+//YYDBw4gLS0NAODo6AilUomIiAhkZ2dj06ZN8PX1xaVLl2BpaanqNzc3F8uXL8fGjRthYWGBN998Ex9++CH+85//lJlz2LBhSE1Nxbx58/DZZ59Vap3nzp2LhIQEeHl5YdSoURg+fDgcHBywcuVK2NraYvDgwZgzZw7Wrl2rWubgwYOwsbHBoUOHcPPmTbz99ttwcnLCwoULAQCLFy/Gpk2bsG7dOjRu3BhHjhzBm2++CWdnZ3Tt2lXVz8yZM7F8+XI0atQIderUqdR6EJXLyE/3IDIL0dHRgqWlpWBnZyfY2dkJAAR3d3fhl19+KXO5jz/+WAgKClK9njt3rtCqVSuNNikpKYKFhUWJj7ESBEFITEwUAGg8tmfNmjWCq6trqe9b9IimX3/9VThw4IBgZWWlWr5Vq1YajwgCIOzatUtjeUdHR9Xjy4r6+ve//62an5ycLAAQDh48qJq2ePFioUmTJqrX0dHRQt26dYVnz56ppq1du1awt7cXFAqFkJeXJ9ja2gonTpzQeO/Ro0cLw4YNEwThn0ci7d69u9R1JdI37jESVVC3bt1Ue0NPnjzB559/joiICJw5cwYNGzYEAGzZsgWrVq3C9evXkZOTg8LCQkil0jL7TU9PR4MGDeDv719qG1tbW/j6+qpeu7u7lzk8qi4sLAydO3dGbGwskpKSKrRMSVq2bKn6f1dXVwBAYGCgxrSXM7Vq1Qq2traq1x06dEBOTg5u376NnJwc5ObmomfPnhrLFBQU4JVXXtGY1qZNG51zE2mLhZGoguzs7ODn56d6/e9//xuOjo746quvsGDBApw8eRJvvPEG4uPjERYWBkdHR2zevBmffPJJmf1KJJJy39vKykrjtUgkgqDFo1SXLFmCDh06YNq0acXmldRX0XHR0jKIRKISpymVygpnysnJAQDs3bsX9evX15gnFos1XtvZ2VW4X6LKYmEk0pFIJIKFhQWeP38OADhx4gQaNmyI//u//1O1uXXrlsYy1tbWUCgUGtNatmyJO3fu4M8//yxzr7Ey2rVrh/79+2PmzJnF5jk7O2ucyHP16lXk5ubq5X3Pnz+P58+fq4r/qVOnYG9vD09PT9StWxdisRgZGRkaxxOJjI2FkaiC8vPzcf/+fQAvhlI/++wz5OTkIDIyEgDQuHFjZGRkYPPmzWjbti327t2LXbt2afTh7e2NGzduqIZPHRwc0LVrV3Tp0gUDBgzAihUr4Ofnhz/++AMikQjh4eF6y79w4UI0b94ctWpp/tp3794dn332GTp06ACFQoEZM2YU20PVVUFBAUaPHo2PPvoIN2/exNy5czFhwgRYWFjAwcEBH374ISZPngylUonOnTsjKysLx48fh1QqRXR0tF4yEGmLl2sQVdCBAwfg7u4Od3d3tG/fHmfPnsW2bdvw+uuvAwD69u2LyZMnY8KECWjdujVOnDiB2NhYjT4GDBiA8PBwdOvWDc7OzkhOTgYA7NixA23btsWwYcPQrFkzTJ8+vdieZWX5+/tj1KhRyMvL05j+ySefwNPTE8HBwRg+fDg+/PBDjeOCldGjRw80btwYXbp0wZAhQ9C3b1+NmwvMnz8fsbGxWLx4MQICAhAeHo69e/fCx8dHL+9PpAuRoM2BCiIiomqOe4xERERqWBiJiIjUsDASERGpYWEkIiJSw8JIRESkhoWRiIhIDQsjERGRGhZGIiIiNSyMREREalgYiYiI1LAwEhERqfl/1iPUxfzqDdwAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "trial_logs = miprov2_model.trial_logs\n", + "trial_numbers = list(trial_logs.keys())\n", + "scores = [trial_logs[trial]['score'] for trial in trial_numbers]\n", + "pruning_status = [trial_logs[trial]['pruned'] for trial in trial_numbers]\n", + "\n", + "plt.figure(figsize=(5, 3))\n", + "for trial_number, score, pruned in zip(trial_numbers, scores, pruning_status):\n", + " if pruned:\n", + " plt.scatter(trial_number, score, color='grey', label='Pruned Batch' if 'Pruned Batch' not in plt.gca().get_legend_handles_labels()[1] else \"\")\n", + " else:\n", + " plt.scatter(trial_number, score, color='green', label='Successful Batch' if 'Successful Batch' not in plt.gca().get_legend_handles_labels()[1] else \"\")\n", + "\n", + "plt.xlabel('Batch Number')\n", + "plt.ylabel('Score')\n", + "plt.title('Batch Scores')\n", + "plt.grid(True)\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline program | Score: 0:\n", + "Prompt 1 Instruction: Given the fields `input_text`, `entity_types`, produce the fields `entities_relationships`.\n", + "\n", + "----------------\n", + "Best program after 0 batches | Score: 81.23:\n", + "Prompt 1 Instruction: Given the `input_text` and a list of `entity_types`, meticulously extract and identify all entities and their relationships within the text. Ensure that each entity is accurately classified according to the provided entity types, and clearly define the relationships between entities, including their descriptions, weights, and orders. Provide a step-by-step reasoning process to justify the extraction and classification of each entity and relationship.\n", + "\n", + "Best program after 5 batches | Score: 81.23:\n", + "Prompt 1 Instruction: Given the `input_text` and a list of `entity_types`, meticulously extract and identify all entities and their relationships within the text. Ensure that each entity is accurately classified according to the provided entity types, and clearly define the relationships between entities, including their descriptions, weights, and orders. Provide a step-by-step reasoning process to justify the extraction and classification of each entity and relationship.\n", + "\n", + "Best program after 10 batches | Score: 85.16:\n", + "Prompt 1 Instruction: Given the `input_text` and `entity_types`, meticulously identify and extract entities from the text. For each identified entity, provide its `entity_name`, `entity_type`, a concise `description`, and an `importance_score`. Subsequently, determine and list the relationships between these entities, specifying the `src_id` (source entity), `tgt_id` (target entity), a `description` of the relationship, a `weight` indicating the strength of the relationship, and an `order` to indicate the sequence of relationships. Ensure that the relationships are coherent and directly derived from the context provided in the `input_text`.\n", + "\n", + "Best program after 15 batches | Score: 85.16:\n", + "Prompt 1 Instruction: Given the `input_text` and `entity_types`, meticulously identify and extract entities from the text. For each identified entity, provide its `entity_name`, `entity_type`, a concise `description`, and an `importance_score`. Subsequently, determine and list the relationships between these entities, specifying the `src_id` (source entity), `tgt_id` (target entity), a `description` of the relationship, a `weight` indicating the strength of the relationship, and an `order` to indicate the sequence of relationships. Ensure that the relationships are coherent and directly derived from the context provided in the `input_text`.\n", + "\n" + ] + } + ], + "source": [ + "best_score = 0\n", + "\n", + "def get_signature(predictor):\n", + " if (hasattr(predictor, 'extended_signature')):\n", + " return predictor.extended_signature\n", + " elif (hasattr(predictor, 'signature')):\n", + " return predictor.signature\n", + "\n", + "print(f\"Baseline program | Score: {best_score}:\")\n", + "for i,predictor in enumerate(model.predictors()):\n", + " print(f\"Prompt {i+1} Instruction: {get_signature(predictor).instructions}\")\n", + "print()\n", + "\n", + "print(\"----------------\")\n", + "\n", + "for trial_num in miprov2_model.trial_logs:\n", + " program_score = miprov2_model.trial_logs[trial_num][\"score\"]\n", + " program_pruned = miprov2_model.trial_logs[trial_num][\"pruned\"]\n", + " if program_score > best_score and not program_pruned and miprov2_model.trial_logs[trial_num][\"full_eval\"]:\n", + " best_score = program_score\n", + " best_program_so_far = miprov2_model.trial_logs[trial_num][\"program\"]\n", + " if trial_num % 5 == 0:\n", + " print(f\"Best program after {trial_num} batches | Score: {best_score}:\")\n", + " for i,predictor in enumerate(best_program_so_far.predictors()):\n", + " print(f\"Prompt {i+1} Instruction: {get_signature(predictor).instructions}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/20 [00:00\n", + "#T_a2fb4 th {\n", + " text-align: left;\n", + "}\n", + "#T_a2fb4 td {\n", + " text-align: left;\n", + "}\n", + "#T_a2fb4_row0_col0, #T_a2fb4_row0_col1, #T_a2fb4_row0_col2, #T_a2fb4_row0_col3, #T_a2fb4_row0_col4, #T_a2fb4_row0_col5, #T_a2fb4_row1_col0, #T_a2fb4_row1_col1, #T_a2fb4_row1_col2, #T_a2fb4_row1_col3, #T_a2fb4_row1_col4, #T_a2fb4_row1_col5, #T_a2fb4_row2_col0, #T_a2fb4_row2_col1, #T_a2fb4_row2_col2, #T_a2fb4_row2_col3, #T_a2fb4_row2_col4, #T_a2fb4_row2_col5, #T_a2fb4_row3_col0, #T_a2fb4_row3_col1, #T_a2fb4_row3_col2, #T_a2fb4_row3_col3, #T_a2fb4_row3_col4, #T_a2fb4_row3_col5, #T_a2fb4_row4_col0, #T_a2fb4_row4_col1, #T_a2fb4_row4_col2, #T_a2fb4_row4_col3, #T_a2fb4_row4_col4, #T_a2fb4_row4_col5 {\n", + " text-align: left;\n", + " white-space: pre-wrap;\n", + " word-wrap: break-word;\n", + " max-width: 400px;\n", + "}\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 input_textexample_entitiesexample_relationshipspred_entitiespred_relationshipsentity_recall_metric
0As students from Marjory Stoneman Douglas High School confront lawmakers with demands to restrict sales of assault rifles, there were warnings by the president of...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where students confronted lawmakers about restricting sales of assault rifles.', 'importance_score':...[{'src_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'tgt_id': 'FLORIDA', 'description': 'Students from Marjory Stoneman Douglas High School are located in the state of Florida.', 'weight': 0.9,...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where a mass shooting occurred, leading to demands for restricting assault...[{'src_id': 'NIKOLAS CRUZ', 'tgt_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'description': 'Nikolas Cruz carried out a mass shooting at Marjory Stoneman Douglas High School.', 'weight': 0.9,...✔️ [1.0]
1From ferrying people to and from their place of work to transporting nuclear waste and coal, railways are not only an integral part of 21st...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels whose clients include Deutsche Bahn and French rail operator SNCF.', 'importance_score': 0.9}, {'entity_name': 'DEUTSCHE...[{'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description': 'Railnova provides technology solutions to Deutsche Bahn.', 'weight': 0.9, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'SNCF', 'description': 'Railnova provides technology...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels that offers solutions to improve train operations.', 'importance_score': 0.9}, {'entity_name': 'CHRISTIAN SPRAUER', 'entity_type': 'PERSON',...[{'src_id': 'CHRISTIAN SPRAUER', 'tgt_id': 'RAILNOVA', 'description': 'Christian Sprauer is the CEO and founder of Railnova.', 'weight': 0.9, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description':...✔️ [0.8888888888888888]
2Jan 22 (Reuters) - Shanghai Stock Exchange Filing * SHOWS BLOCK TRADE OF YONGHUI SUPERSTORES Co LTd's 166.3 MILLION SHARES INVOLVING 1.63 BILLION YUAN ($254.68...[{'entity_name': 'YONGHUI SUPERSTORES CO LTD', 'entity_type': 'ORGANIZATION', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'importance_score': 1.0}, {'entity_name':...[{'src_id': 'YONGHUI SUPERSTORES CO LTD', 'tgt_id': '166.3 MILLION SHARES', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'weight':...[{'entity_name': 'SHANGHAI STOCK EXCHANGE', 'entity_type': 'ORGANIZATION', 'description': \"A stock exchange where the block trade of YONGHUI SUPERSTORES Co Ltd's shares took place.\", 'importance_score': 0.8}, {'entity_name':...[{'src_id': 'SHANGHAI STOCK EXCHANGE', 'tgt_id': 'YONGHUI SUPERSTORES CO LTD', 'description': \"The block trade of YONGHUI SUPERSTORES Co Ltd's shares took place at the Shanghai Stock...✔️ [0.8]
3LONDON (Reuters) - Britain’s economy was weaker than previously thought in 2017, official data showed on Thursday, leaving the country lagging further behind the global...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy was weaker than previously thought in 2017.', 'importance_score': 0.9}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.9, 'order': 1}, {'src_id': 'BANK OF ENGLAND', 'tgt_id': 'INTEREST RATES',...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy was weaker than previously thought in 2017.', 'importance_score': 0.9}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.8, 'order': 1}, {'src_id': 'BRITAIN', 'tgt_id': 'BANK OF ENGLAND', 'description':...✔️ [1.0]
4Trump taps White House doctor as new VA secretary 2 Hours Ago CNBC's Kayla Tausche reports President Trump has tapped White House physician Rear Admiral...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'President Trump who tapped Ronny Jackson as new VA secretary.', 'importance_score': 1.0}, {'entity_name': 'WHITE HOUSE', 'entity_type': 'ORGANIZATION', 'description': 'The White...[{'src_id': 'TRUMP', 'tgt_id': 'RONNY JACKSON', 'description': 'President Trump taps Ronny Jackson as new VA secretary.', 'weight': 1.0, 'order': 1}, {'src_id': 'RONNY JACKSON', 'tgt_id': 'VA', 'description':...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'The President who tapped Rear Admiral Ronny Jackson as the new VA secretary.', 'importance_score': 0.9}, {'entity_name': 'REAR ADMIRAL RONNY JACKSON',...[{'src_id': 'TRUMP', 'tgt_id': 'REAR ADMIRAL RONNY JACKSON', 'description': 'President Trump tapped Rear Admiral Ronny Jackson to run the Department of Veterans Affairs.', 'weight': 0.9, 'order':...✔️ [0.5714285714285714]
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " ... 15 more rows not displayed ...\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/20 [00:00\n", + "#T_a56c4 th {\n", + " text-align: left;\n", + "}\n", + "#T_a56c4 td {\n", + " text-align: left;\n", + "}\n", + "#T_a56c4_row0_col0, #T_a56c4_row0_col1, #T_a56c4_row0_col2, #T_a56c4_row0_col3, #T_a56c4_row0_col4, #T_a56c4_row0_col5, #T_a56c4_row1_col0, #T_a56c4_row1_col1, #T_a56c4_row1_col2, #T_a56c4_row1_col3, #T_a56c4_row1_col4, #T_a56c4_row1_col5, #T_a56c4_row2_col0, #T_a56c4_row2_col1, #T_a56c4_row2_col2, #T_a56c4_row2_col3, #T_a56c4_row2_col4, #T_a56c4_row2_col5, #T_a56c4_row3_col0, #T_a56c4_row3_col1, #T_a56c4_row3_col2, #T_a56c4_row3_col3, #T_a56c4_row3_col4, #T_a56c4_row3_col5, #T_a56c4_row4_col0, #T_a56c4_row4_col1, #T_a56c4_row4_col2, #T_a56c4_row4_col3, #T_a56c4_row4_col4, #T_a56c4_row4_col5 {\n", + " text-align: left;\n", + " white-space: pre-wrap;\n", + " word-wrap: break-word;\n", + " max-width: 400px;\n", + "}\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 input_textexample_entitiesexample_relationshipspred_entitiespred_relationshipsrelationships_similarity_metric
0As students from Marjory Stoneman Douglas High School confront lawmakers with demands to restrict sales of assault rifles, there were warnings by the president of...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where students confronted lawmakers about restricting sales of assault rifles.', 'importance_score':...[{'src_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'tgt_id': 'FLORIDA', 'description': 'Students from Marjory Stoneman Douglas High School are located in the state of Florida.', 'weight': 0.9,...[{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where a mass shooting occurred, leading to demands for restricting assault...[{'src_id': 'NIKOLAS CRUZ', 'tgt_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'description': 'Nikolas Cruz carried out a mass shooting at Marjory Stoneman Douglas High School.', 'weight': 0.9,...✔️ [0.4]
1From ferrying people to and from their place of work to transporting nuclear waste and coal, railways are not only an integral part of 21st...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels whose clients include Deutsche Bahn and French rail operator SNCF.', 'importance_score': 0.9}, {'entity_name': 'DEUTSCHE...[{'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description': 'Railnova provides technology solutions to Deutsche Bahn.', 'weight': 0.9, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'SNCF', 'description': 'Railnova provides technology...[{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels that offers solutions to improve train operations.', 'importance_score': 0.9}, {'entity_name': 'CHRISTIAN SPRAUER', 'entity_type': 'PERSON',...[{'src_id': 'CHRISTIAN SPRAUER', 'tgt_id': 'RAILNOVA', 'description': 'Christian Sprauer is the CEO and founder of Railnova.', 'weight': 0.9, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description':...✔️ [0.85]
2Jan 22 (Reuters) - Shanghai Stock Exchange Filing * SHOWS BLOCK TRADE OF YONGHUI SUPERSTORES Co LTd's 166.3 MILLION SHARES INVOLVING 1.63 BILLION YUAN ($254.68...[{'entity_name': 'YONGHUI SUPERSTORES CO LTD', 'entity_type': 'ORGANIZATION', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'importance_score': 1.0}, {'entity_name':...[{'src_id': 'YONGHUI SUPERSTORES CO LTD', 'tgt_id': '166.3 MILLION SHARES', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'weight':...[{'entity_name': 'SHANGHAI STOCK EXCHANGE', 'entity_type': 'ORGANIZATION', 'description': \"A stock exchange where the block trade of YONGHUI SUPERSTORES Co Ltd's shares took place.\", 'importance_score': 0.8}, {'entity_name':...[{'src_id': 'SHANGHAI STOCK EXCHANGE', 'tgt_id': 'YONGHUI SUPERSTORES CO LTD', 'description': \"The block trade of YONGHUI SUPERSTORES Co Ltd's shares took place at the Shanghai Stock...✔️ [0.3]
3LONDON (Reuters) - Britain’s economy was weaker than previously thought in 2017, official data showed on Thursday, leaving the country lagging further behind the global...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy was weaker than previously thought in 2017.', 'importance_score': 0.9}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.9, 'order': 1}, {'src_id': 'BANK OF ENGLAND', 'tgt_id': 'INTEREST RATES',...[{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy was weaker than previously thought in 2017.', 'importance_score': 0.9}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The...[{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.8, 'order': 1}, {'src_id': 'BRITAIN', 'tgt_id': 'BANK OF ENGLAND', 'description':...✔️ [0.4]
4Trump taps White House doctor as new VA secretary 2 Hours Ago CNBC's Kayla Tausche reports President Trump has tapped White House physician Rear Admiral...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'President Trump who tapped Ronny Jackson as new VA secretary.', 'importance_score': 1.0}, {'entity_name': 'WHITE HOUSE', 'entity_type': 'ORGANIZATION', 'description': 'The White...[{'src_id': 'TRUMP', 'tgt_id': 'RONNY JACKSON', 'description': 'President Trump taps Ronny Jackson as new VA secretary.', 'weight': 1.0, 'order': 1}, {'src_id': 'RONNY JACKSON', 'tgt_id': 'VA', 'description':...[{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'The President who tapped Rear Admiral Ronny Jackson as the new VA secretary.', 'importance_score': 0.9}, {'entity_name': 'REAR ADMIRAL RONNY JACKSON',...[{'src_id': 'TRUMP', 'tgt_id': 'REAR ADMIRAL RONNY JACKSON', 'description': 'President Trump tapped Rear Admiral Ronny Jackson to run the Department of Veterans Affairs.', 'weight': 0.9, 'order':...✔️ [0.65]
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " ... 15 more rows not displayed ...\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "metrics = [entity_recall_metric, relationships_similarity_metric]\n", + "for metric in metrics:\n", + " evaluate = Evaluate(\n", + " devset=devset[:20], \n", + " metric=metric, \n", + " num_threads=os.cpu_count(), \n", + " display_progress=True,\n", + " display_table=5,\n", + " )\n", + " evaluate(miprov2_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('extractor.predictor', Predict(StringSignature(input_text, entity_types -> reasoning, entities_relationships\n", + " instructions=\"You are a meticulous entity relationship extractor tasked with analyzing a given text and extracting detailed entities and their relationships based on specified entity types. Your process involves:\\n\\n1. Identifying entities in the text that match the provided entity types, ensuring comprehensive descriptions and ignoring duplicates or generic terms.\\n\\n2. Extracting relationships between these entities using 'src_id' and 'tgt_id' keys from the text descriptions.\\n\\n3. Classifying relationships as direct (order:0), second-order (order:2), or third-order (order:3) based on their proximity and context in the text.\\n\\n4. Ensuring all entities and relationships are organized into a structured JSON object following the specified schema, providing clear and detailed descriptions for each entity and relationship.\"\n", + " input_text = Field(annotation=str required=True json_schema_extra={'desc': 'The text to extract entities and relationships from.', '__dspy_field_type': 'input', 'prefix': 'Input Text:'})\n", + " entity_types = Field(annotation=list[str] required=True json_schema_extra={'desc': 'List of entity types used for extraction.', '__dspy_field_type': 'input', 'prefix': 'Entity Types:'})\n", + " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${produce the entities_relationships}. We ...', '__dspy_field_type': 'output'})\n", + " entities_relationships = Field(annotation=list[Union[Entity, Relationship]] required=True json_schema_extra={'desc': 'List of entities and relationships extracted from the text.', '__dspy_field_type': 'output', 'prefix': 'Entities Relationships:'})\n", + ")))]\n" + ] + } + ], + "source": [ + "miprov2_model.save(entity_relationship_miprov2_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nano-graphrag", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nano-graphrag/examples/generate_entity_relationship_dspy.ipynb b/nano-graphrag/examples/generate_entity_relationship_dspy.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..9c5545e94d7268fb3b96c7fe72c119b85e073a2f --- /dev/null +++ b/nano-graphrag/examples/generate_entity_relationship_dspy.ipynb @@ -0,0 +1,2062 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Examples for Entity Relationship Extraction\n", + "\n", + "- Taking datasets from Huggingface containing news articles and generate entities and relationships out of each news article.\n", + "- Save them as DSPy examples locally to be used for fine-tuning prompt instructions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniconda/base/envs/nano-graphrag/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import dspy\n", + "import asyncio\n", + "import os\n", + "import numpy as np\n", + "from dotenv import load_dotenv\n", + "from datasets import load_dataset\n", + "import logging\n", + "import pickle\n", + "\n", + "from nano_graphrag._utils import compute_mdhash_id\n", + "from nano_graphrag.entity_extraction.extract import generate_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "WORKING_DIR = \"./nano_graphrag_cache_generate_dspy_examples\"\n", + "\n", + "load_dotenv()\n", + "\n", + "logging.basicConfig(level=logging.WARNING)\n", + "logging.getLogger(\"nano-graphrag\").setLevel(logging.DEBUG)\n", + "\n", + "np.random.seed(1337)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + " You are a world-class AI system, capable of complex reasoning and reflection. \n", + " Reason through the query, and then provide your final response. \n", + " If you detect that you made a mistake in your reasoning at any point, correct yourself.\n", + " Think carefully.\n", + "\"\"\"\n", + "lm = dspy.OpenAI(\n", + " model=\"deepseek-chat\", \n", + " model_type=\"chat\", \n", + " api_key=os.environ[\"DEEPSEEK_API_KEY\"], \n", + " base_url=os.environ[\"DEEPSEEK_BASE_URL\"], \n", + " system_prompt=system_prompt, \n", + " temperature=1.0,\n", + " top_p=1.0,\n", + " max_tokens=4096\n", + ")\n", + "dspy.settings.configure(lm=lm, experimental=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "train_len = 100\n", + "val_len = 100\n", + "dev_len = 200" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(WORKING_DIR, exist_ok=True)\n", + "entity_relationship_trainset_path = os.path.join(WORKING_DIR, \"entity_relationship_extraction_news_trainset.pkl\")\n", + "entity_relationship_valset_path = os.path.join(WORKING_DIR, \"entity_relationship_extraction_news_valset.pkl\")\n", + "entity_relationship_devset_path = os.path.join(WORKING_DIR, \"entity_relationship_extraction_news_devset.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniconda/base/envs/nano-graphrag/lib/python3.10/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n", + " table = cls._concat_blocks(blocks, axis=0)\n" + ] + } + ], + "source": [ + "fin_news = load_dataset(\"ashraq/financial-news-articles\")\n", + "cnn_news = load_dataset(\"AyoubChLin/CNN_News_Articles_2011-2022\")\n", + "fin_shuffled_indices = np.random.permutation(len(fin_news['train']))\n", + "cnn_train_shuffled_indices = np.random.permutation(len(cnn_news['train']))\n", + "cnn_test_shuffled_indices = np.random.permutation(len(cnn_news['test']))\n", + "train_data = cnn_news['train'].select(cnn_train_shuffled_indices[:train_len])\n", + "val_data = cnn_news['test'].select(cnn_test_shuffled_indices[:val_len])\n", + "dev_data = fin_news['train'].select(fin_shuffled_indices[:dev_len])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 100 chunks, 907 entities(duplicated), 667 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:nano-graphrag:Saved 100 examples with keys: ['input_text', 'entities', 'relationships'], filtered 0 examples\n" + ] + } + ], + "source": [ + "train_chunks = {compute_mdhash_id(text, prefix=f\"chunk-trainset-\"): {\"content\": text} for text in train_data[\"text\"]}\n", + "trainset = asyncio.run(generate_dataset(chunks=train_chunks, filepath=entity_relationship_trainset_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Example({'input_text': ' (CNN)Former Russian spy Sergei Skripal and his daughter, Yulia, are continuing to recover from a nerve agent attack, and their rehabilitation has been \"slow and extremely painful,\" the daughter said Wednesday. Sergei, 66, and Yulia Skripal were found March 4 slumped on a bench in Salisbury, England, after being exposed to novichok, a military-grade nerve agent.While Yulia Skripal, 33, spent 20 days in a coma before she was released from the hospital in April and taken to a safe location, her father remained hospitalized until last week. \"We are so lucky to have both survived this attempted assassination,\" she said Wednesday from an undisclosed location, her first public appearance since the attack. \"I don\\'t want to describe the details, but the clinical treatment was invasive, painful and depressing. Our recovery has been slow and extremely painful.\"JUST WATCHEDYulia Skripal released from hospitalReplayMore Videos ...MUST WATCHYulia Skripal released from hospital 01:48They both are progressing, she said, but her life has been turned upside down, and she\\'s experienced physical and emotional changes, on which she did not elaborate. Read More\"I take one day at a time and want to help care for my father until his full recovery. In the longer term, I hope to return home to my country,\" she said. In requesting privacy and emphasizing that no one speaks for either her or her father, Yulia Skripal thanked the Salisbury District Hospital staff and others involved in their care. She added, \"I\\'m grateful for the offers of assistance from the Russian Embassy, but at the moment I do not wish to avail myself of their services.\"Former Russian spy Sergei Skripal and his daughter, Yulia Skripal, at a restaurant in Salisbury, UK.Lorna Wilkinson, director of nursing at the hospital, said last week that Sergei Skripal would continue his recovery outside the hospital, the UK Press Association reported. The poisoning of the Skripals sparked a diplomatic row between the UK and Russia, which has consistently denied allegations it was behind the poisoningThe UK expelled 23 Russian diplomats who had been declared as unidentified intelligence officers.The United States, Canada, Australia and 18 European Union states kicked out Russian diplomats in a show of support for the UK.Detectives with London\\'s Metropolitan Police believe the Skripals first came into contact with a nerve agent at Sergei Skripal\\'s home.JUST WATCHEDWho is Sergei Skripal? ReplayMore Videos ...MUST WATCHWho is Sergei Skripal? 01:56In late March, police identified the highest concentration of the nerve agent on the home\\'s front door.Upon Sergei Skripal\\'s release, Russian President Vladimir Putin wished him good health, but questioned the British claim that a military-grade nerve agent was responsible for his illness. \"A military-grade poisonous substance is so powerful that the person dies within seconds or minutes,\" Putin said at a news conference alongside German Chancellor Angela Merkel in Sochi, Russia.\"We repeatedly offered UK authorities our help, and we asked to be given access to the investigation, but there is no response,\" he said. \"Our offer stands.\"CNN\\'s James Masters contributed to this report.', 'entities': [{'entity_name': 'SERGEI SKRIPAL', 'entity_type': 'PERSON', 'description': 'Former Russian spy who was attacked with a nerve agent.', 'importance_score': 1.0}, {'entity_name': 'YULIA SKRIPAL', 'entity_type': 'PERSON', 'description': 'Daughter of Sergei Skripal, also attacked with a nerve agent.', 'importance_score': 0.9}, {'entity_name': 'SALISBURY', 'entity_type': 'LOCATION', 'description': 'City in England where the Skripals were found after the attack.', 'importance_score': 0.8}, {'entity_name': 'UK', 'entity_type': 'LOCATION', 'description': 'Country where the attack on the Skripals took place.', 'importance_score': 0.7}, {'entity_name': 'RUSSIA', 'entity_type': 'LOCATION', 'description': 'Country of origin for Sergei Skripal and the suspected source of the attack.', 'importance_score': 0.8}, {'entity_name': 'VLADIMIR PUTIN', 'entity_type': 'PERSON', 'description': 'Russian President who commented on the Skripal case.', 'importance_score': 0.7}, {'entity_name': 'ANGELA MERKEL', 'entity_type': 'PERSON', 'description': 'German Chancellor who was alongside Putin during a news conference.', 'importance_score': 0.6}, {'entity_name': 'SOCHI', 'entity_type': 'LOCATION', 'description': 'City in Russia where Putin and Merkel held a news conference.', 'importance_score': 0.5}, {'entity_name': 'NOVICHOK', 'entity_type': 'CHEMICAL', 'description': 'Military-grade nerve agent used in the attack on the Skripals.', 'importance_score': 0.9}], 'relationships': [{'src_id': 'SERGEI SKRIPAL', 'tgt_id': 'YULIA SKRIPAL', 'description': 'Father-daughter relationship between Sergei Skripal and Yulia Skripal.', 'weight': 1.0, 'order': 1}, {'src_id': 'SERGEI SKRIPAL', 'tgt_id': 'SALISBURY', 'description': 'Sergei Skripal was found in Salisbury after the attack.', 'weight': 0.8, 'order': 1}, {'src_id': 'YULIA SKRIPAL', 'tgt_id': 'SALISBURY', 'description': 'Yulia Skripal was found in Salisbury after the attack.', 'weight': 0.8, 'order': 1}, {'src_id': 'SERGEI SKRIPAL', 'tgt_id': 'UK', 'description': 'Sergei Skripal was attacked in the UK.', 'weight': 0.7, 'order': 1}, {'src_id': 'YULIA SKRIPAL', 'tgt_id': 'UK', 'description': 'Yulia Skripal was attacked in the UK.', 'weight': 0.7, 'order': 1}, {'src_id': 'SERGEI SKRIPAL', 'tgt_id': 'RUSSIA', 'description': 'Sergei Skripal is from Russia and the attack is suspected to be from Russia.', 'weight': 0.8, 'order': 2}, {'src_id': 'YULIA SKRIPAL', 'tgt_id': 'RUSSIA', 'description': 'Yulia Skripal is from Russia and the attack is suspected to be from Russia.', 'weight': 0.8, 'order': 2}, {'src_id': 'VLADIMIR PUTIN', 'tgt_id': 'SERGEI SKRIPAL', 'description': 'Vladimir Putin commented on the Skripal case.', 'weight': 0.7, 'order': 2}, {'src_id': 'VLADIMIR PUTIN', 'tgt_id': 'ANGELA MERKEL', 'description': 'Vladimir Putin and Angela Merkel held a news conference together.', 'weight': 0.6, 'order': 1}, {'src_id': 'VLADIMIR PUTIN', 'tgt_id': 'SOCHI', 'description': 'Vladimir Putin and Angela Merkel held a news conference in Sochi.', 'weight': 0.5, 'order': 1}, {'src_id': 'SERGEI SKRIPAL', 'tgt_id': 'NOVICHOK', 'description': 'Sergei Skripal was exposed to novichok in the attack.', 'weight': 0.9, 'order': 1}, {'src_id': 'YULIA SKRIPAL', 'tgt_id': 'NOVICHOK', 'description': 'Yulia Skripal was exposed to novichok in the attack.', 'weight': 0.9, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': ' (CNN)Roger Federer thinks the professional tennis circuit won\\'t return for a while due to the coronavirus pandemic but, when the time does come, the Swiss superstar said he would find it difficult to play without fans. While European football\\'s Bundesliga resurfaced last week behind closed doors and Spain\\'s La Liga is set to resume the middle of next month, the last official word from tennis authorities essentially saw all action suspended through July. The hiatus began in March, with Federer already sidelined since he was recuperating from knee surgery. Sunday would have marked the first day of the French Open in its usual spot on the tennis calendar -- in March, though, it was rescheduled for September -- and another grand slam, Wimbledon in July, was called off. \"I\\'m not training at the moment because I don\\'t see a reason for that to be honest,\" Federer told three-time French Open champion Gustavo Kuerten -- who is raising funds for coronavirus relief efforts in his native Brazil -- in a video interview reported by Tennis.com.Read More\"I am happy with my body now and I still believe that the return of the tour\\xa0is a long way off,\" continued the 38-year-old. \"And I think it\\'s important\\xa0mentally to enjoy this break, having played so much tennis.\\xa0\"When I\\'m getting towards returning and have a goal to train for, I think I will be super motivated.\"We should be sliding into @rolandgarros right now, thinking of our mates in Paris 👋 pic.twitter.com/0PLKryyIjj— #AusOpen (@AustralianOpen) May 24, 2020 Federer is arguably tennis\\' best supported player ever, and the prospect of competing without spectators doesn\\'t appeal to him. \"Most of the time when we are training, there is no one,\" said the men\\'s record 20-time grand slam champion. \"For us, of course, it is possible to play if there are no fans. But on the other hand, I really hope that the circuit can return as it normally is. \"And hold off till the time is appropriate, minimum\\xa0a third of the stadium or half full. But, for me, completely empty when playing big tournaments is very difficult.\"Federer has been active on social media during the lockdown, sparking a public discussion on the merging of the men\\'s and women\\'s tours with a tweet last month and embarking on a funny Instagram Live with tennis rival Rafael Nadal.Nadal, unlike Federer, has started practicing, though only very recently. The Spaniard would have been favored to win a 20th major and tie Federer had the French Open been played as usual given he has collected a record 12 titles at Roland Garros. Here I am, the first pictures I am posting for you on court. This is my practice earlier today at @rnadalacademy #BackOnCourt #BabolatFamily 🎾👍🏻💪🏻😉 pic.twitter.com/x7tzgLj9pc— Rafa Nadal (@RafaelNadal) May 22, 2020 The next grand slam is scheduled to be late August\\'s US Open in New York, with organizers expected to announce in June if it will go ahead. ', 'entities': [{'entity_name': 'ROGER FEDERER', 'entity_type': 'PERSON', 'description': \"Swiss tennis superstar who thinks the professional tennis circuit won't return for a while due to the coronavirus pandemic.\", 'importance_score': 1.0}, {'entity_name': 'GUSTAVO KUERTEN', 'entity_type': 'PERSON', 'description': 'Three-time French Open champion who is raising funds for coronavirus relief efforts in his native Brazil.', 'importance_score': 0.8}, {'entity_name': 'RAFAEL NADAL', 'entity_type': 'PERSON', 'description': 'Tennis rival of Roger Federer who has started practicing recently.', 'importance_score': 0.9}, {'entity_name': 'FRENCH OPEN', 'entity_type': 'EVENT', 'description': 'A grand slam tennis tournament that was rescheduled for September.', 'importance_score': 0.7}, {'entity_name': 'WIMBLEDON', 'entity_type': 'EVENT', 'description': 'A grand slam tennis tournament in July that was called off.', 'importance_score': 0.7}, {'entity_name': 'US OPEN', 'entity_type': 'EVENT', 'description': 'A grand slam tennis tournament scheduled for late August in New York.', 'importance_score': 0.7}, {'entity_name': 'ROLAND GARROS', 'entity_type': 'LOCATION', 'description': 'The location where the French Open is usually held.', 'importance_score': 0.6}], 'relationships': [{'src_id': 'ROGER FEDERER', 'tgt_id': 'GUSTAVO KUERTEN', 'description': 'Roger Federer discussed the return of the tennis tour with Gustavo Kuerten in a video interview.', 'weight': 0.8, 'order': 1}, {'src_id': 'ROGER FEDERER', 'tgt_id': 'RAFAEL NADAL', 'description': 'Roger Federer has engaged in social media discussions and a funny Instagram Live with tennis rival Rafael Nadal.', 'weight': 0.9, 'order': 1}, {'src_id': 'ROGER FEDERER', 'tgt_id': 'FRENCH OPEN', 'description': 'Roger Federer mentioned the rescheduling of the French Open in September.', 'weight': 0.7, 'order': 1}, {'src_id': 'ROGER FEDERER', 'tgt_id': 'WIMBLEDON', 'description': 'Roger Federer mentioned the cancellation of Wimbledon.', 'weight': 0.7, 'order': 1}, {'src_id': 'RAFAEL NADAL', 'tgt_id': 'FRENCH OPEN', 'description': 'Rafael Nadal would have been favored to win the French Open if it had been played as usual.', 'weight': 0.8, 'order': 1}, {'src_id': 'RAFAEL NADAL', 'tgt_id': 'US OPEN', 'description': \"Rafael Nadal's practice is in preparation for the upcoming US Open.\", 'weight': 0.7, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': 'London (CNN)Top archbishops in the Church of England have apologized for guidance issued by the church last week that said only married heterosexuals should have sex -- while same-sex or heterosexual Christians in civil partnerships should remain abstinent. Archbishop of Canterbury Justin Welby and Archbishop of York John Sentamu said in a statement Thursday that they took responsibility for last week\\'s announcement, \"which we acknowledge has jeopardized trust.\" \"We are very sorry and recognize the division and hurt this has caused,\" Welby and Sentamu wrote.The statement stopped short of retracting the \"pastoral guidance\" issued by the bishops of the Church of England, which was in response to the extension of civil partnerships to heterosexual couples in the UK.Church of England backs climate-friendly stock index that excludes BP and Exxon MobilThe guidance, published January 22, said \"for Christians, marriage -- that is the lifelong union between a man and a woman, contracted with the making of vows -- remains the proper context for sexual activity.\"Read MoreWhen it comes to civil partnerships, the church sought \"to affirm the value of committed, sexually abstinent friendships.\" It highlighted division between conservatives -- who want the church to stick to a traditional biblical teachings on marriage -- and the church\\'s progressive members. The guidance was described as \"a laughingstock to a nation that believes it (the church) is obsessed with sex,\" in an open letter to the archbishops. The letter, signed by more than 3,000 people, including dozens of members of the clergy, said the bishops\\' guidance \"has significantly damaged the mission of the church and it has broken the trust of those it seeks to serve.\"It goes on to express dismay that a public pronouncement on sex and marriage was made while the church was still undergoing a review of of the issue, called the \"Living in Love and Faith\" project. The letter said its signatories had not expected an announcement on the matter until the project\\'s report was published. \"It seems our trust has been misplaced and we feel badly let down.\" ', 'entities': [{'entity_name': 'LONDON', 'entity_type': 'LOCATION', 'description': 'The capital city of the United Kingdom, mentioned as the location of the news source.', 'importance_score': 0.8}, {'entity_name': 'JUSTIN WELBY', 'entity_type': 'PERSON', 'description': \"The Archbishop of Canterbury, who apologized for the church's guidance on sexual activity.\", 'importance_score': 0.9}, {'entity_name': 'JOHN SENTAMU', 'entity_type': 'PERSON', 'description': \"The Archbishop of York, who also apologized for the church's guidance on sexual activity.\", 'importance_score': 0.9}, {'entity_name': 'CHURCH OF ENGLAND', 'entity_type': 'ORGANIZATION', 'description': 'The established church of England, which issued controversial guidance on sexual activity.', 'importance_score': 0.9}, {'entity_name': 'JANUARY 22', 'entity_type': 'DATE', 'description': 'The date when the pastoral guidance was published by the Church of England.', 'importance_score': 0.7}], 'relationships': [{'src_id': 'JUSTIN WELBY', 'tgt_id': 'CHURCH OF ENGLAND', 'description': \"Justin Welby, as the Archbishop of Canterbury, is responsible for the Church of England's guidance on sexual activity.\", 'weight': 0.9, 'order': 1}, {'src_id': 'JOHN SENTAMU', 'tgt_id': 'CHURCH OF ENGLAND', 'description': \"John Sentamu, as the Archbishop of York, is responsible for the Church of England's guidance on sexual activity.\", 'weight': 0.9, 'order': 1}, {'src_id': 'CHURCH OF ENGLAND', 'tgt_id': 'JUSTIN WELBY', 'description': 'The Church of England issued guidance on sexual activity under the leadership of Justin Welby.', 'weight': 0.9, 'order': 1}, {'src_id': 'CHURCH OF ENGLAND', 'tgt_id': 'JOHN SENTAMU', 'description': 'The Church of England issued guidance on sexual activity under the leadership of John Sentamu.', 'weight': 0.9, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': ' (CNN)Germans will file into polling stations to vote in an unpredictable federal election on Sunday -- but for the first time in nearly two decades, their longtime leader won\\'t be in contention.Chancellor Angela Merkel has been a symbol of stability in Europe since she took on the role in 2005; the chemist-turned-political mainstay has withstood a wave of populism, a financial crisis, a pandemic and Brexit to carve out an impressive legacy as the world\\'s most successful female leader.But Merkel, 67, will step down once the repercussions of Sunday\\'s vote become clear, a move that has cast a sense of uncertainty over the weekend\\'s election.Compared to the previous votes in 2017 and 2013, there is \"much more chance of a significant shift in German politics and policy after the election,\" according to Pepijn Bergsen, a research fellow who monitors the country for international think tank Chatham House.The race to become Merkel\\'s successor is tight, and the ultimate victor may not be known for days or even weeks after polls close. Read MoreBut for the first time in a generation, Germans will be deciding what post-Merkel Germany will look like. Whoever they turn to will face a catalog of challenges, both at home and abroad.Merkel has provided a steady hand domestically and abroad, but Germans must now decide on her successor.Where is Merkel?Merkel\\'s departure from the front lines of global politics has been a long time coming; she first announced in 2018 that she would not seek re-election at the end of her term, following a series of setbacks in regional elections.In her time in office, she has dealt with five UK prime ministers, four French presidents, seven Italian prime ministers, and four American commanders-in-chief. Her period in power has been a remarkably eventful one, and Merkel\\'s imperturbable presence throughout has earned her an international reputation for stability and level-headedness. \"That worked very well politically for her in Germany, and on the world stage,\" Bergsen told CNN. \"Germany has done very well over the last 15 years from an economic perspective ... (and) Germany didn\\'t do that badly during the financial crisis, but the realization has crept in that that won\\'t last.\"The European refugee crisis of the mid-2010s proved a major challenge to Merkel\\'s party, the Christian Democratic Union (CDU), and she has also earned detractors over her close relationship with China. But after a pandemic which saw Germany fare better than many of its neighbors, analysts and polling suggest Merkel will leave office with the respect of most Germans. \"She\\'s seen very positively in Germany, because she\\'s associated with stability -- people know what they\\'re getting,\" said Ben Schreer, from the International Institute for Strategic Studies\\' (IISS) Berlin-based Europe office.Who\\'s in the race to replace her?German politics is dominated by two parties -- the center-right CDU and the left-leaning Social Democratic Party, or SPD -- who have governed together in a coalition for the past eight years. But other parties have grown in popularity over the past decade as the CDU and SPD have lost ground. This election is particularly close; the CDU and SPD have both held polling advantages, and the Green Party has also emerged as a serious contender.Merkel\\'s successor at the helm of the CDU is Armin Laschet, 60, a long-time ally of the Chancellor and the party\\'s deputy leader since 2012. A devout Catholic whose father was at one point a coal mining engineer, he was selected as the party\\'s candidate after a torturous leadership tussle.Laschet has a background in law and journalism, and was elected to the German Bundestag in 1994.Laschet won a protracted leadership campaign to replace Merkel, but he is struggling to attract voters on the national stage.Merkel has voiced her support for Laschet, but despite her efforts to persuade Germans to stick with the CDU, polling suggests her replacement as the party\\'s leader has struggled to win over Germans.His foremost opponent is the SPD\\'s Olaf Scholz, who has taken a surprise lead in the polls in recent weeks, leaving him as the marginal frontrunner heading into Sunday\\'s vote. Like Laschet, Scholz has a long history as a political player in Germany. He has been Merkel\\'s finance minister and vice chancellor since 2018, placing him arguably in a better position to run as her natural successor than her own party\\'s candidate.Scholz has earned increased visibility as he navigated Germany\\'s economic response to the pandemic, and cleared the last electoral hurdle with an assured performance in the final television debate. But polls nonetheless suggest a huge number of undecided voters late in the campaign, increasing the unpredictability of the vote.The Green Party\\'s leader Annalena Baerbock caused a brief sensation in German politics when she surged in the polls early in the campaign, prompting voters to wonder whether she could become the country\\'s first ever Green chancellor.Olaf Scholz has taken a surprise lead in polling in recent weeks.A 40-year-old former professional trampolinist, Baerbock stands out in a field of mostly male political leaders. And although her star has faded somewhat in the closing stretch, she has capitalized on voters\\' climate concerns to establish her group as the third party in the race. The far-right AfD remains a stubborn presence on the political scene, scrapping with the liberal Free Democratic Party for fourth place. The refugee crisis that sparked the AfD\\'s surge in German politics has subsided as a pressing political issue, but the party remains an outlet for voters angered by immigration issues. In March, they became the first German party since the Nazi era to be put under government surveillance. How does the voting work?German elections to the Bundestag are run on a system of proportional representation, meaning that each party\\'s vote share relates directly to how many seats they get in parliament. That principle makes it virtually impossible for a party to lead a government alone; coalitions must instead be formed after the vote, and these often contain more than two groups.Many Germans have already cast their ballots; the pandemic has increased the amount of postal voting that took place before polling day. Regardless of how they choose to vote, Germans are asked to pick their local lawmaker, and also their preferred overall party. Once the results come in, a race will start to put together enough seats to govern -- meaning smaller parties can become kingmakers.\"Whoever wins on paper on Sunday night probably can\\'t be sure that he or she will actually lead the government, because there\\'s going to be so many permutations,\" Schreer explained, adding: \"We may not know until November, if we\\'re lucky.\"What are the issues?All the candidates are caught in a Merkel-sized conundrum, as they attempt to define their own agendas while allaying Germans\\' fears over a change in leadership. Climate change has been a major factor in the country\\'s national debate, particularly after devastating flooding hit the country in July. In Canada and Germany, the climate crisis is finally on the ballot. But can it win?A push from Merkel has put environmental issues at the heart of German politics, and virtually all parties have stressed their green credentials. In this campaign the Green Party has called for a 70% cut in greenhouse gas emissions from 1990 levels by 2030, compared to the current government goal of a 55% cut.Economic worries have also come to the fore; in a last-gasp pitch to voters, Laschet said Monday that a left-wing coalition led by the SPD would cause a \"severe economic crisis,\" Reuters reported. Laschet has also followed Merkel\\'s line regarding the European Union; in the final pre-election debate, he emphasized European cohesion as one of his flagship policies. But the campaign has been mostly defined by domestic matters; a minimum wage hike and pension reforms are at the heart of Scholz\\'s campaign, and he stressed those plans again in the debate.Will a Merkel-less Germany still lead on the world stage?The global consequences of Sunday\\'s vote are clear; Merkel\\'s longevity saw her become Europe\\'s de facto leader, and it\\'s unclear whether her successor will fill the same role.\"Germany\\'s going to be faced with some significant foreign policy challenges which the new government has to take on,\" Schreer said. \"The question is, who\\'s going to replace (Merkel), and will that person have the same charisma and ability that she did?\" he added. \"Allies are skeptical, and Germans as well are quite cautious in that regard.\"A key part of Merkel\\'s role was her steadfast determination to maintain European cohesion and paper over the cracks between EU member states. Merkel has outlasted dozens of major leaders during her 16-year period in power.\"Macron will try to usurp Merkel\\'s position in Europe,\" predicted Bergsen, signaling a possible shift in the balance of power towards France, Germany\\'s western neighbor. \"The German position won\\'t necessarily change, but whoever now comes to power will have to deal with a broader (domestic) coalition so they will find it slightly harder to lead on the international stage.\"Looking further afield, Germany\\'s new leader will also have to balance the country\\'s relationships with the United States and China, two nations with whom Merkel attempted to maintain close ties.And keeping the United Kingdom close after its departure from the EU is key. \"The UK remains an important partner in strategic terms, and Germany knows that if the UK isn\\'t engaged in the European continent, then you will split the Europeans,\" said Schreer.\"(Germany) is a well-respected country at the international stage -- that is undoubtedly the case,\" he added. \"The question is: Does that now enable Germany to weather those international storms that are certainly coming?\"', 'entities': [{'entity_name': 'ANGELA MERKEL', 'entity_type': 'PERSON', 'description': 'Chancellor of Germany, symbol of stability in Europe, chemist-turned-political mainstay', 'importance_score': 1.0}, {'entity_name': 'GERMANY', 'entity_type': 'LOCATION', 'description': 'Country where the federal election is taking place', 'importance_score': 0.9}, {'entity_name': 'CHRISTIAN DEMOCRATIC UNION (CDU)', 'entity_type': 'POLITICAL_PARTY', 'description': \"Merkel's party, faced challenges during the European refugee crisis\", 'importance_score': 0.8}, {'entity_name': 'ARMIN LASCHET', 'entity_type': 'PERSON', 'description': \"Merkel's successor at the helm of the CDU, long-time ally of the Chancellor\", 'importance_score': 0.8}, {'entity_name': 'OLAF SCHOLZ', 'entity_type': 'PERSON', 'description': \"SPD's candidate, took a surprise lead in the polls, Merkel's finance minister and vice chancellor\", 'importance_score': 0.8}, {'entity_name': 'SOCIAL DEMOCRATIC PARTY (SPD)', 'entity_type': 'POLITICAL_PARTY', 'description': 'Left-leaning party, governed together with CDU in a coalition', 'importance_score': 0.7}, {'entity_name': 'ANNALENA BAERBOCK', 'entity_type': 'PERSON', 'description': \"Green Party's leader, former professional trampolinist\", 'importance_score': 0.7}, {'entity_name': 'GREEN PARTY', 'entity_type': 'POLITICAL_PARTY', 'description': 'Serious contender in the election, focused on climate change', 'importance_score': 0.7}, {'entity_name': 'AFD', 'entity_type': 'POLITICAL_PARTY', 'description': 'Far-right party, remains a presence on the political scene', 'importance_score': 0.6}], 'relationships': [{'src_id': 'ANGELA MERKEL', 'tgt_id': 'ARMIN LASCHET', 'description': \"Merkel's successor at the helm of the CDU\", 'weight': 0.9, 'order': 1}, {'src_id': 'ANGELA MERKEL', 'tgt_id': 'OLAF SCHOLZ', 'description': \"Merkel's finance minister and vice chancellor\", 'weight': 0.8, 'order': 1}, {'src_id': 'ARMIN LASCHET', 'tgt_id': 'CHRISTIAN DEMOCRATIC UNION (CDU)', 'description': 'Laschet is the candidate for the CDU', 'weight': 0.9, 'order': 1}, {'src_id': 'OLAF SCHOLZ', 'tgt_id': 'SOCIAL DEMOCRATIC PARTY (SPD)', 'description': 'Scholz is the candidate for the SPD', 'weight': 0.9, 'order': 1}, {'src_id': 'ANNALENA BAERBOCK', 'tgt_id': 'GREEN PARTY', 'description': 'Baerbock is the leader of the Green Party', 'weight': 0.9, 'order': 1}, {'src_id': 'GERMANY', 'tgt_id': 'CHRISTIAN DEMOCRATIC UNION (CDU)', 'description': 'CDU is a major political party in Germany', 'weight': 0.8, 'order': 1}, {'src_id': 'GERMANY', 'tgt_id': 'SOCIAL DEMOCRATIC PARTY (SPD)', 'description': 'SPD is a major political party in Germany', 'weight': 0.8, 'order': 1}, {'src_id': 'GERMANY', 'tgt_id': 'GREEN PARTY', 'description': 'Green Party is a serious contender in the German election', 'weight': 0.7, 'order': 1}, {'src_id': 'GERMANY', 'tgt_id': 'AFD', 'description': 'AfD is a far-right party in Germany', 'weight': 0.6, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': 'Story highlightsBrazil beat Spain 3-0 in Confederations Cup finalBrazilian striker Neymar named as the best player of Confederations Cup Tournament permeated by protests over staging of 2014 World Cup in BrazilBrazil\\'s 3-0 win over Spain in the Confederations Cup final brought to an end an event that was designed as a test run for the 2014 World Cup hosts.It was a tournament that was permeated by social unrest, with protesters partly unhappy over the degree of public money devoted to the staging of the World Cup.CNN looks at five things we learned from the event.1. Neymar is the real dealNeymar has been touted as the latest in a line of Brazilian superstars that stretches all the way back to the great Pele, but outside Brazil many were beginning to believe the hype surrounding the young star was just that. Prior to the Confederations Cup his performances for the national team had been lacklustre, particular in the 2-2 draw against Chile in April, when he was singled out for jeers by Brazil\\'s own fans.JUST WATCHEDTour Brazilian football\\'s spiritual homeReplayMore Videos ...MUST WATCHTour Brazilian football\\'s spiritual home 02:24JUST WATCHEDBrazil wins Confederations CupReplayMore Videos ...MUST WATCHBrazil wins Confederations Cup 01:57JUST WATCHEDSpotlight on Paulinho ReplayMore Videos ...MUST WATCHSpotlight on Paulinho 00:46JUST WATCHEDThiago Silva targets PSG dominance ReplayMore Videos ...MUST WATCHThiago Silva targets PSG dominance 03:07Read: Maracana magic by Brazil to claim Confederations CupWhile most would argue that he\\'s in no position to judge, Marseille midfielder Joey Barton\\'s Tweet likening Neymar to Justin Bieber, and something rather unpleasant relating to cats, seemed to sum up the views of many European observers.Following the Brazilian\\'s $75 million transfer to Barcelona -- just before the tournament kicked off -- some were even suggesting the Catalan club had been fleeced by Brazilian side Santos. Not any more: this was no tantalizing glimpse of what La Liga fans might be watching next season, it was incontrovertible proof of what Neymar can bring. The 21-year-old produced a string of vibrant, flamboyant, but above all effective performances throughout the tournament, his goals and guile serving notice that he is every inch a superstar.2. Time may be up for Buffon and companyAfter winning friends at Euro 2012, the Azzurri continued to show that the days of \"catenaccio\" are long gone, with some fast attacking football. Italy\\'s run in the tournament was certainly invigorating, lurching from near disaster in an extraordinary first-half capitulation against Japan, to a sublime (albeit goalless) first-half domination of Spain.Read: Spain edge past Italy In the end they were well worth their third place, but it could and possibly should have been more. Gianluigi Buffon is widely and justifiably regarded as one of the world\\'s finest goalkeepers; yet his penalty saves in the third place play-off against Uruguay masked his culpability in goals conceded at key moments in the tournament.This was not the dominant, steely-eyed Buffon of old -- particularly against Brazil, where he looked anything but solid. Likewise, there are other old hands whose time in the famous blue shirt may be nearing its end. Given Cesare Prandelli\\'s reluctance to start AC Milan\\'s exciting midfielder Stephan El Shaarawy, major surgery is unlikely. Photos: Brazil\\'s most painful moment Photos: Brazil\\'s most painful momentA national tragedy – Moacyr Barbosa Nascimento\\'s life was forever changed after the 1950 World Cup. With Brazil needing just a draw against Uruguay in its final game to lift the trophy for the first time, the team lost 2-1 and he was blamed for the second goal. The goalkeeper\\'s perceived mistake haunted him. Twenty years later he overheard a woman in a supermarket say to her son, \"There is the man who made Brazil cry.\"Hide Caption 1 of 8 Photos: Brazil\\'s most painful momentThe Maracana – The Maracana Stadium in Rio de Janeiro was the venue for the 1950 final, with 200,000 spectators packed into the purpose-built arena. The stadium has been redeveloped and a crowd of 78,000 people will watch the final of 2014 World Cup at the iconic ground.Hide Caption 2 of 8 Photos: Brazil\\'s most painful momentBrazil\\'s golden boy – All eyes will be on Neymar during both June\\'s Confederations Cup and next year\\'s World Cup. The attacker, who recently signed for Barcelona in a deal reportedly worth in excess of $80 million, is Brazil\\'s star player and must perform to his best if \"La Selecao\" are to satisfy an expectant public.Hide Caption 3 of 8 Photos: Brazil\\'s most painful momentThe greatest ever? – The Brazil team of 1970, which beat Italy 4-1 in the World Cup final in Mexico, is widely regarded as the greatest of all time. Pele, a three-time World Cup winner seen here leaping on his teammates, says Brazil must recover from the failure of 63 years ago.Hide Caption 4 of 8 Photos: Brazil\\'s most painful momentThree-peat – Carlos Alberto, captain of the 1970 team, lifts the Jules Rimet trophy which Brazil was allowed to keep after becoming the first nation to win the World Cup three times. The former fullback thinks next year\\'s World Cup will come too soon for Brazil\\'s inexperienced team.Hide Caption 5 of 8 Photos: Brazil\\'s most painful momentThe second coming – Luiz Felipe Scolari was the coach of the last Brazil team to lift the World Cup, in Japan and South Korea in 2002. The veteran has been reappointed in a bid to inject life into an ailing Brazil team. His results have so far left much to be desired: two wins, one defeat and four draws since November 2012.Hide Caption 6 of 8 Photos: Brazil\\'s most painful momentGrand reopening – England was Brazil\\'s first opponent at a refurbished Maracana earlier this month. A half-volley from midfielder Paulinho, pictured, rescued a 2-2 draw for the 2014 World Cup host.Hide Caption 7 of 8 Photos: Brazil\\'s most painful momentA flourish against France – Brazil\\'s most recent match, the last before the Confederations Cup starts, ended in a comfortable 3-0 defeat of France. A penalty from Lucas Moura, right, completed the scoring.Hide Caption 8 of 8 Photos: Brazil\\'s greatest footballers Photos: Brazil\\'s greatest footballersHeleno – Actor Rodrigo Santoro signs a poster for the film \"Heleno\", in which he plays the mercurial striker. A destructive personality, together with illness and drug problems prevented Heleno from becoming one of Brazil\\'s greatest ever players. But he helped pave the way for some of the world\\'s greatest soccer icons...Hide Caption 1 of 8 Photos: Brazil\\'s greatest footballersPele – Ask many Brazilians who is the greatest footballer of all time and their answer will be simple: \"Pele.\" The striker won three World Cups with Brazil between 1958 and 1970 and is his country\\'s leading goalscorer with 77 goals from 92 caps.Hide Caption 2 of 8 Photos: Brazil\\'s greatest footballersGarrincha – Most football fans would say Argentina\\'s Diego Maradona is the only player who can rival Pele for the title of greatest ever. In Brazil, however, Garrincha is regarded as the only player who comes close to the great man. The tricky winger was a key part of Brazil\\'s World Cup triumphs in 1958 and 1962. Sadly, Garrincha struggled with alcohol problems and died of liver cirrhosis aged 49.Hide Caption 3 of 8 Photos: Brazil\\'s greatest footballersZico – After a Pele-inspired triumph in 1970, Brazil would wait 24 years before lifting the World Cup again. Although the 1980s was a barren decade in terms of trophies for Brazil, the team which the South Americans sent to the 1982 World Cup is heralded as one of the most entertaining in history. Central to its free-flowing, attacking style was Zico, a midfielder of considerable craft and guile who collected 72 caps between 1976 and 1988.Hide Caption 4 of 8 Photos: Brazil\\'s greatest footballersRomario – When Brazil finally won the World Cup for a fourth time in 1994 in the U.S., the team was derided by some for being too functional. In a team short of star quality, striker Romario was the shining light, scoring five goals as Brazil lifted the trophy thanks to a penalty-shootout victory over Italy.Hide Caption 5 of 8 Photos: Brazil\\'s greatest footballersRonaldo – Ronaldo watched on as Romario fired Brazil to victory in 1994, four years later he was the star man at France 1998. Brazil lost the final 3-0 to the hosts, with mystery surrounding their starting 11 as Ronaldo was left out of, then reinstated to, the team for the deciding match at the Stade de France. Ronaldo\\'s redemption arrived in 2002, when he scored both goals as Brazil beat Germany 2-0 to lift the World Cup for a fifth time.Hide Caption 6 of 8 Photos: Brazil\\'s greatest footballersRonaldinho – While Ronaldo was the star man in Japan and South Korea, he was ably supported by flamboyant playmaker Ronaldinho. Ronaldinho\\'s performance in the World Cup earned him a move to Barcelona in 2003, where he went on to win the European Champions League in 2006. He was twice named FIFA World Player of the Year.Hide Caption 7 of 8 Photos: Brazil\\'s greatest footballersNeymar? – The latest Brazilian tipped for stardom is Neymar, who recently followed in Ronaldinho\\'s footsteps by joining Barcelona. All eyes will be on the forward when Brazil host the World Cup in 2014. Neymar has made a good start to Brazil\\'s Confederations Cup campaign, scoring two goals in two matches.Hide Caption 8 of 8 Photos: Confederations Cup: Brazil beats Mexico as Fortaleza protests Photos: Confederations Cup: Brazil beats Mexico as Fortaleza protestsGolden Boy – Hours after declaring himself saddened by the need for protests against Brazil\\'s social conditions, Neymar brought joy to his compatriots with the opening goal in a 2-0 win over Mexico. Hide Caption 1 of 5 Photos: Confederations Cup: Brazil beats Mexico as Fortaleza protestsNo Ordinary Game – A family of four negotiate their way to the game as riot police prevent protesters from gaining access to Fortaleza\\'s Castelao Stadium. Hide Caption 2 of 5 Photos: Confederations Cup: Brazil beats Mexico as Fortaleza protestsStalemate – Protesters confront riot police officers on the distant outskirts of the Castelao Stadium, which has been newly built for next year\\'s World Cup at a cost of $240 million.Hide Caption 3 of 5 Photos: Confederations Cup: Brazil beats Mexico as Fortaleza protestsLone Marksman – A police officer holds his weapon as protesters continue their demonstration ahead of Brazil\\'s Group A match with Mexico in the ongoing Confederations Cup.Hide Caption 4 of 5 Photos: Confederations Cup: Brazil beats Mexico as Fortaleza protestsPower Play – Fans hold up banners - which state that they are protesting against corruption, rather than the national team - ahead of Brazil\\'s 2-0 win over Mexico.Hide Caption 5 of 5The Italian national side has always been resistant to change, as its ageing spine suggests, but it may need to pick up the pace of its evolution. The likes of Buffon and the peerless Andrea Pirlo may have one more big tournament in them, but Prandelli would do well to blood some fresh talent in the meantime, just in case -- especially given the physical demands next year\\'s World Cup will present. 3. Protests are a wake-up call to FIFA and BrazilFor a country that is synonymous with football, there were times during this tournament when Brazil seemed to have fallen well and truly out of love with the game -- or at least, with FIFA and the Brazilian government\\'s interpretation of what a World Cup should look like. As simmering social unrest threatened to boil into something more serious, the tournament\\'s detractors grew in volume and number. Even former star striker Romario joined a critical chorus that cited ticket prices, infrastructure costs and a questionable legacy as reasons why the Confederations Cup, and next year\\'s World Cup, were bad for Brazil. Read: A fair World Cup deal for BrazilIn anticipation of trouble, a reported 10,000 police were on duty in Rio de Janeiro ahead of Brazil\\'s clash with Spain, but in the end protests were relatively low-key. Following their team\\'s emphatic triumph over the world and European champions, crowds spilled out onto the street and into a carnival atmosphere.Clearly this story is not over, and there is much work to do; for now, however, football\\'s primacy has been restored. In fact there was much cause for optimism on the pitch. This was a well-deserved win from a team that looks to be gaining in stature, and the atmosphere inside the Maracana for the final was profoundly imposing.Of course Brazil won the 2009 Confederations Cup before limping out in the quarterfinals in South Africa; but this time they will be at home. The World Cup will be a tougher test; but don\\'t bet against them.4. Spain needs a re-bootFinally, after a seemingly interminable period of dominance, Spain\\'s champions look to have been found out. For clues as to how, it is hard to see beyond the Germans. Italy really should have beaten the Spaniards after playing them off the park for significant chunks of their semi-final; Brazil\\'s 3-0 defeat of the world champions was a muscular and ruthless final execution.Both owed much to the approaches of Bayern Munich and Borussia Dortmund in their UEFA Champions League defeats of Barcelona and Real Madrid.Spain were hustled and hassled, with attempts to impose their intricate passing game met by fiercely committed opponents, closing them down until the Europeans simply folded.Fernando Torres only won the golden boot because of his goals against Tahiti\\'s part-timers. Xavi and Iniesta are still fabulously creative, but elsewhere Spain\\'s weak links were exploited with a kind of physicality this team now seems incapable of countering. Drained of confidence, or possibly just appetite, they looked listless and in dire need of fresh ideas. The sight of a defender, Sergio Ramos, taking a penalty that could have brought them back into the final spoke volumes. True Brazil had had an extra day of rest before the final, but the mask of invincibility has slipped; Spain has 12 months to work out how to set it back in place.5. The Confederations Cup comes of ageIn theory at least, the Confederations Cup is the unloved second cousin of the World Cup, the Euros and the Copa America. Traditionally these games are seen by the cynics as little more than jumped up exhibition matches -- a mere aperitif before the main meal of the World Cup. By and large, no one really cares who wins. This time, however, something seemed to click. Maybe it was the location. Brazil may have some serious issues to confront, but there is something about the host country that elevated this tournament onto a different plane.In football terms, this is about as educated as any crowd can get. The enthusiastic way in which local fans adopted teams such as Japan, Uruguay, Tahiti and Italy conveyed atmosphere and meaning to fixtures where normally none would exist, and more than made up for the absence of traveling support. The outcome, for the neutrals at least, was first class entertainment. Brazil 2013 served up some of the most memorable international games of recent memory, played in front of passionate crowds in some spectacular arenas. Read: Brazil\\'s beautiful game?If Brazil 2014 can pick up where this rehearsal left off, then we are truly in for a treat.', 'entities': [{'entity_name': 'BRAZIL', 'entity_type': 'LOCATION', 'description': 'Brazil is a country that hosted the Confederations Cup and is preparing to host the 2014 World Cup. It experienced social unrest during the Confederations Cup due to public money being devoted to the World Cup.', 'importance_score': 1.0}, {'entity_name': 'SPAIN', 'entity_type': 'LOCATION', 'description': 'Spain is a country whose national football team was defeated 3-0 by Brazil in the Confederations Cup final. The Spanish team is known for its dominance in international football.', 'importance_score': 0.9}, {'entity_name': 'NEYMAR', 'entity_type': 'PERSON', 'description': 'Neymar is a Brazilian footballer who was named the best player of the Confederations Cup. He is considered a rising superstar in the football world and recently transferred to Barcelona for $75 million.', 'importance_score': 1.0}, {'entity_name': 'CONFEDERATIONS CUP', 'entity_type': 'EVENT', 'description': 'The Confederations Cup is an international football tournament held in Brazil, serving as a test run for the 2014 World Cup. It was marked by social unrest and protests.', 'importance_score': 0.9}, {'entity_name': '2014 WORLD CUP', 'entity_type': 'EVENT', 'description': 'The 2014 World Cup is an upcoming international football tournament to be hosted by Brazil. It is preceded by the Confederations Cup, which highlighted social issues and protests.', 'importance_score': 1.0}], 'relationships': [{'src_id': 'BRAZIL', 'tgt_id': 'SPAIN', 'description': 'Brazil defeated Spain 3-0 in the Confederations Cup final, showcasing their strength as potential hosts of the 2014 World Cup.', 'weight': 1.0, 'order': 1}, {'src_id': 'NEYMAR', 'tgt_id': 'CONFEDERATIONS CUP', 'description': 'Neymar was named the best player of the Confederations Cup, demonstrating his skill and potential as a future football superstar.', 'weight': 1.0, 'order': 1}, {'src_id': 'CONFEDERATIONS CUP', 'tgt_id': '2014 WORLD CUP', 'description': 'The Confederations Cup served as a test run for the 2014 World Cup, highlighting both the potential and the challenges faced by Brazil as hosts.', 'weight': 0.9, 'order': 2}]}) (input_keys={'input_text'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainset[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 99 chunks, 901 entities(duplicated), 631 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 400 Bad Request\"\n", + "ERROR:nano-graphrag:Error in TypedEntityRelationshipExtractor: Error code: 400 - {'error': {'message': 'Content Exists Risk', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 100 chunks, 901 entities(duplicated), 631 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:nano-graphrag:Saved 99 examples with keys: ['input_text', 'entities', 'relationships'], filtered 1 examples\n" + ] + } + ], + "source": [ + "val_chunks = {compute_mdhash_id(text, prefix=f\"chunk-valset-\"): {\"content\": text} for text in val_data[\"text\"]}\n", + "valset = asyncio.run(generate_dataset(chunks=val_chunks, filepath=entity_relationship_valset_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Example({'input_text': 'Story highlightsPortugal qualifies for Euro 2016 finalDefeats Wales 2-0 in semifinalCristiano Ronaldo and Nani on targetFrance or Germany awaits in the final (CNN)Cristiano Ronaldo proved himself the man for the big occasion once again by firing Portugal into the final of Euro 2016 Wednesday.Ronaldo\\'s second-half header and a strike by his former Manchester United teammate Nani secured a 2-0 win against Wales in Lyon.Follow @cnnsport\\n\\nPortugal, which last reached the final of the competition in 2004, will face either host nation France or world champion Germany in Paris on Sunday.This contest had been billed as clash between two of the world\\'s top players -- Ronaldo and his Real Madrid teammate Gareth Bale.But Bale, who has enjoyed a stellar tournament for a country which had never before qualified for the tournament, failed to match the exploits of his rival.Read MoreRonaldo, who equaled Michel Platini\\'s record of scoring nine career goals at the Euros, will now hope to exorcise the ghosts of 2004 when Portugal was beaten by Greece in a final held in its own backyard. When you know you\\'re going to Paris. 🎉 #PORWAL #EURO2016 pic.twitter.com/7FXah0E6QE— UEFA EURO 2016 (@UEFAEURO) July 6, 2016\\n\"I hope that [after the final] we\\'ll be smiling and that it will be tears of joy in the end,\" Ronaldo told the tournament\\'s official website.\"I\\'ve always said my dream was to win a trophy with Portugal. We\\'re closer to doing it and I believe that we\\'ll win. \"With a lot of work, humility and the spirit of self-sacrifice, which I have always shown in my career, these sorts of things are doable.\"Bale and Ronaldo embraced at the end of the game.Rocky roadFor Portugal, the road to the final has been long and arduous, often due to the team\\'s own inadequacies.While Wales enjoyed a rather enchanting run to its first major semifinal, Portugal\\'s journey to its fifth could scarcely have been more different.No other team had arrived at the last-four stage of the European Championship finals without winning a game in 90 minutes -- but Portugal, which drew all three of its group matches and only qualified in third place, has always managed to do just enough.A last-16 win over Croatia, secured with just three minutes of extra-time remaining, was then followed by a penalty shootout triumph over Poland after the game had finished 1-1.For all the exciting talent, and there is plenty even if this may not be a vintage Portuguese team, rarely has it looked impressive.Wales paid special attention to Cristiano Ronaldo in the opening stages of the contest.And yet, what it has done, is use its experience, its know-how and a resolute determination to ensure it will not be beaten.In Renato Sanches, an 18-year-old midfielder recently bought by Bayern Munich for $36 million, Portugal has a player who will surely go on to take on Ronaldo\\'s mantle.But even the youngest and most precocious need guidance.That guidance has come from Ronaldo, who despite failing to find his best form, has carried this team for long periods of the tournament.His two goals in the 3-3 draw with Hungary, a game in which it trailed on three separate occasions, enabled Portugal to qualify as the third of the four best third-placed teams.Cristiano Ronaldo: Is Euro 2016 the last chance for Portugal great?\"It\\'s what we have dreamed of since the beginning,\" Ronaldo added.\"We knew it would be a long road and we\\'re still in the tournament. We have believed right from the start. \"We had difficult moments, but it\\'s like I always say: it\\'s better to start poorly and have a positive ending. \"We haven\\'t won anything yet as I said a few days ago, but the dream is still alive.\"LeaderFor so long Ronaldo has been the one who has been responsible for pushing his team forward, but even he has appeared to be struggling both on and off the field of play.First there was his outburst after his side\\'s group draw against Iceland, when he accused the island team of having a \"small mentality\" and claiming it would never achieve anything in the tournament -- a prediction which proved rather wide of the mark.Then there was his penalty miss in the goalless draw with Austria which was followed a few days later by his decision to throw a reporter\\'s microphone into a lake while out on a prematch walk.But for all his histrionics and his ability to split public opinion, few can doubt his talents and the records he has broken.🎉 Bravo @selecaoportugal: #EURO2016 finalists! 🎉#POR #PORWAL pic.twitter.com/kizafyy7EI— UEFA EURO 2016 (@UEFAEURO) July 6, 2016\\nHis 61 goals in 132 games makes him Portugal\\'s top scorer, while he is the only player to have scored in four consecutive European Championship finals.But his struggles in France have been well documented -- his wayward shooting, the lack of usual poise on front of goal and his waning influence on games has become more noticeable.It is in sharp contrast to his Real Madrid teammate Bale, who was responsible for creating one of the only real opportunities of a tight and cagey first period.Bale, who scored in each of Wales\\' first three group games, picked the ball up in his own half and drove through the Portuguese midfield before unleashing a fierce effort straight at goalkeeper Rui Patricio.Turning pointWhile the first half may have been rather uninspiring, the second was anything but as Ronaldo and Portugal took control of the contest.Just five minutes of the second half had passed when Ronaldo rose highest at the far post to meet Raphael Guerreiro\\'s cross and power a header into the top corner.Wales, still dazed from conceding the opening goal, was then dealt another blow just three minutes later.FT #Por 2-0 #Wal. Cristiano Ronaldo leads Portugal into the #Euro2016 final! pic.twitter.com/Z8vl4YUfxR— CNN Football Club (@CNNFC) July 6, 2016\\nOnce again it was Ronaldo at the heart of the move, flashing a driven effort into the penalty area -- and Nani, who on Tuesday joined Valencia, diverted the ball past stranded goalkeeper Wayne Hennessey.Wales, pushing forward in search of a way back into the contest, began to leave holes at the back and was fortunate not to fall further behind.First, Hennessey failed to hold Nani\\'s shot and Joao Mario somehow contrived to fire the rebound wide of the post with the goal gaping.Moments later it was Danilo who came close, his short squirming under Hennessey\\'s body before the goalkeeper recovered to stop the ball on the line.Bale had a couple efforts from long-range but failed to find a way past the Portuguese defense.Wales huffed and puffed, with Bale unleashing a couple of efforts from long range as Portugal sat back and hit its opponent on the counter attack.Roared on by thousands of fans, Wales continued to move forward but it lacked the precision and quality which had been so evident during its run to the final four.In the immediate aftermath of this defeat, it is easy to to forget that when Chris Coleman took over this team four years ago that it sat 117th in the world rankings and had not qualified for a major tournament since 1958.Just to play in these finals was an achievement -- but what this team has done over the past month will change the landscape of football in Wales forever.Here\\'s another look at the Ronaldo header that gave #Por the lead. What a leap! #PORWAL pic.twitter.com/2tuSLXU9vI— CNN Football Club (@CNNFC) July 6, 2016\\nNot only did it qualify ahead of England and outlast it, it played with a sense of pride, passion and least of all, quality.Some had labeled Wales a \"one-man team\" with Bale cast as the superstar dragging 10 other players along with him.Yet, such an assertion was to prove folly. Read more: Full Euro 2016 coverageWales have been a joy to watch throughout the tournament. Like Iceland, it boasts a connection between the team and supporters which few others can match, bar the Irish of course.It has inspired, it has brought hope and it has revitalized football in the country -- but this was a fairy tale without the ending so many had yearned for.For Ronaldo, though, the end is not yet written.', 'entities': [{'entity_name': 'PORTUGAL', 'entity_type': 'ORGANIZATION', 'description': 'The national football team of Portugal that qualified for the Euro 2016 final.', 'importance_score': 1.0}, {'entity_name': 'CRISTIANO RONALDO', 'entity_type': 'PERSON', 'description': 'A Portuguese footballer who scored in the Euro 2016 semifinal against Wales.', 'importance_score': 1.0}, {'entity_name': 'NANI', 'entity_type': 'PERSON', 'description': 'A Portuguese footballer and former Manchester United teammate of Cristiano Ronaldo who scored in the Euro 2016 semifinal against Wales.', 'importance_score': 0.8}, {'entity_name': 'WALES', 'entity_type': 'ORGANIZATION', 'description': 'The national football team of Wales that was defeated by Portugal in the Euro 2016 semifinal.', 'importance_score': 0.9}, {'entity_name': 'EURO 2016', 'entity_type': 'EVENT', 'description': 'The 2016 UEFA European Championship, a major football tournament held in France.', 'importance_score': 1.0}, {'entity_name': 'FRANCE', 'entity_type': 'LOCATION', 'description': 'The host nation of Euro 2016 and a potential opponent for Portugal in the final.', 'importance_score': 0.8}, {'entity_name': 'GERMANY', 'entity_type': 'ORGANIZATION', 'description': 'The national football team of Germany, the world champion and a potential opponent for Portugal in the final.', 'importance_score': 0.8}], 'relationships': [{'src_id': 'PORTUGAL', 'tgt_id': 'WALES', 'description': 'Portugal defeated Wales 2-0 in the semifinal of Euro 2016.', 'weight': 1.0, 'order': 1}, {'src_id': 'CRISTIANO RONALDO', 'tgt_id': 'NANI', 'description': 'Cristiano Ronaldo and Nani, both Portuguese footballers, scored in the semifinal against Wales.', 'weight': 0.9, 'order': 1}, {'src_id': 'PORTUGAL', 'tgt_id': 'FRANCE', 'description': 'Portugal may face France in the final of Euro 2016.', 'weight': 0.8, 'order': 2}, {'src_id': 'PORTUGAL', 'tgt_id': 'GERMANY', 'description': 'Portugal may face Germany in the final of Euro 2016.', 'weight': 0.8, 'order': 2}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': ' (CNN)A Swedish train operator is investigating the forcible removal of a pregnant woman from a train in Stockholm.A video clip posted on social media shows security guards working for Stockholm public transport authority SL dragging the woman off the train and pushing her down onto a bench on the platform.The woman had allegedly refused to accept a fine after failing to show a valid ticket, according to transport officials. Security guards stepped in after she allegedly became aggressive.The woman\\'s daughter can be seen crying as she tries to hold on to her mother, before another security guard leads the child away.\"In these two videos you can see the victim dragged out of the train, her child taken from her and whilst she attempts to stand up she is held down,\" wrote popular blogger and activist Lovette Jallow, who posted mobile phone footage of the incident on Instagram. Read More View this post on Instagram Tomorrow all the witnesses that came forth with their evidence will be making their own reports. . . In these two videos you can see the victim dragged out of the train, her child taken from her and whilst she attempts to stand up she is held down. The full video is over 5 minutes long and I will try and edit and put it up tonight or tomorrow. . . After everything the Swedish public was shown in the recent @kallafaktatv4 documentary about how aftiswedes are racially profiled and mistreated this shouldn\\'t surprise anyone even if the victim is pregnant. The victim is currently in the hospital and all I can do is hope the baby is alright because if anything happens to that child. There will be hell to pay. A post shared by ᒪOᐯETTE ᒍᗩᒪᒪOᗯ (@action4humanity_se) on Jan 31, 2019 at 1:17pm PST\\nIn the post, Jallow also criticized how Swedes of African origin are \"racially profiled and mistreated.\"The woman visited hospital for treatment after the incident, which took place at Hötorget metro station on January 31, according to a police report.\"All I can do is hope the baby is alright because if anything happens to that child. There will be hell to pay,\" wrote Jallow.The post drew many comments from other Instagram users. Sofie Fisen Jakopsohn, @sofiejakopsohn, left a comment calling the incident a crime against human rights, while others said that things would have been different if the woman had listened to the guards\\' instructions.SL spokesman Henrik Palmer told CNN that the security guards are subject to rigorous training to prevent these kinds of incidents.An internal investigation is being carried out to work out what went wrong, he said.\"We take this very seriously,\" said Palmer, adding that the security guards have been suspended until the investigation is complete.There are cameras on the train and the platform, said Palmer, and police have access to all of the footage.Stockholm police are investigating the incident but could not be reached for comment.', 'entities': [{'entity_name': 'SWEDISH TRAIN OPERATOR', 'entity_type': 'ORGANIZATION', 'description': 'The organization responsible for operating trains in Sweden.', 'importance_score': 0.8}, {'entity_name': 'STOCKHOLM PUBLIC TRANSPORT AUTHORITY SL', 'entity_type': 'ORGANIZATION', 'description': 'The public transport authority in Stockholm responsible for managing public transportation.', 'importance_score': 0.9}, {'entity_name': 'LOVETTE JALLOW', 'entity_type': 'PERSON', 'description': 'A popular blogger and activist who posted the incident on Instagram.', 'importance_score': 0.7}, {'entity_name': 'HÖTORGET METRO STATION', 'entity_type': 'LOCATION', 'description': 'The metro station where the incident took place.', 'importance_score': 0.6}, {'entity_name': 'JANUARY 31, 2019', 'entity_type': 'DATE', 'description': 'The date when the incident occurred.', 'importance_score': 0.5}, {'entity_name': 'HENRIK PALMER', 'entity_type': 'PERSON', 'description': 'The spokesman for Stockholm public transport authority SL.', 'importance_score': 0.6}], 'relationships': [{'src_id': 'SWEDISH TRAIN OPERATOR', 'tgt_id': 'STOCKHOLM PUBLIC TRANSPORT AUTHORITY SL', 'description': 'The Swedish train operator is investigating an incident involving the Stockholm public transport authority SL.', 'weight': 0.8, 'order': 1}, {'src_id': 'STOCKHOLM PUBLIC TRANSPORT AUTHORITY SL', 'tgt_id': 'LOVETTE JALLOW', 'description': 'The Stockholm public transport authority SL is mentioned in a post by Lovette Jallow on Instagram.', 'weight': 0.7, 'order': 2}, {'src_id': 'LOVETTE JALLOW', 'tgt_id': 'HÖTORGET METRO STATION', 'description': 'Lovette Jallow posted about an incident that occurred at Hötorget metro station.', 'weight': 0.6, 'order': 2}, {'src_id': 'HÖTORGET METRO STATION', 'tgt_id': 'JANUARY 31, 2019', 'description': 'The incident at Hötorget metro station took place on January 31, 2019.', 'weight': 0.5, 'order': 1}, {'src_id': 'STOCKHOLM PUBLIC TRANSPORT AUTHORITY SL', 'tgt_id': 'HENRIK PALMER', 'description': 'Henrik Palmer is the spokesman for the Stockholm public transport authority SL.', 'weight': 0.6, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': 'Portrush, Northern Ireland (CNN)He grew up with a famous Gaelic football-playing dad, but Shane Lowry\\'s dream as one of the only golfers in a school of 500 was always to win the Open.He accomplished that in spectacular fashion Sunday with a momentous six-shot victory over England\\'s Tommy Fleetwood to clinch his first major title at Royal Portrush.The Open, the oldest of golf\\'s four majors, had not been held in Northern Ireland for 68 years because of the Troubles that dogged the nation, but Lowry\\'s charge to the Claret Jug united fans from both sides of the border.They roared him on in thunderous fashion and chanted his name in scenes reminiscent of the raucous atmosphere at the Ryder Cup.Judging by his nerve to build on a four-shot lead in the face of eye-popping pressure, he will be an asset to European captain Padraig Harrington\\'s side in the white-hot atmosphere of Whistling Straits, Wisconsin, in 2020. Read MoreThe 32-year-old described the experience of winning the Open as \"surreal,\" saying he can\\'t quite believe he is a major champion.\"To do it here in Portrush is even more special, it is a dream come true,\" Lowry told CNN Sport in the Royal Portrush clubhouse during a whirlwind tour of media engagements.\"I didn\\'t know if I would achieve anything like this and I have and I\\'m really going to enjoy it.\"READ: Shane Lowry clinches Claret Jug for first major titleShane Lowry was roared on by raucous home support at Royal Portrush. \\'Dream come true\\'Shane Lowry.... That is all https://t.co/B7T3bHR24z— Carl Frampton MBE (@RealCFrampton) July 21, 2019 Lowry joins an illustrious roll call of recent success stories from the Republic of Ireland and Northern Ireland, alongside three-time major champion Harrington, Portrush\\'s Graeme McDowell the 2010 US Open champion, 2011 Open winner Darren Clarke and former world No.1 Rory McIlroy, who bagged four majors between 2011 and 2014.And who should be one of his close group of family and friends watching behind the 18th green but Harrington, who won the Open in 2007 and 2008, and McDowell. \"Paddy and G-Mac (McDowell) are two really good friends of mine now and I\\'m just so happy I can add my name to the list of major champions,\" he said. \"Like, you go into Paddy\\'s house and the Claret Jug is sitting on the kitchen table, and I\\'m going to have one on my kitchen table as well.\" Now he has set his sights on making Harrington\\'s team in the biennial Europe against the USA match next year.\"The Ryder Cup, that\\'s the plan,\" he added. \"It was very kind of Paddy to wait for me on the 18th green.\"READ: Emotional McIlroy feels \\'love\\' after epic Open missREAD: Tiger Woods misses Open cut, yearns for \\'hot weeks\\'Shane Lowry joins an illustrious list of major champions from both sides of the border in Ireland. Golf is fickleLowry\\'s previous best finished in a major was tied second in the US Open at Oakmont in 2016 -- but the sting in the tail was blowing a four-shot overnight lead going into the final round.He suffered no such blips on a testing day at Portrush Sunday, feeding off the lively crowd and seemingly oblivious to the at times torrential rain and gusty winds.Lowry won his first European Tour title -- the 2009 Irish Open -- when still an amateur and added a fourth in Abu Dhabi in January.But he missed his fourth consecutive cut in the Open at Carnoustie last year and says he had fallen out of love with the game.\"Carnoustie, that just shows you how fickle golf is,\" said Lowry, whose dad Brendan followed his round Sunday. \"Golf is a weird sport and you never know what\\'s around the corner. That\\'s why you need to fight through the bad times. \"It was something that became very stressful and it was weighing on me and I just didn\\'t like doing it. What a difference a year makes.\"McDowell told a story outlining Lowry\\'s early potential as a member of the Irish amateur team alongside McIIroy. Renowned coach Pete Cowen, mentor now to multiple major champions, was asked down to Dublin to look at the squad and see what he thought. According to McDowell, he said: \"Rory McIlroy looks pretty good, but that slightly overweight kid with the glasses looks good, too.\" That was the young Lowry. READ: Why Holywood star McIlroy has always been box officeREAD: Golfers feud after caddie\\'s mom is hit by ballShane Lowry celebrates as he walks up the 18th fairway on his way to winning the Open.Lowry also lost his PGA Tour card last year but credits the turnaround in fortunes to the people around him, including new caddie Bo Martin and coach Neil Manchip, who delivered a crucial pep talk over coffee at a hotel in nearby Bushmills on the eve of the Open.\"I suppose hard work and belief in myself and belief from the people around me,\" he told CNN of his reversal.\"I wouldn\\'t be here without any of them.\"Lowry also says the perspective that comes from becoming a father to Iris, born in 2017, helps him to process the bad days on the golf course.\"It has a lot. If things didn\\'t go to plan [Sunday] I would have been unbelievably disappointed but at the end of the day you\\'re going back to a warm home with a family. \"It could be worse.\"He won the Open at Royal Portrush in front of an excited and proud home crowd.It couldn\\'t be much better.', 'entities': [{'entity_name': 'SHANE LOWRY', 'entity_type': 'PERSON', 'description': 'A professional golfer who won the Open at Royal Portrush.', 'importance_score': 1.0}, {'entity_name': 'TOMMY FLEETWOOD', 'entity_type': 'PERSON', 'description': 'A professional golfer from England who was runner-up in the Open.', 'importance_score': 0.8}, {'entity_name': 'ROYAL PORTRUSH', 'entity_type': 'LOCATION', 'description': 'A golf course in Northern Ireland where the Open was held.', 'importance_score': 0.9}, {'entity_name': 'PADRAIG HARRINGTON', 'entity_type': 'PERSON', 'description': 'A professional golfer and European captain who is a friend of Shane Lowry.', 'importance_score': 0.7}, {'entity_name': 'GRAEME MCDOWELL', 'entity_type': 'PERSON', 'description': 'A professional golfer from Portrush who is a friend of Shane Lowry.', 'importance_score': 0.6}, {'entity_name': 'RORY MCILROY', 'entity_type': 'PERSON', 'description': 'A professional golfer and former world No.1 who is from Northern Ireland.', 'importance_score': 0.7}, {'entity_name': 'DARREN CLARKE', 'entity_type': 'PERSON', 'description': 'A professional golfer who won the Open in 2011.', 'importance_score': 0.6}, {'entity_name': 'CLARET JUG', 'entity_type': 'AWARD', 'description': 'The trophy awarded to the winner of the Open.', 'importance_score': 0.8}, {'entity_name': 'RYDER CUP', 'entity_type': 'EVENT', 'description': 'A biennial golf match between Europe and the USA.', 'importance_score': 0.7}, {'entity_name': 'WHISTLING STRAITS', 'entity_type': 'LOCATION', 'description': 'A golf course in Wisconsin where the Ryder Cup will be held in 2020.', 'importance_score': 0.6}], 'relationships': [{'src_id': 'SHANE LOWRY', 'tgt_id': 'TOMMY FLEETWOOD', 'description': 'Shane Lowry won the Open with a six-shot victory over Tommy Fleetwood.', 'weight': 0.9, 'order': 1}, {'src_id': 'SHANE LOWRY', 'tgt_id': 'ROYAL PORTRUSH', 'description': 'Shane Lowry won the Open at Royal Portrush.', 'weight': 1.0, 'order': 1}, {'src_id': 'SHANE LOWRY', 'tgt_id': 'PADRAIG HARRINGTON', 'description': \"Shane Lowry is an asset to European captain Padraig Harrington's side in the Ryder Cup.\", 'weight': 0.8, 'order': 1}, {'src_id': 'SHANE LOWRY', 'tgt_id': 'GRAEME MCDOWELL', 'description': 'Shane Lowry is friends with Graeme McDowell.', 'weight': 0.7, 'order': 1}, {'src_id': 'SHANE LOWRY', 'tgt_id': 'RORY MCILROY', 'description': 'Shane Lowry joins an illustrious roll call of recent success stories from Ireland, alongside Rory McIlroy.', 'weight': 0.7, 'order': 1}, {'src_id': 'SHANE LOWRY', 'tgt_id': 'DARREN CLARKE', 'description': 'Shane Lowry joins an illustrious roll call of recent success stories from Ireland, alongside Darren Clarke.', 'weight': 0.6, 'order': 1}, {'src_id': 'SHANE LOWRY', 'tgt_id': 'CLARET JUG', 'description': 'Shane Lowry won the Claret Jug at the Open.', 'weight': 0.9, 'order': 1}, {'src_id': 'SHANE LOWRY', 'tgt_id': 'RYDER CUP', 'description': \"Shane Lowry has set his sights on making Padraig Harrington's team in the Ryder Cup.\", 'weight': 0.7, 'order': 1}, {'src_id': 'RYDER CUP', 'tgt_id': 'WHISTLING STRAITS', 'description': 'The Ryder Cup will be held at Whistling Straits in 2020.', 'weight': 0.6, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': ' (CNN)Tampa Bay Buccaneers head coach Bruce Arians says he cares about wide receiver Antonio Brown and hopes the best for him.\"I wish him well,\" Arians said at his weekly press conference on Monday remotely. \"I hope if he needs help, get some ... It\\'s very hard because I do care about him.\"While Brown\\'s future remains unclear, he was not released by the Buccaneers on Monday, according to the NFL\\'s transactions report.On Sunday, the seven-time Pro Bowler left the game against the New York Jets late in the third quarter. Following the Buccaneers\\' victory against the Jets, Arians said Brown was \"no longer a Buc.\"The broadcast showed the 33-year-old appearing to be upset on the sideline, removing his jersey and pads while teammates pleaded with him to stay. A shirtless Brown could be seen throwing his undershirt into the stands as he made his way into the stadium\\'s tunnel while waving a peace sign.Read MoreArians would not share the nature of the conversation he had with Brown and said the last time he saw the receiver was when Brown left the field. The Bucs head coach also said he has not heard from Brown or his representatives. When asked if there was any mental health evaluation performed on Brown before he left the stadium, Arians said he had no idea.Arians said he has no regrets on bringing Brown onto the team and \"hopes the best for him.\" Brown looks on against the New York Jets during the game.Before exiting the game, Brown had three receptions for 26 yards.After the game, Brown tweeted a photo of himself with the cryptic caption \"Super Gremlin.\" He also tweeted a link to his new single, \"Pit Not The Palace\" and posted a series of sponsored images showing himself wearing items from clothing company Fashion Nova on Instagram.The incident comes weeks after Brown said he was suspended in December for three games without pay following a league investigation, which determined he violated Covid-19 protocols.Visit CNN.com/sport for more news, features, and videosThe Pittsburgh Steelers drafted Brown in 2010, and he played for them the first nine seasons of his career. Brown was traded to the then-Oakland Raiders ahead of the 2019 season but was released before playing in a regular season game.Brown then signed with the New England Patriots and played one game before being released again, days after one of his former offseason athletic trainers filed a lawsuit accusing him of rape and assault in 2017 and 2018. Brown denied the allegations and said he would fight to clear his name. The NFL said at the time there was an ongoing investigation into Brown\\'s conduct.Brown signed a one-year deal with the Buccaneers in October 2020 and was part of the Bucs team that won the Super Bowl last season. CNN\\'s Amir Vera, Homero De la Fuente and Jacob Lev contributed to this report.', 'entities': [{'entity_name': 'BRUCE ARIANS', 'entity_type': 'PERSON', 'description': 'Tampa Bay Buccaneers head coach who expresses care for Antonio Brown and hopes for the best for him.', 'importance_score': 0.9}, {'entity_name': 'ANTONIO BROWN', 'entity_type': 'PERSON', 'description': 'Wide receiver for the Tampa Bay Buccaneers who left the game against the New York Jets and had a cryptic tweet after the game.', 'importance_score': 0.9}, {'entity_name': 'TAMPA BAY BUCCANEERS', 'entity_type': 'ORGANIZATION', 'description': 'NFL team that Antonio Brown plays for and Bruce Arians coaches.', 'importance_score': 0.8}, {'entity_name': 'NEW YORK JETS', 'entity_type': 'ORGANIZATION', 'description': 'NFL team that played against the Tampa Bay Buccaneers.', 'importance_score': 0.7}], 'relationships': [{'src_id': 'BRUCE ARIANS', 'tgt_id': 'ANTONIO BROWN', 'description': 'Bruce Arians, the head coach of the Tampa Bay Buccaneers, expresses care for Antonio Brown, the wide receiver, and hopes for the best for him.', 'weight': 0.9, 'order': 1}, {'src_id': 'ANTONIO BROWN', 'tgt_id': 'TAMPA BAY BUCCANEERS', 'description': 'Antonio Brown is a wide receiver for the Tampa Bay Buccaneers.', 'weight': 0.8, 'order': 1}, {'src_id': 'TAMPA BAY BUCCANEERS', 'tgt_id': 'NEW YORK JETS', 'description': 'The Tampa Bay Buccaneers played against the New York Jets in an NFL game.', 'weight': 0.7, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': 'Story highlights Death toll rises to more than 6,200Pemba Tamang, 15, shows no apparent signs of serious injury after rescueU.S. special forces helicopter 30, including 3 Americans, to safetyKathmandu, Nepal (CNN)On Day Six of Nepal\\'s tragedy, life triumphed as rescuers pulled an 15-year-old from the rubble of a multistory residential building in one of Kathmandu\\'s hard-hit neighborhoods.A large crowd erupted in cheers as Pemba Tamang was carried out on a stretcher. He was wearing a New York shirt and a blue neck brace, was blanketed by dust and had the look of a deer in the headlights. His rescuer, Inspector Lakshman Basnet of the Nepalese Armed Police Force, said Tamang was responsive and showed no apparent signs of serious injury. He was given an IV drip and rushed from the Gongapur area to a temporary emergency hospital run by an Israeli aid team. The Nepalese rescuers had been working for five hours to locate Tamang after they heard his voice coming from under the debris.Read MoreGroup helicoptered out by U.S. special forcesAlso Thursday, a U.S. special operations forces team rescued 30 people, including three Americans, by helicopter from an area of Nepal called Bamboo Village, according to the U.S. ambassador to Nepal.The group was trapped in the village and living in a makeshift shelter, Ambassador Peter Bodde said. The families had contacted the U.S. government to let officials know where their relatives were stuck, he said.They had no other way to get out of the area, Bodde said.\\'It\\'s dangerous, but it\\'s what we do\\'An American disaster response team was also involved in the rescue of the 15 year old boy. The team was at a nearby damaged bus station when it got word that someone might be alive. Andrew Olvera, the head of the U.S. team, said his men rushed over with search dogs and equipment ranging from breaching tools to sophisticated cameras that can probe under the rubble. Pemba Tamang being pulled alive from the rubble, five days after a huge earthquake hit Nepal.He said the operation carried enormous risk, as chunks of the collapsed building hung precariously on rebar. Entire floors of what used to be people\\'s homes were visible -- ceiling fans and beds still draped with cotton sheets. It was a mountain of loss and sorrow. \"It\\'s dangerous, but it\\'s what we do,\" said Olvera, who has a daughter and twin 11-year-old boys. \"It\\'s risk versus gain. To save a human life, we will risk almost everything. \"The way the building is, it\\'s definitely a miracle,\" he said. Man \\'survived by good faith\\' Tamang cried for water in a muffled voice. He had been buried for five days under a building that pancaked. He dodged death because of a motorcycle that shielded him from the pressure of the concrete and steel, according to Basnet. Photos: Powerful earthquake hits Nepal Photos: Powerful earthquake hits NepalNepalese police officers clear debris from Durbar Square in Kathmandu on Sunday, May 3. A magnitude-7.8 earthquake centered less than 50 miles from Kathmandu rocked Nepal with devastating force Saturday, April 25. The earthquake and its aftershocks have turned one of the world\\'s most scenic regions into a panorama of devastation, killing and injuring thousands.Hide Caption 1 of 64 Photos: Powerful earthquake hits NepalAn injured Nepalese woman is carried by villagers toward an Indian army helicopter to be airlifted from Philim village in Gorkha district in Nepal on May 3.Hide Caption 2 of 64 Photos: Powerful earthquake hits NepalMembers of the Tsayana family warm themselves next to a fire outside their damaged house on May 3 in Bhaktapur, Nepal.Hide Caption 3 of 64 Photos: Powerful earthquake hits NepalA woman receives comfort during the funeral of her mother, a victim of Nepal\\'s deadly earthquake, on Friday, May 1, in Kathmandu. Hide Caption 4 of 64 Photos: Powerful earthquake hits NepalHindu priests perform rituals during the cremations of victims at the Pashupatinath Temple on the banks of the Bagmati River in Kathmandu on May 1.Hide Caption 5 of 64 Photos: Powerful earthquake hits NepalPeople await aid from an Indian army helicopter in front of damaged homes in Kulgaun, Nepal, on May 1.Hide Caption 6 of 64 Photos: Powerful earthquake hits NepalAn injured woman gets carried on a stretcher at Kathmandu\\'s airport after being evacuated from Melamchi, Nepal, on May 1.Hide Caption 7 of 64 Photos: Powerful earthquake hits NepalA member of the Los Angeles County Fire Department guides his sniffing dog through a collapsed building in Kathmandu on Thursday, April 30.Hide Caption 8 of 64 Photos: Powerful earthquake hits NepalA teenage boy gets rushed to a hospital April 30 after being rescued from the debris of a building in Kathmandu days after the earthquake.Hide Caption 9 of 64 Photos: Powerful earthquake hits NepalA man is freed from the ruins of a hotel by French rescuers in the Gangabu area of Kathmandu on Tuesday, April 28. Reuters identified the man as Rishi Khanal.Hide Caption 10 of 64 Photos: Powerful earthquake hits NepalNepalese military police search through rubble outside Kathmandu on April 28.Hide Caption 11 of 64 Photos: Powerful earthquake hits NepalPeople rest April 28 in a temporary housing camp in Kathmandu. Large encampments of tents have sprung up in open areas, including a wide space belonging to the military in the center of the capital. Hide Caption 12 of 64 Photos: Powerful earthquake hits NepalA family collects belongings from their home in Bhaktapur, Nepal, on Monday, April 27. Hide Caption 13 of 64 Photos: Powerful earthquake hits NepalDamaged buildings lean to the side in Kathmandu on April 27.Hide Caption 14 of 64 Photos: Powerful earthquake hits NepalHide Caption 15 of 64 Photos: Powerful earthquake hits NepalMembers of the Nepalese army retrieve bodies from a collapsed building in Bhaktapur near Kathmandu on April 27.Hide Caption 16 of 64 Photos: Powerful earthquake hits NepalNepalese soldiers carry a wounded woman to a helicopter as they evacuate people from Trishuli Bazar, Nepal, on April 27.Hide Caption 17 of 64 Photos: Powerful earthquake hits NepalPeople charge their cell phones in an open area in Kathmandu on April 27.Hide Caption 18 of 64 Photos: Powerful earthquake hits NepalEmergency personnel evacuate an injured man to a waiting helicopter in Trishuli Bazar on April 27.Hide Caption 19 of 64 Photos: Powerful earthquake hits NepalAn aerial view of the devastation in Kathmandu on April 27. The destruction in Nepal\\'s capital is stark: revered temples reduced to rubble, people buried in the wreckage of their homes, hospitals short on medical supplies overflowing with patients.Hide Caption 20 of 64 Photos: Powerful earthquake hits NepalResidents rescue items from the debris of houses damaged in the quake in Kathmandu on April 27.Hide Caption 21 of 64 Photos: Powerful earthquake hits NepalAn aervial view shows ruined buildings in Trishuli Bazar on April 27.Hide Caption 22 of 64 Photos: Powerful earthquake hits NepalA woman prays at a ruined temple in Kathmandu on April 27.Hide Caption 23 of 64 Photos: Powerful earthquake hits NepalPeople rest in temporary shelters in Kathmandu on April 27.Hide Caption 24 of 64 Photos: Powerful earthquake hits NepalResidents cycle over damaged roads on the outskirts of Kathmandu on Sunday, April 26.Hide Caption 25 of 64 Photos: Powerful earthquake hits NepalFour-month-old Sonit Awal is held up by Nepalese army soldiers after being rescued from the rubble of his house in Bhaktapur, Nepal, on April 26.Hide Caption 26 of 64 Photos: Powerful earthquake hits NepalThe newspaper that provided photographs of the baby\\'s rescue says the Nepalese army initially left the site, thinking the baby had not survived. Hours later when the infant\\'s cries were heard, soldiers came back and rescued him.Hide Caption 27 of 64 Photos: Powerful earthquake hits NepalThe newspaper adds the Nepalese Army had initially failed to rescue the baby and left the site thinking the baby had not survived. Hours later when the baby\\'s cries were heard the army came back and rescued him.Hide Caption 28 of 64 Photos: Powerful earthquake hits NepalA woman cries after identifying the body of a relative in Bhaktapur on April 26.Hide Caption 29 of 64 Photos: Powerful earthquake hits NepalMen clear debris in Bhaktapur on April 26.Hide Caption 30 of 64 Photos: Powerful earthquake hits NepalA truck evacuates residents from Kathmandu on April 26. Hide Caption 31 of 64 Photos: Powerful earthquake hits NepalA Buddha statue is surrounded by debris on April 26 from a collapsed temple in the UNESCO world heritage site of Bhaktapur.Hide Caption 32 of 64 Photos: Powerful earthquake hits NepalAn elderly woman is helped to her home after being treated for her injuries in Bhaktapur on April 26. Hide Caption 33 of 64 Photos: Powerful earthquake hits NepalFamily members break down on April 26 during the cremation of a loved one killed in\\xa0Bhaktapur. Hide Caption 34 of 64 Photos: Powerful earthquake hits NepalSmoke from funeral pyres fills the air at the Pashupatinath temple on the banks of Bagmati River in Kathmandu on April 26.Hide Caption 35 of 64 Photos: Powerful earthquake hits NepalMembers of India\\'s National Disaster Response Force look for survivors in Kathmandu on April 26. Hide Caption 36 of 64 Photos: Powerful earthquake hits NepalRescue workers remove debris on April 26 as they search for victims in Bhaktapur.Hide Caption 37 of 64 Photos: Powerful earthquake hits NepalPeople look at the debris of one of the oldest temples in Kathmandu on April 26.Hide Caption 38 of 64 Photos: Powerful earthquake hits NepalPeople sleep on a street in Kathmandu, Nepal, on Saturday, April 25. A seemingly endless series of aftershocks continued to roil the area, further traumatizing survivors. Hide Caption 39 of 64 Photos: Powerful earthquake hits NepalCivilian rescuers carry a person on a stretcher in Kathmandu on April 25.Hide Caption 40 of 64 Photos: Powerful earthquake hits NepalPeople try to free a man from the rubble in Kathmandu on April 25. Cheers rose from the piles when people were found alive -- but mostly bodies turned up. Hide Caption 41 of 64 Photos: Powerful earthquake hits NepalPedestrians walk past collapsed buildings in Kathmandu on April 25.Hide Caption 42 of 64 Photos: Powerful earthquake hits NepalAzim Afif, of the Universiti Teknologi Malaysia climbing team, provided this photo of their Mount Everest base camp after it was ravaged by an avalanche triggered by the earthquake on April 25. All of Afif\\'s five-member team survived.Hide Caption 43 of 64 Photos: Powerful earthquake hits NepalRescuers clear rubble in Kathmandu\\'s Basantapur Durbar Square on April 25.Hide Caption 44 of 64 Photos: Powerful earthquake hits NepalA temple on Hanumandhoka Durbar Square lies in ruins after an earthquake in Kathmandu on April 25. Hide Caption 45 of 64 Photos: Powerful earthquake hits NepalDharahara, a tower dating back to 1832 that rose more than 60 meters (200 feet) and provided breathtaking views of Kathmandu and the surrounding Himalayas, collapsed in the earthquake on April 25.Hide Caption 46 of 64 Photos: Powerful earthquake hits NepalThe hand of a statue is seen under debris in Basantapur Durbar Square in Kathmandu.Hide Caption 47 of 64 Photos: Powerful earthquake hits NepalA Nepalese man and woman hold each other in Kathmandu\\'s Basantapur Durbar Square on April 25.Hide Caption 48 of 64 Photos: Powerful earthquake hits NepalA victim of Nepal\\'s earthquake lies in the debris of Dharahara after it collapsed on April 25 in Kathmandu, Nepal. Hide Caption 49 of 64 Photos: Powerful earthquake hits NepalRescuers look for victims under a collapsed building in Kathmandu on April 25. Hide Caption 50 of 64 Photos: Powerful earthquake hits NepalVolunteers carry a body recovered from the debris of a collapsed building in Kathmandu.Hide Caption 51 of 64 Photos: Powerful earthquake hits NepalA victim\\'s body is seen in the debris of the collapsed Dharahara on April 25. Hide Caption 52 of 64 Photos: Powerful earthquake hits NepalEmergency rescue workers carry a victim from Dharahara after the tower in Kathmandu collapsed on April 25. Hide Caption 53 of 64 Photos: Powerful earthquake hits NepalPeople free a man from the rubble of a destroyed building in Kathmandu.Hide Caption 54 of 64 Photos: Powerful earthquake hits NepalA man walks past a collapsed temple at Basantapur Durbar Square.Hide Caption 55 of 64 Photos: Powerful earthquake hits NepalRescue workers clear debris in Kathmandu while searching for survivors.Hide Caption 56 of 64 Photos: Powerful earthquake hits NepalPeople huddle together outside a hospital in Kathmandu. Eyewitnesses said residents were scared and waiting for aftershocks to end.Hide Caption 57 of 64 Photos: Powerful earthquake hits NepalPeople search for survivors stuck under the rubble of a destroyed building in Kathmandu.Hide Caption 58 of 64 Photos: Powerful earthquake hits NepalInjured people receive treatment in Kathmandu. A CNN reporter said medics were focused on treating the most severely injured.Hide Caption 59 of 64 Photos: Powerful earthquake hits NepalEmergency rescue workers search for survivors in the debris of Dharahara on April 25.Hide Caption 60 of 64 Photos: Powerful earthquake hits NepalAn injured child lies on the ground outside a hospital in Kathmandu on April 25.Hide Caption 61 of 64 Photos: Powerful earthquake hits NepalPeople help with rescue efforts at the site of a collapsed building in Kathmandu.Hide Caption 62 of 64 Photos: Powerful earthquake hits NepalAn injured child receives treatment outside Medicare Hospital in Kathmandu on April 25. Residents, after a relentless series of aftershocks, have been remaining outdoors.Hide Caption 63 of 64 Photos: Powerful earthquake hits NepalThe rubble of collapsed walls fills a street in Lalitpur, on the outskirts of Kathmandu, on April 25.Hide Caption 64 of 64And, Basnet said, he \"survived by good faith.\" Dennis Bautista, who went down to where Tamang was buried to administer medical aid, called the rescue amazing. \"It feels good to be able to help out. I can\\'t imagine what he went through,\" Bautista said. \"He is a brave young man.\" Basnet said once he got closer to Tamang, he tried to reassure him that he would be OK. \"I gave him water and talked to him regularly,\" Basnet said. Rescuers continue searchAfter Tamang was rushed to the hospital, the US Agency for International Development team continued the search. The possibility remained that someone else might have survived, like Tamang. Other search and rescue teams continued to scour through Kathmandu\\'s rubble Thursday. They are looking for survivors from the magnitude-7.8 earthquake that struck Saturday, killing at least 6,204 people and wounding almost 14,000, according to Nepali authorities. Another 72 people were reported dead in India and 25 in China.JUST WATCHEDEverest earthquake puts Nepal\\'s economy at riskReplayMore Videos ...MUST WATCHEverest earthquake puts Nepal\\'s economy at risk 02:22In Nepal, 19 of the deaths occurred on Mount Everest, where the quake triggered deadly avalanches. Despite the disaster, Nepali authorities plans to reopen routes up the mountain as soon as next week.Teams are clearing paths and and rebuilding ladders, Tourism Ministry spokesman Krishna Sapkota told CNN on Thursday. Officials are encouraging people who have already received permission to climb this season to go ahead with their plans.As the chances dimmed for finding people alive in the wreckage left by the quake across Nepal, Tamang\\'s rescue boosted hopes for all those who still have loved ones and friends missing. Other people have been saved from under collapsed buildings in previous days, including a 27-year-old man on Tuesday and a 4-month-old boy on Sunday. The Nepali military also released a photo of a dust-caked 11-year-old girl who they said was rescued Wednesday after 90 hours under the rubble.The rescuers, meanwhile, have no intention of giving up looking for more. The 2010 earthquake in Haiti, Olvera said, revised the benchmark for what was thought possible for survival. A man there was miraculously pulled from the rubble after 27 long days.360-degree view of earthquake zoneFollow @CNNbrk\\n\\n CNN\\'s Sugam Pokharel contributed to this report.', 'entities': [{'entity_name': 'PEMBA TAMANG', 'entity_type': 'PERSON', 'description': 'A 15-year-old boy rescued from the rubble of a multistory residential building in Kathmandu.', 'importance_score': 0.9}, {'entity_name': 'KATHMANDU', 'entity_type': 'LOCATION', 'description': 'A city in Nepal where the earthquake caused significant damage.', 'importance_score': 0.8}, {'entity_name': 'NEPAL', 'entity_type': 'LOCATION', 'description': 'The country where the earthquake occurred, causing widespread devastation.', 'importance_score': 0.9}, {'entity_name': 'U.S. SPECIAL FORCES', 'entity_type': 'ORGANIZATION', 'description': 'A team that rescued 30 people, including three Americans, by helicopter from Bamboo Village.', 'importance_score': 0.7}, {'entity_name': 'INSPECTOR LAKSHMAN BASNET', 'entity_type': 'PERSON', 'description': 'A rescuer from the Nepalese Armed Police Force who helped save Pemba Tamang.', 'importance_score': 0.7}, {'entity_name': 'BAMBOO VILLAGE', 'entity_type': 'LOCATION', 'description': 'An area in Nepal where a group of people, including Americans, were trapped and later rescued by U.S. special forces.', 'importance_score': 0.6}, {'entity_name': 'ANDREW OLVERA', 'entity_type': 'PERSON', 'description': 'The head of the U.S. disaster response team involved in the rescue of Pemba Tamang.', 'importance_score': 0.6}, {'entity_name': 'DENNIS BAUTISTA', 'entity_type': 'PERSON', 'description': 'A person who administered medical aid to Pemba Tamang after his rescue.', 'importance_score': 0.5}], 'relationships': [{'src_id': 'PEMBA TAMANG', 'tgt_id': 'INSPECTOR LAKSHMAN BASNET', 'description': 'Pemba Tamang was rescued by Inspector Lakshman Basnet.', 'weight': 0.9, 'order': 1}, {'src_id': 'U.S. SPECIAL FORCES', 'tgt_id': 'BAMBOO VILLAGE', 'description': 'U.S. special forces rescued people from Bamboo Village.', 'weight': 0.8, 'order': 1}, {'src_id': 'ANDREW OLVERA', 'tgt_id': 'PEMBA TAMANG', 'description': \"Andrew Olvera's team was involved in the rescue of Pemba Tamang.\", 'weight': 0.7, 'order': 1}, {'src_id': 'DENNIS BAUTISTA', 'tgt_id': 'PEMBA TAMANG', 'description': 'Dennis Bautista administered medical aid to Pemba Tamang after his rescue.', 'weight': 0.6, 'order': 1}]}) (input_keys={'input_text'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "valset[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 49 chunks, 320 entities(duplicated), 222 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 50 chunks, 323 entities(duplicated), 224 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 51 chunks, 328 entities(duplicated), 227 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 53 chunks, 338 entities(duplicated), 234 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 54 chunks, 345 entities(duplicated), 238 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 55 chunks, 352 entities(duplicated), 244 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 56 chunks, 359 entities(duplicated), 249 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 57 chunks, 367 entities(duplicated), 253 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 58 chunks, 377 entities(duplicated), 259 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 59 chunks, 387 entities(duplicated), 268 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 60 chunks, 397 entities(duplicated), 276 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 61 chunks, 413 entities(duplicated), 281 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 63 chunks, 421 entities(duplicated), 287 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 65 chunks, 430 entities(duplicated), 294 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 66 chunks, 445 entities(duplicated), 303 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 67 chunks, 460 entities(duplicated), 316 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 68 chunks, 468 entities(duplicated), 323 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 69 chunks, 474 entities(duplicated), 326 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 70 chunks, 490 entities(duplicated), 334 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 71 chunks, 505 entities(duplicated), 348 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 72 chunks, 510 entities(duplicated), 351 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 74 chunks, 521 entities(duplicated), 359 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 75 chunks, 528 entities(duplicated), 362 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 76 chunks, 537 entities(duplicated), 365 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 78 chunks, 558 entities(duplicated), 380 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 79 chunks, 565 entities(duplicated), 385 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 80 chunks, 573 entities(duplicated), 391 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 81 chunks, 577 entities(duplicated), 394 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 82 chunks, 582 entities(duplicated), 397 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 83 chunks, 593 entities(duplicated), 406 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 84 chunks, 601 entities(duplicated), 411 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 85 chunks, 608 entities(duplicated), 417 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 86 chunks, 618 entities(duplicated), 426 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 87 chunks, 623 entities(duplicated), 430 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 88 chunks, 633 entities(duplicated), 439 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 90 chunks, 646 entities(duplicated), 448 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 91 chunks, 657 entities(duplicated), 455 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 92 chunks, 662 entities(duplicated), 459 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 93 chunks, 667 entities(duplicated), 462 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 94 chunks, 673 entities(duplicated), 466 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 95 chunks, 677 entities(duplicated), 468 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 96 chunks, 688 entities(duplicated), 474 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 97 chunks, 692 entities(duplicated), 476 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 98 chunks, 703 entities(duplicated), 485 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 99 chunks, 736 entities(duplicated), 499 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 100 chunks, 746 entities(duplicated), 504 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 101 chunks, 760 entities(duplicated), 517 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 102 chunks, 765 entities(duplicated), 521 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 103 chunks, 770 entities(duplicated), 525 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 104 chunks, 776 entities(duplicated), 528 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 106 chunks, 788 entities(duplicated), 537 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 107 chunks, 794 entities(duplicated), 541 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 109 chunks, 809 entities(duplicated), 552 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 110 chunks, 813 entities(duplicated), 555 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 111 chunks, 820 entities(duplicated), 561 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 112 chunks, 823 entities(duplicated), 563 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 114 chunks, 838 entities(duplicated), 571 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 115 chunks, 844 entities(duplicated), 574 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 116 chunks, 855 entities(duplicated), 578 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 117 chunks, 858 entities(duplicated), 579 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 118 chunks, 871 entities(duplicated), 590 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 119 chunks, 876 entities(duplicated), 590 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 120 chunks, 882 entities(duplicated), 596 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 121 chunks, 890 entities(duplicated), 598 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 122 chunks, 893 entities(duplicated), 600 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 123 chunks, 900 entities(duplicated), 615 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 124 chunks, 915 entities(duplicated), 620 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 125 chunks, 921 entities(duplicated), 624 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 126 chunks, 924 entities(duplicated), 627 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 128 chunks, 937 entities(duplicated), 636 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 129 chunks, 950 entities(duplicated), 644 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 130 chunks, 961 entities(duplicated), 648 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 131 chunks, 967 entities(duplicated), 653 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 132 chunks, 972 entities(duplicated), 654 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 133 chunks, 977 entities(duplicated), 658 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 134 chunks, 983 entities(duplicated), 663 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 135 chunks, 988 entities(duplicated), 667 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 136 chunks, 992 entities(duplicated), 670 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 137 chunks, 996 entities(duplicated), 671 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 138 chunks, 1010 entities(duplicated), 680 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 139 chunks, 1014 entities(duplicated), 683 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 140 chunks, 1021 entities(duplicated), 689 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 141 chunks, 1024 entities(duplicated), 690 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 143 chunks, 1032 entities(duplicated), 695 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 144 chunks, 1039 entities(duplicated), 699 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 145 chunks, 1045 entities(duplicated), 704 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 146 chunks, 1050 entities(duplicated), 708 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 147 chunks, 1054 entities(duplicated), 710 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 148 chunks, 1062 entities(duplicated), 716 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 149 chunks, 1066 entities(duplicated), 719 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 150 chunks, 1073 entities(duplicated), 725 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 151 chunks, 1083 entities(duplicated), 730 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 152 chunks, 1089 entities(duplicated), 735 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 153 chunks, 1099 entities(duplicated), 740 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 154 chunks, 1103 entities(duplicated), 742 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 155 chunks, 1108 entities(duplicated), 745 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 156 chunks, 1112 entities(duplicated), 746 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 157 chunks, 1118 entities(duplicated), 750 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 158 chunks, 1124 entities(duplicated), 753 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 159 chunks, 1128 entities(duplicated), 756 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 160 chunks, 1139 entities(duplicated), 766 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 161 chunks, 1145 entities(duplicated), 769 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 162 chunks, 1152 entities(duplicated), 775 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 163 chunks, 1156 entities(duplicated), 778 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 164 chunks, 1174 entities(duplicated), 795 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 165 chunks, 1177 entities(duplicated), 797 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 166 chunks, 1184 entities(duplicated), 801 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 167 chunks, 1192 entities(duplicated), 805 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 168 chunks, 1219 entities(duplicated), 824 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 169 chunks, 1230 entities(duplicated), 832 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 170 chunks, 1235 entities(duplicated), 837 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠙ Processed 171 chunks, 1253 entities(duplicated), 842 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠹ Processed 172 chunks, 1255 entities(duplicated), 843 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠸ Processed 173 chunks, 1263 entities(duplicated), 849 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠴ Processed 175 chunks, 1285 entities(duplicated), 864 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠦ Processed 176 chunks, 1294 entities(duplicated), 868 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠧ Processed 177 chunks, 1301 entities(duplicated), 873 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠇ Processed 178 chunks, 1308 entities(duplicated), 875 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠏ Processed 179 chunks, 1316 entities(duplicated), 882 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 180 chunks, 1319 entities(duplicated), 882 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠼ Processed 184 chunks, 1352 entities(duplicated), 903 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⠋ Processed 200 chunks, 1470 entities(duplicated), 982 relations(duplicated)\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:nano-graphrag:Saved 197 examples with keys: ['input_text', 'entities', 'relationships'], filtered 3 examples\n" + ] + } + ], + "source": [ + "dev_chunks = {compute_mdhash_id(text, prefix=f\"chunk-devset-\"): {\"content\": text} for text in dev_data[\"text\"]}\n", + "devset = asyncio.run(generate_dataset(chunks=dev_chunks, filepath=entity_relationship_devset_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Example({'input_text': \"As students from Marjory Stoneman Douglas High School confront lawmakers with demands to restrict sales of assault rifles, there were warnings by the president of the Florida school administration association that schools in the state were vulnerable to such an attack.\\nHighlights\\n“Nikolas Cruz was able to purchase an assault rifle before he was able to buy a beer,” said Stoneman Douglas student Laurenzo Prado, referring to a Florida law that allows people as young as 18 to buy assault weapons. Students galvanized by the deadly mass shooting at the Florida high school confronted lawmakers with demands to restrict sales of assault rifles, while President Donald Trump suggested arming teachers as a way to stop more U.S. rampages.\\nTwo weeks before a gunman fatally shot 17 people at a Florida high school, Bill Lee, the president of the state’s school administrators association, warned that Florida’s schools were vulnerable to just such an attack. “It’s not a matter of if, but when,” he wrote in the Orlando Sentinel on Jan. 29, calling on legislators to increase school security spending after two January school shootings in other states.\\nWorld\\nU.S. players pose for a photo as they celebrate their victory over Canada. REUTERS/Grigory Dukor She’s practiced the ‘Oops, I did it again’ thousands of times in training, and her signature trick was worth its weight in Olympic gold as American Jocelyne Lamoureux-Davidson’s shootout winner broke a Canadian 16-year stranglehold on the women’s ice hockey title. It was the perfect jaw-dropping finale to a game that was billed as a grudge match but swiftly developed into a classic for the ages at the Pyeongchang Winter Games.\\nWarplanes pounded the last rebel enclave near the Syrian capital for a fifth straight day, as the United Nations pleaded for a truce to halt one of the fiercest air assaults of the seven-year civil war and prevent a “massacre” .\\nBangladesh is racing to turn an uninhabited and muddy Bay of Bengal island into home for 100,000 Rohingya Muslims who have fled a military crackdown in Myanmar , amid conflicting signals from top Bangladeshi officials about whether the refugees would end up being stranded there.\\nCommentary\\nNorth Korea’s participation in the 2018 Winter Olympics has created an opening for renewed dialogue with the United States and South Korea, writes Peter App s. “By taking part so visibly alongside South Korea, Pyongyang has been able to present itself as a credible global and regional power in a way that has eluded North Korean leaders since the war that divided the peninsula. The real strategic winner, however, is the South Korean government, which has shrewdly used the games to reshape the diplomatic landscape.”\\nBusiness\\nLast March, executives at General Electric's power-plant business gave Wall Street a surprisingly bullish forecast for the year. Despite flat demand for new natural gas power plants, they said, GE Power’s revenue and profit would rise. But GE’s forecast turned out to be a mirage .\\nMexican buyers imported ten times more corn from Brazil last year amid concern that NAFTA renegotiations could disrupt their U.S. supplies, according to government data and top grains merchants.\\nWarren Buffett may use part of his annual letter to Berkshire Hathaway shareholders, due on Saturday , to renew his optimism about America, at a time economic growth is on the upswing and U.S. stocks sit near record highs despite rising interest rates.\\nWorld stocks tumbled to one-week lows after the U.S. Fed confirmed it was on track to raise interest rates several times this year, sending bond yields to new multi-year highs.\\nTop stories on Reuters TV\\n'Hand grenade' thrown at U.S. embassy in Montenegro\\nGM's potential Korea exit piles pressure on Moon Jae-in\\n \", 'entities': [{'entity_name': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'entity_type': 'ORGANIZATION', 'description': 'A high school in Florida where students confronted lawmakers about restricting sales of assault rifles.', 'importance_score': 0.9}, {'entity_name': 'NIKOLAS CRUZ', 'entity_type': 'PERSON', 'description': 'A person who was able to purchase an assault rifle before he was able to buy a beer, according to a Florida law.', 'importance_score': 0.8}, {'entity_name': 'LAURENZO PRADO', 'entity_type': 'PERSON', 'description': \"A student from Marjory Stoneman Douglas High School who mentioned Nikolas Cruz's ability to purchase an assault rifle.\", 'importance_score': 0.7}, {'entity_name': 'FLORIDA', 'entity_type': 'LOCATION', 'description': 'A state in the United States where the events related to the high school and gun control are taking place.', 'importance_score': 0.8}, {'entity_name': 'BILL LEE', 'entity_type': 'PERSON', 'description': \"The president of the Florida school administrators association who warned about the vulnerability of Florida's schools to attacks.\", 'importance_score': 0.7}, {'entity_name': 'ORLANDO SENTINEL', 'entity_type': 'ORGANIZATION', 'description': 'A newspaper where Bill Lee wrote about the need for increased school security spending.', 'importance_score': 0.6}, {'entity_name': 'DONALD TRUMP', 'entity_type': 'PERSON', 'description': 'The President of the United States who suggested arming teachers as a way to stop more U.S. rampages.', 'importance_score': 0.8}, {'entity_name': 'GENERAL ELECTRIC', 'entity_type': 'ORGANIZATION', 'description': 'A company whose power-plant business gave a surprisingly bullish forecast for the year, which turned out to be a mirage.', 'importance_score': 0.6}, {'entity_name': 'WARREN BUFFETT', 'entity_type': 'PERSON', 'description': 'An investor who may use his annual letter to Berkshire Hathaway shareholders to renew his optimism about America.', 'importance_score': 0.7}], 'relationships': [{'src_id': 'MARJORY STONEMAN DOUGLAS HIGH SCHOOL', 'tgt_id': 'FLORIDA', 'description': 'Students from Marjory Stoneman Douglas High School are located in the state of Florida.', 'weight': 0.9, 'order': 1}, {'src_id': 'NIKOLAS CRUZ', 'tgt_id': 'ASSAULT RIFLE', 'description': 'Nikolas Cruz was able to purchase an assault rifle before he was able to buy a beer.', 'weight': 0.8, 'order': 1}, {'src_id': 'LAURENZO PRADO', 'tgt_id': 'NIKOLAS CRUZ', 'description': \"Laurenzo Prado, a student, mentioned Nikolas Cruz's ability to purchase an assault rifle.\", 'weight': 0.7, 'order': 1}, {'src_id': 'BILL LEE', 'tgt_id': \"FLORIDA'S SCHOOLS\", 'description': \"Bill Lee warned that Florida's schools were vulnerable to attacks.\", 'weight': 0.7, 'order': 1}, {'src_id': 'DONALD TRUMP', 'tgt_id': 'ARMING TEACHERS', 'description': 'Donald Trump suggested arming teachers as a way to stop more U.S. rampages.', 'weight': 0.8, 'order': 1}, {'src_id': 'GENERAL ELECTRIC', 'tgt_id': 'POWER-PLANT BUSINESS', 'description': \"General Electric's power-plant business gave a surprisingly bullish forecast for the year.\", 'weight': 0.6, 'order': 1}, {'src_id': 'WARREN BUFFETT', 'tgt_id': 'BERKSHIRE HATHAWAY', 'description': 'Warren Buffett may use his annual letter to Berkshire Hathaway shareholders to renew his optimism about America.', 'weight': 0.7, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': 'From ferrying people to and from their place of work to transporting nuclear waste and coal, railways are not only an integral part of 21st century life but have played a crucial role in shaping the modern world.\\nIn Belgium, one business is looking to use innovation to drive further change in the sector. Based in Brussels, Railnova is a technology company whose clients include Deutsche Bahn and French rail operator SNCF.\\nAccording to its CEO and founder Christian Sprauer, Railnova offers train operators the ability to be alerted to issues before failures occur.\\nA device designed and manufactured by the business is fitted to a train. The box gathers a host of data, a subset of which is sent to the cloud where it can be analyzed.\\n\"Before, they would have to wait until the failure happens and bring the train to the workshop,\" Sprauer said. \"Now… they can be alerted up front and they can avoid certain failures and… react much faster to other failures.\"\\nInnovation is being used across the rail industry to improve services. Instead of buying paper tickets, rail users in London can now touch their debit or credit cards on to sensors to pay for their journey. In China and Japan, magnetic levitation — or Maglev — trains offer a tantalizing glimpse of the high-speed rail travel that could be just around the corner.\\nExamples of the solutions offered by the Railnova technology range from measuring engine speed and braking force, to measuring traction force. Energy consumption was another area of interest, with Sprauer stating it was a good predictor of whether there was a problem with a train or not.\\nThe wealth of data being mined could, Sprauer said, offer a number of benefits. Reducing energy consumption was one such example. On diesel trains, he said, energy consumption could be cut by coaching drivers on best driving practices.\\n\"We call this the \\'golden run\\'… every driver tries to achieve the golden run and, as a whole, the company reduces… fuel expenses by 5 percent.\"\\nFollow CNBC International on Twitter and Facebook .', 'entities': [{'entity_name': 'RAILNOVA', 'entity_type': 'ORGANIZATION', 'description': 'A technology company based in Brussels whose clients include Deutsche Bahn and French rail operator SNCF.', 'importance_score': 0.9}, {'entity_name': 'DEUTSCHE BAHN', 'entity_type': 'ORGANIZATION', 'description': 'A client of Railnova, a German railway company.', 'importance_score': 0.8}, {'entity_name': 'SNCF', 'entity_type': 'ORGANIZATION', 'description': 'A client of Railnova, a French rail operator.', 'importance_score': 0.8}, {'entity_name': 'CHRISTIAN SPRAUER', 'entity_type': 'PERSON', 'description': 'The CEO and founder of Railnova.', 'importance_score': 0.9}, {'entity_name': 'BRUSSELS', 'entity_type': 'LOCATION', 'description': 'The location where Railnova is based.', 'importance_score': 0.7}, {'entity_name': 'LONDON', 'entity_type': 'LOCATION', 'description': 'A location where rail users can now touch their debit or credit cards on sensors to pay for their journey.', 'importance_score': 0.7}, {'entity_name': 'CHINA', 'entity_type': 'LOCATION', 'description': 'A country where magnetic levitation trains are being used.', 'importance_score': 0.7}, {'entity_name': 'JAPAN', 'entity_type': 'LOCATION', 'description': 'A country where magnetic levitation trains are being used.', 'importance_score': 0.7}, {'entity_name': 'MAGLEV', 'entity_type': 'TECHNOLOGY', 'description': 'Magnetic levitation trains that offer high-speed rail travel.', 'importance_score': 0.8}], 'relationships': [{'src_id': 'RAILNOVA', 'tgt_id': 'DEUTSCHE BAHN', 'description': 'Railnova provides technology solutions to Deutsche Bahn.', 'weight': 0.9, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'SNCF', 'description': 'Railnova provides technology solutions to SNCF.', 'weight': 0.9, 'order': 1}, {'src_id': 'CHRISTIAN SPRAUER', 'tgt_id': 'RAILNOVA', 'description': 'Christian Sprauer is the CEO and founder of Railnova.', 'weight': 1.0, 'order': 1}, {'src_id': 'RAILNOVA', 'tgt_id': 'BRUSSELS', 'description': 'Railnova is based in Brussels.', 'weight': 0.8, 'order': 1}, {'src_id': 'LONDON', 'tgt_id': 'DEBIT CARD', 'description': 'In London, rail users can touch their debit cards on sensors to pay for their journey.', 'weight': 0.7, 'order': 2}, {'src_id': 'LONDON', 'tgt_id': 'CREDIT CARD', 'description': 'In London, rail users can touch their credit cards on sensors to pay for their journey.', 'weight': 0.7, 'order': 2}, {'src_id': 'CHINA', 'tgt_id': 'MAGLEV', 'description': 'In China, magnetic levitation trains are being used.', 'weight': 0.8, 'order': 1}, {'src_id': 'JAPAN', 'tgt_id': 'MAGLEV', 'description': 'In Japan, magnetic levitation trains are being used.', 'weight': 0.8, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': \"Jan 22 (Reuters) - Shanghai Stock Exchange Filing\\n* SHOWS BLOCK TRADE OF YONGHUI SUPERSTORES Co LTd's 166.3 MILLION SHARES INVOLVING 1.63 BILLION YUAN ($254.68 million) ON JAN 22 Source text in Chinese: bit.ly/2yJZikT Further company coverage: ($1 = 6.4003 Chinese yuan renminbi) (Reporting by Hong Kong newsroom)\\n \", 'entities': [{'entity_name': 'YONGHUI SUPERSTORES CO LTD', 'entity_type': 'ORGANIZATION', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'importance_score': 1.0}, {'entity_name': '166.3 MILLION SHARES', 'entity_type': 'MONEY', 'description': '166.3 million shares are being traded in the block trade involving YONGHUI SUPERSTORES Co LTd.', 'importance_score': 0.9}, {'entity_name': '1.63 BILLION YUAN', 'entity_type': 'MONEY', 'description': 'The block trade involves 1.63 billion yuan ($254.68 million).', 'importance_score': 0.9}, {'entity_name': 'JAN 22', 'entity_type': 'DATE', 'description': 'The block trade occurred on January 22.', 'importance_score': 0.8}, {'entity_name': 'SHANGHAI STOCK EXCHANGE', 'entity_type': 'ORGANIZATION', 'description': 'The block trade was filed with the Shanghai Stock Exchange.', 'importance_score': 0.8}], 'relationships': [{'src_id': 'YONGHUI SUPERSTORES CO LTD', 'tgt_id': '166.3 MILLION SHARES', 'description': 'YONGHUI SUPERSTORES Co LTd is involved in a block trade of 166.3 million shares.', 'weight': 1.0, 'order': 1}, {'src_id': '166.3 MILLION SHARES', 'tgt_id': '1.63 BILLION YUAN', 'description': 'The block trade of 166.3 million shares involves 1.63 billion yuan.', 'weight': 1.0, 'order': 1}, {'src_id': 'SHANGHAI STOCK EXCHANGE', 'tgt_id': 'YONGHUI SUPERSTORES CO LTD', 'description': 'The block trade involving YONGHUI SUPERSTORES Co LTd was filed with the Shanghai Stock Exchange.', 'weight': 0.9, 'order': 1}, {'src_id': 'JAN 22', 'tgt_id': 'YONGHUI SUPERSTORES CO LTD', 'description': 'The block trade involving YONGHUI SUPERSTORES Co LTd occurred on January 22.', 'weight': 0.8, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': 'LONDON (Reuters) - Britain’s economy was weaker than previously thought in 2017, official data showed on Thursday, leaving the country lagging further behind the global recovery as it prepares to leave the European Union.\\nThe downgrade of the full-year and fourth-quarter growth rates also raised questions about the strength of the economy as the Bank of England prepares to raise interest rates.\\nGross domestic product growth slowed to a quarterly 0.4 percent from a previous estimate of 0.5 percent, wrong-footing economists and reducing 2017 growth as a whole to 1.7 percent, its lowest since 2012.\\nRelated Coverage UK retail sales growth eases further in February: CBI This was still stronger than most economists feared immediately after Britain voted to leave the EU in June 2016.\\nBut the country has relied heavily on the unexpectedly robust global economy to sustain its economic growth while consumers have been squeezed by higher inflation caused by the fall in the pound after the Brexit vote.\\nSterling was little changed after Thursday’s data and government bond prices rose slightly.\\nAlan Clarke, an economist at Scotiabank, said the figures showed Britain’s economy was growing at roughly the pace the BoE sees its new, lower speed limit, meaning a rate hike was still on the cards.\\nBut Samuel Tombs, at Pantheon Macroeconomics, said the data showed the central bank should delay any action for now.\\n“The latest GDP data suggest that the economy remains in a fragile state and does not need to be cooled with another rate rise as soon as May,” he said in a note to clients.\\nBoE Governor Mark Carney said this month that rates would probably need to rise sooner and by somewhat more than the central bank had thought in November, when it raised borrowing costs for the first time in a decade.\\nMost economists think rates will rise again in May, and financial markets expect a further increase, to 1 percent, by the end of the year.\\nLAGGARD UK Britain’s year-on-year economic growth of 1.4 percent in the last three months of 2017 was not just its weakest in five years but also the weakest of any of the economies in the Group of Seven, including long-term laggards such as Japan and Italy.\\nThere were only limited signs of a rebalancing of the economy away from consumer demand and toward business investment and net trade that Carney pointed to as positive signs in an appearance before lawmakers on Wednesday.\\nBusiness investment was flat on the quarter and 2.1 percent higher on the year, both readings coming in weaker than expected in a Reuters poll of economists. Net trade dragged on growth in most quarters of 2017, though it was positive on the year.\\nThe BoE said earlier this month that it expected the economy would grow by 1.8 percent this year, faster than its previous forecast of 1.6 percent, mostly because of the strength of the global economy.\\nReporting by David Milliken; Editing by Toby Chopra\\n ', 'entities': [{'entity_name': 'BRITAIN', 'entity_type': 'LOCATION', 'description': 'The country whose economy was weaker than previously thought in 2017.', 'importance_score': 0.9}, {'entity_name': 'EUROPEAN UNION', 'entity_type': 'ORGANIZATION', 'description': 'The political and economic union that Britain is preparing to leave.', 'importance_score': 0.8}, {'entity_name': 'BANK OF ENGLAND', 'entity_type': 'ORGANIZATION', 'description': 'The central bank of the United Kingdom, preparing to raise interest rates.', 'importance_score': 0.8}, {'entity_name': 'ALAN CLARKE', 'entity_type': 'PERSON', 'description': 'An economist at Scotiabank, commenting on the economic figures.', 'importance_score': 0.7}, {'entity_name': 'SAMUEL TOMBS', 'entity_type': 'PERSON', 'description': 'An economist at Pantheon Macroeconomics, suggesting the central bank should delay any action.', 'importance_score': 0.7}, {'entity_name': 'MARK CARNEY', 'entity_type': 'PERSON', 'description': 'The Governor of the Bank of England, stating that rates would probably need to rise sooner.', 'importance_score': 0.8}, {'entity_name': 'SCOTIABANK', 'entity_type': 'ORGANIZATION', 'description': 'A financial institution where Alan Clarke works as an economist.', 'importance_score': 0.6}, {'entity_name': 'PANTHEON MACROECONOMICS', 'entity_type': 'ORGANIZATION', 'description': 'A macroeconomic research firm where Samuel Tombs works as an economist.', 'importance_score': 0.6}], 'relationships': [{'src_id': 'BRITAIN', 'tgt_id': 'EUROPEAN UNION', 'description': 'Britain is preparing to leave the European Union.', 'weight': 0.9, 'order': 1}, {'src_id': 'BANK OF ENGLAND', 'tgt_id': 'INTEREST RATES', 'description': 'The Bank of England is preparing to raise interest rates.', 'weight': 0.8, 'order': 1}, {'src_id': 'ALAN CLARKE', 'tgt_id': 'BANK OF ENGLAND', 'description': \"Alan Clarke, an economist at Scotiabank, comments on the Bank of England's potential rate hike.\", 'weight': 0.7, 'order': 2}, {'src_id': 'SAMUEL TOMBS', 'tgt_id': 'BANK OF ENGLAND', 'description': 'Samuel Tombs, an economist at Pantheon Macroeconomics, suggests the Bank of England should delay any action.', 'weight': 0.7, 'order': 2}, {'src_id': 'MARK CARNEY', 'tgt_id': 'BANK OF ENGLAND', 'description': 'Mark Carney, the Governor of the Bank of England, states that rates would probably need to rise sooner.', 'weight': 0.8, 'order': 1}]}) (input_keys={'input_text'}),\n", + " Example({'input_text': \"Trump taps White House doctor as new VA secretary 2 Hours Ago CNBC's Kayla Tausche reports President Trump has tapped White House physician Rear Admiral Ronny Jackson to run the Department of Veterans Affairs amid a shakeup at the White House.\", 'entities': [{'entity_name': 'TRUMP', 'entity_type': 'PERSON', 'description': 'President Trump who tapped Ronny Jackson as new VA secretary.', 'importance_score': 1.0}, {'entity_name': 'WHITE HOUSE', 'entity_type': 'ORGANIZATION', 'description': 'The White House where Ronny Jackson is a physician.', 'importance_score': 0.8}, {'entity_name': 'VA', 'entity_type': 'ORGANIZATION', 'description': 'Department of Veterans Affairs, the organization Ronny Jackson is tapped to run.', 'importance_score': 0.9}, {'entity_name': 'RONNY JACKSON', 'entity_type': 'PERSON', 'description': 'Rear Admiral Ronny Jackson, the White House physician tapped to run the VA.', 'importance_score': 0.9}, {'entity_name': 'CNBC', 'entity_type': 'ORGANIZATION', 'description': 'CNBC, the news organization reporting on the event.', 'importance_score': 0.7}, {'entity_name': 'KAYLA TAUSCHE', 'entity_type': 'PERSON', 'description': 'Kayla Tausche, the CNBC reporter covering the story.', 'importance_score': 0.7}, {'entity_name': '2 HOURS AGO', 'entity_type': 'TIME', 'description': 'The time when the report was made.', 'importance_score': 0.5}], 'relationships': [{'src_id': 'TRUMP', 'tgt_id': 'RONNY JACKSON', 'description': 'President Trump taps Ronny Jackson as new VA secretary.', 'weight': 1.0, 'order': 1}, {'src_id': 'RONNY JACKSON', 'tgt_id': 'VA', 'description': 'Ronny Jackson is tapped to run the Department of Veterans Affairs.', 'weight': 1.0, 'order': 1}, {'src_id': 'KAYLA TAUSCHE', 'tgt_id': 'TRUMP', 'description': \"CNBC's Kayla Tausche reports President Trump has tapped Ronny Jackson.\", 'weight': 0.8, 'order': 2}]}) (input_keys={'input_text'})]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "devset[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_trainset = pickle.load(open(entity_relationship_trainset_path, \"rb\"))\n", + "loaded_valset = pickle.load(open(entity_relationship_valset_path, \"rb\"))\n", + "loaded_devset = pickle.load(open(entity_relationship_devset_path, \"rb\"))\n", + "assert loaded_trainset == trainset\n", + "assert loaded_valset == valset\n", + "assert loaded_devset == devset" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nano-graphrag", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nano-graphrag/examples/graphml_visualize.py b/nano-graphrag/examples/graphml_visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..42f0b2ccf8747f9a3fd58e2ff86e3546abe82daa --- /dev/null +++ b/nano-graphrag/examples/graphml_visualize.py @@ -0,0 +1,282 @@ +import networkx as nx +import json +import os +import webbrowser +import http.server +import socketserver +import threading + +# load GraphML file and transfer to JSON +def graphml_to_json(graphml_file): + G = nx.read_graphml(graphml_file) + data = nx.node_link_data(G) + return json.dumps(data) + + +# create HTML file +def create_html(html_path): + html_content = ''' + + + + + + Graph Visualization + + + + + +
+
+ + + + + ''' + + with open(html_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + +def create_json(json_data, json_path): + json_data = "var graphJson = " + json_data.replace('\\"', '').replace("'", "\\'").replace("\n", "") + with open(json_path, 'w', encoding='utf-8') as f: + f.write(json_data) + + +# start simple HTTP server +def start_server(port): + handler = http.server.SimpleHTTPRequestHandler + with socketserver.TCPServer(("", port), handler) as httpd: + print(f"Server started at http://localhost:{port}") + httpd.serve_forever() + +# main function +def visualize_graphml(graphml_file, html_path, port=8000): + json_data = graphml_to_json(graphml_file) + html_dir = os.path.dirname(html_path) + if not os.path.exists(html_dir): + os.makedirs(html_dir) + json_path = os.path.join(html_dir, 'graph_json.js') + create_json(json_data, json_path) + create_html(html_path) + # start server in background + server_thread = threading.Thread(target=start_server(port)) + server_thread.daemon = True + server_thread.start() + + # open default browser + webbrowser.open(f'http://localhost:{port}/{html_path}') + + print("Visualization is ready. Press Ctrl+C to exit.") + try: + # keep main thread running + while True: + pass + except KeyboardInterrupt: + print("Shutting down...") + +# usage +if __name__ == "__main__": + graphml_file = r"nano_graphrag_cache_azure_openai_TEST\graph_chunk_entity_relation.graphml" # replace with your GraphML file path + html_path = "graph_visualization.html" + visualize_graphml(graphml_file, html_path, 11236) \ No newline at end of file diff --git a/nano-graphrag/examples/no_openai_key_at_all.py b/nano-graphrag/examples/no_openai_key_at_all.py new file mode 100644 index 0000000000000000000000000000000000000000..1fce788dfb3de4d4612e4846a41ab2446656298e --- /dev/null +++ b/nano-graphrag/examples/no_openai_key_at_all.py @@ -0,0 +1,111 @@ +import os +import logging +import ollama +import numpy as np +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs +from sentence_transformers import SentenceTransformer + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.INFO) + +# !!! qwen2-7B maybe produce unparsable results and cause the extraction of graph to fail. +WORKING_DIR = "./nano_graphrag_cache_ollama_TEST" +MODEL = "qwen2" + +EMBED_MODEL = SentenceTransformer( + "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu" +) + + +# We're using Sentence Transformers to generate embeddings for the BGE model +@wrap_embedding_func_with_attrs( + embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(), + max_token_size=EMBED_MODEL.max_seq_length, +) +async def local_embedding(texts: list[str]) -> np.ndarray: + return EMBED_MODEL.encode(texts, normalize_embeddings=True) + + +async def ollama_model_if_cache( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + # remove kwargs that are not supported by ollama + kwargs.pop("max_tokens", None) + kwargs.pop("response_format", None) + + ollama_client = ollama.AsyncClient() + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(MODEL, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + response = await ollama_client.chat(model=MODEL, messages=messages, **kwargs) + + result = response["message"]["content"] + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert({args_hash: {"return": result, "model": MODEL}}) + # ----------------------------------------------------- + return result + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=ollama_model_if_cache, + cheap_model_func=ollama_model_if_cache, + embedding_func=local_embedding, + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + best_model_func=ollama_model_if_cache, + cheap_model_func=ollama_model_if_cache, + embedding_func=local_embedding, + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True) + # rag.insert(FAKE_TEXT[half_len:]) + + +if __name__ == "__main__": + insert() + query() diff --git a/nano-graphrag/examples/using_amazon_bedrock.py b/nano-graphrag/examples/using_amazon_bedrock.py new file mode 100644 index 0000000000000000000000000000000000000000..c8aeac46c433db854d56cb6213b235753682f15f --- /dev/null +++ b/nano-graphrag/examples/using_amazon_bedrock.py @@ -0,0 +1,19 @@ +from nano_graphrag import GraphRAG, QueryParam + +graph_func = GraphRAG( + working_dir="../bedrock_example", + using_amazon_bedrock=True, + best_model_id="us.anthropic.claude-3-sonnet-20240229-v1:0", + cheap_model_id="us.anthropic.claude-3-haiku-20240307-v1:0", +) + +with open("../tests/mock_data.txt") as f: + graph_func.insert(f.read()) + +prompt = "What are the top themes in this story?" + +# Perform global graphrag search +print(graph_func.query(prompt, param=QueryParam(mode="global"))) + +# Perform local graphrag search (I think is better and more scalable one) +print(graph_func.query(prompt, param=QueryParam(mode="local"))) diff --git a/nano-graphrag/examples/using_custom_chunking_method.py b/nano-graphrag/examples/using_custom_chunking_method.py new file mode 100644 index 0000000000000000000000000000000000000000..e1550b606d2aad46a4b0ef99857e631e15ef095e --- /dev/null +++ b/nano-graphrag/examples/using_custom_chunking_method.py @@ -0,0 +1,43 @@ +from nano_graphrag._utils import encode_string_by_tiktoken +from nano_graphrag.base import QueryParam +from nano_graphrag.graphrag import GraphRAG +from nano_graphrag._op import chunking_by_seperators + + +def chunking_by_token_size( + tokens_list: list[list[int]], # nano-graphrag may pass a batch of docs' tokens + doc_keys: list[str], # nano-graphrag may pass a batch of docs' key ids + tiktoken_model, # a titoken model + overlap_token_size=128, + max_token_size=1024, +): + + results = [] + for index, tokens in enumerate(tokens_list): + chunk_token = [] + lengths = [] + for start in range(0, len(tokens), max_token_size - overlap_token_size): + + chunk_token.append(tokens[start : start + max_token_size]) + lengths.append(min(max_token_size, len(tokens) - start)) + + chunk_token = tiktoken_model.decode_batch(chunk_token) + for i, chunk in enumerate(chunk_token): + + results.append( + { + "tokens": lengths[i], + "content": chunk.strip(), + "chunk_order_index": i, + "full_doc_id": doc_keys[index], + } + ) + + return results + + +WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST" +rag = GraphRAG( + working_dir=WORKING_DIR, + chunk_func=chunking_by_seperators, +) diff --git a/nano-graphrag/examples/using_deepseek_api_as_llm+glm_api_as_embedding.py b/nano-graphrag/examples/using_deepseek_api_as_llm+glm_api_as_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..d29433fba1027bec9c428975eb861b1001e329d7 --- /dev/null +++ b/nano-graphrag/examples/using_deepseek_api_as_llm+glm_api_as_embedding.py @@ -0,0 +1,136 @@ +import os +import logging +import numpy as np +from openai import AsyncOpenAI, OpenAI +from dataclasses import dataclass +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.INFO) + +GLM_API_KEY = "XXXX" +DEEPSEEK_API_KEY = "sk-XXXX" + +MODEL = "deepseek-chat" + + +async def deepseepk_model_if_cache( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = AsyncOpenAI( + api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com" + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(MODEL, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + + response = await openai_async_client.chat.completions.create( + model=MODEL, messages=messages, **kwargs + ) + + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": MODEL}} + ) + # ----------------------------------------------------- + return response.choices[0].message.content + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +@dataclass +class EmbeddingFunc: + embedding_dim: int + max_token_size: int + func: callable + + async def __call__(self, *args, **kwargs) -> np.ndarray: + return await self.func(*args, **kwargs) + +def wrap_embedding_func_with_attrs(**kwargs): + """Wrap a function with attributes""" + + def final_decro(func) -> EmbeddingFunc: + new_func = EmbeddingFunc(**kwargs, func=func) + return new_func + + return final_decro + +@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192) +async def GLM_embedding(texts: list[str]) -> np.ndarray: + model_name = "embedding-2" + client = OpenAI( + api_key=GLM_API_KEY, + base_url="https://open.bigmodel.cn/api/paas/v4/" + ) + embedding = client.embeddings.create( + input=texts, + model=model_name, + ) + final_embedding = [d.embedding for d in embedding.data] + return np.array(final_embedding) + + + +WORKING_DIR = "./nano_graphrag_cache_deepseek_TEST" + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=deepseepk_model_if_cache, + cheap_model_func=deepseepk_model_if_cache, + embedding_func=GLM_embedding, + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + best_model_func=deepseepk_model_if_cache, + cheap_model_func=deepseepk_model_if_cache, + embedding_func=GLM_embedding, + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True) + # rag.insert(FAKE_TEXT[half_len:]) + + +if __name__ == "__main__": + insert() + # query() diff --git a/nano-graphrag/examples/using_deepseek_as_llm.py b/nano-graphrag/examples/using_deepseek_as_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..4c32d12071a810ec267c07b3ebdd6d677bd9680d --- /dev/null +++ b/nano-graphrag/examples/using_deepseek_as_llm.py @@ -0,0 +1,98 @@ +import os +import logging +from openai import AsyncOpenAI +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.INFO) + +DEEPSEEK_API_KEY = "sk-XXXX" +MODEL = "deepseek-chat" + + +async def deepseepk_model_if_cache( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = AsyncOpenAI( + api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com" + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(MODEL, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + + response = await openai_async_client.chat.completions.create( + model=MODEL, messages=messages, **kwargs + ) + + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": MODEL}} + ) + # ----------------------------------------------------- + return response.choices[0].message.content + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +WORKING_DIR = "./nano_graphrag_cache_deepseek_TEST" + + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=deepseepk_model_if_cache, + cheap_model_func=deepseepk_model_if_cache, + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + best_model_func=deepseepk_model_if_cache, + cheap_model_func=deepseepk_model_if_cache, + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True) + # rag.insert(FAKE_TEXT[half_len:]) + + +if __name__ == "__main__": + insert() + # query() diff --git a/nano-graphrag/examples/using_dspy_entity_extraction.py b/nano-graphrag/examples/using_dspy_entity_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..e618e48dff3dccdaaa3415e754d5e033d6399ef8 --- /dev/null +++ b/nano-graphrag/examples/using_dspy_entity_extraction.py @@ -0,0 +1,144 @@ +import os +from openai import AsyncOpenAI +from dotenv import load_dotenv +import logging +import numpy as np +import dspy +from sentence_transformers import SentenceTransformer +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag._llm import gpt_4o_mini_complete +from nano_graphrag._storage import HNSWVectorStorage +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs +from nano_graphrag.entity_extraction.extract import extract_entities_dspy + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.DEBUG) + +WORKING_DIR = "./nano_graphrag_cache_using_dspy_entity_extraction" + +load_dotenv() + + +EMBED_MODEL = SentenceTransformer( + "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu" +) + + +@wrap_embedding_func_with_attrs( + embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(), + max_token_size=EMBED_MODEL.max_seq_length, +) +async def local_embedding(texts: list[str]) -> np.ndarray: + return EMBED_MODEL.encode(texts, normalize_embeddings=True) + + +async def deepseepk_model_if_cache( + prompt, model: str = "deepseek-chat", system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = AsyncOpenAI( + api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com" + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + + response = await openai_async_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": model}} + ) + # ----------------------------------------------------- + return response.choices[0].message.content + + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + vector_db_storage_cls=HNSWVectorStorage, + vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50}, + best_model_max_async=10, + cheap_model_max_async=10, + best_model_func=deepseepk_model_if_cache, + cheap_model_func=deepseepk_model_if_cache, + embedding_func=local_embedding, + entity_extraction_func=extract_entities_dspy + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + vector_db_storage_cls=HNSWVectorStorage, + vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50}, + best_model_max_token_size=8196, + cheap_model_max_token_size=8196, + best_model_max_async=4, + cheap_model_max_async=4, + best_model_func=gpt_4o_mini_complete, + cheap_model_func=gpt_4o_mini_complete, + embedding_func=local_embedding, + entity_extraction_func=extract_entities_dspy + + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="local") + ) + ) + + +if __name__ == "__main__": + lm = dspy.LM( + model="deepseek/deepseek-chat", + model_type="chat", + api_provider="openai", + api_key=os.environ["DEEPSEEK_API_KEY"], + base_url=os.environ["DEEPSEEK_BASE_URL"], + temperature=1.0, + max_tokens=8192 + ) + dspy.settings.configure(lm=lm, experimental=True) + insert() + query() diff --git a/nano-graphrag/examples/using_faiss_as_vextorDB.py b/nano-graphrag/examples/using_faiss_as_vextorDB.py new file mode 100644 index 0000000000000000000000000000000000000000..543d6fbbcf42d8f4268e05c3254b85074f8af297 --- /dev/null +++ b/nano-graphrag/examples/using_faiss_as_vextorDB.py @@ -0,0 +1,97 @@ +import os +import asyncio +import numpy as np +from nano_graphrag.graphrag import GraphRAG, QueryParam +from nano_graphrag._utils import logger +from nano_graphrag.base import BaseVectorStorage +from dataclasses import dataclass +import faiss +import pickle +import logging +import xxhash +logging.getLogger('msal').setLevel(logging.WARNING) +logging.getLogger('azure').setLevel(logging.WARNING) +logging.getLogger("httpx").setLevel(logging.WARNING) + +WORKING_DIR = "./nano_graphrag_cache_faiss_TEST" + +@dataclass +class FAISSStorage(BaseVectorStorage): + + def __post_init__(self): + self._index_file_name = os.path.join( + self.global_config["working_dir"], f"{self.namespace}_faiss.index" + ) + self._metadata_file_name = os.path.join( + self.global_config["working_dir"], f"{self.namespace}_metadata.pkl" + ) + self._max_batch_size = self.global_config["embedding_batch_num"] + + if os.path.exists(self._index_file_name) and os.path.exists(self._metadata_file_name): + self._index = faiss.read_index(self._index_file_name) + with open(self._metadata_file_name, 'rb') as f: + self._metadata = pickle.load(f) + else: + self._index = faiss.IndexIDMap(faiss.IndexFlatIP(self.embedding_func.embedding_dim)) + self._metadata = {} + + async def upsert(self, data: dict[str, dict]): + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + + contents = [v["content"] for v in data.values()] + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + embeddings_list = await asyncio.gather( + *[self.embedding_func(batch) for batch in batches] + ) + embeddings = np.concatenate(embeddings_list) + + ids = [] + for k, v in data.items(): + id = xxhash.xxh32_intdigest(k.encode()) + metadata = {k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields} + metadata['id'] = k + self._metadata[id] = metadata + ids.append(id) + + ids = np.array(ids, dtype=np.int64) + self._index.add_with_ids(embeddings, ids) + + + return len(data) + + async def query(self, query, top_k=5): + embedding = await self.embedding_func([query]) + distances, indices = self._index.search(embedding, top_k) + + results = [] + for _, (distance, id) in enumerate(zip(distances[0], indices[0])): + if id != -1: # FAISS returns -1 for empty slots + if id in self._metadata: + metadata = self._metadata[id] + results.append({**metadata, "distance": 1 - distance}) # Convert to cosine distance + + return results + + async def index_done_callback(self): + faiss.write_index(self._index, self._index_file_name) + with open(self._metadata_file_name, 'wb') as f: + pickle.dump(self._metadata, f) + +if __name__ == "__main__": + + graph_func = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + vector_db_storage_cls=FAISSStorage, + ) + + with open(r"tests/mock_data.txt", encoding='utf-8') as f: + graph_func.insert(f.read()[:30000]) + + # Perform global graphrag search + print(graph_func.query("What are the top themes in this story?")) + + \ No newline at end of file diff --git a/nano-graphrag/examples/using_hnsw_as_vectorDB.py b/nano-graphrag/examples/using_hnsw_as_vectorDB.py new file mode 100644 index 0000000000000000000000000000000000000000..0320d1e4a4f95ec1d939d20865b45e00216bdfd1 --- /dev/null +++ b/nano-graphrag/examples/using_hnsw_as_vectorDB.py @@ -0,0 +1,129 @@ +import os +from openai import AsyncOpenAI +from dotenv import load_dotenv +import logging +import numpy as np +from sentence_transformers import SentenceTransformer +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag._llm import gpt_4o_mini_complete +from nano_graphrag._storage import HNSWVectorStorage +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.DEBUG) + +WORKING_DIR = "./nano_graphrag_cache_using_hnsw_as_vectorDB" + +load_dotenv() + + +EMBED_MODEL = SentenceTransformer( + "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu" +) + + +@wrap_embedding_func_with_attrs( + embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(), + max_token_size=EMBED_MODEL.max_seq_length, +) +async def local_embedding(texts: list[str]) -> np.ndarray: + return EMBED_MODEL.encode(texts, normalize_embeddings=True) + + +async def deepseepk_model_if_cache( + prompt, model: str = "deepseek-chat", system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = AsyncOpenAI( + api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com" + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + + response = await openai_async_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": model}} + ) + # ----------------------------------------------------- + return response.choices[0].message.content + + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + vector_db_storage_cls=HNSWVectorStorage, + vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50}, + best_model_max_async=10, + cheap_model_max_async=10, + best_model_func=deepseepk_model_if_cache, + cheap_model_func=deepseepk_model_if_cache, + embedding_func=local_embedding + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + vector_db_storage_cls=HNSWVectorStorage, + vector_db_storage_cls_kwargs={"max_elements": 1000000, "ef_search": 200, "M": 50}, + best_model_max_token_size=8196, + cheap_model_max_token_size=8196, + best_model_max_async=4, + cheap_model_max_async=4, + best_model_func=gpt_4o_mini_complete, + cheap_model_func=gpt_4o_mini_complete, + embedding_func=local_embedding + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="local") + ) + ) + + +if __name__ == "__main__": + insert() + query() diff --git a/nano-graphrag/examples/using_llm_api_as_llm+ollama_embedding.py b/nano-graphrag/examples/using_llm_api_as_llm+ollama_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..04478eb1ae66ee96621416f6ce4a8f78b73c1636 --- /dev/null +++ b/nano-graphrag/examples/using_llm_api_as_llm+ollama_embedding.py @@ -0,0 +1,122 @@ +import os +import logging +import ollama +import numpy as np +from openai import AsyncOpenAI +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.INFO) + +# Assumed llm model settings +LLM_BASE_URL = "https://your.api.url" +LLM_API_KEY = "your_api_key" +MODEL = "your_model_name" + +# Assumed embedding model settings +EMBEDDING_MODEL = "nomic-embed-text" +EMBEDDING_MODEL_DIM = 768 +EMBEDDING_MODEL_MAX_TOKENS = 8192 + + +async def llm_model_if_cache( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = AsyncOpenAI( + api_key=LLM_API_KEY, base_url=LLM_BASE_URL + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(MODEL, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + + response = await openai_async_client.chat.completions.create( + model=MODEL, messages=messages, **kwargs + ) + + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": MODEL}} + ) + # ----------------------------------------------------- + return response.choices[0].message.content + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +WORKING_DIR = "./nano_graphrag_cache_llm_TEST" + + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=llm_model_if_cache, + cheap_model_func=llm_model_if_cache, + embedding_func=ollama_embedding, + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + best_model_func=llm_model_if_cache, + cheap_model_func=llm_model_if_cache, + embedding_func=ollama_embedding, + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True) + # rag.insert(FAKE_TEXT[half_len:]) + +# We're using Ollama to generate embeddings for the BGE model +@wrap_embedding_func_with_attrs( + embedding_dim= EMBEDDING_MODEL_DIM, + max_token_size= EMBEDDING_MODEL_MAX_TOKENS, +) + +async def ollama_embedding(texts :list[str]) -> np.ndarray: + embed_text = [] + for text in texts: + data = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text) + embed_text.append(data["embedding"]) + + return embed_text + +if __name__ == "__main__": + insert() + query() diff --git a/nano-graphrag/examples/using_local_embedding_model.py b/nano-graphrag/examples/using_local_embedding_model.py new file mode 100644 index 0000000000000000000000000000000000000000..aeff17bb358503b7b57824c8801693681e81655b --- /dev/null +++ b/nano-graphrag/examples/using_local_embedding_model.py @@ -0,0 +1,38 @@ +import sys + +sys.path.append("..") +import logging +import numpy as np +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag._utils import wrap_embedding_func_with_attrs +from sentence_transformers import SentenceTransformer + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.INFO) + +WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST" + +EMBED_MODEL = SentenceTransformer( + "sentence-transformers/all-MiniLM-L6-v2", cache_folder=WORKING_DIR, device="cpu" +) + + +# We're using Sentence Transformers to generate embeddings for the BGE model +@wrap_embedding_func_with_attrs( + embedding_dim=EMBED_MODEL.get_sentence_embedding_dimension(), + max_token_size=EMBED_MODEL.max_seq_length, +) +async def local_embedding(texts: list[str]) -> np.ndarray: + return EMBED_MODEL.encode(texts, normalize_embeddings=True) + + +rag = GraphRAG( + working_dir=WORKING_DIR, + embedding_func=local_embedding, +) + +with open("../tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + +# rag.insert(FAKE_TEXT) +print(rag.query("What the main theme of this story?", param=QueryParam(mode="local"))) diff --git a/nano-graphrag/examples/using_milvus_as_vectorDB.py b/nano-graphrag/examples/using_milvus_as_vectorDB.py new file mode 100644 index 0000000000000000000000000000000000000000..bacc36905336c55e4da29761dab116d9b429c5e3 --- /dev/null +++ b/nano-graphrag/examples/using_milvus_as_vectorDB.py @@ -0,0 +1,94 @@ +import os +import asyncio +import numpy as np +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag._utils import logger +from nano_graphrag.base import BaseVectorStorage +from dataclasses import dataclass + + +@dataclass +class MilvusLiteStorge(BaseVectorStorage): + + @staticmethod + def create_collection_if_not_exist(client, collection_name: str, **kwargs): + if client.has_collection(collection_name): + return + # TODO add constants for ID max length to 32 + client.create_collection( + collection_name, max_length=32, id_type="string", **kwargs + ) + + def __post_init__(self): + from pymilvus import MilvusClient + + self._client_file_name = os.path.join( + self.global_config["working_dir"], "milvus_lite.db" + ) + self._client = MilvusClient(self._client_file_name) + self._max_batch_size = self.global_config["embedding_batch_num"] + MilvusLiteStorge.create_collection_if_not_exist( + self._client, + self.namespace, + dimension=self.embedding_func.embedding_dim, + ) + + async def upsert(self, data: dict[str, dict]): + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + list_data = [ + { + "id": k, + **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, + } + for k, v in data.items() + ] + contents = [v["content"] for v in data.values()] + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + embeddings_list = await asyncio.gather( + *[self.embedding_func(batch) for batch in batches] + ) + embeddings = np.concatenate(embeddings_list) + for i, d in enumerate(list_data): + d["vector"] = embeddings[i] + results = self._client.upsert(collection_name=self.namespace, data=list_data) + return results + + async def query(self, query, top_k=5): + embedding = await self.embedding_func([query]) + results = self._client.search( + collection_name=self.namespace, + data=embedding, + limit=top_k, + output_fields=list(self.meta_fields), + search_params={"metric_type": "COSINE", "params": {"radius": 0.2}}, + ) + return [ + {**dp["entity"], "id": dp["id"], "distance": dp["distance"]} + for dp in results[0] + ] + + +def insert(): + data = ["YOUR TEXT DATA HERE", "YOUR TEXT DATA HERE"] + rag = GraphRAG( + working_dir="./nano_graphrag_cache_milvus_TEST", + enable_llm_cache=True, + vector_db_storage_cls=MilvusLiteStorge, + ) + rag.insert(data) + + +def query(): + rag = GraphRAG( + working_dir="./nano_graphrag_cache_milvus_TEST", + enable_llm_cache=True, + vector_db_storage_cls=MilvusLiteStorge, + ) + print(rag.query("YOUR QUERY HERE", param=QueryParam(mode="local"))) + + +insert() +query() diff --git a/nano-graphrag/examples/using_ollama_as_llm.py b/nano-graphrag/examples/using_ollama_as_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..e067212ba4738b66bf6c69701acd778638444e46 --- /dev/null +++ b/nano-graphrag/examples/using_ollama_as_llm.py @@ -0,0 +1,96 @@ +import os +import logging +import ollama +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.INFO) + +# !!! qwen2-7B maybe produce unparsable results and cause the extraction of graph to fail. +MODEL = "qwen2" + + +async def ollama_model_if_cache( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + # remove kwargs that are not supported by ollama + kwargs.pop("max_tokens", None) + kwargs.pop("response_format", None) + + ollama_client = ollama.AsyncClient() + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(MODEL, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + response = await ollama_client.chat(model=MODEL, messages=messages, **kwargs) + + result = response["message"]["content"] + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert({args_hash: {"return": result, "model": MODEL}}) + # ----------------------------------------------------- + return result + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +WORKING_DIR = "./nano_graphrag_cache_ollama_TEST" + + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=ollama_model_if_cache, + cheap_model_func=ollama_model_if_cache, + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + best_model_func=ollama_model_if_cache, + cheap_model_func=ollama_model_if_cache, + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True) + # rag.insert(FAKE_TEXT[half_len:]) + + +if __name__ == "__main__": + insert() + query() diff --git a/nano-graphrag/examples/using_ollama_as_llm_and_embedding.py b/nano-graphrag/examples/using_ollama_as_llm_and_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..44d669db8635e5523479df63c1011b6b053e5829 --- /dev/null +++ b/nano-graphrag/examples/using_ollama_as_llm_and_embedding.py @@ -0,0 +1,120 @@ +import os +import sys + +sys.path.append("..") +import logging +import ollama +import numpy as np +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag.base import BaseKVStorage +from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs + +logging.basicConfig(level=logging.WARNING) +logging.getLogger("nano-graphrag").setLevel(logging.INFO) + +# Assumed llm model settings +MODEL = "your_model_name" + +# Assumed embedding model settings +EMBEDDING_MODEL = "nomic-embed-text" +EMBEDDING_MODEL_DIM = 768 +EMBEDDING_MODEL_MAX_TOKENS = 8192 + + +async def ollama_model_if_cache( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + # remove kwargs that are not supported by ollama + kwargs.pop("max_tokens", None) + kwargs.pop("response_format", None) + + ollama_client = ollama.AsyncClient() + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + # Get the cached response if having------------------- + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(MODEL, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + # ----------------------------------------------------- + response = await ollama_client.chat(model=MODEL, messages=messages, **kwargs) + + result = response["message"]["content"] + # Cache the response if having------------------- + if hashing_kv is not None: + await hashing_kv.upsert({args_hash: {"return": result, "model": MODEL}}) + # ----------------------------------------------------- + return result + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +WORKING_DIR = "./nano_graphrag_cache_ollama_TEST" + + +def query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=ollama_model_if_cache, + cheap_model_func=ollama_model_if_cache, + embedding_func=ollama_embedding, + ) + print( + rag.query( + "What are the top themes in this story?", param=QueryParam(mode="global") + ) + ) + + +def insert(): + from time import time + + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + remove_if_exist(f"{WORKING_DIR}/vdb_entities.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + + rag = GraphRAG( + working_dir=WORKING_DIR, + enable_llm_cache=True, + best_model_func=ollama_model_if_cache, + cheap_model_func=ollama_model_if_cache, + embedding_func=ollama_embedding, + ) + start = time() + rag.insert(FAKE_TEXT) + print("indexing time:", time() - start) + # rag = GraphRAG(working_dir=WORKING_DIR, enable_llm_cache=True) + # rag.insert(FAKE_TEXT[half_len:]) + + +# We're using Ollama to generate embeddings for the BGE model +@wrap_embedding_func_with_attrs( + embedding_dim=EMBEDDING_MODEL_DIM, + max_token_size=EMBEDDING_MODEL_MAX_TOKENS, +) +async def ollama_embedding(texts: list[str]) -> np.ndarray: + embed_text = [] + for text in texts: + data = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text) + embed_text.append(data["embedding"]) + + return embed_text + + +if __name__ == "__main__": + insert() + query() diff --git a/nano-graphrag/examples/using_qdrant_as_vectorDB.py b/nano-graphrag/examples/using_qdrant_as_vectorDB.py new file mode 100644 index 0000000000000000000000000000000000000000..280df68010c12856d4a1a7d7fe2a857e93bb0c04 --- /dev/null +++ b/nano-graphrag/examples/using_qdrant_as_vectorDB.py @@ -0,0 +1,113 @@ +import os +import asyncio +import uuid +import numpy as np +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag._utils import logger +from nano_graphrag.base import BaseVectorStorage +from dataclasses import dataclass + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import VectorParams, Distance, PointStruct, SearchParams +except ImportError as original_error: + raise ImportError( + "Qdrant client is not installed. Install it using: pip install qdrant-client\n" + ) from original_error + + +@dataclass +class QdrantStorage(BaseVectorStorage): + def __post_init__(self): + + # Use a local file-based Qdrant storage + # Useful for prototyping and CI. + # For production, refer to: + # https://qdrant.tech/documentation/guides/installation/ + self._client_file_path = os.path.join( + self.global_config["working_dir"], "qdrant_storage" + ) + + self._client = QdrantClient(path=self._client_file_path) + + self._max_batch_size = self.global_config["embedding_batch_num"] + + if not self._client.collection_exists(collection_name=self.namespace): + self._client.create_collection( + collection_name=self.namespace, + vectors_config=VectorParams( + size=self.embedding_func.embedding_dim, distance=Distance.COSINE + ), + ) + + async def upsert(self, data: dict[str, dict]): + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + + list_data = [ + { + "id": k, + **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, + } + for k, v in data.items() + ] + + contents = [v["content"] for v in data.values()] + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + + embeddings_list = await asyncio.gather( + *[self.embedding_func(batch) for batch in batches] + ) + embeddings = np.concatenate(embeddings_list) + + points = [ + PointStruct( + id=uuid.uuid4().hex, + vector=embeddings[i].tolist(), + payload=data, + ) + for i, data in enumerate(list_data) + ] + + results = self._client.upsert(collection_name=self.namespace, points=points) + return results + + async def query(self, query, top_k=5): + embedding = await self.embedding_func([query]) + + results = self._client.query_points( + collection_name=self.namespace, + query=embedding[0].tolist(), + limit=top_k, + ).points + + return [ + {**result.payload, "score": result.score} + for result in results + ] + + +def insert(): + data = ["YOUR TEXT DATA HERE", "YOUR TEXT DATA HERE"] + rag = GraphRAG( + working_dir="./nano_graphrag_cache_qdrant_TEST", + enable_llm_cache=True, + vector_db_storage_cls=QdrantStorage, + ) + rag.insert(data) + + +def query(): + rag = GraphRAG( + working_dir="./nano_graphrag_cache_qdrant_TEST", + enable_llm_cache=True, + vector_db_storage_cls=QdrantStorage, + ) + print(rag.query("YOUR QUERY HERE", param=QueryParam(mode="local"))) + + +if __name__ == "__main__": + insert() + query() diff --git a/nano-graphrag/nano_graphrag/__init__.py b/nano-graphrag/nano_graphrag/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe20e4ba5d62944f971eb1955c67021fecf515e5 --- /dev/null +++ b/nano-graphrag/nano_graphrag/__init__.py @@ -0,0 +1,7 @@ +from .graphrag import GraphRAG, QueryParam + +__version__ = "0.0.8.2" +__author__ = "Jianbai Ye" +__url__ = "https://github.com/gusye1234/nano-graphrag" + +# dp stands for data pack diff --git a/nano-graphrag/nano_graphrag/_llm.py b/nano-graphrag/nano_graphrag/_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..974c33911badc4d491d9b5420f68bad0e0c18d79 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_llm.py @@ -0,0 +1,294 @@ +import json +import numpy as np +from typing import Optional, List, Any, Callable + +import aioboto3 +from openai import AsyncOpenAI, AsyncAzureOpenAI, APIConnectionError, RateLimitError + +from tenacity import ( + retry, + stop_after_attempt, + wait_exponential, + retry_if_exception_type, +) +import os + +from ._utils import compute_args_hash, wrap_embedding_func_with_attrs +from .base import BaseKVStorage + +global_openai_async_client = None +global_azure_openai_async_client = None +global_amazon_bedrock_async_client = None + + +def get_openai_async_client_instance(): + global global_openai_async_client + if global_openai_async_client is None: + global_openai_async_client = AsyncOpenAI() + return global_openai_async_client + + +def get_azure_openai_async_client_instance(): + global global_azure_openai_async_client + if global_azure_openai_async_client is None: + global_azure_openai_async_client = AsyncAzureOpenAI() + return global_azure_openai_async_client + + +def get_amazon_bedrock_async_client_instance(): + global global_amazon_bedrock_async_client + if global_amazon_bedrock_async_client is None: + global_amazon_bedrock_async_client = aioboto3.Session() + return global_amazon_bedrock_async_client + + +@retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), +) +async def openai_complete_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = get_openai_async_client_instance() + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + response = await openai_async_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": model}} + ) + await hashing_kv.index_done_callback() + return response.choices[0].message.content + + +@retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), +) +async def amazon_bedrock_complete_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + amazon_bedrock_async_client = get_amazon_bedrock_async_client_instance() + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages = [] + messages.extend(history_messages) + messages.append({"role": "user", "content": [{"text": prompt}]}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + inference_config = { + "temperature": 0, + "maxTokens": 4096 if "max_tokens" not in kwargs else kwargs["max_tokens"], + } + + async with amazon_bedrock_async_client.client( + "bedrock-runtime", + region_name=os.getenv("AWS_REGION", "us-east-1") + ) as bedrock_runtime: + if system_prompt: + response = await bedrock_runtime.converse( + modelId=model, messages=messages, inferenceConfig=inference_config, + system=[{"text": system_prompt}] + ) + else: + response = await bedrock_runtime.converse( + modelId=model, messages=messages, inferenceConfig=inference_config, + ) + + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response["output"]["message"]["content"][0]["text"], "model": model}} + ) + await hashing_kv.index_done_callback() + return response["output"]["message"]["content"][0]["text"] + + +def create_amazon_bedrock_complete_function(model_id: str) -> Callable: + """ + Factory function to dynamically create completion functions for Amazon Bedrock + + Args: + model_id (str): Amazon Bedrock model identifier (e.g., "us.anthropic.claude-3-sonnet-20240229-v1:0") + + Returns: + Callable: Generated completion function + """ + async def bedrock_complete( + prompt: str, + system_prompt: Optional[str] = None, + history_messages: List[Any] = [], + **kwargs + ) -> str: + return await amazon_bedrock_complete_if_cache( + model_id, + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs + ) + + # Set function name for easier debugging + bedrock_complete.__name__ = f"{model_id}_complete" + + return bedrock_complete + + +async def gpt_4o_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "gpt-4o", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +async def gpt_4o_mini_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "gpt-4o-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192) +@retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), +) +async def amazon_bedrock_embedding(texts: list[str]) -> np.ndarray: + amazon_bedrock_async_client = get_amazon_bedrock_async_client_instance() + + async with amazon_bedrock_async_client.client( + "bedrock-runtime", + region_name=os.getenv("AWS_REGION", "us-east-1") + ) as bedrock_runtime: + embeddings = [] + for text in texts: + body = json.dumps( + { + "inputText": text, + "dimensions": 1024, + } + ) + response = await bedrock_runtime.invoke_model( + modelId="amazon.titan-embed-text-v2:0", body=body, + ) + response_body = await response.get("body").read() + embeddings.append(json.loads(response_body)) + return np.array([dp["embedding"] for dp in embeddings]) + + +@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) +@retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), +) +async def openai_embedding(texts: list[str]) -> np.ndarray: + openai_async_client = get_openai_async_client_instance() + response = await openai_async_client.embeddings.create( + model="text-embedding-3-small", input=texts, encoding_format="float" + ) + return np.array([dp.embedding for dp in response.data]) + + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), +) +async def azure_openai_complete_if_cache( + deployment_name, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + azure_openai_client = get_azure_openai_async_client_instance() + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(deployment_name, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + response = await azure_openai_client.chat.completions.create( + model=deployment_name, messages=messages, **kwargs + ) + + if hashing_kv is not None: + await hashing_kv.upsert( + { + args_hash: { + "return": response.choices[0].message.content, + "model": deployment_name, + } + } + ) + await hashing_kv.index_done_callback() + return response.choices[0].message.content + + +async def azure_gpt_4o_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await azure_openai_complete_if_cache( + "gpt-4o", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +async def azure_gpt_4o_mini_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await azure_openai_complete_if_cache( + "gpt-4o-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), +) +async def azure_openai_embedding(texts: list[str]) -> np.ndarray: + azure_openai_client = get_azure_openai_async_client_instance() + response = await azure_openai_client.embeddings.create( + model="text-embedding-3-small", input=texts, encoding_format="float" + ) + return np.array([dp.embedding for dp in response.data]) diff --git a/nano-graphrag/nano_graphrag/_op.py b/nano-graphrag/nano_graphrag/_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b8cef88184a538344be5a9f47d0f793e9a76f4a4 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_op.py @@ -0,0 +1,1140 @@ +import re +import json +import asyncio +from typing import Union +from collections import Counter, defaultdict +from ._splitter import SeparatorSplitter +from ._utils import ( + logger, + clean_str, + compute_mdhash_id, + is_float_regex, + list_of_list_to_csv, + pack_user_ass_to_openai_messages, + split_string_by_multi_markers, + truncate_list_by_token_size, + + TokenizerWrapper +) +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, + SingleCommunitySchema, + CommunitySchema, + TextChunkSchema, + QueryParam, +) +from .prompt import GRAPH_FIELD_SEP, PROMPTS + + +def chunking_by_token_size( + tokens_list: list[list[int]], + doc_keys, + tokenizer_wrapper: TokenizerWrapper, + overlap_token_size=128, + max_token_size=1024, +): + results = [] + for index, tokens in enumerate(tokens_list): + chunk_token = [] + lengths = [] + for start in range(0, len(tokens), max_token_size - overlap_token_size): + chunk_token.append(tokens[start : start + max_token_size]) + lengths.append(min(max_token_size, len(tokens) - start)) + + + chunk_texts = tokenizer_wrapper.decode_batch(chunk_token) + + for i, chunk in enumerate(chunk_texts): + results.append( + { + "tokens": lengths[i], + "content": chunk.strip(), + "chunk_order_index": i, + "full_doc_id": doc_keys[index], + } + ) + return results + + +def chunking_by_seperators( + tokens_list: list[list[int]], + doc_keys, + tokenizer_wrapper: TokenizerWrapper, + overlap_token_size=128, + max_token_size=1024, +): + from .prompt import PROMPTS + # *** 修改 ***: 直接使用 wrapper 编码,而不是获取底层 tokenizer + separators = [tokenizer_wrapper.encode(s) for s in PROMPTS["default_text_separator"]] + splitter = SeparatorSplitter( + separators=separators, + chunk_size=max_token_size, + chunk_overlap=overlap_token_size, + ) + results = [] + for index, tokens in enumerate(tokens_list): + chunk_tokens = splitter.split_tokens(tokens) + lengths = [len(c) for c in chunk_tokens] + + decoded_chunks = tokenizer_wrapper.decode_batch(chunk_tokens) + for i, chunk in enumerate(decoded_chunks): + results.append( + { + "tokens": lengths[i], + "content": chunk.strip(), + "chunk_order_index": i, + "full_doc_id": doc_keys[index], + } + ) + return results + + +def get_chunks(new_docs, chunk_func=chunking_by_token_size, tokenizer_wrapper: TokenizerWrapper = None, **chunk_func_params): + inserting_chunks = {} + new_docs_list = list(new_docs.items()) + docs = [new_doc[1]["content"] for new_doc in new_docs_list] + doc_keys = [new_doc[0] for new_doc in new_docs_list] + + tokens = [tokenizer_wrapper.encode(doc) for doc in docs] + chunks = chunk_func( + tokens, doc_keys=doc_keys, tokenizer_wrapper=tokenizer_wrapper, overlap_token_size=chunk_func_params.get("overlap_token_size", 128), max_token_size=chunk_func_params.get("max_token_size", 1024) + ) + for chunk in chunks: + inserting_chunks.update( + {compute_mdhash_id(chunk["content"], prefix="chunk-"): chunk} + ) + return inserting_chunks + + +async def _handle_entity_relation_summary( + entity_or_relation_name: str, + description: str, + global_config: dict, + tokenizer_wrapper: TokenizerWrapper, +) -> str: + use_llm_func: callable = global_config["cheap_model_func"] + llm_max_tokens = global_config["cheap_model_max_token_size"] + summary_max_tokens = global_config["entity_summary_to_max_tokens"] + + + tokens = tokenizer_wrapper.encode(description) + if len(tokens) < summary_max_tokens: + return description + prompt_template = PROMPTS["summarize_entity_descriptions"] + + use_description = tokenizer_wrapper.decode(tokens[:llm_max_tokens]) + context_base = dict( + entity_name=entity_or_relation_name, + description_list=use_description.split(GRAPH_FIELD_SEP), + ) + use_prompt = prompt_template.format(**context_base) + logger.debug(f"Trigger summary: {entity_or_relation_name}") + summary = await use_llm_func(use_prompt, max_tokens=summary_max_tokens) + return summary + + +async def _handle_single_entity_extraction( + record_attributes: list[str], + chunk_key: str, +): + if len(record_attributes) < 4 or record_attributes[0] != '"entity"': + return None + # add this record as a node in the G + entity_name = clean_str(record_attributes[1].upper()) + if not entity_name.strip(): + return None + entity_type = clean_str(record_attributes[2].upper()) + entity_description = clean_str(record_attributes[3]) + entity_source_id = chunk_key + return dict( + entity_name=entity_name, + entity_type=entity_type, + description=entity_description, + source_id=entity_source_id, + ) + + +async def _handle_single_relationship_extraction( + record_attributes: list[str], + chunk_key: str, +): + if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': + return None + # add this record as edge + source = clean_str(record_attributes[1].upper()) + target = clean_str(record_attributes[2].upper()) + edge_description = clean_str(record_attributes[3]) + edge_source_id = chunk_key + weight = ( + float(record_attributes[-1]) if is_float_regex(record_attributes[-1]) else 1.0 + ) + return dict( + src_id=source, + tgt_id=target, + weight=weight, + description=edge_description, + source_id=edge_source_id, + ) + + +async def _merge_nodes_then_upsert( + entity_name: str, + nodes_data: list[dict], + knwoledge_graph_inst: BaseGraphStorage, + global_config: dict, + tokenizer_wrapper, +): + already_entitiy_types = [] + already_source_ids = [] + already_description = [] + + already_node = await knwoledge_graph_inst.get_node(entity_name) + if already_node is not None: + already_entitiy_types.append(already_node["entity_type"]) + already_source_ids.extend( + split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP]) + ) + already_description.append(already_node["description"]) + + entity_type = sorted( + Counter( + [dp["entity_type"] for dp in nodes_data] + already_entitiy_types + ).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + description = GRAPH_FIELD_SEP.join( + sorted(set([dp["description"] for dp in nodes_data] + already_description)) + ) + source_id = GRAPH_FIELD_SEP.join( + set([dp["source_id"] for dp in nodes_data] + already_source_ids) + ) + description = await _handle_entity_relation_summary( + entity_name, description, global_config, tokenizer_wrapper + ) + node_data = dict( + entity_type=entity_type, + description=description, + source_id=source_id, + ) + await knwoledge_graph_inst.upsert_node( + entity_name, + node_data=node_data, + ) + node_data["entity_name"] = entity_name + return node_data + + +async def _merge_edges_then_upsert( + src_id: str, + tgt_id: str, + edges_data: list[dict], + knwoledge_graph_inst: BaseGraphStorage, + global_config: dict, + tokenizer_wrapper, +): + already_weights = [] + already_source_ids = [] + already_description = [] + already_order = [] + if await knwoledge_graph_inst.has_edge(src_id, tgt_id): + already_edge = await knwoledge_graph_inst.get_edge(src_id, tgt_id) + already_weights.append(already_edge["weight"]) + already_source_ids.extend( + split_string_by_multi_markers(already_edge["source_id"], [GRAPH_FIELD_SEP]) + ) + already_description.append(already_edge["description"]) + already_order.append(already_edge.get("order", 1)) + + # [numberchiffre]: `Relationship.order` is only returned from DSPy's predictions + order = min([dp.get("order", 1) for dp in edges_data] + already_order) + weight = sum([dp["weight"] for dp in edges_data] + already_weights) + description = GRAPH_FIELD_SEP.join( + sorted(set([dp["description"] for dp in edges_data] + already_description)) + ) + source_id = GRAPH_FIELD_SEP.join( + set([dp["source_id"] for dp in edges_data] + already_source_ids) + ) + for need_insert_id in [src_id, tgt_id]: + if not (await knwoledge_graph_inst.has_node(need_insert_id)): + await knwoledge_graph_inst.upsert_node( + need_insert_id, + node_data={ + "source_id": source_id, + "description": description, + "entity_type": '"UNKNOWN"', + }, + ) + description = await _handle_entity_relation_summary( + (src_id, tgt_id), description, global_config, tokenizer_wrapper + ) + await knwoledge_graph_inst.upsert_edge( + src_id, + tgt_id, + edge_data=dict( + weight=weight, description=description, source_id=source_id, order=order + ), + ) + + +async def extract_entities( + chunks: dict[str, TextChunkSchema], + knwoledge_graph_inst: BaseGraphStorage, + entity_vdb: BaseVectorStorage, + tokenizer_wrapper, + global_config: dict, + using_amazon_bedrock: bool=False, +) -> Union[BaseGraphStorage, None]: + use_llm_func: callable = global_config["best_model_func"] + entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"] + + ordered_chunks = list(chunks.items()) + + entity_extract_prompt = PROMPTS["entity_extraction"] + context_base = dict( + tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"], + record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"], + completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"], + entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]), + ) + continue_prompt = PROMPTS["entiti_continue_extraction"] + if_loop_prompt = PROMPTS["entiti_if_loop_extraction"] + + already_processed = 0 + already_entities = 0 + already_relations = 0 + + async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): + nonlocal already_processed, already_entities, already_relations + chunk_key = chunk_key_dp[0] + chunk_dp = chunk_key_dp[1] + content = chunk_dp["content"] + hint_prompt = entity_extract_prompt.format(**context_base, input_text=content) + final_result = await use_llm_func(hint_prompt) + if isinstance(final_result, list): + final_result = final_result[0]["text"] + + history = pack_user_ass_to_openai_messages(hint_prompt, final_result, using_amazon_bedrock) + for now_glean_index in range(entity_extract_max_gleaning): + glean_result = await use_llm_func(continue_prompt, history_messages=history) + + history += pack_user_ass_to_openai_messages(continue_prompt, glean_result, using_amazon_bedrock) + final_result += glean_result + if now_glean_index == entity_extract_max_gleaning - 1: + break + + if_loop_result: str = await use_llm_func( + if_loop_prompt, history_messages=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + + records = split_string_by_multi_markers( + final_result, + [context_base["record_delimiter"], context_base["completion_delimiter"]], + ) + + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + for record in records: + record = re.search(r"\((.*)\)", record) + if record is None: + continue + record = record.group(1) + record_attributes = split_string_by_multi_markers( + record, [context_base["tuple_delimiter"]] + ) + if_entities = await _handle_single_entity_extraction( + record_attributes, chunk_key + ) + if if_entities is not None: + maybe_nodes[if_entities["entity_name"]].append(if_entities) + continue + + if_relation = await _handle_single_relationship_extraction( + record_attributes, chunk_key + ) + if if_relation is not None: + maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append( + if_relation + ) + already_processed += 1 + already_entities += len(maybe_nodes) + already_relations += len(maybe_edges) + now_ticks = PROMPTS["process_tickers"][ + already_processed % len(PROMPTS["process_tickers"]) + ] + print( + f"{now_ticks} Processed {already_processed}({already_processed*100//len(ordered_chunks)}%) chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", + end="", + flush=True, + ) + return dict(maybe_nodes), dict(maybe_edges) + + # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings + results = await asyncio.gather( + *[_process_single_content(c) for c in ordered_chunks] + ) + print() # clear the progress bar + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + for m_nodes, m_edges in results: + for k, v in m_nodes.items(): + maybe_nodes[k].extend(v) + for k, v in m_edges.items(): + # it's undirected graph + maybe_edges[tuple(sorted(k))].extend(v) + all_entities_data = await asyncio.gather( + *[ + _merge_nodes_then_upsert(k, v, knwoledge_graph_inst, global_config, tokenizer_wrapper) + for k, v in maybe_nodes.items() + ] + ) + await asyncio.gather( + *[ + _merge_edges_then_upsert(k[0], k[1], v, knwoledge_graph_inst, global_config, tokenizer_wrapper) + for k, v in maybe_edges.items() + ] + ) + if not len(all_entities_data): + logger.warning("Didn't extract any entities, maybe your LLM is not working") + return None + if entity_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["entity_name"], prefix="ent-"): { + "content": dp["entity_name"] + dp["description"], + "entity_name": dp["entity_name"], + } + for dp in all_entities_data + } + await entity_vdb.upsert(data_for_vdb) + return knwoledge_graph_inst + + +def _pack_single_community_by_sub_communities( + community: SingleCommunitySchema, + max_token_size: int, + already_reports: dict[str, CommunitySchema], + tokenizer_wrapper: TokenizerWrapper, +) -> tuple[str, int, set, set]: + all_sub_communities = [ + already_reports[k] for k in community["sub_communities"] if k in already_reports + ] + all_sub_communities = sorted( + all_sub_communities, key=lambda x: x["occurrence"], reverse=True + ) + + may_trun_all_sub_communities = truncate_list_by_token_size( + all_sub_communities, + key=lambda x: x["report_string"], + max_token_size=max_token_size, + tokenizer_wrapper=tokenizer_wrapper, + ) + sub_fields = ["id", "report", "rating", "importance"] + sub_communities_describe = list_of_list_to_csv( + [sub_fields] + + [ + [ + i, + c["report_string"], + c["report_json"].get("rating", -1), + c["occurrence"], + ] + for i, c in enumerate(may_trun_all_sub_communities) + ] + ) + already_nodes = [] + already_edges = [] + for c in may_trun_all_sub_communities: + already_nodes.extend(c["nodes"]) + already_edges.extend([tuple(e) for e in c["edges"]]) + + + return ( + sub_communities_describe, + len(tokenizer_wrapper.encode(sub_communities_describe)), + set(already_nodes), + set(already_edges), + ) + + +async def _pack_single_community_describe( + knwoledge_graph_inst: BaseGraphStorage, + community: SingleCommunitySchema, + tokenizer_wrapper: "TokenizerWrapper", + max_token_size: int = 12000, + already_reports: dict[str, CommunitySchema] = {}, + global_config: dict = {}, +) -> str: + + + + # 1. 准备原始数据 + nodes_in_order = sorted(community["nodes"]) + edges_in_order = sorted(community["edges"], key=lambda x: x[0] + x[1]) + + nodes_data = await asyncio.gather( + *[knwoledge_graph_inst.get_node(n) for n in nodes_in_order] + ) + edges_data = await asyncio.gather( + *[knwoledge_graph_inst.get_edge(src, tgt) for src, tgt in edges_in_order] + ) + + + # 2. 定义模板和固定开销 + final_template = """-----Reports----- +```csv +{reports} +``` +-----Entities----- +```csv +{entities} +``` +-----Relationships----- +```csv +{relationships} +```""" + base_template_tokens = len(tokenizer_wrapper.encode( + final_template.format(reports="", entities="", relationships="") + )) + remaining_budget = max_token_size - base_template_tokens + + # 3. 处理子社区报告 + report_describe = "" + contain_nodes = set() + contain_edges = set() + + # 启发式截断检测 + truncated = len(nodes_in_order) > 100 or len(edges_in_order) > 100 + + need_to_use_sub_communities = ( + truncated and + community["sub_communities"] and + already_reports + ) + force_to_use_sub_communities = global_config["addon_params"].get( + "force_to_use_sub_communities", False + ) + + if need_to_use_sub_communities or force_to_use_sub_communities: + logger.debug(f"Community {community['title']} using sub-communities") + # 获取子社区报告及包含的节点/边 + result = _pack_single_community_by_sub_communities( + community, remaining_budget, already_reports, tokenizer_wrapper + ) + report_describe, report_size, contain_nodes, contain_edges = result + remaining_budget = max(0, remaining_budget - report_size) + + # 4. 准备节点和边数据(过滤子社区已包含的) + def format_row(row: list) -> str: + return ','.join('"{}"'.format(str(item).replace('"', '""')) for item in row) + + node_fields = ["id", "entity", "type", "description", "degree"] + edge_fields = ["id", "source", "target", "description", "rank"] + + # 获取度数并创建数据结构 + node_degrees = await knwoledge_graph_inst.node_degrees_batch(nodes_in_order) + edge_degrees = await knwoledge_graph_inst.edge_degrees_batch(edges_in_order) + + # 过滤已存在于子社区的节点/边 + nodes_list_data = [ + [i, name, data.get("entity_type", "UNKNOWN"), + data.get("description", "UNKNOWN"), node_degrees[i]] + for i, (name, data) in enumerate(zip(nodes_in_order, nodes_data)) + if name not in contain_nodes # 关键过滤 + ] + + edges_list_data = [ + [i, edge[0], edge[1], data.get("description", "UNKNOWN"), edge_degrees[i]] + for i, (edge, data) in enumerate(zip(edges_in_order, edges_data)) + if (edge[0], edge[1]) not in contain_edges # 关键过滤 + ] + + # 按重要性排序 + nodes_list_data.sort(key=lambda x: x[-1], reverse=True) + edges_list_data.sort(key=lambda x: x[-1], reverse=True) + + # 5. 动态分配预算 + # 计算表头开销 + header_tokens = len(tokenizer_wrapper.encode( + list_of_list_to_csv([node_fields]) + "\n" + list_of_list_to_csv([edge_fields]) + )) + + + + data_budget = max(0, remaining_budget - header_tokens) + total_items = len(nodes_list_data) + len(edges_list_data) + node_ratio = len(nodes_list_data) / max(1, total_items) + edge_ratio = 1 - node_ratio + + + + + # 执行截断 + nodes_final = truncate_list_by_token_size( + nodes_list_data, key=format_row, + max_token_size=int(data_budget * node_ratio), + tokenizer_wrapper=tokenizer_wrapper + ) + edges_final = truncate_list_by_token_size( + edges_list_data, key=format_row, + max_token_size= int(data_budget * edge_ratio), + tokenizer_wrapper=tokenizer_wrapper + ) + + # 6. 组装最终输出 + nodes_describe = list_of_list_to_csv([node_fields] + nodes_final) + edges_describe = list_of_list_to_csv([edge_fields] + edges_final) + + + + final_output = final_template.format( + reports=report_describe, + entities=nodes_describe, + relationships=edges_describe + ) + + return final_output + + +def _community_report_json_to_str(parsed_output: dict) -> str: + """refer official graphrag: index/graph/extractors/community_reports""" + title = parsed_output.get("title", "Report") + summary = parsed_output.get("summary", "") + findings = parsed_output.get("findings", []) + + def finding_summary(finding: dict): + if isinstance(finding, str): + return finding + return finding.get("summary") + + def finding_explanation(finding: dict): + if isinstance(finding, str): + return "" + return finding.get("explanation") + + report_sections = "\n\n".join( + f"## {finding_summary(f)}\n\n{finding_explanation(f)}" for f in findings + ) + return f"# {title}\n\n{summary}\n\n{report_sections}" + + +async def generate_community_report( + community_report_kv: BaseKVStorage[CommunitySchema], + knwoledge_graph_inst: BaseGraphStorage, + tokenizer_wrapper: TokenizerWrapper, + global_config: dict, +): + llm_extra_kwargs = global_config["special_community_report_llm_kwargs"] + use_llm_func: callable = global_config["best_model_func"] + use_string_json_convert_func: callable = global_config["convert_response_to_json_func"] + + communities_schema = await knwoledge_graph_inst.community_schema() + community_keys, community_values = list(communities_schema.keys()), list(communities_schema.values()) + already_processed = 0 + + prompt_template = PROMPTS["community_report"] + + prompt_overhead = len(tokenizer_wrapper.encode(prompt_template.format(input_text=""))) + + async def _form_single_community_report( + community: SingleCommunitySchema, already_reports: dict[str, CommunitySchema] + ): + nonlocal already_processed + describe = await _pack_single_community_describe( + knwoledge_graph_inst, + community, + tokenizer_wrapper=tokenizer_wrapper, + max_token_size=global_config["best_model_max_token_size"] - prompt_overhead -200, # extra token for chat template and prompt template + already_reports=already_reports, + global_config=global_config, + ) + prompt = prompt_template.format(input_text=describe) + + + response = await use_llm_func(prompt, **llm_extra_kwargs) + data = use_string_json_convert_func(response) + already_processed += 1 + now_ticks = PROMPTS["process_tickers"][already_processed % len(PROMPTS["process_tickers"])] + print(f"{now_ticks} Processed {already_processed} communities\r", end="", flush=True) + return data + + levels = sorted(set([c["level"] for c in community_values]), reverse=True) + logger.info(f"Generating by levels: {levels}") + community_datas = {} + for level in levels: + this_level_community_keys, this_level_community_values = zip( + *[ + (k, v) + for k, v in zip(community_keys, community_values) + if v["level"] == level + ] + ) + this_level_communities_reports = await asyncio.gather( + *[ + _form_single_community_report(c, community_datas) + for c in this_level_community_values + ] + ) + community_datas.update( + { + k: { + "report_string": _community_report_json_to_str(r), + "report_json": r, + **v, + } + for k, r, v in zip( + this_level_community_keys, + this_level_communities_reports, + this_level_community_values, + ) + } + ) + print() # clear the progress bar + await community_report_kv.upsert(community_datas) + + +async def _find_most_related_community_from_entities( + node_datas: list[dict], + query_param: QueryParam, + community_reports: BaseKVStorage[CommunitySchema], + tokenizer_wrapper, +): + related_communities = [] + for node_d in node_datas: + if "clusters" not in node_d: + continue + related_communities.extend(json.loads(node_d["clusters"])) + related_community_dup_keys = [ + str(dp["cluster"]) + for dp in related_communities + if dp["level"] <= query_param.level + ] + related_community_keys_counts = dict(Counter(related_community_dup_keys)) + _related_community_datas = await asyncio.gather( + *[community_reports.get_by_id(k) for k in related_community_keys_counts.keys()] + ) + related_community_datas = { + k: v + for k, v in zip(related_community_keys_counts.keys(), _related_community_datas) + if v is not None + } + related_community_keys = sorted( + related_community_keys_counts.keys(), + key=lambda k: ( + related_community_keys_counts[k], + related_community_datas[k]["report_json"].get("rating", -1), + ), + reverse=True, + ) + sorted_community_datas = [ + related_community_datas[k] for k in related_community_keys + ] + + use_community_reports = truncate_list_by_token_size( + sorted_community_datas, + key=lambda x: x["report_string"], + max_token_size=query_param.local_max_token_for_community_report, + tokenizer_wrapper=tokenizer_wrapper, + ) + if query_param.local_community_single_one: + use_community_reports = use_community_reports[:1] + return use_community_reports + + +async def _find_most_related_text_unit_from_entities( + node_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, + tokenizer_wrapper, +): + text_units = [ + split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) + for dp in node_datas + ] + edges = await knowledge_graph_inst.get_nodes_edges_batch([dp["entity_name"] for dp in node_datas]) + all_one_hop_nodes = set() + for this_edges in edges: + if not this_edges: + continue + all_one_hop_nodes.update([e[1] for e in this_edges]) + all_one_hop_nodes = list(all_one_hop_nodes) + all_one_hop_nodes_data = await knowledge_graph_inst.get_nodes_batch(all_one_hop_nodes) + all_one_hop_text_units_lookup = { + k: set(split_string_by_multi_markers(v["source_id"], [GRAPH_FIELD_SEP])) + for k, v in zip(all_one_hop_nodes, all_one_hop_nodes_data) + if v is not None + } + all_text_units_lookup = {} + for index, (this_text_units, this_edges) in enumerate(zip(text_units, edges)): + for c_id in this_text_units: + if c_id in all_text_units_lookup: + continue + relation_counts = 0 + for e in this_edges: + if ( + e[1] in all_one_hop_text_units_lookup + and c_id in all_one_hop_text_units_lookup[e[1]] + ): + relation_counts += 1 + all_text_units_lookup[c_id] = { + "data": await text_chunks_db.get_by_id(c_id), + "order": index, + "relation_counts": relation_counts, + } + if any([v is None for v in all_text_units_lookup.values()]): + logger.warning("Text chunks are missing, maybe the storage is damaged") + all_text_units = [ + {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None + ] + all_text_units = sorted( + all_text_units, key=lambda x: (x["order"], -x["relation_counts"]) + ) + all_text_units = truncate_list_by_token_size( + all_text_units, + key=lambda x: x["data"]["content"], + max_token_size=query_param.local_max_token_for_text_unit, + tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper + ) + all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] + return all_text_units + + +async def _find_most_related_edges_from_entities( + node_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, + tokenizer_wrapper, +): + all_related_edges = await knowledge_graph_inst.get_nodes_edges_batch([dp["entity_name"] for dp in node_datas]) + + all_edges = [] + seen = set() + + for this_edges in all_related_edges: + for e in this_edges: + sorted_edge = tuple(sorted(e)) + if sorted_edge not in seen: + seen.add(sorted_edge) + all_edges.append(sorted_edge) + + all_edges_pack = await knowledge_graph_inst.get_edges_batch(all_edges) + all_edges_degree = await knowledge_graph_inst.edge_degrees_batch(all_edges) + all_edges_data = [ + {"src_tgt": k, "rank": d, **v} + for k, v, d in zip(all_edges, all_edges_pack, all_edges_degree) + if v is not None + ] + all_edges_data = sorted( + all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True + ) + all_edges_data = truncate_list_by_token_size( + all_edges_data, + key=lambda x: x["description"], + max_token_size=query_param.local_max_token_for_local_context, + tokenizer_wrapper=tokenizer_wrapper, + ) + return all_edges_data + + +async def _build_local_query_context( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + community_reports: BaseKVStorage[CommunitySchema], + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + tokenizer_wrapper, +): + results = await entities_vdb.query(query, top_k=query_param.top_k) + if not len(results): + return None + node_datas = await knowledge_graph_inst.get_nodes_batch([r["entity_name"] for r in results]) + if not all([n is not None for n in node_datas]): + logger.warning("Some nodes are missing, maybe the storage is damaged") + node_degrees = await knowledge_graph_inst.node_degrees_batch([r["entity_name"] for r in results]) + node_datas = [ + {**n, "entity_name": k["entity_name"], "rank": d} + for k, n, d in zip(results, node_datas, node_degrees) + if n is not None + ] + use_communities = await _find_most_related_community_from_entities( + node_datas, query_param, community_reports, tokenizer_wrapper + ) + use_text_units = await _find_most_related_text_unit_from_entities( + node_datas, query_param, text_chunks_db, knowledge_graph_inst, tokenizer_wrapper + ) + use_relations = await _find_most_related_edges_from_entities( + node_datas, query_param, knowledge_graph_inst, tokenizer_wrapper + ) + logger.info( + f"Using {len(node_datas)} entites, {len(use_communities)} communities, {len(use_relations)} relations, {len(use_text_units)} text units" + ) + entites_section_list = [["id", "entity", "type", "description", "rank"]] + for i, n in enumerate(node_datas): + entites_section_list.append( + [ + i, + n["entity_name"], + n.get("entity_type", "UNKNOWN"), + n.get("description", "UNKNOWN"), + n["rank"], + ] + ) + entities_context = list_of_list_to_csv(entites_section_list) + + relations_section_list = [ + ["id", "source", "target", "description", "weight", "rank"] + ] + for i, e in enumerate(use_relations): + relations_section_list.append( + [ + i, + e["src_tgt"][0], + e["src_tgt"][1], + e["description"], + e["weight"], + e["rank"], + ] + ) + relations_context = list_of_list_to_csv(relations_section_list) + + communities_section_list = [["id", "content"]] + for i, c in enumerate(use_communities): + communities_section_list.append([i, c["report_string"]]) + communities_context = list_of_list_to_csv(communities_section_list) + + text_units_section_list = [["id", "content"]] + for i, t in enumerate(use_text_units): + text_units_section_list.append([i, t["content"]]) + text_units_context = list_of_list_to_csv(text_units_section_list) + return f""" +-----Reports----- +```csv +{communities_context} +``` +-----Entities----- +```csv +{entities_context} +``` +-----Relationships----- +```csv +{relations_context} +``` +-----Sources----- +```csv +{text_units_context} +``` +""" + + +async def local_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + community_reports: BaseKVStorage[CommunitySchema], + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + tokenizer_wrapper, + global_config: dict, +) -> str: + use_model_func = global_config["best_model_func"] + context = await _build_local_query_context( + query, + knowledge_graph_inst, + entities_vdb, + community_reports, + text_chunks_db, + query_param, + tokenizer_wrapper, + ) + if query_param.only_need_context: + return context + if context is None: + return PROMPTS["fail_response"] + sys_prompt_temp = PROMPTS["local_rag_response"] + sys_prompt = sys_prompt_temp.format( + context_data=context, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + + +async def _map_global_communities( + query: str, + communities_data: list[CommunitySchema], + query_param: QueryParam, + global_config: dict, + tokenizer_wrapper, +): + use_string_json_convert_func = global_config["convert_response_to_json_func"] + use_model_func = global_config["best_model_func"] + community_groups = [] + while len(communities_data): + this_group = truncate_list_by_token_size( + communities_data, + key=lambda x: x["report_string"], + max_token_size=query_param.global_max_token_for_community_report, + tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper + ) + community_groups.append(this_group) + communities_data = communities_data[len(this_group) :] + + async def _process(community_truncated_datas: list[CommunitySchema]) -> dict: + communities_section_list = [["id", "content", "rating", "importance"]] + for i, c in enumerate(community_truncated_datas): + communities_section_list.append( + [ + i, + c["report_string"], + c["report_json"].get("rating", 0), + c["occurrence"], + ] + ) + community_context = list_of_list_to_csv(communities_section_list) + sys_prompt_temp = PROMPTS["global_map_rag_points"] + sys_prompt = sys_prompt_temp.format(context_data=community_context) + response = await use_model_func( + query, + system_prompt=sys_prompt, + **query_param.global_special_community_map_llm_kwargs, + ) + data = use_string_json_convert_func(response) + return data.get("points", []) + + logger.info(f"Grouping to {len(community_groups)} groups for global search") + responses = await asyncio.gather(*[_process(c) for c in community_groups]) + return responses + + +async def global_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + community_reports: BaseKVStorage[CommunitySchema], + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + tokenizer_wrapper, + global_config: dict, +) -> str: + community_schema = await knowledge_graph_inst.community_schema() + community_schema = { + k: v for k, v in community_schema.items() if v["level"] <= query_param.level + } + if not len(community_schema): + return PROMPTS["fail_response"] + use_model_func = global_config["best_model_func"] + + sorted_community_schemas = sorted( + community_schema.items(), + key=lambda x: x[1]["occurrence"], + reverse=True, + ) + sorted_community_schemas = sorted_community_schemas[ + : query_param.global_max_consider_community + ] + community_datas = await community_reports.get_by_ids( + [k[0] for k in sorted_community_schemas] + ) + community_datas = [c for c in community_datas if c is not None] + community_datas = [ + c + for c in community_datas + if c["report_json"].get("rating", 0) >= query_param.global_min_community_rating + ] + community_datas = sorted( + community_datas, + key=lambda x: (x["occurrence"], x["report_json"].get("rating", 0)), + reverse=True, + ) + logger.info(f"Revtrieved {len(community_datas)} communities") + + map_communities_points = await _map_global_communities( + query, community_datas, query_param, global_config, tokenizer_wrapper + ) + final_support_points = [] + for i, mc in enumerate(map_communities_points): + for point in mc: + if "description" not in point: + continue + final_support_points.append( + { + "analyst": i, + "answer": point["description"], + "score": point.get("score", 1), + } + ) + final_support_points = [p for p in final_support_points if p["score"] > 0] + if not len(final_support_points): + return PROMPTS["fail_response"] + final_support_points = sorted( + final_support_points, key=lambda x: x["score"], reverse=True + ) + final_support_points = truncate_list_by_token_size( + final_support_points, + key=lambda x: x["answer"], + max_token_size=query_param.global_max_token_for_community_report, + tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper + ) + points_context = [] + for dp in final_support_points: + points_context.append( + f"""----Analyst {dp['analyst']}---- +Importance Score: {dp['score']} +{dp['answer']} +""" + ) + points_context = "\n".join(points_context) + if query_param.only_need_context: + return points_context + sys_prompt_temp = PROMPTS["global_reduce_rag_response"] + response = await use_model_func( + query, + sys_prompt_temp.format( + report_data=points_context, response_type=query_param.response_type + ), + ) + return response + + +async def naive_query( + query, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + tokenizer_wrapper, + global_config: dict, +): + use_model_func = global_config["best_model_func"] + results = await chunks_vdb.query(query, top_k=query_param.top_k) + if not len(results): + return PROMPTS["fail_response"] + chunks_ids = [r["id"] for r in results] + chunks = await text_chunks_db.get_by_ids(chunks_ids) + + maybe_trun_chunks = truncate_list_by_token_size( + chunks, + key=lambda x: x["content"], + max_token_size=query_param.naive_max_token_for_text_unit, + tokenizer_wrapper=tokenizer_wrapper, # 传入 wrapper + ) + logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks") + section = "--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks]) + if query_param.only_need_context: + return section + sys_prompt_temp = PROMPTS["naive_rag_response"] + sys_prompt = sys_prompt_temp.format( + content_data=section, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response diff --git a/nano-graphrag/nano_graphrag/_splitter.py b/nano-graphrag/nano_graphrag/_splitter.py new file mode 100644 index 0000000000000000000000000000000000000000..1054d17fd64ae6f86e62b4b9c05964342e10fe33 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_splitter.py @@ -0,0 +1,94 @@ +from typing import List, Optional, Union, Literal + +class SeparatorSplitter: + def __init__( + self, + separators: Optional[List[List[int]]] = None, + keep_separator: Union[bool, Literal["start", "end"]] = "end", + chunk_size: int = 4000, + chunk_overlap: int = 200, + length_function: callable = len, + ): + self._separators = separators or [] + self._keep_separator = keep_separator + self._chunk_size = chunk_size + self._chunk_overlap = chunk_overlap + self._length_function = length_function + + def split_tokens(self, tokens: List[int]) -> List[List[int]]: + splits = self._split_tokens_with_separators(tokens) + return self._merge_splits(splits) + + def _split_tokens_with_separators(self, tokens: List[int]) -> List[List[int]]: + splits = [] + current_split = [] + i = 0 + while i < len(tokens): + separator_found = False + for separator in self._separators: + if tokens[i:i+len(separator)] == separator: + if self._keep_separator in [True, "end"]: + current_split.extend(separator) + if current_split: + splits.append(current_split) + current_split = [] + if self._keep_separator == "start": + current_split.extend(separator) + i += len(separator) + separator_found = True + break + if not separator_found: + current_split.append(tokens[i]) + i += 1 + if current_split: + splits.append(current_split) + return [s for s in splits if s] + + def _merge_splits(self, splits: List[List[int]]) -> List[List[int]]: + if not splits: + return [] + + merged_splits = [] + current_chunk = [] + + for split in splits: + if not current_chunk: + current_chunk = split + elif self._length_function(current_chunk) + self._length_function(split) <= self._chunk_size: + current_chunk.extend(split) + else: + merged_splits.append(current_chunk) + current_chunk = split + + if current_chunk: + merged_splits.append(current_chunk) + + if len(merged_splits) == 1 and self._length_function(merged_splits[0]) > self._chunk_size: + return self._split_chunk(merged_splits[0]) + + if self._chunk_overlap > 0: + return self._enforce_overlap(merged_splits) + + return merged_splits + + def _split_chunk(self, chunk: List[int]) -> List[List[int]]: + result = [] + for i in range(0, len(chunk), self._chunk_size - self._chunk_overlap): + new_chunk = chunk[i:i + self._chunk_size] + if len(new_chunk) > self._chunk_overlap: # 只有当 chunk 长度大于 overlap 时才添加 + result.append(new_chunk) + return result + + def _enforce_overlap(self, chunks: List[List[int]]) -> List[List[int]]: + result = [] + for i, chunk in enumerate(chunks): + if i == 0: + result.append(chunk) + else: + overlap = chunks[i-1][-self._chunk_overlap:] + new_chunk = overlap + chunk + if self._length_function(new_chunk) > self._chunk_size: + new_chunk = new_chunk[:self._chunk_size] + result.append(new_chunk) + return result + diff --git a/nano-graphrag/nano_graphrag/_storage/__init__.py b/nano-graphrag/nano_graphrag/_storage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c8184ab2729d0a54d1b47be6a5e02301508b0e22 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_storage/__init__.py @@ -0,0 +1,5 @@ +from .gdb_networkx import NetworkXStorage +from .gdb_neo4j import Neo4jStorage +from .vdb_hnswlib import HNSWVectorStorage +from .vdb_nanovectordb import NanoVectorDBStorage +from .kv_json import JsonKVStorage diff --git a/nano-graphrag/nano_graphrag/_storage/gdb_neo4j.py b/nano-graphrag/nano_graphrag/_storage/gdb_neo4j.py new file mode 100644 index 0000000000000000000000000000000000000000..54739ae96d310907da9a90bb6f3103dc2bff8b8b --- /dev/null +++ b/nano-graphrag/nano_graphrag/_storage/gdb_neo4j.py @@ -0,0 +1,529 @@ +import json +import asyncio +from collections import defaultdict +from typing import List +from neo4j import AsyncGraphDatabase +from dataclasses import dataclass +from typing import Union +from ..base import BaseGraphStorage, SingleCommunitySchema +from .._utils import logger +from ..prompt import GRAPH_FIELD_SEP + +neo4j_lock = asyncio.Lock() + + +def make_path_idable(path): + return path.replace(".", "_").replace("/", "__").replace("-", "_").replace(":", "_").replace("\\", "__") + + +@dataclass +class Neo4jStorage(BaseGraphStorage): + def __post_init__(self): + self.neo4j_url = self.global_config["addon_params"].get("neo4j_url", None) + self.neo4j_auth = self.global_config["addon_params"].get("neo4j_auth", None) + self.namespace = ( + f"{make_path_idable(self.global_config['working_dir'])}__{self.namespace}" + ) + logger.info(f"Using the label {self.namespace} for Neo4j as identifier") + if self.neo4j_url is None or self.neo4j_auth is None: + raise ValueError("Missing neo4j_url or neo4j_auth in addon_params") + self.async_driver = AsyncGraphDatabase.driver( + self.neo4j_url, auth=self.neo4j_auth, max_connection_pool_size=50, + ) + + # async def create_database(self): + # async with self.async_driver.session() as session: + # try: + # constraints = await session.run("SHOW CONSTRAINTS") + # # TODO I don't know why CREATE CONSTRAINT IF NOT EXISTS still trigger error + # # so have to check if the constrain exists + # constrain_exists = False + + # async for record in constraints: + # if ( + # self.namespace in record["labelsOrTypes"] + # and "id" in record["properties"] + # and record["type"] == "UNIQUENESS" + # ): + # constrain_exists = True + # break + # if not constrain_exists: + # await session.run( + # f"CREATE CONSTRAINT FOR (n:{self.namespace}) REQUIRE n.id IS UNIQUE" + # ) + # logger.info(f"Add constraint for namespace: {self.namespace}") + + # except Exception as e: + # logger.error(f"Error accessing or setting up the database: {str(e)}") + # raise + + async def _init_workspace(self): + await self.async_driver.verify_authentication() + await self.async_driver.verify_connectivity() + # TODOLater: create database if not exists always cause an error when async + # await self.create_database() + + async def index_start_callback(self): + logger.info("Init Neo4j workspace") + await self._init_workspace() + + # create index for faster searching + try: + async with self.async_driver.session() as session: + await session.run( + f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.id)" + ) + + await session.run( + f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.entity_type)" + ) + + await session.run( + f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.communityIds)" + ) + + await session.run( + f"CREATE INDEX IF NOT EXISTS FOR (n:`{self.namespace}`) ON (n.source_id)" + ) + logger.info("Neo4j indexes created successfully") + except Exception as e: + logger.error(f"Failed to create indexes: {e}") + raise e + + async def has_node(self, node_id: str) -> bool: + async with self.async_driver.session() as session: + result = await session.run( + f"MATCH (n:`{self.namespace}`) WHERE n.id = $node_id RETURN COUNT(n) > 0 AS exists", + node_id=node_id, + ) + record = await result.single() + return record["exists"] if record else False + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + async with self.async_driver.session() as session: + result = await session.run( + f""" + MATCH (s:`{self.namespace}`) + WHERE s.id = $source_id + MATCH (t:`{self.namespace}`) + WHERE t.id = $target_id + RETURN EXISTS((s)-[]->(t)) AS exists + """, + source_id=source_node_id, + target_id=target_node_id, + ) + + record = await result.single() + return record["exists"] if record else False + + async def node_degree(self, node_id: str) -> int: + results = await self.node_degrees_batch([node_id]) + return results[0] if results else 0 + + async def node_degrees_batch(self, node_ids: List[str]) -> List[str]: + if not node_ids: + return {} + + result_dict = {node_id: 0 for node_id in node_ids} + async with self.async_driver.session() as session: + result = await session.run( + f""" + UNWIND $node_ids AS node_id + MATCH (n:`{self.namespace}`) + WHERE n.id = node_id + OPTIONAL MATCH (n)-[]-(m:`{self.namespace}`) + RETURN node_id, COUNT(m) AS degree + """, + node_ids=node_ids + ) + + async for record in result: + result_dict[record["node_id"]] = record["degree"] + + return [result_dict[node_id] for node_id in node_ids] + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + results = await self.edge_degrees_batch([(src_id, tgt_id)]) + return results[0] if results else 0 + + async def edge_degrees_batch(self, edge_pairs: list[tuple[str, str]]) -> list[int]: + if not edge_pairs: + return [] + + result_dict = {tuple(edge_pair): 0 for edge_pair in edge_pairs} + + edges_params = [{"src_id": src, "tgt_id": tgt} for src, tgt in edge_pairs] + + try: + async with self.async_driver.session() as session: + result = await session.run( + f""" + UNWIND $edges AS edge + + MATCH (s:`{self.namespace}`) + WHERE s.id = edge.src_id + WITH edge, s + OPTIONAL MATCH (s)-[]-(n1:`{self.namespace}`) + WITH edge, COUNT(n1) AS src_degree + + MATCH (t:`{self.namespace}`) + WHERE t.id = edge.tgt_id + WITH edge, src_degree, t + OPTIONAL MATCH (t)-[]-(n2:`{self.namespace}`) + WITH edge.src_id AS src_id, edge.tgt_id AS tgt_id, src_degree, COUNT(n2) AS tgt_degree + + RETURN src_id, tgt_id, src_degree + tgt_degree AS degree + """, + edges=edges_params + ) + + async for record in result: + src_id = record["src_id"] + tgt_id = record["tgt_id"] + degree = record["degree"] + + # 更新结果字典 + edge_pair = (src_id, tgt_id) + result_dict[edge_pair] = degree + + return [result_dict[tuple(edge_pair)] for edge_pair in edge_pairs] + except Exception as e: + logger.error(f"Error in batch edge degree calculation: {e}") + return [0] * len(edge_pairs) + + + + async def get_node(self, node_id: str) -> Union[dict, None]: + result = await self.get_nodes_batch([node_id]) + return result[0] if result else None + + async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, Union[dict, None]]: + if not node_ids: + return {} + + result_dict = {node_id: None for node_id in node_ids} + + try: + async with self.async_driver.session() as session: + result = await session.run( + f""" + UNWIND $node_ids AS node_id + MATCH (n:`{self.namespace}`) + WHERE n.id = node_id + RETURN node_id, properties(n) AS node_data + """, + node_ids=node_ids + ) + + async for record in result: + node_id = record["node_id"] + raw_node_data = record["node_data"] + + if raw_node_data: + raw_node_data["clusters"] = json.dumps( + [ + { + "level": index, + "cluster": cluster_id, + } + for index, cluster_id in enumerate( + raw_node_data.get("communityIds", []) + ) + ] + ) + result_dict[node_id] = raw_node_data + return [result_dict[node_id] for node_id in node_ids] + except Exception as e: + logger.error(f"Error in batch node retrieval: {e}") + raise e + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + results = await self.get_edges_batch([(source_node_id, target_node_id)]) + return results[0] if results else None + + async def get_edges_batch( + self, edge_pairs: list[tuple[str, str]] + ) -> list[Union[dict, None]]: + if not edge_pairs: + return [] + + result_dict = {tuple(edge_pair): None for edge_pair in edge_pairs} + + edges_params = [{"source_id": src, "target_id": tgt} for src, tgt in edge_pairs] + + try: + async with self.async_driver.session() as session: + result = await session.run( + f""" + UNWIND $edges AS edge + MATCH (s:`{self.namespace}`)-[r]->(t:`{self.namespace}`) + WHERE s.id = edge.source_id AND t.id = edge.target_id + RETURN edge.source_id AS source_id, edge.target_id AS target_id, properties(r) AS edge_data + """, + edges=edges_params + ) + + async for record in result: + source_id = record["source_id"] + target_id = record["target_id"] + edge_data = record["edge_data"] + + edge_pair = (source_id, target_id) + result_dict[edge_pair] = edge_data + + return [result_dict[tuple(edge_pair)] for edge_pair in edge_pairs] + except Exception as e: + logger.error(f"Error in batch edge retrieval: {e}") + return [None] * len(edge_pairs) + + async def get_node_edges( + self, source_node_id: str + ) -> list[tuple[str, str]]: + results = await self.get_nodes_edges_batch([source_node_id]) + return results[0] if results else [] + + async def get_nodes_edges_batch( + self, node_ids: list[str] + ) -> list[list[tuple[str, str]]]: + if not node_ids: + return [] + + result_dict = {node_id: [] for node_id in node_ids} + + try: + async with self.async_driver.session() as session: + result = await session.run( + f""" + UNWIND $node_ids AS node_id + MATCH (s:`{self.namespace}`)-[r]->(t:`{self.namespace}`) + WHERE s.id = node_id + RETURN s.id AS source_id, t.id AS target_id + """, + node_ids=node_ids + ) + + async for record in result: + source_id = record["source_id"] + target_id = record["target_id"] + + if source_id in result_dict: + result_dict[source_id].append((source_id, target_id)) + + return [result_dict[node_id] for node_id in node_ids] + except Exception as e: + logger.error(f"Error in batch node edges retrieval: {e}") + return [[] for _ in node_ids] + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + await self.upsert_nodes_batch([(node_id, node_data)]) + + async def upsert_nodes_batch(self, nodes_data: list[tuple[str, dict[str, str]]]): + if not nodes_data: + return [] + + nodes_by_type = {} + for node_id, node_data in nodes_data: + node_type = node_data.get("entity_type", "UNKNOWN").strip('"') + if node_type not in nodes_by_type: + nodes_by_type[node_type] = [] + nodes_by_type[node_type].append((node_id, node_data)) + + async with self.async_driver.session() as session: + for node_type, type_nodes in nodes_by_type.items(): + params = [{"id": node_id, "data": node_data} for node_id, node_data in type_nodes] + + await session.run( + f""" + UNWIND $nodes AS node + MERGE (n:`{self.namespace}`:`{node_type}` {{id: node.id}}) + SET n += node.data + """, + nodes=params + ) + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + await self.upsert_edges_batch([(source_node_id, target_node_id, edge_data)]) + + + async def upsert_edges_batch( + self, edges_data: list[tuple[str, str, dict[str, str]]] + ): + if not edges_data: + return + + edges_params = [] + for source_id, target_id, edge_data in edges_data: + edge_data_copy = edge_data.copy() + edge_data_copy.setdefault("weight", 0.0) + + edges_params.append({ + "source_id": source_id, + "target_id": target_id, + "edge_data": edge_data_copy + }) + + async with self.async_driver.session() as session: + await session.run( + f""" + UNWIND $edges AS edge + MATCH (s:`{self.namespace}`) + WHERE s.id = edge.source_id + WITH edge, s + MATCH (t:`{self.namespace}`) + WHERE t.id = edge.target_id + MERGE (s)-[r:RELATED]->(t) + SET r += edge.edge_data + """, + edges=edges_params + ) + + + + + async def clustering(self, algorithm: str): + if algorithm != "leiden": + raise ValueError( + f"Clustering algorithm {algorithm} not supported in Neo4j implementation" + ) + + random_seed = self.global_config["graph_cluster_seed"] + max_level = self.global_config["max_graph_cluster_size"] + async with self.async_driver.session() as session: + try: + # Project the graph with undirected relationships + await session.run( + f""" + CALL gds.graph.project( + 'graph_{self.namespace}', + ['{self.namespace}'], + {{ + RELATED: {{ + orientation: 'UNDIRECTED', + properties: ['weight'] + }} + }} + ) + """ + ) + + # Run Leiden algorithm + result = await session.run( + f""" + CALL gds.leiden.write( + 'graph_{self.namespace}', + {{ + writeProperty: 'communityIds', + includeIntermediateCommunities: True, + relationshipWeightProperty: "weight", + maxLevels: {max_level}, + tolerance: 0.0001, + gamma: 1.0, + theta: 0.01, + randomSeed: {random_seed} + }} + ) + YIELD communityCount, modularities; + """ + ) + result = await result.single() + community_count: int = result["communityCount"] + modularities = result["modularities"] + logger.info( + f"Performed graph clustering with {community_count} communities and modularities {modularities}" + ) + finally: + # Drop the projected graph + await session.run(f"CALL gds.graph.drop('graph_{self.namespace}')") + + async def community_schema(self) -> dict[str, SingleCommunitySchema]: + results = defaultdict( + lambda: dict( + level=None, + title=None, + edges=set(), + nodes=set(), + chunk_ids=set(), + occurrence=0.0, + sub_communities=[], + ) + ) + + async with self.async_driver.session() as session: + # Fetch community data + result = await session.run( + f""" + MATCH (n:`{self.namespace}`) + WITH n, n.communityIds AS communityIds, [(n)-[]-(m:`{self.namespace}`) | m.id] AS connected_nodes + RETURN n.id AS node_id, n.source_id AS source_id, + communityIds AS cluster_key, + connected_nodes + """ + ) + + # records = await result.fetch() + + max_num_ids = 0 + async for record in result: + for index, c_id in enumerate(record["cluster_key"]): + node_id = str(record["node_id"]) + source_id = record["source_id"] + level = index + cluster_key = str(c_id) + connected_nodes = record["connected_nodes"] + + results[cluster_key]["level"] = level + results[cluster_key]["title"] = f"Cluster {cluster_key}" + results[cluster_key]["nodes"].add(node_id) + results[cluster_key]["edges"].update( + [ + tuple(sorted([node_id, str(connected)])) + for connected in connected_nodes + if connected != node_id + ] + ) + chunk_ids = source_id.split(GRAPH_FIELD_SEP) + results[cluster_key]["chunk_ids"].update(chunk_ids) + max_num_ids = max( + max_num_ids, len(results[cluster_key]["chunk_ids"]) + ) + + # Process results + for k, v in results.items(): + v["edges"] = [list(e) for e in v["edges"]] + v["nodes"] = list(v["nodes"]) + v["chunk_ids"] = list(v["chunk_ids"]) + v["occurrence"] = len(v["chunk_ids"]) / max_num_ids + + # Compute sub-communities (this is a simplified approach) + for cluster in results.values(): + cluster["sub_communities"] = [ + sub_key + for sub_key, sub_cluster in results.items() + if sub_cluster["level"] > cluster["level"] + and set(sub_cluster["nodes"]).issubset(set(cluster["nodes"])) + ] + + return dict(results) + + async def index_done_callback(self): + await self.async_driver.close() + + async def _debug_delete_all_node_edges(self): + async with self.async_driver.session() as session: + try: + # Delete all relationships in the namespace + await session.run(f"MATCH (n:`{self.namespace}`)-[r]-() DELETE r") + + # Delete all nodes in the namespace + await session.run(f"MATCH (n:`{self.namespace}`) DELETE n") + + logger.info( + f"All nodes and edges in namespace '{self.namespace}' have been deleted." + ) + except Exception as e: + logger.error(f"Error deleting nodes and edges: {str(e)}") + raise diff --git a/nano-graphrag/nano_graphrag/_storage/gdb_networkx.py b/nano-graphrag/nano_graphrag/_storage/gdb_networkx.py new file mode 100644 index 0000000000000000000000000000000000000000..8be8be8150c79992db8ca01c29db4a4d27004049 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_storage/gdb_networkx.py @@ -0,0 +1,268 @@ +import html +import json +import os +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Union, cast, List +import networkx as nx +import numpy as np +import asyncio + +from .._utils import logger +from ..base import ( + BaseGraphStorage, + SingleCommunitySchema, +) +from ..prompt import GRAPH_FIELD_SEP + + +@dataclass +class NetworkXStorage(BaseGraphStorage): + @staticmethod + def load_nx_graph(file_name) -> nx.Graph: + if os.path.exists(file_name): + return nx.read_graphml(file_name) + return None + + @staticmethod + def write_nx_graph(graph: nx.Graph, file_name): + logger.info( + f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges" + ) + nx.write_graphml(graph, file_name) + + @staticmethod + def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Return the largest connected component of the graph, with nodes and edges sorted in a stable way. + """ + from graspologic.utils import largest_connected_component + + graph = graph.copy() + graph = cast(nx.Graph, largest_connected_component(graph)) + node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore + graph = nx.relabel_nodes(graph, node_mapping) + return NetworkXStorage._stabilize_graph(graph) + + @staticmethod + def _stabilize_graph(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Ensure an undirected graph with the same relationships will always be read the same way. + """ + fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph() + + sorted_nodes = graph.nodes(data=True) + sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0]) + + fixed_graph.add_nodes_from(sorted_nodes) + edges = list(graph.edges(data=True)) + + if not graph.is_directed(): + + def _sort_source_target(edge): + source, target, edge_data = edge + if source > target: + temp = source + source = target + target = temp + return source, target, edge_data + + edges = [_sort_source_target(edge) for edge in edges] + + def _get_edge_key(source: Any, target: Any) -> str: + return f"{source} -> {target}" + + edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1])) + + fixed_graph.add_edges_from(edges) + return fixed_graph + + def __post_init__(self): + self._graphml_xml_file = os.path.join( + self.global_config["working_dir"], f"graph_{self.namespace}.graphml" + ) + preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) + if preloaded_graph is not None: + logger.info( + f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges" + ) + self._graph = preloaded_graph or nx.Graph() + self._clustering_algorithms = { + "leiden": self._leiden_clustering, + } + self._node_embed_algorithms = { + "node2vec": self._node2vec_embed, + } + + async def index_done_callback(self): + NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file) + + async def has_node(self, node_id: str) -> bool: + return self._graph.has_node(node_id) + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + return self._graph.has_edge(source_node_id, target_node_id) + + async def get_node(self, node_id: str) -> Union[dict, None]: + return self._graph.nodes.get(node_id) + + async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, Union[dict, None]]: + return await asyncio.gather(*[self.get_node(node_id) for node_id in node_ids]) + + async def node_degree(self, node_id: str) -> int: + # [numberchiffre]: node_id not part of graph returns `DegreeView({})` instead of 0 + return self._graph.degree(node_id) if self._graph.has_node(node_id) else 0 + + async def node_degrees_batch(self, node_ids: List[str]) -> List[str]: + return await asyncio.gather(*[self.node_degree(node_id) for node_id in node_ids]) + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + return (self._graph.degree(src_id) if self._graph.has_node(src_id) else 0) + ( + self._graph.degree(tgt_id) if self._graph.has_node(tgt_id) else 0 + ) + + async def edge_degrees_batch(self, edge_pairs: list[tuple[str, str]]) -> list[int]: + return await asyncio.gather(*[self.edge_degree(src_id, tgt_id) for src_id, tgt_id in edge_pairs]) + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + return self._graph.edges.get((source_node_id, target_node_id)) + + async def get_edges_batch( + self, edge_pairs: list[tuple[str, str]] + ) -> list[Union[dict, None]]: + return await asyncio.gather(*[self.get_edge(source_node_id, target_node_id) for source_node_id, target_node_id in edge_pairs]) + + async def get_node_edges(self, source_node_id: str): + if self._graph.has_node(source_node_id): + return list(self._graph.edges(source_node_id)) + return None + + async def get_nodes_edges_batch( + self, node_ids: list[str] + ) -> list[list[tuple[str, str]]]: + return await asyncio.gather(*[self.get_node_edges(node_id) for node_id + in node_ids]) + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + self._graph.add_node(node_id, **node_data) + + async def upsert_nodes_batch(self, nodes_data: list[tuple[str, dict[str, str]]]): + await asyncio.gather(*[self.upsert_node(node_id, node_data) for node_id, node_data in nodes_data]) + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + self._graph.add_edge(source_node_id, target_node_id, **edge_data) + + async def upsert_edges_batch( + self, edges_data: list[tuple[str, str, dict[str, str]]] + ): + await asyncio.gather(*[self.upsert_edge(source_node_id, target_node_id, edge_data) + for source_node_id, target_node_id, edge_data in edges_data]) + + async def clustering(self, algorithm: str): + if algorithm not in self._clustering_algorithms: + raise ValueError(f"Clustering algorithm {algorithm} not supported") + await self._clustering_algorithms[algorithm]() + + async def community_schema(self) -> dict[str, SingleCommunitySchema]: + results = defaultdict( + lambda: dict( + level=None, + title=None, + edges=set(), + nodes=set(), + chunk_ids=set(), + occurrence=0.0, + sub_communities=[], + ) + ) + max_num_ids = 0 + levels = defaultdict(set) + for node_id, node_data in self._graph.nodes(data=True): + if "clusters" not in node_data: + continue + clusters = json.loads(node_data["clusters"]) + this_node_edges = self._graph.edges(node_id) + + for cluster in clusters: + level = cluster["level"] + cluster_key = str(cluster["cluster"]) + levels[level].add(cluster_key) + results[cluster_key]["level"] = level + results[cluster_key]["title"] = f"Cluster {cluster_key}" + results[cluster_key]["nodes"].add(node_id) + results[cluster_key]["edges"].update( + [tuple(sorted(e)) for e in this_node_edges] + ) + results[cluster_key]["chunk_ids"].update( + node_data["source_id"].split(GRAPH_FIELD_SEP) + ) + max_num_ids = max(max_num_ids, len(results[cluster_key]["chunk_ids"])) + + ordered_levels = sorted(levels.keys()) + for i, curr_level in enumerate(ordered_levels[:-1]): + next_level = ordered_levels[i + 1] + this_level_comms = levels[curr_level] + next_level_comms = levels[next_level] + # compute the sub-communities by nodes intersection + for comm in this_level_comms: + results[comm]["sub_communities"] = [ + c + for c in next_level_comms + if results[c]["nodes"].issubset(results[comm]["nodes"]) + ] + + for k, v in results.items(): + v["edges"] = list(v["edges"]) + v["edges"] = [list(e) for e in v["edges"]] + v["nodes"] = list(v["nodes"]) + v["chunk_ids"] = list(v["chunk_ids"]) + v["occurrence"] = len(v["chunk_ids"]) / max_num_ids + return dict(results) + + def _cluster_data_to_subgraphs(self, cluster_data: dict[str, list[dict[str, str]]]): + for node_id, clusters in cluster_data.items(): + self._graph.nodes[node_id]["clusters"] = json.dumps(clusters) + + async def _leiden_clustering(self): + from graspologic.partition import hierarchical_leiden + + graph = NetworkXStorage.stable_largest_connected_component(self._graph) + community_mapping = hierarchical_leiden( + graph, + max_cluster_size=self.global_config["max_graph_cluster_size"], + random_seed=self.global_config["graph_cluster_seed"], + ) + + node_communities: dict[str, list[dict[str, str]]] = defaultdict(list) + __levels = defaultdict(set) + for partition in community_mapping: + level_key = partition.level + cluster_id = partition.cluster + node_communities[partition.node].append( + {"level": level_key, "cluster": cluster_id} + ) + __levels[level_key].add(cluster_id) + node_communities = dict(node_communities) + __levels = {k: len(v) for k, v in __levels.items()} + logger.info(f"Each level has communities: {dict(__levels)}") + self._cluster_data_to_subgraphs(node_communities) + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + if algorithm not in self._node_embed_algorithms: + raise ValueError(f"Node embedding algorithm {algorithm} not supported") + return await self._node_embed_algorithms[algorithm]() + + async def _node2vec_embed(self): + from graspologic import embed + + embeddings, nodes = embed.node2vec_embed( + self._graph, + **self.global_config["node2vec_params"], + ) + + nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] + return embeddings, nodes_ids diff --git a/nano-graphrag/nano_graphrag/_storage/kv_json.py b/nano-graphrag/nano_graphrag/_storage/kv_json.py new file mode 100644 index 0000000000000000000000000000000000000000..b802f268944497411c5130ed1c6693556390dfca --- /dev/null +++ b/nano-graphrag/nano_graphrag/_storage/kv_json.py @@ -0,0 +1,46 @@ +import os +from dataclasses import dataclass + +from .._utils import load_json, logger, write_json +from ..base import ( + BaseKVStorage, +) + + +@dataclass +class JsonKVStorage(BaseKVStorage): + def __post_init__(self): + working_dir = self.global_config["working_dir"] + self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") + self._data = load_json(self._file_name) or {} + logger.info(f"Load KV {self.namespace} with {len(self._data)} data") + + async def all_keys(self) -> list[str]: + return list(self._data.keys()) + + async def index_done_callback(self): + write_json(self._data, self._file_name) + + async def get_by_id(self, id): + return self._data.get(id, None) + + async def get_by_ids(self, ids, fields=None): + if fields is None: + return [self._data.get(id, None) for id in ids] + return [ + ( + {k: v for k, v in self._data[id].items() if k in fields} + if self._data.get(id, None) + else None + ) + for id in ids + ] + + async def filter_keys(self, data: list[str]) -> set[str]: + return set([s for s in data if s not in self._data]) + + async def upsert(self, data: dict[str, dict]): + self._data.update(data) + + async def drop(self): + self._data = {} diff --git a/nano-graphrag/nano_graphrag/_storage/vdb_hnswlib.py b/nano-graphrag/nano_graphrag/_storage/vdb_hnswlib.py new file mode 100644 index 0000000000000000000000000000000000000000..3e98c954f6514dfedaafedc60a76ee8d637e5836 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_storage/vdb_hnswlib.py @@ -0,0 +1,141 @@ +import asyncio +import os +from dataclasses import dataclass, field +from typing import Any +import pickle +import hnswlib +import numpy as np +import xxhash + +from .._utils import logger +from ..base import BaseVectorStorage + + +@dataclass +class HNSWVectorStorage(BaseVectorStorage): + ef_construction: int = 100 + M: int = 16 + max_elements: int = 1000000 + ef_search: int = 50 + num_threads: int = -1 + _index: Any = field(init=False) + _metadata: dict[str, dict] = field(default_factory=dict) + _current_elements: int = 0 + + def __post_init__(self): + self._index_file_name = os.path.join( + self.global_config["working_dir"], f"{self.namespace}_hnsw.index" + ) + self._metadata_file_name = os.path.join( + self.global_config["working_dir"], f"{self.namespace}_hnsw_metadata.pkl" + ) + self._embedding_batch_num = self.global_config.get("embedding_batch_num", 100) + + hnsw_params = self.global_config.get("vector_db_storage_cls_kwargs", {}) + self.ef_construction = hnsw_params.get("ef_construction", self.ef_construction) + self.M = hnsw_params.get("M", self.M) + self.max_elements = hnsw_params.get("max_elements", self.max_elements) + self.ef_search = hnsw_params.get("ef_search", self.ef_search) + self.num_threads = hnsw_params.get("num_threads", self.num_threads) + self._index = hnswlib.Index( + space="cosine", dim=self.embedding_func.embedding_dim + ) + + if os.path.exists(self._index_file_name) and os.path.exists( + self._metadata_file_name + ): + self._index.load_index( + self._index_file_name, max_elements=self.max_elements + ) + with open(self._metadata_file_name, "rb") as f: + self._metadata, self._current_elements = pickle.load(f) + logger.info( + f"Loaded existing index for {self.namespace} with {self._current_elements} elements" + ) + else: + self._index.init_index( + max_elements=self.max_elements, + ef_construction=self.ef_construction, + M=self.M, + ) + self._index.set_ef(self.ef_search) + self._metadata = {} + self._current_elements = 0 + logger.info(f"Created new index for {self.namespace}") + + async def upsert(self, data: dict[str, dict]) -> np.ndarray: + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + if not data: + logger.warning("You insert an empty data to vector DB") + return [] + + if self._current_elements + len(data) > self.max_elements: + raise ValueError( + f"Cannot insert {len(data)} elements. Current: {self._current_elements}, Max: {self.max_elements}" + ) + + list_data = [ + { + "id": k, + **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, + } + for k, v in data.items() + ] + contents = [v["content"] for v in data.values()] + batch_size = min(self._embedding_batch_num, len(contents)) + embeddings = np.concatenate( + await asyncio.gather( + *[ + self.embedding_func(contents[i : i + batch_size]) + for i in range(0, len(contents), batch_size) + ] + ) + ) + + ids = np.fromiter( + (xxhash.xxh32_intdigest(d["id"].encode()) for d in list_data), + dtype=np.uint32, + count=len(list_data), + ) + self._metadata.update( + { + id_int: { + k: v for k, v in d.items() if k in self.meta_fields or k == "id" + } + for id_int, d in zip(ids, list_data) + } + ) + self._index.add_items(data=embeddings, ids=ids, num_threads=self.num_threads) + self._current_elements = self._index.get_current_count() + return ids + + async def query(self, query: str, top_k: int = 5) -> list[dict]: + if self._current_elements == 0: + return [] + + top_k = min(top_k, self._current_elements) + + if top_k > self.ef_search: + logger.warning( + f"Setting ef_search to {top_k} because top_k is larger than ef_search" + ) + self._index.set_ef(top_k) + + embedding = await self.embedding_func([query]) + labels, distances = self._index.knn_query( + data=embedding[0], k=top_k, num_threads=self.num_threads + ) + + return [ + { + **self._metadata.get(label, {}), + "distance": distance, + "similarity": 1 - distance, + } + for label, distance in zip(labels[0], distances[0]) + ] + + async def index_done_callback(self): + self._index.save_index(self._index_file_name) + with open(self._metadata_file_name, "wb") as f: + pickle.dump((self._metadata, self._current_elements), f) diff --git a/nano-graphrag/nano_graphrag/_storage/vdb_nanovectordb.py b/nano-graphrag/nano_graphrag/_storage/vdb_nanovectordb.py new file mode 100644 index 0000000000000000000000000000000000000000..f73ab06279d8fd1b22e3c2b75794fb282f133ae4 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_storage/vdb_nanovectordb.py @@ -0,0 +1,68 @@ +import asyncio +import os +from dataclasses import dataclass +import numpy as np +from nano_vectordb import NanoVectorDB + +from .._utils import logger +from ..base import BaseVectorStorage + + +@dataclass +class NanoVectorDBStorage(BaseVectorStorage): + cosine_better_than_threshold: float = 0.2 + + def __post_init__(self): + + self._client_file_name = os.path.join( + self.global_config["working_dir"], f"vdb_{self.namespace}.json" + ) + self._max_batch_size = self.global_config["embedding_batch_num"] + self._client = NanoVectorDB( + self.embedding_func.embedding_dim, storage_file=self._client_file_name + ) + self.cosine_better_than_threshold = self.global_config.get( + "query_better_than_threshold", self.cosine_better_than_threshold + ) + + async def upsert(self, data: dict[str, dict]): + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + if not len(data): + logger.warning("You insert an empty data to vector DB") + return [] + list_data = [ + { + "__id__": k, + **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, + } + for k, v in data.items() + ] + contents = [v["content"] for v in data.values()] + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + embeddings_list = await asyncio.gather( + *[self.embedding_func(batch) for batch in batches] + ) + embeddings = np.concatenate(embeddings_list) + for i, d in enumerate(list_data): + d["__vector__"] = embeddings[i] + results = self._client.upsert(datas=list_data) + return results + + async def query(self, query: str, top_k=5): + embedding = await self.embedding_func([query]) + embedding = embedding[0] + results = self._client.query( + query=embedding, + top_k=top_k, + better_than_threshold=self.cosine_better_than_threshold, + ) + results = [ + {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results + ] + return results + + async def index_done_callback(self): + self._client.save() diff --git a/nano-graphrag/nano_graphrag/_utils.py b/nano-graphrag/nano_graphrag/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..19457f68c1ceb54a288a18e6343078d3c9bfd0d5 --- /dev/null +++ b/nano-graphrag/nano_graphrag/_utils.py @@ -0,0 +1,305 @@ +import asyncio +import html +import json +import logging +import os +import re +import numbers +from dataclasses import dataclass +from functools import wraps +from hashlib import md5 +from typing import Any, Union, Literal + +import numpy as np +import tiktoken + + +from transformers import AutoTokenizer + +logger = logging.getLogger("nano-graphrag") +logging.getLogger("neo4j").setLevel(logging.ERROR) + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + # If there is already an event loop, use it. + loop = asyncio.get_event_loop() + except RuntimeError: + # If in a sub-thread, create a new event loop. + logger.info("Creating a new event loop in a sub-thread.") + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + +def extract_first_complete_json(s: str): + """Extract the first complete JSON object from the string using a stack to track braces.""" + stack = [] + first_json_start = None + + for i, char in enumerate(s): + if char == '{': + stack.append(i) + if first_json_start is None: + first_json_start = i + elif char == '}': + if stack: + start = stack.pop() + if not stack: + first_json_str = s[first_json_start:i+1] + try: + # Attempt to parse the JSON string + return json.loads(first_json_str.replace("\n", "")) + except json.JSONDecodeError as e: + logger.error(f"JSON decoding failed: {e}. Attempted string: {first_json_str[:50]}...") + return None + finally: + first_json_start = None + logger.warning("No complete JSON object found in the input string.") + return None + +def parse_value(value: str): + """Convert a string value to its appropriate type (int, float, bool, None, or keep as string). Work as a more broad 'eval()'""" + value = value.strip() + + if value == "null": + return None + elif value == "true": + return True + elif value == "false": + return False + else: + # Try to convert to int or float + try: + if '.' in value: # If there's a dot, it might be a float + return float(value) + else: + return int(value) + except ValueError: + # If conversion fails, return the value as-is (likely a string) + return value.strip('"') # Remove surrounding quotes if they exist + +def extract_values_from_json(json_string, keys=["reasoning", "answer", "data"], allow_no_quotes=False): + """Extract key values from a non-standard or malformed JSON string, handling nested objects.""" + extracted_values = {} + + # Enhanced pattern to match both quoted and unquoted values, as well as nested objects + regex_pattern = r'(?P"?\w+"?)\s*:\s*(?P{[^}]*}|".*?"|[^,}]+)' + + for match in re.finditer(regex_pattern, json_string, re.DOTALL): + key = match.group('key').strip('"') # Strip quotes from key + value = match.group('value').strip() + + # If the value is another nested JSON (starts with '{' and ends with '}'), recursively parse it + if value.startswith('{') and value.endswith('}'): + extracted_values[key] = extract_values_from_json(value) + else: + # Parse the value into the appropriate type (int, float, bool, etc.) + extracted_values[key] = parse_value(value) + + if not extracted_values: + logger.warning("No values could be extracted from the string.") + + return extracted_values + + +def convert_response_to_json(response: str) -> dict: + """Convert response string to JSON, with error handling and fallback to non-standard JSON extraction.""" + prediction_json = extract_first_complete_json(response) + + if prediction_json is None: + logger.info("Attempting to extract values from a non-standard JSON string...") + prediction_json = extract_values_from_json(response, allow_no_quotes=True) + + if not prediction_json: + logger.error("Unable to extract meaningful data from the response.") + else: + logger.info("JSON data successfully extracted.") + + return prediction_json + + + + +class TokenizerWrapper: + def __init__(self, tokenizer_type: Literal["tiktoken", "huggingface"] = "tiktoken", model_name: str = "gpt-4o"): + self.tokenizer_type = tokenizer_type + self.model_name = model_name + self._tokenizer = None + self._lazy_load_tokenizer() + + def _lazy_load_tokenizer(self): + if self._tokenizer is not None: + return + logger.info(f"Loading tokenizer: type='{self.tokenizer_type}', name='{self.model_name}'") + if self.tokenizer_type == "tiktoken": + self._tokenizer = tiktoken.encoding_for_model(self.model_name) + elif self.tokenizer_type == "huggingface": + if AutoTokenizer is None: + raise ImportError("`transformers` is not installed. Please install it via `pip install transformers` to use HuggingFace tokenizers.") + self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) + else: + raise ValueError(f"Unknown tokenizer_type: {self.tokenizer_type}") + + def get_tokenizer(self): + """提供对底层 tokenizer 对象的访问,用于特殊情况(如 decode_batch)。""" + self._lazy_load_tokenizer() + return self._tokenizer + + def encode(self, text: str) -> list[int]: + self._lazy_load_tokenizer() + return self._tokenizer.encode(text) + + def decode(self, tokens: list[int]) -> str: + self._lazy_load_tokenizer() + return self._tokenizer.decode(tokens) + + # +++ 新增 +++: 增加一个批量解码的方法以提高效率,并保持接口一致性 + def decode_batch(self, tokens_list: list[list[int]]) -> list[str]: + self._lazy_load_tokenizer() + # HuggingFace tokenizer 有 decode_batch,但 tiktoken 没有,我们用列表推导来模拟 + if self.tokenizer_type == "tiktoken": + return [self._tokenizer.decode(tokens) for tokens in tokens_list] + elif self.tokenizer_type == "huggingface": + return self._tokenizer.batch_decode(tokens_list, skip_special_tokens=True) + else: + raise ValueError(f"Unknown tokenizer_type: {self.tokenizer_type}") + + + +def truncate_list_by_token_size( + list_data: list, + key: callable, + max_token_size: int, + tokenizer_wrapper: TokenizerWrapper +): + """Truncate a list of data by token size using a provided tokenizer wrapper.""" + if max_token_size <= 0: + return [] + tokens = 0 + for i, data in enumerate(list_data): + tokens += len(tokenizer_wrapper.encode(key(data))) + 1 # 防御性,模拟通过\n拼接列表的情况 + if tokens > max_token_size: + return list_data[:i] + return list_data + + +def compute_mdhash_id(content, prefix: str = ""): + return prefix + md5(content.encode()).hexdigest() + + +def write_json(json_obj, file_name): + with open(file_name, "w", encoding="utf-8") as f: + json.dump(json_obj, f, indent=2, ensure_ascii=False) + + +def load_json(file_name): + if not os.path.exists(file_name): + return None + with open(file_name, encoding="utf-8") as f: + return json.load(f) + + +# it's dirty to type, so it's a good way to have fun +def pack_user_ass_to_openai_messages(prompt: str, generated_content: str, using_amazon_bedrock: bool): + if using_amazon_bedrock: + return [ + {"role": "user", "content": [{"text": prompt}]}, + {"role": "assistant", "content": [{"text": generated_content}]}, + ] + else: + return [ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": generated_content}, + ] + + +def is_float_regex(value): + return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) + + +def compute_args_hash(*args): + return md5(str(args).encode()).hexdigest() + + +def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: + """Split a string by multiple markers""" + if not markers: + return [content] + results = re.split("|".join(re.escape(marker) for marker in markers), content) + return [r.strip() for r in results if r.strip()] + + +def enclose_string_with_quotes(content: Any) -> str: + """Enclose a string with quotes""" + if isinstance(content, numbers.Number): + return str(content) + content = str(content) + content = content.strip().strip("'").strip('"') + return f'"{content}"' + + +def list_of_list_to_csv(data: list[list]): + return "\n".join( + [ + ",\t".join([f"{enclose_string_with_quotes(data_dd)}" for data_dd in data_d]) + for data_d in data + ] + ) + + +# ----------------------------------------------------------------------------------- +# Refer the utils functions of the official GraphRAG implementation: +# https://github.com/microsoft/graphrag +def clean_str(input: Any) -> str: + """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" + # If we get non-string input, just give it back + if not isinstance(input, str): + return input + + result = html.unescape(input.strip()) + # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python + return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) + + +# Utils types ----------------------------------------------------------------------- +@dataclass +class EmbeddingFunc: + embedding_dim: int + max_token_size: int + func: callable + + async def __call__(self, *args, **kwargs) -> np.ndarray: + return await self.func(*args, **kwargs) + + +# Decorators ------------------------------------------------------------------------ +def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): + """Add restriction of maximum async calling times for a async func""" + + def final_decro(func): + """Not using async.Semaphore to aovid use nest-asyncio""" + __current_size = 0 + + @wraps(func) + async def wait_func(*args, **kwargs): + nonlocal __current_size + while __current_size >= max_size: + await asyncio.sleep(waitting_time) + __current_size += 1 + result = await func(*args, **kwargs) + __current_size -= 1 + return result + + return wait_func + + return final_decro + + +def wrap_embedding_func_with_attrs(**kwargs): + """Wrap a function with attributes""" + + def final_decro(func) -> EmbeddingFunc: + new_func = EmbeddingFunc(**kwargs, func=func) + return new_func + + return final_decro diff --git a/nano-graphrag/nano_graphrag/base.py b/nano-graphrag/nano_graphrag/base.py new file mode 100644 index 0000000000000000000000000000000000000000..e340941f2f9ffc419f81fa7b23bb01ce13727578 --- /dev/null +++ b/nano-graphrag/nano_graphrag/base.py @@ -0,0 +1,186 @@ +from dataclasses import dataclass, field +from typing import TypedDict, Union, Literal, Generic, TypeVar, List + +import numpy as np + +from ._utils import EmbeddingFunc + + +@dataclass +class QueryParam: + mode: Literal["local", "global", "naive"] = "global" + only_need_context: bool = False + response_type: str = "Multiple Paragraphs" + level: int = 2 + top_k: int = 20 + # naive search + naive_max_token_for_text_unit = 12000 + # local search + local_max_token_for_text_unit: int = 4000 # 12000 * 0.33 + local_max_token_for_local_context: int = 4800 # 12000 * 0.4 + local_max_token_for_community_report: int = 3200 # 12000 * 0.27 + local_community_single_one: bool = False + # global search + global_min_community_rating: float = 0 + global_max_consider_community: float = 512 + global_max_token_for_community_report: int = 16384 + global_special_community_map_llm_kwargs: dict = field( + default_factory=lambda: {"response_format": {"type": "json_object"}} + ) + + +TextChunkSchema = TypedDict( + "TextChunkSchema", + {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}, +) + +SingleCommunitySchema = TypedDict( + "SingleCommunitySchema", + { + "level": int, + "title": str, + "edges": list[list[str, str]], + "nodes": list[str], + "chunk_ids": list[str], + "occurrence": float, + "sub_communities": list[str], + }, +) + + +class CommunitySchema(SingleCommunitySchema): + report_string: str + report_json: dict + + +T = TypeVar("T") + + +@dataclass +class StorageNameSpace: + namespace: str + global_config: dict + + async def index_start_callback(self): + """commit the storage operations after indexing""" + pass + + async def index_done_callback(self): + """commit the storage operations after indexing""" + pass + + async def query_done_callback(self): + """commit the storage operations after querying""" + pass + + +@dataclass +class BaseVectorStorage(StorageNameSpace): + embedding_func: EmbeddingFunc + meta_fields: set = field(default_factory=set) + + async def query(self, query: str, top_k: int) -> list[dict]: + raise NotImplementedError + + async def upsert(self, data: dict[str, dict]): + """Use 'content' field from value for embedding, use key as id. + If embedding_func is None, use 'embedding' field from value + """ + raise NotImplementedError + + +@dataclass +class BaseKVStorage(Generic[T], StorageNameSpace): + async def all_keys(self) -> list[str]: + raise NotImplementedError + + async def get_by_id(self, id: str) -> Union[T, None]: + raise NotImplementedError + + async def get_by_ids( + self, ids: list[str], fields: Union[set[str], None] = None + ) -> list[Union[T, None]]: + raise NotImplementedError + + async def filter_keys(self, data: list[str]) -> set[str]: + """return un-exist keys""" + raise NotImplementedError + + async def upsert(self, data: dict[str, T]): + raise NotImplementedError + + async def drop(self): + raise NotImplementedError + + +@dataclass +class BaseGraphStorage(StorageNameSpace): + async def has_node(self, node_id: str) -> bool: + raise NotImplementedError + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + raise NotImplementedError + + async def node_degree(self, node_id: str) -> int: + raise NotImplementedError + + async def node_degrees_batch(self, node_ids: List[str]) -> List[str]: + raise NotImplementedError + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + raise NotImplementedError + + async def edge_degrees_batch(self, edge_pairs: list[tuple[str, str]]) -> list[int]: + raise NotImplementedError + + async def get_node(self, node_id: str) -> Union[dict, None]: + raise NotImplementedError + + async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, Union[dict, None]]: + raise NotImplementedError + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + raise NotImplementedError + + async def get_edges_batch( + self, edge_pairs: list[tuple[str, str]] + ) -> list[Union[dict, None]]: + raise NotImplementedError + + async def get_node_edges( + self, source_node_id: str + ) -> Union[list[tuple[str, str]], None]: + raise NotImplementedError + + async def get_nodes_edges_batch( + self, node_ids: list[str] + ) -> list[list[tuple[str, str]]]: + raise NotImplementedError + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + raise NotImplementedError + + async def upsert_nodes_batch(self, nodes_data: list[tuple[str, dict[str, str]]]): + raise NotImplementedError + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + raise NotImplementedError + + async def upsert_edges_batch( + self, edges_data: list[tuple[str, str, dict[str, str]]] + ): + raise NotImplementedError + + async def clustering(self, algorithm: str): + raise NotImplementedError + + async def community_schema(self) -> dict[str, SingleCommunitySchema]: + """Return the community representation with report and nodes""" + raise NotImplementedError + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + raise NotImplementedError("Node embedding is not used in nano-graphrag.") diff --git a/nano-graphrag/nano_graphrag/entity_extraction/__init__.py b/nano-graphrag/nano_graphrag/entity_extraction/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/nano-graphrag/nano_graphrag/entity_extraction/extract.py b/nano-graphrag/nano_graphrag/entity_extraction/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..45f160713db338ae7e97b8924b78b6bbc27d68b7 --- /dev/null +++ b/nano-graphrag/nano_graphrag/entity_extraction/extract.py @@ -0,0 +1,171 @@ +from typing import Union +import pickle +import asyncio +from openai import BadRequestError +from collections import defaultdict +import dspy +from nano_graphrag.base import ( + BaseGraphStorage, + BaseVectorStorage, + TextChunkSchema, +) +from nano_graphrag.prompt import PROMPTS +from nano_graphrag._utils import logger, compute_mdhash_id +from nano_graphrag.entity_extraction.module import TypedEntityRelationshipExtractor +from nano_graphrag._op import _merge_edges_then_upsert, _merge_nodes_then_upsert + + +async def generate_dataset( + chunks: dict[str, TextChunkSchema], + filepath: str, + save_dataset: bool = True, + global_config: dict = {}, +) -> list[dspy.Example]: + entity_extractor = TypedEntityRelationshipExtractor(num_refine_turns=1, self_refine=True) + + if global_config.get("use_compiled_dspy_entity_relationship", False): + entity_extractor.load(global_config["entity_relationship_module_path"]) + + ordered_chunks = list(chunks.items()) + already_processed = 0 + already_entities = 0 + already_relations = 0 + + async def _process_single_content( + chunk_key_dp: tuple[str, TextChunkSchema] + ) -> dspy.Example: + nonlocal already_processed, already_entities, already_relations + chunk_dp = chunk_key_dp[1] + content = chunk_dp["content"] + try: + prediction = await asyncio.to_thread(entity_extractor, input_text=content) + entities, relationships = prediction.entities, prediction.relationships + except BadRequestError as e: + logger.error(f"Error in TypedEntityRelationshipExtractor: {e}") + entities, relationships = [], [] + example = dspy.Example( + input_text=content, entities=entities, relationships=relationships + ).with_inputs("input_text") + already_entities += len(entities) + already_relations += len(relationships) + already_processed += 1 + now_ticks = PROMPTS["process_tickers"][ + already_processed % len(PROMPTS["process_tickers"]) + ] + print( + f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", + end="", + flush=True, + ) + return example + + examples = await asyncio.gather( + *[_process_single_content(c) for c in ordered_chunks] + ) + filtered_examples = [ + example + for example in examples + if len(example.entities) > 0 and len(example.relationships) > 0 + ] + num_filtered_examples = len(examples) - len(filtered_examples) + if save_dataset: + with open(filepath, "wb") as f: + pickle.dump(filtered_examples, f) + logger.info( + f"Saved {len(filtered_examples)} examples with keys: {filtered_examples[0].keys()}, filtered {num_filtered_examples} examples" + ) + + return filtered_examples + + +async def extract_entities_dspy( + chunks: dict[str, TextChunkSchema], + knwoledge_graph_inst: BaseGraphStorage, + entity_vdb: BaseVectorStorage, + global_config: dict, +) -> Union[BaseGraphStorage, None]: + entity_extractor = TypedEntityRelationshipExtractor(num_refine_turns=1, self_refine=True) + + if global_config.get("use_compiled_dspy_entity_relationship", False): + entity_extractor.load(global_config["entity_relationship_module_path"]) + + ordered_chunks = list(chunks.items()) + already_processed = 0 + already_entities = 0 + already_relations = 0 + + async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): + nonlocal already_processed, already_entities, already_relations + chunk_key = chunk_key_dp[0] + chunk_dp = chunk_key_dp[1] + content = chunk_dp["content"] + try: + prediction = await asyncio.to_thread(entity_extractor, input_text=content) + entities, relationships = prediction.entities, prediction.relationships + except BadRequestError as e: + logger.error(f"Error in TypedEntityRelationshipExtractor: {e}") + entities, relationships = [], [] + + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + + for entity in entities: + entity["source_id"] = chunk_key + maybe_nodes[entity["entity_name"]].append(entity) + already_entities += 1 + + for relationship in relationships: + relationship["source_id"] = chunk_key + maybe_edges[(relationship["src_id"], relationship["tgt_id"])].append( + relationship + ) + already_relations += 1 + + already_processed += 1 + now_ticks = PROMPTS["process_tickers"][ + already_processed % len(PROMPTS["process_tickers"]) + ] + print( + f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", + end="", + flush=True, + ) + return dict(maybe_nodes), dict(maybe_edges) + + results = await asyncio.gather( + *[_process_single_content(c) for c in ordered_chunks] + ) + print() + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + for m_nodes, m_edges in results: + for k, v in m_nodes.items(): + maybe_nodes[k].extend(v) + for k, v in m_edges.items(): + maybe_edges[k].extend(v) + all_entities_data = await asyncio.gather( + *[ + _merge_nodes_then_upsert(k, v, knwoledge_graph_inst, global_config) + for k, v in maybe_nodes.items() + ] + ) + await asyncio.gather( + *[ + _merge_edges_then_upsert(k[0], k[1], v, knwoledge_graph_inst, global_config) + for k, v in maybe_edges.items() + ] + ) + if not len(all_entities_data): + logger.warning("Didn't extract any entities, maybe your LLM is not working") + return None + if entity_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["entity_name"], prefix="ent-"): { + "content": dp["entity_name"] + dp["description"], + "entity_name": dp["entity_name"], + } + for dp in all_entities_data + } + await entity_vdb.upsert(data_for_vdb) + + return knwoledge_graph_inst diff --git a/nano-graphrag/nano_graphrag/entity_extraction/metric.py b/nano-graphrag/nano_graphrag/entity_extraction/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..cfe4b752b2553343ca5ac01ec54c3334c98834e8 --- /dev/null +++ b/nano-graphrag/nano_graphrag/entity_extraction/metric.py @@ -0,0 +1,62 @@ +import dspy +from nano_graphrag.entity_extraction.module import Relationship + + +class AssessRelationships(dspy.Signature): + """ + Assess the similarity between gold and predicted relationships: + 1. Match relationships based on src_id and tgt_id pairs, allowing for slight variations in entity names. + 2. For matched pairs, compare: + a) Description similarity (semantic meaning) + b) Weight similarity + c) Order similarity + 3. Consider unmatched relationships as penalties. + 4. Aggregate scores, accounting for precision and recall. + 5. Return a final similarity score between 0 (no similarity) and 1 (perfect match). + + Key considerations: + - Prioritize matching based on entity pairs over exact string matches. + - Use semantic similarity for descriptions rather than exact matches. + - Weight the importance of different aspects (e.g., entity matching, description, weight, order). + - Balance the impact of matched and unmatched relationships in the final score. + """ + + gold_relationships: list[Relationship] = dspy.InputField( + desc="The gold-standard relationships to compare against." + ) + predicted_relationships: list[Relationship] = dspy.InputField( + desc="The predicted relationships to compare against the gold-standard relationships." + ) + similarity_score: float = dspy.OutputField( + desc="Similarity score between 0 and 1, with 1 being the highest similarity." + ) + + +def relationships_similarity_metric( + gold: dspy.Example, pred: dspy.Prediction, trace=None +) -> float: + model = dspy.ChainOfThought(AssessRelationships) + gold_relationships = [Relationship(**item) for item in gold["relationships"]] + predicted_relationships = [Relationship(**item) for item in pred["relationships"]] + similarity_score = float( + model( + gold_relationships=gold_relationships, + predicted_relationships=predicted_relationships, + ).similarity_score + ) + return similarity_score + + +def entity_recall_metric( + gold: dspy.Example, pred: dspy.Prediction, trace=None +) -> float: + true_set = set(item["entity_name"] for item in gold["entities"]) + pred_set = set(item["entity_name"] for item in pred["entities"]) + true_positives = len(pred_set.intersection(true_set)) + false_negatives = len(true_set - pred_set) + recall = ( + true_positives / (true_positives + false_negatives) + if (true_positives + false_negatives) > 0 + else 0 + ) + return recall diff --git a/nano-graphrag/nano_graphrag/entity_extraction/module.py b/nano-graphrag/nano_graphrag/entity_extraction/module.py new file mode 100644 index 0000000000000000000000000000000000000000..b04272c531060e7b6ce0cab4007f55638223b335 --- /dev/null +++ b/nano-graphrag/nano_graphrag/entity_extraction/module.py @@ -0,0 +1,330 @@ +import dspy +from pydantic import BaseModel, Field +from nano_graphrag._utils import clean_str +from nano_graphrag._utils import logger + + +""" +Obtained from: +https://github.com/SciPhi-AI/R2R/blob/6e958d1e451c1cb10b6fc868572659785d1091cb/r2r/providers/prompts/defaults.jsonl +""" +ENTITY_TYPES = [ + "PERSON", + "ORGANIZATION", + "LOCATION", + "DATE", + "TIME", + "MONEY", + "PERCENTAGE", + "PRODUCT", + "EVENT", + "LANGUAGE", + "NATIONALITY", + "RELIGION", + "TITLE", + "PROFESSION", + "ANIMAL", + "PLANT", + "DISEASE", + "MEDICATION", + "CHEMICAL", + "MATERIAL", + "COLOR", + "SHAPE", + "MEASUREMENT", + "WEATHER", + "NATURAL_DISASTER", + "AWARD", + "LAW", + "CRIME", + "TECHNOLOGY", + "SOFTWARE", + "HARDWARE", + "VEHICLE", + "FOOD", + "DRINK", + "SPORT", + "MUSIC_GENRE", + "INSTRUMENT", + "ARTWORK", + "BOOK", + "MOVIE", + "TV_SHOW", + "ACADEMIC_SUBJECT", + "SCIENTIFIC_THEORY", + "POLITICAL_PARTY", + "CURRENCY", + "STOCK_SYMBOL", + "FILE_TYPE", + "PROGRAMMING_LANGUAGE", + "MEDICAL_PROCEDURE", + "CELESTIAL_BODY", +] + + +class Entity(BaseModel): + entity_name: str = Field(..., description="The name of the entity.") + entity_type: str = Field(..., description="The type of the entity.") + description: str = Field( + ..., description="The description of the entity, in details and comprehensive." + ) + importance_score: float = Field( + ..., + ge=0, + le=1, + description="Importance score of the entity. Should be between 0 and 1 with 1 being the most important.", + ) + + def to_dict(self): + return { + "entity_name": clean_str(self.entity_name.upper()), + "entity_type": clean_str(self.entity_type.upper()), + "description": clean_str(self.description), + "importance_score": float(self.importance_score), + } + + +class Relationship(BaseModel): + src_id: str = Field(..., description="The name of the source entity.") + tgt_id: str = Field(..., description="The name of the target entity.") + description: str = Field( + ..., + description="The description of the relationship between the source and target entity, in details and comprehensive.", + ) + weight: float = Field( + ..., + ge=0, + le=1, + description="The weight of the relationship. Should be between 0 and 1 with 1 being the strongest relationship.", + ) + order: int = Field( + ..., + ge=1, + le=3, + description="The order of the relationship. 1 for direct relationships, 2 for second-order, 3 for third-order.", + ) + + def to_dict(self): + return { + "src_id": clean_str(self.src_id.upper()), + "tgt_id": clean_str(self.tgt_id.upper()), + "description": clean_str(self.description), + "weight": float(self.weight), + "order": int(self.order), + } + + +class CombinedExtraction(dspy.Signature): + """ + Given a text document that is potentially relevant to this activity and a list of entity types, + identify all entities of those types from the text and all relationships among the identified entities. + + Entity Guidelines: + 1. Each entity name should be an actual atomic word from the input text. + 2. Avoid duplicates and generic terms. + 3. Make sure descriptions are detailed and comprehensive. Use multiple complete sentences for each point below: + a). The entity's role or significance in the context + b). Key attributes or characteristics + c). Relationships to other entities (if applicable) + d). Historical or cultural relevance (if applicable) + e). Any notable actions or events associated with the entity + 4. All entity types from the text must be included. + 5. IMPORTANT: Only use entity types from the provided 'entity_types' list. Do not introduce new entity types. + + Relationship Guidelines: + 1. Make sure relationship descriptions are detailed and comprehensive. Use multiple complete sentences for each point below: + a). The nature of the relationship (e.g., familial, professional, causal) + b). The impact or significance of the relationship on both entities + c). Any historical or contextual information relevant to the relationship + d). How the relationship evolved over time (if applicable) + e). Any notable events or actions that resulted from this relationship + 2. Include direct relationships (order 1) as well as higher-order relationships (order 2 and 3): + a). Direct relationships: Immediate connections between entities. + b). Second-order relationships: Indirect effects or connections that result from direct relationships. + c). Third-order relationships: Further indirect effects that result from second-order relationships. + 3. The "src_id" and "tgt_id" fields must exactly match entity names from the extracted entities list. + """ + + input_text: str = dspy.InputField( + desc="The text to extract entities and relationships from." + ) + entity_types: list[str] = dspy.InputField( + desc="List of entity types used for extraction." + ) + entities: list[Entity] = dspy.OutputField( + desc="List of entities extracted from the text and the entity types." + ) + relationships: list[Relationship] = dspy.OutputField( + desc="List of relationships extracted from the text and the entity types." + ) + + +class CritiqueCombinedExtraction(dspy.Signature): + """ + Critique the current extraction of entities and relationships from a given text. + Focus on completeness, accuracy, and adherence to the provided entity types and extraction guidelines. + + Critique Guidelines: + 1. Evaluate if all relevant entities from the input text are captured and correctly typed. + 2. Check if entity descriptions are comprehensive and follow the provided guidelines. + 3. Assess the completeness of relationship extractions, including higher-order relationships. + 4. Verify that relationship descriptions are detailed and follow the provided guidelines. + 5. Identify any inconsistencies, errors, or missed opportunities in the current extraction. + 6. Suggest specific improvements or additions to enhance the quality of the extraction. + """ + + input_text: str = dspy.InputField( + desc="The original text from which entities and relationships were extracted." + ) + entity_types: list[str] = dspy.InputField( + desc="List of valid entity types for this extraction task." + ) + current_entities: list[Entity] = dspy.InputField( + desc="List of currently extracted entities to be critiqued." + ) + current_relationships: list[Relationship] = dspy.InputField( + desc="List of currently extracted relationships to be critiqued." + ) + entity_critique: str = dspy.OutputField( + desc="Detailed critique of the current entities, highlighting areas for improvement for completeness and accuracy.." + ) + relationship_critique: str = dspy.OutputField( + desc="Detailed critique of the current relationships, highlighting areas for improvement for completeness and accuracy.." + ) + + +class RefineCombinedExtraction(dspy.Signature): + """ + Refine the current extraction of entities and relationships based on the provided critique. + Improve completeness, accuracy, and adherence to the extraction guidelines. + + Refinement Guidelines: + 1. Address all points raised in the entity and relationship critiques. + 2. Add missing entities and relationships identified in the critique. + 3. Improve entity and relationship descriptions as suggested. + 4. Ensure all refinements still adhere to the original extraction guidelines. + 5. Maintain consistency between entities and relationships during refinement. + 6. Focus on enhancing the overall quality and comprehensiveness of the extraction. + """ + + input_text: str = dspy.InputField( + desc="The original text from which entities and relationships were extracted." + ) + entity_types: list[str] = dspy.InputField( + desc="List of valid entity types for this extraction task." + ) + current_entities: list[Entity] = dspy.InputField( + desc="List of currently extracted entities to be refined." + ) + current_relationships: list[Relationship] = dspy.InputField( + desc="List of currently extracted relationships to be refined." + ) + entity_critique: str = dspy.InputField( + desc="Detailed critique of the current entities to guide refinement." + ) + relationship_critique: str = dspy.InputField( + desc="Detailed critique of the current relationships to guide refinement." + ) + refined_entities: list[Entity] = dspy.OutputField( + desc="List of refined entities, addressing the entity critique and improving upon the current entities." + ) + refined_relationships: list[Relationship] = dspy.OutputField( + desc="List of refined relationships, addressing the relationship critique and improving upon the current relationships." + ) + + +class TypedEntityRelationshipExtractorException(dspy.Module): + def __init__( + self, + predictor: dspy.Module, + exception_types: tuple[type[Exception]] = (Exception,), + ): + super().__init__() + self.predictor = predictor + self.exception_types = exception_types + + def copy(self): + return TypedEntityRelationshipExtractorException(self.predictor) + + def forward(self, **kwargs): + try: + prediction = self.predictor(**kwargs) + return prediction + + except Exception as e: + if isinstance(e, self.exception_types): + return dspy.Prediction(entities=[], relationships=[]) + + raise e + + +class TypedEntityRelationshipExtractor(dspy.Module): + def __init__( + self, + lm: dspy.LM = None, + max_retries: int = 3, + entity_types: list[str] = ENTITY_TYPES, + self_refine: bool = False, + num_refine_turns: int = 1, + ): + super().__init__() + self.lm = lm + self.entity_types = entity_types + self.self_refine = self_refine + self.num_refine_turns = num_refine_turns + + self.extractor = dspy.ChainOfThought( + signature=CombinedExtraction, max_retries=max_retries + ) + self.extractor = TypedEntityRelationshipExtractorException( + self.extractor, exception_types=(ValueError,) + ) + + if self.self_refine: + self.critique = dspy.ChainOfThought( + signature=CritiqueCombinedExtraction, max_retries=max_retries + ) + self.refine = dspy.ChainOfThought( + signature=RefineCombinedExtraction, max_retries=max_retries + ) + + def forward(self, input_text: str) -> dspy.Prediction: + with dspy.context(lm=self.lm if self.lm is not None else dspy.settings.lm): + extraction_result = self.extractor( + input_text=input_text, entity_types=self.entity_types + ) + + current_entities: list[Entity] = extraction_result.entities + current_relationships: list[Relationship] = extraction_result.relationships + + if self.self_refine: + for _ in range(self.num_refine_turns): + critique_result = self.critique( + input_text=input_text, + entity_types=self.entity_types, + current_entities=current_entities, + current_relationships=current_relationships, + ) + refined_result = self.refine( + input_text=input_text, + entity_types=self.entity_types, + current_entities=current_entities, + current_relationships=current_relationships, + entity_critique=critique_result.entity_critique, + relationship_critique=critique_result.relationship_critique, + ) + logger.debug( + f"entities: {len(current_entities)} | refined_entities: {len(refined_result.refined_entities)}" + ) + logger.debug( + f"relationships: {len(current_relationships)} | refined_relationships: {len(refined_result.refined_relationships)}" + ) + current_entities = refined_result.refined_entities + current_relationships = refined_result.refined_relationships + + entities = [entity.to_dict() for entity in current_entities] + relationships = [ + relationship.to_dict() for relationship in current_relationships + ] + + return dspy.Prediction(entities=entities, relationships=relationships) diff --git a/nano-graphrag/nano_graphrag/graphrag.py b/nano-graphrag/nano_graphrag/graphrag.py new file mode 100644 index 0000000000000000000000000000000000000000..b1d887ac59eb39859f7bf51176898885549c7a2f --- /dev/null +++ b/nano-graphrag/nano_graphrag/graphrag.py @@ -0,0 +1,382 @@ +import asyncio +import os +from dataclasses import asdict, dataclass, field +from datetime import datetime +from functools import partial +from typing import Callable, Dict, List, Optional, Type, Union, cast + + + +from ._llm import ( + amazon_bedrock_embedding, + create_amazon_bedrock_complete_function, + gpt_4o_complete, + gpt_4o_mini_complete, + openai_embedding, + azure_gpt_4o_complete, + azure_openai_embedding, + azure_gpt_4o_mini_complete, +) +from ._op import ( + chunking_by_token_size, + extract_entities, + generate_community_report, + get_chunks, + local_query, + global_query, + naive_query, +) +from ._storage import ( + JsonKVStorage, + NanoVectorDBStorage, + NetworkXStorage, +) +from ._utils import ( + EmbeddingFunc, + compute_mdhash_id, + limit_async_func_call, + convert_response_to_json, + always_get_an_event_loop, + logger, + TokenizerWrapper, +) +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, + StorageNameSpace, + QueryParam, +) + + +@dataclass +class GraphRAG: + working_dir: str = field( + default_factory=lambda: f"./nano_graphrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" + ) + # graph mode + enable_local: bool = True + enable_naive_rag: bool = False + + # text chunking + tokenizer_type: str = "tiktoken" # or 'huggingface' + tiktoken_model_name: str = "gpt-4o" + huggingface_model_name: str = "bert-base-uncased" # default HF model + chunk_func: Callable[ + [ + list[list[int]], + List[str], + TokenizerWrapper, + Optional[int], + Optional[int], + ], + List[Dict[str, Union[str, int]]], + ] = chunking_by_token_size + chunk_token_size: int = 1200 + chunk_overlap_token_size: int = 100 + + + # entity extraction + entity_extract_max_gleaning: int = 1 + entity_summary_to_max_tokens: int = 500 + + # graph clustering + graph_cluster_algorithm: str = "leiden" + max_graph_cluster_size: int = 10 + graph_cluster_seed: int = 0xDEADBEEF + + # node embedding + node_embedding_algorithm: str = "node2vec" + node2vec_params: dict = field( + default_factory=lambda: { + "dimensions": 1536, + "num_walks": 10, + "walk_length": 40, + "num_walks": 10, + "window_size": 2, + "iterations": 3, + "random_seed": 3, + } + ) + + # community reports + special_community_report_llm_kwargs: dict = field( + default_factory=lambda: {"response_format": {"type": "json_object"}} + ) + + # text embedding + embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding) + embedding_batch_num: int = 32 + embedding_func_max_async: int = 16 + query_better_than_threshold: float = 0.2 + + # LLM + using_azure_openai: bool = False + using_amazon_bedrock: bool = False + best_model_id: str = "us.anthropic.claude-3-sonnet-20240229-v1:0" + cheap_model_id: str = "us.anthropic.claude-3-haiku-20240307-v1:0" + best_model_func: callable = gpt_4o_complete + best_model_max_token_size: int = 32768 + best_model_max_async: int = 16 + cheap_model_func: callable = gpt_4o_mini_complete + cheap_model_max_token_size: int = 32768 + cheap_model_max_async: int = 16 + + # entity extraction + entity_extraction_func: callable = extract_entities + + # storage + key_string_value_json_storage_cls: Type[BaseKVStorage] = JsonKVStorage + vector_db_storage_cls: Type[BaseVectorStorage] = NanoVectorDBStorage + vector_db_storage_cls_kwargs: dict = field(default_factory=dict) + graph_storage_cls: Type[BaseGraphStorage] = NetworkXStorage + enable_llm_cache: bool = True + + # extension + always_create_working_dir: bool = True + addon_params: dict = field(default_factory=dict) + convert_response_to_json_func: callable = convert_response_to_json + + def __post_init__(self): + _print_config = ",\n ".join([f"{k} = {v}" for k, v in asdict(self).items()]) + logger.debug(f"GraphRAG init with param:\n\n {_print_config}\n") + + self.tokenizer_wrapper = TokenizerWrapper( + tokenizer_type=self.tokenizer_type, + model_name=self.tiktoken_model_name if self.tokenizer_type == "tiktoken" else self.huggingface_model_name + ) + + if self.using_azure_openai: + # If there's no OpenAI API key, use Azure OpenAI + if self.best_model_func == gpt_4o_complete: + self.best_model_func = azure_gpt_4o_complete + if self.cheap_model_func == gpt_4o_mini_complete: + self.cheap_model_func = azure_gpt_4o_mini_complete + if self.embedding_func == openai_embedding: + self.embedding_func = azure_openai_embedding + logger.info( + "Switched the default openai funcs to Azure OpenAI if you didn't set any of it" + ) + + if self.using_amazon_bedrock: + self.best_model_func = create_amazon_bedrock_complete_function(self.best_model_id) + self.cheap_model_func = create_amazon_bedrock_complete_function(self.cheap_model_id) + self.embedding_func = amazon_bedrock_embedding + logger.info( + "Switched the default openai funcs to Amazon Bedrock" + ) + + if not os.path.exists(self.working_dir) and self.always_create_working_dir: + logger.info(f"Creating working directory {self.working_dir}") + os.makedirs(self.working_dir) + + self.full_docs = self.key_string_value_json_storage_cls( + namespace="full_docs", global_config=asdict(self) + ) + + self.text_chunks = self.key_string_value_json_storage_cls( + namespace="text_chunks", global_config=asdict(self) + ) + + self.llm_response_cache = ( + self.key_string_value_json_storage_cls( + namespace="llm_response_cache", global_config=asdict(self) + ) + if self.enable_llm_cache + else None + ) + + self.community_reports = self.key_string_value_json_storage_cls( + namespace="community_reports", global_config=asdict(self) + ) + self.chunk_entity_relation_graph = self.graph_storage_cls( + namespace="chunk_entity_relation", global_config=asdict(self) + ) + + self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( + self.embedding_func + ) + self.entities_vdb = ( + self.vector_db_storage_cls( + namespace="entities", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"entity_name"}, + ) + if self.enable_local + else None + ) + self.chunks_vdb = ( + self.vector_db_storage_cls( + namespace="chunks", + global_config=asdict(self), + embedding_func=self.embedding_func, + ) + if self.enable_naive_rag + else None + ) + + self.best_model_func = limit_async_func_call(self.best_model_max_async)( + partial(self.best_model_func, hashing_kv=self.llm_response_cache) + ) + self.cheap_model_func = limit_async_func_call(self.cheap_model_max_async)( + partial(self.cheap_model_func, hashing_kv=self.llm_response_cache) + ) + + + + def insert(self, string_or_strings): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert(string_or_strings)) + + def query(self, query: str, param: QueryParam = QueryParam()): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.aquery(query, param)) + + async def aquery(self, query: str, param: QueryParam = QueryParam()): + if param.mode == "local" and not self.enable_local: + raise ValueError("enable_local is False, cannot query in local mode") + if param.mode == "naive" and not self.enable_naive_rag: + raise ValueError("enable_naive_rag is False, cannot query in naive mode") + if param.mode == "local": + response = await local_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.community_reports, + self.text_chunks, + param, + self.tokenizer_wrapper, + asdict(self), + ) + elif param.mode == "global": + response = await global_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.community_reports, + self.text_chunks, + param, + self.tokenizer_wrapper, + asdict(self), + ) + elif param.mode == "naive": + response = await naive_query( + query, + self.chunks_vdb, + self.text_chunks, + param, + self.tokenizer_wrapper, + asdict(self), + ) + else: + raise ValueError(f"Unknown mode {param.mode}") + await self._query_done() + return response + + async def ainsert(self, string_or_strings): + await self._insert_start() + try: + if isinstance(string_or_strings, str): + string_or_strings = [string_or_strings] + # ---------- new docs + new_docs = { + compute_mdhash_id(c.strip(), prefix="doc-"): {"content": c.strip()} + for c in string_or_strings + } + _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + if not len(new_docs): + logger.warning(f"All docs are already in the storage") + return + logger.info(f"[New Docs] inserting {len(new_docs)} docs") + + # ---------- chunking + + inserting_chunks = get_chunks( + new_docs=new_docs, + chunk_func=self.chunk_func, + overlap_token_size=self.chunk_overlap_token_size, + max_token_size=self.chunk_token_size, + tokenizer_wrapper=self.tokenizer_wrapper, + ) + + _add_chunk_keys = await self.text_chunks.filter_keys( + list(inserting_chunks.keys()) + ) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } + if not len(inserting_chunks): + logger.warning(f"All chunks are already in the storage") + return + logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") + if self.enable_naive_rag: + logger.info("Insert chunks for naive RAG") + await self.chunks_vdb.upsert(inserting_chunks) + + # TODO: don't support incremental update for communities now, so we have to drop all + await self.community_reports.drop() + + # ---------- extract/summary entity and upsert to graph + logger.info("[Entity Extraction]...") + maybe_new_kg = await self.entity_extraction_func( + inserting_chunks, + knwoledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + tokenizer_wrapper=self.tokenizer_wrapper, + global_config=asdict(self), + using_amazon_bedrock=self.using_amazon_bedrock, + ) + if maybe_new_kg is None: + logger.warning("No new entities found") + return + self.chunk_entity_relation_graph = maybe_new_kg + # ---------- update clusterings of graph + logger.info("[Community Report]...") + await self.chunk_entity_relation_graph.clustering( + self.graph_cluster_algorithm + ) + await generate_community_report( + self.community_reports, self.chunk_entity_relation_graph, self.tokenizer_wrapper, asdict(self) + ) + + # ---------- commit upsertings and indexing + await self.full_docs.upsert(new_docs) + await self.text_chunks.upsert(inserting_chunks) + finally: + await self._insert_done() + + async def _insert_start(self): + tasks = [] + for storage_inst in [ + self.chunk_entity_relation_graph, + ]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_start_callback()) + await asyncio.gather(*tasks) + + async def _insert_done(self): + tasks = [] + for storage_inst in [ + self.full_docs, + self.text_chunks, + self.llm_response_cache, + self.community_reports, + self.entities_vdb, + self.chunks_vdb, + self.chunk_entity_relation_graph, + ]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) + await asyncio.gather(*tasks) + + async def _query_done(self): + tasks = [] + for storage_inst in [self.llm_response_cache]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) + await asyncio.gather(*tasks) diff --git a/nano-graphrag/nano_graphrag/prompt.py b/nano-graphrag/nano_graphrag/prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..06f1da0d2b805f5ab8e4cad648e177ddb89fe073 --- /dev/null +++ b/nano-graphrag/nano_graphrag/prompt.py @@ -0,0 +1,520 @@ +""" +Reference: + - Prompts are from [graphrag](https://github.com/microsoft/graphrag) +""" + +GRAPH_FIELD_SEP = "" +PROMPTS = {} + +PROMPTS[ + "claim_extraction" +] = """-Target activity- +You are an intelligent assistant that helps a human analyst to analyze claims against certain entities presented in a text document. + +-Goal- +Given a text document that is potentially relevant to this activity, an entity specification, and a claim description, extract all entities that match the entity specification and all claims against those entities. + +-Steps- +1. Extract all named entities that match the predefined entity specification. Entity specification can either be a list of entity names or a list of entity types. +2. For each entity identified in step 1, extract all claims associated with the entity. Claims need to match the specified claim description, and the entity should be the subject of the claim. +For each claim, extract the following information: +- Subject: name of the entity that is subject of the claim, capitalized. The subject entity is one that committed the action described in the claim. Subject needs to be one of the named entities identified in step 1. +- Object: name of the entity that is object of the claim, capitalized. The object entity is one that either reports/handles or is affected by the action described in the claim. If object entity is unknown, use **NONE**. +- Claim Type: overall category of the claim, capitalized. Name it in a way that can be repeated across multiple text inputs, so that similar claims share the same claim type +- Claim Status: **TRUE**, **FALSE**, or **SUSPECTED**. TRUE means the claim is confirmed, FALSE means the claim is found to be False, SUSPECTED means the claim is not verified. +- Claim Description: Detailed description explaining the reasoning behind the claim, together with all the related evidence and references. +- Claim Date: Period (start_date, end_date) when the claim was made. Both start_date and end_date should be in ISO-8601 format. If the claim was made on a single date rather than a date range, set the same date for both start_date and end_date. If date is unknown, return **NONE**. +- Claim Source Text: List of **all** quotes from the original text that are relevant to the claim. + +Format each claim as ({tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Return output in English as a single list of all the claims identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +4. When finished, output {completion_delimiter} + +-Examples- +Example 1: +Entity specification: organization +Claim description: red flags associated with an entity +Text: According to an article on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B. The company is owned by Person C who was suspected of engaging in corruption activities in 2015. +Output: + +(COMPANY A{tuple_delimiter}GOVERNMENT AGENCY B{tuple_delimiter}ANTI-COMPETITIVE PRACTICES{tuple_delimiter}TRUE{tuple_delimiter}2022-01-10T00:00:00{tuple_delimiter}2022-01-10T00:00:00{tuple_delimiter}Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10{tuple_delimiter}According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B.) +{completion_delimiter} + +Example 2: +Entity specification: Company A, Person C +Claim description: red flags associated with an entity +Text: According to an article on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B. The company is owned by Person C who was suspected of engaging in corruption activities in 2015. +Output: + +(COMPANY A{tuple_delimiter}GOVERNMENT AGENCY B{tuple_delimiter}ANTI-COMPETITIVE PRACTICES{tuple_delimiter}TRUE{tuple_delimiter}2022-01-10T00:00:00{tuple_delimiter}2022-01-10T00:00:00{tuple_delimiter}Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10{tuple_delimiter}According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B.) +{record_delimiter} +(PERSON C{tuple_delimiter}NONE{tuple_delimiter}CORRUPTION{tuple_delimiter}SUSPECTED{tuple_delimiter}2015-01-01T00:00:00{tuple_delimiter}2015-12-30T00:00:00{tuple_delimiter}Person C was suspected of engaging in corruption activities in 2015{tuple_delimiter}The company is owned by Person C who was suspected of engaging in corruption activities in 2015) +{completion_delimiter} + +-Real Data- +Use the following input for your answer. +Entity specification: {entity_specs} +Claim description: {claim_description} +Text: {input_text} +Output: """ + +PROMPTS[ + "community_report" +] = """You are an AI assistant that helps a human analyst to perform general information discovery. +Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network. + +# Goal +Write a comprehensive report of a community, given a list of entities that belong to the community as well as their relationships and optional associated claims. The report will be used to inform decision-makers about information associated with the community and their potential impact. The content of this report includes an overview of the community's key entities, their legal compliance, technical capabilities, reputation, and noteworthy claims. + +# Report Structure + +The report should include the following sections: + +- TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title. +- SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities. +- IMPACT SEVERITY RATING: a float score between 0-10 that represents the severity of IMPACT posed by entities within the community. IMPACT is the scored importance of a community. +- RATING EXPLANATION: Give a single sentence explanation of the IMPACT severity rating. +- DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive. + +Return output as a well-formed JSON-formatted string with the following format: + {{ + "title": , + "summary": , + "rating": , + "rating_explanation": , + "findings": [ + {{ + "summary":, + "explanation": + }}, + {{ + "summary":, + "explanation": + }} + ... + ] + }} + +# Grounding Rules +Do not include information where the supporting evidence for it is not provided. + + +# Example Input +----------- +Text: +``` +Entities: +```csv +id,entity,type,description +5,VERDANT OASIS PLAZA,geo,Verdant Oasis Plaza is the location of the Unity March +6,HARMONY ASSEMBLY,organization,Harmony Assembly is an organization that is holding a march at Verdant Oasis Plaza +``` +Relationships: +```csv +id,source,target,description +37,VERDANT OASIS PLAZA,UNITY MARCH,Verdant Oasis Plaza is the location of the Unity March +38,VERDANT OASIS PLAZA,HARMONY ASSEMBLY,Harmony Assembly is holding a march at Verdant Oasis Plaza +39,VERDANT OASIS PLAZA,UNITY MARCH,The Unity March is taking place at Verdant Oasis Plaza +40,VERDANT OASIS PLAZA,TRIBUNE SPOTLIGHT,Tribune Spotlight is reporting on the Unity march taking place at Verdant Oasis Plaza +41,VERDANT OASIS PLAZA,BAILEY ASADI,Bailey Asadi is speaking at Verdant Oasis Plaza about the march +43,HARMONY ASSEMBLY,UNITY MARCH,Harmony Assembly is organizing the Unity March +``` +``` +Output: +{{ + "title": "Verdant Oasis Plaza and Unity March", + "summary": "The community revolves around the Verdant Oasis Plaza, which is the location of the Unity March. The plaza has relationships with the Harmony Assembly, Unity March, and Tribune Spotlight, all of which are associated with the march event.", + "rating": 5.0, + "rating_explanation": "The impact severity rating is moderate due to the potential for unrest or conflict during the Unity March.", + "findings": [ + {{ + "summary": "Verdant Oasis Plaza as the central location", + "explanation": "Verdant Oasis Plaza is the central entity in this community, serving as the location for the Unity March. This plaza is the common link between all other entities, suggesting its significance in the community. The plaza's association with the march could potentially lead to issues such as public disorder or conflict, depending on the nature of the march and the reactions it provokes." + }}, + {{ + "summary": "Harmony Assembly's role in the community", + "explanation": "Harmony Assembly is another key entity in this community, being the organizer of the march at Verdant Oasis Plaza. The nature of Harmony Assembly and its march could be a potential source of threat, depending on their objectives and the reactions they provoke. The relationship between Harmony Assembly and the plaza is crucial in understanding the dynamics of this community." + }}, + {{ + "summary": "Unity March as a significant event", + "explanation": "The Unity March is a significant event taking place at Verdant Oasis Plaza. This event is a key factor in the community's dynamics and could be a potential source of threat, depending on the nature of the march and the reactions it provokes. The relationship between the march and the plaza is crucial in understanding the dynamics of this community." + }}, + {{ + "summary": "Role of Tribune Spotlight", + "explanation": "Tribune Spotlight is reporting on the Unity March taking place in Verdant Oasis Plaza. This suggests that the event has attracted media attention, which could amplify its impact on the community. The role of Tribune Spotlight could be significant in shaping public perception of the event and the entities involved." + }} + ] +}} + + +# Real Data + +Use the following text for your answer. Do not make anything up in your answer. + +Text: +``` +{input_text} +``` + +The report should include the following sections: + +- TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title. +- SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities. +- IMPACT SEVERITY RATING: a float score between 0-10 that represents the severity of IMPACT posed by entities within the community. IMPACT is the scored importance of a community. +- RATING EXPLANATION: Give a single sentence explanation of the IMPACT severity rating. +- DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive. + +Return output as a well-formed JSON-formatted string with the following format: + {{ + "title": , + "summary": , + "rating": , + "rating_explanation": , + "findings": [ + {{ + "summary":, + "explanation": + }}, + {{ + "summary":, + "explanation": + }} + ... + ] + }} + +# Grounding Rules +Do not include information where the supporting evidence for it is not provided. + +Output: +""" + +PROMPTS[ + "entity_extraction" +] = """-Goal- +Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. + +-Steps- +1. Identify all entities. For each identified entity, extract the following information: +- entity_name: Name of the entity, capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_description: Comprehensive description of the entity's attributes and activities +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_description: explanation as to why you think the source entity and the target entity are related to each other +- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity + Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +4. When finished, output {completion_delimiter} + +###################### +-Examples- +###################### +Example 1: + +Entity_types: [person, technology, mission, organization, location] +Text: +while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. + +Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” + +The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. + +It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths +################ +Output: +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} +("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} +("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}6){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}5){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}9){completion_delimiter} +############################# +Example 2: + +Entity_types: [person, technology, mission, organization, location] +Text: +They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. + +Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. + +Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly +############# +Output: +("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} +("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} +("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}9){completion_delimiter} +############################# +Example 3: + +Entity_types: [person, role, technology, organization, event, location, concept] +Text: +their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. + +"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." + +Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." + +Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. + +The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation +############# +Output: +("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} +("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} +("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} +("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} +("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} +("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}9){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}10){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}7){completion_delimiter} +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + + +PROMPTS[ + "summarize_entity_descriptions" +] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + + +PROMPTS[ + "entiti_continue_extraction" +] = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +PROMPTS[ + "entiti_if_loop_extraction" +] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] +PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" +PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" +PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" + +PROMPTS[ + "local_rag_response" +] = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + + +---Data tables--- + +{context_data} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Do not include information where the supporting evidence for it is not provided. + + +---Target response length and format--- + +{response_type} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +PROMPTS[ + "global_map_rag_points" +] = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables. + +You should use the data provided in the data tables below as the primary context for generating the response. +If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up. + +Each key point in the response should have the following element: +- Description: A comprehensive description of the point. +- Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0. + +The response should be JSON formatted as follows: +{{ + "points": [ + {{"description": "Description of point 1...", "score": score_value}}, + {{"description": "Description of point 2...", "score": score_value}} + ] +}} + +The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". +Do not include information where the supporting evidence for it is not provided. + + +---Data tables--- + +{context_data} + +---Goal--- + +Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables. + +You should use the data provided in the data tables below as the primary context for generating the response. +If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up. + +Each key point in the response should have the following element: +- Description: A comprehensive description of the point. +- Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0. + +The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". +Do not include information where the supporting evidence for it is not provided. + +The response should be JSON formatted as follows: +{{ + "points": [ + {{"description": "Description of point 1", "score": score_value}}, + {{"description": "Description of point 2", "score": score_value}} + ] +}} +""" + +PROMPTS[ + "global_reduce_rag_response" +] = """---Role--- + +You are a helpful assistant responding to questions about a dataset by synthesizing perspectives from multiple analysts. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarize all the reports from multiple analysts who focused on different parts of the dataset. + +Note that the analysts' reports provided below are ranked in the **descending order of importance**. + +If you don't know the answer or if the provided reports do not contain sufficient information to provide an answer, just say so. Do not make anything up. + +The final response should remove all irrelevant information from the analysts' reports and merge the cleaned information into a comprehensive answer that provides explanations of all the key points and implications appropriate for the response length and format. + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. + +The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". + +Do not include information where the supporting evidence for it is not provided. + + +---Target response length and format--- + +{response_type} + + +---Analyst Reports--- + +{report_data} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarize all the reports from multiple analysts who focused on different parts of the dataset. + +Note that the analysts' reports provided below are ranked in the **descending order of importance**. + +If you don't know the answer or if the provided reports do not contain sufficient information to provide an answer, just say so. Do not make anything up. + +The final response should remove all irrelevant information from the analysts' reports and merge the cleaned information into a comprehensive answer that provides explanations of all the key points and implications appropriate for the response length and format. + +The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". + +Do not include information where the supporting evidence for it is not provided. + + +---Target response length and format--- + +{response_type} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +PROMPTS[ + "naive_rag_response" +] = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" + +PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." + +PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] + +PROMPTS["default_text_separator"] = [ + # Paragraph separators + "\n\n", + "\r\n\r\n", + # Line breaks + "\n", + "\r\n", + # Sentence ending punctuation + "。", # Chinese period + ".", # Full-width dot + ".", # English period + "!", # Chinese exclamation mark + "!", # English exclamation mark + "?", # Chinese question mark + "?", # English question mark + # Whitespace characters + " ", # Space + "\t", # Tab + "\u3000", # Full-width space + # Special characters + "\u200b", # Zero-width space (used in some Asian languages) +] diff --git a/nano-graphrag/readme.md b/nano-graphrag/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..65f7cff19043199487e0a731a61935b47223029a --- /dev/null +++ b/nano-graphrag/readme.md @@ -0,0 +1,418 @@ +
+ + + + Shows the MemoDB logo + + +

A simple, easy-to-hack GraphRAG implementation

+

+ + + + + + + + + + +

+

+ + + + + + +

+
+ + + + + + + + + +😭 [GraphRAG](https://arxiv.org/pdf/2404.16130) is good and powerful, but the official [implementation](https://github.com/microsoft/graphrag/tree/main) is difficult/painful to **read or hack**. + +😊 This project provides a **smaller, faster, cleaner GraphRAG**, while remaining the core functionality(see [benchmark](#benchmark) and [issues](#Issues) ). + +🎁 Excluding `tests` and prompts, `nano-graphrag` is about **1100 lines of code**. + +👌 Small yet [**portable**](#Components)(faiss, neo4j, ollama...), [**asynchronous**](#Async) and fully typed. + + + +> If you're looking for a multi-user RAG solution for long-term user memory, have a look at this project: [memobase](https://github.com/memodb-io/memobase) :) + +## Install + +**Install from source** (recommend) + +```shell +# clone this repo first +cd nano-graphrag +pip install -e . +``` + +**Install from PyPi** + +```shell +pip install nano-graphrag +``` + + + +## Quick Start + +> [!TIP] +> +> **Please set OpenAI API key in environment: `export OPENAI_API_KEY="sk-..."`.** + +> [!TIP] +> If you're using Azure OpenAI API, refer to the [.env.example](./.env.example.azure) to set your azure openai. Then pass `GraphRAG(...,using_azure_openai=True,...)` to enable. + +> [!TIP] +> If you're using Amazon Bedrock API, please ensure your credentials are properly set through commands like `aws configure`. Then enable it by configuring like this: `GraphRAG(...,using_amazon_bedrock=True, best_model_id="us.anthropic.claude-3-sonnet-20240229-v1:0", cheap_model_id="us.anthropic.claude-3-haiku-20240307-v1:0",...)`. Refer to an [example script](./examples/using_amazon_bedrock.py). + +> [!TIP] +> +> If you don't have any key, check out this [example](./examples/no_openai_key_at_all.py) that using `transformers` and `ollama` . If you like to use another LLM or Embedding Model, check [Advances](#Advances). + +download a copy of A Christmas Carol by Charles Dickens: + +```shell +curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt +``` + +Use the below python snippet: + +```python +from nano_graphrag import GraphRAG, QueryParam + +graph_func = GraphRAG(working_dir="./dickens") + +with open("./book.txt") as f: + graph_func.insert(f.read()) + +# Perform global graphrag search +print(graph_func.query("What are the top themes in this story?")) + +# Perform local graphrag search (I think is better and more scalable one) +print(graph_func.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +``` + +Next time you initialize a `GraphRAG` from the same `working_dir`, it will reload all the contexts automatically. + +#### Batch Insert + +```python +graph_func.insert(["TEXT1", "TEXT2",...]) +``` + +
+ Incremental Insert + +`nano-graphrag` supports incremental insert, no duplicated computation or data will be added: + +```python +with open("./book.txt") as f: + book = f.read() + half_len = len(book) // 2 + graph_func.insert(book[:half_len]) + graph_func.insert(book[half_len:]) +``` + +> `nano-graphrag` use md5-hash of the content as the key, so there is no duplicated chunk. +> +> However, each time you insert, the communities of graph will be re-computed and the community reports will be re-generated + +
+ +
+ Naive RAG + +`nano-graphrag` supports naive RAG insert and query as well: + +```python +graph_func = GraphRAG(working_dir="./dickens", enable_naive_rag=True) +... +# Query +print(rag.query( + "What are the top themes in this story?", + param=QueryParam(mode="naive") +) +``` +
+ + +### Async + +For each method `NAME(...)` , there is a corresponding async method `aNAME(...)` + +```python +await graph_func.ainsert(...) +await graph_func.aquery(...) +... +``` + +### Available Parameters + +`GraphRAG` and `QueryParam` are `dataclass` in Python. Use `help(GraphRAG)` and `help(QueryParam)` to see all available parameters! Or check out the [Advances](#Advances) section to see some options. + + + +## Components + +Below are the components you can use: + +| Type | What | Where | +| :-------------- | :----------------------------------------------------------: | :-----------------------------------------------: | +| LLM | OpenAI | Built-in | +| | Amazon Bedrock | Built-in | +| | DeepSeek | [examples](./examples) | +| | `ollama` | [examples](./examples) | +| Embedding | OpenAI | Built-in | +| | Amazon Bedrock | Built-in | +| | Sentence-transformers | [examples](./examples) | +| Vector DataBase | [`nano-vectordb`](https://github.com/gusye1234/nano-vectordb) | Built-in | +| | [`hnswlib`](https://github.com/nmslib/hnswlib) | Built-in, [examples](./examples) | +| | [`milvus-lite`](https://github.com/milvus-io/milvus-lite) | [examples](./examples) | +| | [faiss](https://github.com/facebookresearch/faiss?tab=readme-ov-file) | [examples](./examples) | +| Graph Storage | [`networkx`](https://networkx.org/documentation/stable/index.html) | Built-in | +| | [`neo4j`](https://neo4j.com/) | Built-in([doc](./docs/use_neo4j_for_graphrag.md)) | +| Visualization | graphml | [examples](./examples) | +| Chunking | by token size | Built-in | +| | by text splitter | Built-in | + +- `Built-in` means we have that implementation inside `nano-graphrag`. `examples` means we have that implementation inside an tutorial under [examples](./examples) folder. + +- Check [examples/benchmarks](./examples/benchmarks) to see few comparisons between components. +- **Always welcome to contribute more components.** + +## Advances + + + +
+Some setup options + +- `GraphRAG(...,always_create_working_dir=False,...)` will skip the dir-creating step. Use it if you switch all your components to non-file storages. + +
+ + + +
+Only query the related context + +`graph_func.query` return the final answer without streaming. + +If you like to interagte `nano-graphrag` in your project, you can use `param=QueryParam(..., only_need_context=True,...)`, which will only return the retrieved context from graph, something like: + +```` +# Local mode +-----Reports----- +```csv +id, content +0, # FOX News and Key Figures in Media and Politics... +1, ... +``` +... + +# Global mode +----Analyst 3---- +Importance Score: 100 +Donald J. Trump: Frequently discussed in relation to his political activities... +... +```` + +You can integrate that context into your customized prompt. + +
+ +
+Prompt + +`nano-graphrag` use prompts from `nano_graphrag.prompt.PROMPTS` dict object. You can play with it and replace any prompt inside. + +Some important prompts: + +- `PROMPTS["entity_extraction"]` is used to extract the entities and relations from a text chunk. +- `PROMPTS["community_report"]` is used to organize and summary the graph cluster's description. +- `PROMPTS["local_rag_response"]` is the system prompt template of the local search generation. +- `PROMPTS["global_reduce_rag_response"]` is the system prompt template of the global search generation. +- `PROMPTS["fail_response"]` is the fallback response when nothing is related to the user query. + +
+ +
+Customize Chunking + + +`nano-graphrag` allow you to customize your own chunking method, check out the [example](./examples/using_custom_chunking_method.py). + +Switch to the built-in text splitter chunking method: + +```python +from nano_graphrag._op import chunking_by_seperators + +GraphRAG(...,chunk_func=chunking_by_seperators,...) +``` + +
+ + + +
+LLM Function + +In `nano-graphrag`, we requires two types of LLM, a great one and a cheap one. The former is used to plan and respond, the latter is used to summary. By default, the great one is `gpt-4o` and the cheap one is `gpt-4o-mini` + +You can implement your own LLM function (refer to `_llm.gpt_4o_complete`): + +```python +async def my_llm_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + # pop cache KV database if any + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + # the rest kwargs are for calling LLM, for example, `max_tokens=xxx` + ... + # YOUR LLM calling + response = await call_your_LLM(messages, **kwargs) + return response +``` + +Replace the default one with: + +```python +# Adjust the max token size or the max async requests if needed +GraphRAG(best_model_func=my_llm_complete, best_model_max_token_size=..., best_model_max_async=...) +GraphRAG(cheap_model_func=my_llm_complete, cheap_model_max_token_size=..., cheap_model_max_async=...) +``` + +You can refer to this [example](./examples/using_deepseek_as_llm.py) that use [`deepseek-chat`](https://platform.deepseek.com/api-docs/) as the LLM model + +You can refer to this [example](./examples/using_ollama_as_llm.py) that use [`ollama`](https://github.com/ollama/ollama) as the LLM model + +#### Json Output + +`nano-graphrag` will use `best_model_func` to output JSON with params `"response_format": {"type": "json_object"}`. However there are some open-source model maybe produce unstable JSON. + +`nano-graphrag` introduces a post-process interface for you to convert the response to JSON. This func's signature is below: + +```python +def YOUR_STRING_TO_JSON_FUNC(response: str) -> dict: + "Convert the string response to JSON" + ... +``` + +And pass your own func by `GraphRAG(...convert_response_to_json_func=YOUR_STRING_TO_JSON_FUNC,...)`. + +For example, you can refer to [json_repair](https://github.com/mangiucugna/json_repair) to repair the JSON string returned by LLM. +
+ + + +
+Embedding Function + +You can replace the default embedding functions with any `_utils.EmbedddingFunc` instance. + +For example, the default one is using OpenAI embedding API: + +```python +@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) +async def openai_embedding(texts: list[str]) -> np.ndarray: + openai_async_client = AsyncOpenAI() + response = await openai_async_client.embeddings.create( + model="text-embedding-3-small", input=texts, encoding_format="float" + ) + return np.array([dp.embedding for dp in response.data]) +``` + +Replace default embedding function with: + +```python +GraphRAG(embedding_func=your_embed_func, embedding_batch_num=..., embedding_func_max_async=...) +``` + +You can refer to an [example](./examples/using_local_embedding_model.py) that use `sentence-transformer` to locally compute embeddings. +
+ + +
+Storage Component + +You can replace all storage-related components to your own implementation, `nano-graphrag` mainly uses three kinds of storage: + +**`base.BaseKVStorage` for storing key-json pairs of data** + +- By default we use disk file storage as the backend. +- `GraphRAG(.., key_string_value_json_storage_cls=YOURS,...)` + +**`base.BaseVectorStorage` for indexing embeddings** + +- By default we use [`nano-vectordb`](https://github.com/gusye1234/nano-vectordb) as the backend. +- We have a built-in [`hnswlib`](https://github.com/nmslib/hnswlib) storage also, check out this [example](./examples/using_hnsw_as_vectorDB.py). +- Check out this [example](./examples/using_milvus_as_vectorDB.py) that implements [`milvus-lite`](https://github.com/milvus-io/milvus-lite) as the backend (not available in Windows). +- `GraphRAG(.., vector_db_storage_cls=YOURS,...)` + +**`base.BaseGraphStorage` for storing knowledge graph** + +- By default we use [`networkx`](https://github.com/networkx/networkx) as the backend. +- We have a built-in `Neo4jStorage` for graph, check out this [tutorial](./docs/use_neo4j_for_graphrag.md). +- `GraphRAG(.., graph_storage_cls=YOURS,...)` + +You can refer to `nano_graphrag.base` to see detailed interfaces for each components. +
+ + + +## FQA + +Check [FQA](./docs/FAQ.md). + + + +## Roadmap + +See [ROADMAP.md](./docs/ROADMAP.md) + + + +## Contribute + +`nano-graphrag` is open to any kind of contribution. Read [this](./docs/CONTRIBUTING.md) before you contribute. + + + + +## Benchmark + +- [benchmark for English](./docs/benchmark-en.md) +- [benchmark for Chinese](./docs/benchmark-zh.md) +- [An evaluation](./examples/benchmarks/eval_naive_graphrag_on_multi_hop.ipynb) notebook on a [multi-hop RAG task](https://github.com/yixuantt/MultiHop-RAG) + + + +## Projects that used `nano-graphrag` + +- [Medical Graph RAG](https://github.com/MedicineToken/Medical-Graph-RAG): Graph RAG for the Medical Data +- [LightRAG](https://github.com/HKUDS/LightRAG): Simple and Fast Retrieval-Augmented Generation +- [fast-graphrag](https://github.com/circlemind-ai/fast-graphrag): RAG that intelligently adapts to your use case, data, and queries +- [HiRAG](https://github.com/hhy-huang/HiRAG): Retrieval-Augmented Generation with Hierarchical Knowledge + +> Welcome to pull requests if your project uses `nano-graphrag`, it will help others to trust this repo❤️ + + + +## Issues + +- `nano-graphrag` didn't implement the `covariates` feature of `GraphRAG` +- `nano-graphrag` implements the global search different from the original. The original use a map-reduce-like style to fill all the communities into context, while `nano-graphrag` only use the top-K important and central communites (use `QueryParam.global_max_consider_community` to control, default to 512 communities). + diff --git a/nano-graphrag/setup.py b/nano-graphrag/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..8a04ea52516e7b778dc240a2eda2cc5b8f94ec8e --- /dev/null +++ b/nano-graphrag/setup.py @@ -0,0 +1,40 @@ +import setuptools +from setuptools import find_packages + +with open("readme.md", "r") as fh: + long_description = fh.read() + + +vars2find = ["__author__", "__version__", "__url__"] +vars2readme = {} +with open("./nano_graphrag/__init__.py") as f: + for line in f.readlines(): + for v in vars2find: + if line.startswith(v): + line = line.replace(" ", "").replace('"', "").replace("'", "").strip() + vars2readme[v] = line.split("=")[1] + +deps = [] +with open("./requirements.txt") as f: + for line in f.readlines(): + if not line.strip(): + continue + deps.append(line.strip()) + +setuptools.setup( + name="nano-graphrag", + url=vars2readme["__url__"], + version=vars2readme["__version__"], + author=vars2readme["__author__"], + description="A simple, easy-to-hack GraphRAG implementation", + long_description=long_description, + long_description_content_type="text/markdown", + packages=find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.9", + install_requires=deps, +) diff --git a/nano-graphrag/tests/__init__.py b/nano-graphrag/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74645628a3d8fc57f4f07e71884d4d79b834f0f3 --- /dev/null +++ b/nano-graphrag/tests/__init__.py @@ -0,0 +1,5 @@ +import logging +import dotenv + +dotenv.load_dotenv() +logging.basicConfig(level=logging.INFO) diff --git a/nano-graphrag/tests/entity_extraction/__init__.py b/nano-graphrag/tests/entity_extraction/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/nano-graphrag/tests/entity_extraction/test_extract.py b/nano-graphrag/tests/entity_extraction/test_extract.py new file mode 100644 index 0000000000000000000000000000000000000000..58e009b73598e2342586daf17044053426fa89ec --- /dev/null +++ b/nano-graphrag/tests/entity_extraction/test_extract.py @@ -0,0 +1,258 @@ +import pytest +import dspy +from openai import BadRequestError +from unittest.mock import Mock, patch, AsyncMock +from nano_graphrag.entity_extraction.extract import generate_dataset, extract_entities_dspy +from nano_graphrag.base import TextChunkSchema, BaseGraphStorage, BaseVectorStorage +import httpx + + +@pytest.fixture +def mock_chunks(): + return { + "chunk1": TextChunkSchema(content="Apple announced a new iPhone model."), + "chunk2": TextChunkSchema(content="Google released an update for Android.") + } + + +@pytest.fixture +def mock_entity_extractor(): + with patch('nano_graphrag.entity_extraction.extract.TypedEntityRelationshipExtractor') as mock: + mock_instance = Mock() + mock.return_value = mock_instance + yield mock_instance + + +@pytest.fixture +def mock_graph_storage(): + return Mock(spec=BaseGraphStorage) + + +@pytest.fixture +def mock_vector_storage(): + return Mock(spec=BaseVectorStorage) + + +@pytest.fixture +def mock_global_config(): + return { + "use_compiled_dspy_entity_relationship": False, + "entity_relationship_module_path": "path/to/module.json" + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("use_compiled,save_dataset", [ + (True, True), (False, True), (True, False), (False, False) +]) +async def test_generate_dataset(mock_chunks, mock_entity_extractor, tmp_path, use_compiled, save_dataset): + mock_prediction = Mock( + entities=[{"entity_name": "APPLE", "entity_type": "ORGANIZATION"}], + relationships=[{"src_id": "APPLE", "tgt_id": "IPHONE"}] + ) + mock_entity_extractor.return_value = mock_prediction + + filepath = tmp_path / "test_dataset.pkl" + + mock_global_config = { + "use_compiled_dspy_entity_relationship": use_compiled, + "entity_relationship_module_path": "test/path.json" if use_compiled else None + } + + with patch('nano_graphrag.entity_extraction.extract.pickle.dump') as mock_dump, \ + patch('nano_graphrag.entity_extraction.extract.TypedEntityRelationshipExtractor') as mock_extractor_class: + + mock_extractor_instance = Mock() + mock_extractor_instance.return_value = mock_prediction + mock_extractor_class.return_value = mock_extractor_instance + + if use_compiled: + mock_extractor_instance.load = Mock() + + result = await generate_dataset(chunks=mock_chunks, filepath=str(filepath), + save_dataset=save_dataset, global_config=mock_global_config) + + assert len(result) == 2 + assert isinstance(result[0], dspy.Example) + assert hasattr(result[0], 'input_text') + assert hasattr(result[0], 'entities') + assert hasattr(result[0], 'relationships') + + if save_dataset: + mock_dump.assert_called_once() + else: + mock_dump.assert_not_called() + + mock_extractor_class.assert_called_once() + assert mock_extractor_instance.call_count == len(mock_chunks) + + if use_compiled: + mock_extractor_instance.load.assert_called_once_with("test/path.json") + else: + assert not hasattr(mock_extractor_instance, 'load') or not mock_extractor_instance.load.called + + +@pytest.mark.asyncio +async def test_generate_dataset_with_empty_chunks(): + chunks = {} + filepath = "test_empty_dataset.pkl" + result = await generate_dataset(chunks, filepath, save_dataset=False) + assert len(result) == 0 + + +@pytest.mark.asyncio +async def test_generate_dataset_with_bad_request_error(): + chunks = {"chunk1": TextChunkSchema(content="Test content")} + filepath = "test_error_dataset.pkl" + + # Create a mock response object + mock_response = Mock(spec=httpx.Response) + mock_response.status_code = 400 + mock_response.headers = {"x-request-id": "test-request-id"} + mock_response.request = Mock(spec=httpx.Request) + + with patch('nano_graphrag.entity_extraction.extract.TypedEntityRelationshipExtractor') as mock_extractor_class: + mock_extractor_instance = Mock() + mock_extractor_instance.side_effect = BadRequestError( + message="Test Error", + response=mock_response, + body={"error": {"message": "Test Error", "type": "invalid_request_error"}} + ) + mock_extractor_class.return_value = mock_extractor_instance + + with patch('nano_graphrag.entity_extraction.extract.asyncio.to_thread', new_callable=AsyncMock) as mock_to_thread: + mock_to_thread.side_effect = BadRequestError( + message="Test Error", + response=mock_response, + body={"error": {"message": "Test Error", "type": "invalid_request_error"}} + ) + + result = await generate_dataset(chunks, filepath, save_dataset=False) + + assert len(result) == 0 + mock_to_thread.assert_called_once() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("use_compiled,entity_vdb", [ + (True, Mock(spec=BaseVectorStorage)), + (False, Mock(spec=BaseVectorStorage)), + (True, None), + (False, None) +]) +async def test_extract_entities_dspy(mock_chunks, mock_graph_storage, entity_vdb, mock_global_config, use_compiled): + mock_entity = { + "entity_name": "APPLE", + "entity_type": "ORGANIZATION", + "description": "A tech company", + "importance_score": 0.9 + } + mock_relationship = { + "src_id": "APPLE", + "tgt_id": "IPHONE", + "description": "Produces", + "weight": 0.8, + "order": 1 + } + mock_prediction = Mock( + entities=[mock_entity], + relationships=[mock_relationship] + ) + + mock_global_config.update({ + "use_compiled_dspy_entity_relationship": use_compiled, + "entity_relationship_module_path": "test/path.json" if use_compiled else None + }) + + with patch('nano_graphrag.entity_extraction.extract.TypedEntityRelationshipExtractor') as mock_extractor_class: + mock_extractor_instance = Mock() + mock_extractor_instance.return_value = mock_prediction + mock_extractor_class.return_value = mock_extractor_instance + + if use_compiled: + mock_extractor_instance.load = Mock() + + with patch('nano_graphrag.entity_extraction.extract._merge_nodes_then_upsert', new_callable=AsyncMock) as mock_merge_nodes, \ + patch('nano_graphrag.entity_extraction.extract._merge_edges_then_upsert', new_callable=AsyncMock) as mock_merge_edges: + mock_merge_nodes.return_value = mock_entity + result = await extract_entities_dspy(mock_chunks, mock_graph_storage, entity_vdb, mock_global_config) + + assert result == mock_graph_storage + mock_extractor_class.assert_called_once() + mock_extractor_instance.assert_called() + mock_merge_nodes.assert_called() + mock_merge_edges.assert_called() + + if entity_vdb: + entity_vdb.upsert.assert_called_once() + else: + assert not hasattr(entity_vdb, 'upsert') or not entity_vdb.upsert.called + + assert mock_extractor_instance.call_count == len(mock_chunks) + + if use_compiled: + mock_extractor_instance.load.assert_called_once_with("test/path.json") + else: + assert not hasattr(mock_extractor_instance, 'load') or not mock_extractor_instance.load.called + + +@pytest.mark.asyncio +async def test_extract_entities_dspy_with_empty_chunks(): + chunks = {} + mock_graph_storage = Mock(spec=BaseGraphStorage) + mock_vector_storage = Mock(spec=BaseVectorStorage) + global_config = {} + + result = await extract_entities_dspy(chunks, mock_graph_storage, mock_vector_storage, global_config) + + assert result is None + + +@pytest.mark.asyncio +async def test_extract_entities_dspy_with_no_entities(): + chunks = {"chunk1": TextChunkSchema(content="Test content")} + mock_graph_storage = Mock(spec=BaseGraphStorage) + mock_vector_storage = Mock(spec=BaseVectorStorage) + global_config = {} + + with patch('nano_graphrag.entity_extraction.extract.TypedEntityRelationshipExtractor') as mock_extractor: + mock_extractor.return_value.return_value = Mock(entities=[], relationships=[]) + result = await extract_entities_dspy(chunks, mock_graph_storage, mock_vector_storage, global_config) + + assert result is None + mock_vector_storage.upsert.assert_not_called() + + +@pytest.mark.asyncio +async def test_extract_entities_dspy_with_bad_request_error(): + chunks = {"chunk1": TextChunkSchema(content="Test content")} + mock_graph_storage = Mock(spec=BaseGraphStorage) + mock_vector_storage = Mock(spec=BaseVectorStorage) + global_config = {} + + mock_response = Mock(spec=httpx.Response) + mock_response.status_code = 400 + mock_response.headers = {"x-request-id": "test-request-id"} + mock_response.request = Mock(spec=httpx.Request) + + with patch('nano_graphrag.entity_extraction.extract.TypedEntityRelationshipExtractor') as mock_extractor_class: + mock_extractor_instance = Mock() + mock_extractor_instance.side_effect = BadRequestError( + message="Test Error", + response=mock_response, + body={"error": {"message": "Test Error", "type": "invalid_request_error"}} + ) + mock_extractor_class.return_value = mock_extractor_instance + + with patch('nano_graphrag.entity_extraction.extract.asyncio.to_thread', new_callable=AsyncMock) as mock_to_thread: + mock_to_thread.side_effect = BadRequestError( + message="Test Error", + response=mock_response, + body={"error": {"message": "Test Error", "type": "invalid_request_error"}} + ) + + result = await extract_entities_dspy(chunks, mock_graph_storage, mock_vector_storage, global_config) + + assert result is None + mock_to_thread.assert_called_once() + mock_vector_storage.upsert.assert_not_called() diff --git a/nano-graphrag/tests/entity_extraction/test_metric.py b/nano-graphrag/tests/entity_extraction/test_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..2e304edb1a5985a6b11a287b382e6bab9bed2d76 --- /dev/null +++ b/nano-graphrag/tests/entity_extraction/test_metric.py @@ -0,0 +1,228 @@ +import pytest +import dspy +from unittest.mock import Mock, patch +from nano_graphrag.entity_extraction.metric import ( + relationships_similarity_metric, + entity_recall_metric, +) + + +@pytest.fixture +def mock_dspy_predict(): + with patch( + "nano_graphrag.entity_extraction.metric.dspy.ChainOfThought" + ) as mock_predict: + mock_instance = Mock() + mock_instance.return_value = dspy.Prediction(similarity_score=0.75) + mock_predict.return_value = mock_instance + yield mock_predict + + +@pytest.fixture +def sample_relationship(): + return { + "src_id": "ENTITY1", + "tgt_id": "ENTITY2", + "description": "Example relationship", + "weight": 0.8, + "order": 1, + } + + +@pytest.fixture +def sample_entity(): + return { + "entity_name": "EXAMPLE_ENTITY", + "entity_type": "PERSON", + "description": "An example entity", + "importance_score": 0.8, + } + + +@pytest.fixture +def example(): + def _example(items): + return ( + {"relationships": items} + if "src_id" in (items[0] if items else {}) + else {"entities": items} + ) + + return _example + + +@pytest.fixture +def prediction(): + def _prediction(items): + return ( + {"relationships": items} + if "src_id" in (items[0] if items else {}) + else {"entities": items} + ) + + return _prediction + + +@pytest.mark.asyncio +async def test_relationship_similarity_metric( + sample_relationship, example, prediction, mock_dspy_predict +): + gold = example( + [ + { + **sample_relationship, + "src_id": "ENTITY1", + "tgt_id": "ENTITY2", + "description": "is related to", + }, + { + **sample_relationship, + "src_id": "ENTITY2", + "tgt_id": "ENTITY3", + "description": "is connected with", + }, + ] + ) + pred = prediction( + [ + { + **sample_relationship, + "src_id": "ENTITY1", + "tgt_id": "ENTITY2", + "description": "is connected to", + }, + { + **sample_relationship, + "src_id": "ENTITY2", + "tgt_id": "ENTITY3", + "description": "is linked with", + }, + ] + ) + + similarity = relationships_similarity_metric(gold, pred) + assert 0 <= similarity <= 1 + + +@pytest.mark.asyncio +async def test_entity_recall_metric(sample_entity, example, prediction): + gold = example( + [ + {**sample_entity, "entity_name": "ENTITY1"}, + {**sample_entity, "entity_name": "ENTITY2"}, + {**sample_entity, "entity_name": "ENTITY3"}, + ] + ) + pred = example( + [ + {**sample_entity, "entity_name": "ENTITY1"}, + {**sample_entity, "entity_name": "ENTITY3"}, + {**sample_entity, "entity_name": "ENTITY4"}, + ] + ) + + recall = entity_recall_metric(gold, pred) + assert recall == 2 / 3 + + +@pytest.mark.asyncio +async def test_relationship_similarity_metric_no_common_keys( + sample_relationship, example, prediction, mock_dspy_predict +): + gold = example( + [ + { + **sample_relationship, + "src_id": "ENTITY1", + "tgt_id": "ENTITY2", + "description": "is related to", + } + ] + ) + pred = prediction( + [ + { + **sample_relationship, + "src_id": "ENTITY3", + "tgt_id": "ENTITY4", + "description": "is connected with", + } + ] + ) + + similarity = relationships_similarity_metric(gold, pred) + assert 0 <= similarity <= 1 + + +@pytest.mark.asyncio +async def test_entity_recall_metric_no_true_positives( + sample_entity, example, prediction +): + gold = example( + [ + {**sample_entity, "entity_name": "ENTITY1"}, + {**sample_entity, "entity_name": "ENTITY2"}, + ] + ) + pred = prediction( + [ + {**sample_entity, "entity_name": "ENTITY3"}, + {**sample_entity, "entity_name": "ENTITY4"}, + ] + ) + + recall = entity_recall_metric(gold, pred) + assert recall == 0 + + +@pytest.mark.asyncio +async def test_relationship_similarity_metric_identical_descriptions( + sample_relationship, example, prediction, mock_dspy_predict +): + gold = example( + [ + { + **sample_relationship, + "src_id": "ENTITY1", + "tgt_id": "ENTITY2", + "description": "is related to", + } + ] + ) + pred = prediction( + [ + { + **sample_relationship, + "src_id": "ENTITY1", + "tgt_id": "ENTITY2", + "description": "is related to", + } + ] + ) + + similarity = relationships_similarity_metric(gold, pred) + assert similarity == 0.75 + + +@pytest.mark.asyncio +async def test_entity_recall_metric_perfect_recall(sample_entity, example, prediction): + entities = [ + {**sample_entity, "entity_name": "ENTITY1"}, + {**sample_entity, "entity_name": "ENTITY2"}, + ] + gold = example(entities) + pred = prediction(entities) + + recall = entity_recall_metric(gold, pred) + assert recall == 1.0 + + +@pytest.mark.asyncio +async def test_relationship_similarity_metric_no_relationships( + example, prediction, mock_dspy_predict +): + gold = example([]) + pred = prediction([]) + + with pytest.raises(KeyError): + similarity = relationships_similarity_metric(gold, pred) diff --git a/nano-graphrag/tests/entity_extraction/test_module.py b/nano-graphrag/tests/entity_extraction/test_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ee530739e6e617d396d91d5b1e7cf1550671365f --- /dev/null +++ b/nano-graphrag/tests/entity_extraction/test_module.py @@ -0,0 +1,90 @@ +import pytest +import dspy +from unittest.mock import Mock, patch +from nano_graphrag.entity_extraction.module import ( + TypedEntityRelationshipExtractor, + Relationship, + Entity, +) + + +@pytest.mark.parametrize("self_refine,num_refine_turns", [(False, 0), (True, 2)]) +def test_entity_relationship_extractor(self_refine, num_refine_turns): + with patch( + "nano_graphrag.entity_extraction.module.dspy.ChainOfThought" + ) as mock_chain_of_thought: + input_text = "Apple announced a new iPhone model." + mock_extractor = Mock() + mock_critique = Mock() + mock_refine = Mock() + + mock_chain_of_thought.side_effect = [mock_extractor, mock_critique, mock_refine] + + mock_entities = [ + Entity( + entity_name="APPLE", + entity_type="ORGANIZATION", + description="A technology company", + importance_score=1, + ), + Entity( + entity_name="IPHONE", + entity_type="PRODUCT", + description="A smartphone", + importance_score=1, + ), + ] + mock_relationships = [ + Relationship( + src_id="APPLE", + tgt_id="IPHONE", + description="Apple manufactures iPhone", + weight=1, + order=1, + ) + ] + + mock_extractor.return_value = dspy.Prediction( + entities=mock_entities, relationships=mock_relationships + ) + + if self_refine: + mock_critique.return_value = dspy.Prediction( + entity_critique="Good entities, but could be more detailed.", + relationship_critique="Relationships are accurate but limited.", + ) + mock_refine.return_value = dspy.Prediction( + refined_entities=mock_entities, refined_relationships=mock_relationships + ) + + extractor = TypedEntityRelationshipExtractor( + self_refine=self_refine, num_refine_turns=num_refine_turns + ) + result = extractor.forward(input_text=input_text) + + mock_extractor.assert_called_once_with( + input_text=input_text, entity_types=extractor.entity_types + ) + + if self_refine: + assert mock_critique.call_count == num_refine_turns + assert mock_refine.call_count == num_refine_turns + + assert len(result.entities) == 2 + assert len(result.relationships) == 1 + + assert result.entities[0]["entity_name"] == "APPLE" + assert result.entities[0]["entity_type"] == "ORGANIZATION" + assert result.entities[0]["description"] == "A technology company" + assert result.entities[0]["importance_score"] == 1 + + assert result.entities[1]["entity_name"] == "IPHONE" + assert result.entities[1]["entity_type"] == "PRODUCT" + assert result.entities[1]["description"] == "A smartphone" + assert result.entities[1]["importance_score"] == 1 + + assert result.relationships[0]["src_id"] == "APPLE" + assert result.relationships[0]["tgt_id"] == "IPHONE" + assert result.relationships[0]["description"] == "Apple manufactures iPhone" + assert result.relationships[0]["weight"] == 1 + assert result.relationships[0]["order"] == 1 diff --git a/nano-graphrag/tests/fixtures/mock_cache.json b/nano-graphrag/tests/fixtures/mock_cache.json new file mode 100644 index 0000000000000000000000000000000000000000..33a0e24d6c054b8f26ba52876e5be35ddb592aa4 --- /dev/null +++ b/nano-graphrag/tests/fixtures/mock_cache.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03916d1be9a80552e3c1938e67f1b4196dec5a53a23a521436b2a577f0314943 +size 655724 diff --git a/nano-graphrag/tests/test_hnsw_vector_storage.py b/nano-graphrag/tests/test_hnsw_vector_storage.py new file mode 100644 index 0000000000000000000000000000000000000000..b1ea39febf63fac67dba12e2f5200e8743b45fb6 --- /dev/null +++ b/nano-graphrag/tests/test_hnsw_vector_storage.py @@ -0,0 +1,287 @@ +import os +import shutil +import numpy as np +import pytest +from unittest.mock import patch +from dataclasses import asdict +from nano_graphrag import GraphRAG +from nano_graphrag._utils import wrap_embedding_func_with_attrs +from nano_graphrag._storage import HNSWVectorStorage + +WORKING_DIR = "./tests/nano_graphrag_cache_hnsw_vector_storage_test" + + +@pytest.fixture(scope="function") +def setup_teardown(): + if os.path.exists(WORKING_DIR): + shutil.rmtree(WORKING_DIR) + os.mkdir(WORKING_DIR) + + yield + + shutil.rmtree(WORKING_DIR) + + +@wrap_embedding_func_with_attrs(embedding_dim=384, max_token_size=8192) +async def mock_embedding(texts: list[str]) -> np.ndarray: + return np.random.rand(len(texts), 384) + + +@pytest.fixture +def hnsw_storage(setup_teardown): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=mock_embedding) + return HNSWVectorStorage( + namespace="test", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + ) + + +@pytest.mark.asyncio +async def test_upsert_and_query(hnsw_storage): + test_data = { + "1": {"content": "Test content 1", "entity_name": "Entity 1"}, + "2": {"content": "Test content 2", "entity_name": "Entity 2"}, + } + + await hnsw_storage.upsert(test_data) + + results = await hnsw_storage.query("Test query", top_k=2) + + assert len(results) == 2 + assert all(isinstance(result, dict) for result in results) + assert all( + "id" in result and "distance" in result and "similarity" in result + for result in results + ) + + +@pytest.mark.asyncio +async def test_persistence(setup_teardown): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=mock_embedding) + initial_storage = HNSWVectorStorage( + namespace="test", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + ) + + test_data = { + "1": {"content": "Test content 1", "entity_name": "Entity 1"}, + } + + await initial_storage.upsert(test_data) + await initial_storage.index_done_callback() + + new_storage = HNSWVectorStorage( + namespace="test", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + ) + + results = await new_storage.query("Test query", top_k=1) + + assert len(results) == 1 + assert results[0]["id"] == "1" + assert "entity_name" in results[0] + + +@pytest.mark.asyncio +async def test_persistence_large_dataset(setup_teardown): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=mock_embedding) + initial_storage = HNSWVectorStorage( + namespace="test_large", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + max_elements=10000, + ) + + large_data = { + str(i): {"content": f"Test content {i}", "entity_name": f"Entity {i}"} + for i in range(1000) + } + await initial_storage.upsert(large_data) + await initial_storage.index_done_callback() + + new_storage = HNSWVectorStorage( + namespace="test_large", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + max_elements=10000, + ) + + results = await new_storage.query("Test query", top_k=500) + assert len(results) == 500 + assert all(result["id"] in large_data for result in results) + + +@pytest.mark.asyncio +async def test_upsert_with_existing_ids(hnsw_storage): + test_data = { + "1": {"content": "Test content 1", "entity_name": "Entity 1"}, + "2": {"content": "Test content 2", "entity_name": "Entity 2"}, + } + + await hnsw_storage.upsert(test_data) + + updated_data = { + "1": {"content": "Updated content 1", "entity_name": "Updated Entity 1"}, + "3": {"content": "Test content 3", "entity_name": "Entity 3"}, + } + + await hnsw_storage.upsert(updated_data) + + results = await hnsw_storage.query("Updated", top_k=3) + + assert len(results) == 3 + assert any( + result["id"] == "1" and result["entity_name"] == "Updated Entity 1" + for result in results + ) + assert any( + result["id"] == "2" and result["entity_name"] == "Entity 2" + for result in results + ) + assert any( + result["id"] == "3" and result["entity_name"] == "Entity 3" + for result in results + ) + + +@pytest.mark.asyncio +async def test_large_batch_upsert(hnsw_storage): + batch_size = 30 + large_data = { + str(i): {"content": f"Test content {i}", "entity_name": f"Entity {i}"} + for i in range(batch_size) + } + + await hnsw_storage.upsert(large_data) + + results = await hnsw_storage.query("Test query", top_k=batch_size) + assert len(results) == batch_size + assert all(isinstance(result, dict) for result in results) + assert all( + "id" in result and "distance" in result and "similarity" in result + for result in results + ) + + +@pytest.mark.asyncio +async def test_empty_data_insertion(hnsw_storage): + empty_data = {} + await hnsw_storage.upsert(empty_data) + + results = await hnsw_storage.query("Test query", top_k=1) + assert len(results) == 0 + + +@pytest.mark.asyncio +async def test_query_with_no_results(hnsw_storage): + results = await hnsw_storage.query("Non-existent query", top_k=5) + assert len(results) == 0 + + test_data = { + "1": {"content": "Test content 1", "entity_name": "Entity 1"}, + } + await hnsw_storage.upsert(test_data) + + results = await hnsw_storage.query("Non-existent query", top_k=5) + assert len(results) == 1 + assert all(0 <= result["similarity"] <= 1 for result in results) + assert "entity_name" in results[0] + + +@pytest.mark.asyncio +async def test_index_done_callback(hnsw_storage): + test_data = { + "1": {"content": "Test content 1", "entity_name": "Entity 1"}, + } + + await hnsw_storage.upsert(test_data) + + with patch("hnswlib.Index.save_index") as mock_save_index: + await hnsw_storage.index_done_callback() + mock_save_index.assert_called_once() + + +@pytest.mark.asyncio +async def test_max_elements_limit(setup_teardown): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=mock_embedding) + max_elements = 10 + small_storage = HNSWVectorStorage( + namespace="test_small", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + max_elements=max_elements, + M=50, + ) + + data = { + str(i): {"content": f"Test content {i}", "entity_name": f"Entity {i}"} + for i in range(max_elements) + } + await small_storage.upsert(data) + + with pytest.raises( + ValueError, + match=f"Cannot insert 1 elements. Current: {max_elements}, Max: {max_elements}", + ): + await small_storage.upsert( + { + str(max_elements): { + "content": "Overflow", + "entity_name": "Overflow Entity", + } + } + ) + + large_max_elements = 100 + large_storage = HNSWVectorStorage( + namespace="test_large", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + max_elements=large_max_elements, + ) + + initial_data_size = int(large_max_elements * 0.3) + initial_data = { + str(i): {"content": f"Test content {i}", "entity_name": f"Entity {i}"} + for i in range(initial_data_size) + } + + await large_storage.upsert(initial_data) + + results = await large_storage.query("Test query", top_k=initial_data_size) + assert len(results) == initial_data_size + + +@pytest.mark.asyncio +async def test_ef_search_values(setup_teardown): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=mock_embedding) + storage = HNSWVectorStorage( + namespace="test_ef", + global_config=asdict(rag), + embedding_func=mock_embedding, + meta_fields={"entity_name"}, + ef_search=10, + ) + + data = { + str(i): {"content": f"Test content {i}", "entity_name": f"Entity {i}"} + for i in range(20) + } + await storage.upsert(data) + + results_default = await storage.query("Test query", top_k=5) + assert len(results_default) == 5 + + storage._index.set_ef(20) + results_higher_ef = await storage.query("Test query", top_k=15) + assert len(results_higher_ef) == 15 diff --git a/nano-graphrag/tests/test_json_parsing.py b/nano-graphrag/tests/test_json_parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..34b81b6431c863d1e862c55bf2148db409c2f148 --- /dev/null +++ b/nano-graphrag/tests/test_json_parsing.py @@ -0,0 +1,131 @@ +import unittest +# from loguru import logger +from nano_graphrag._utils import convert_response_to_json + +class TestJSONExtraction(unittest.TestCase): + + def setUp(self): + """Set up runs before each test case.""" + ... + + def test_standard_json(self): + """Test standard JSON extraction.""" + response = ''' + { + "reasoning": "This is a test.", + "answer": 42, + "data": {"key1": "value1", "key2": "value2"} + } + ''' + expected = { + "reasoning": "This is a test.", + "answer": 42, + "data": {"key1": "value1", "key2": "value2"} + } + self.assertEqual(convert_response_to_json(response), expected) + + def test_non_standard_json_without_quotes(self): + """Test non-standard JSON without quotes on numbers and booleans.""" + response = ''' + { + "reasoning": "Boolean and numbers test.", + "answer": 42, + "isCorrect": true, + "data": {key1: value1} + } + ''' + expected = { + "reasoning": "Boolean and numbers test.", + "answer": 42, + "isCorrect": True, + "data": {"key1": "value1"} + } + self.assertEqual(convert_response_to_json(response), expected) + + def test_nested_json(self): + """Test extraction of nested JSON objects.""" + response = ''' + { + "reasoning": "Nested structure.", + "answer": 42, + "data": {"nested": {"key": "value"}} + } + ''' + expected = { + "reasoning": "Nested structure.", + "answer": 42, + "data": { + "nested": {"key": "value"} + } + } + self.assertEqual(convert_response_to_json(response), expected) + + def test_malformed_json(self): + """Test handling of malformed JSON.""" + response = ''' + Some text before JSON + { + "reasoning": "This is malformed.", + "answer": 42, + "data": {"key": "value"} + } + Some text after JSON + ''' + expected = { + "reasoning": "This is malformed.", + "answer": 42, + "data": {"key": "value"} + } + self.assertEqual(convert_response_to_json(response), expected) + + def test_incomplete_json(self): + """Test handling of incomplete JSON.""" + response = ''' + { + "reasoning": "Incomplete structure", + "answer": 42 + ''' + expected = { + "reasoning": "Incomplete structure", + "answer": 42 + } + self.assertEqual(convert_response_to_json(response), expected) + + def test_value_with_special_characters(self): + """Test JSON with special characters in values.""" + response = ''' + { + "reasoning": "Special characters !@#$%^&*()", + "answer": 42, + "data": {"key": "value with special characters !@#$%^&*()"} + } + ''' + expected = { + "reasoning": "Special characters !@#$%^&*()", + "answer": 42, + "data": {"key": "value with special characters !@#$%^&*()"} + } + self.assertEqual(convert_response_to_json(response), expected) + + def test_boolean_and_null_values(self): + """Test JSON with boolean and null values.""" + response = ''' + { + "reasoning": "Boolean and null test.", + "isCorrect": true, + "isWrong": false, + "unknown": null, + "answer": 42 + } + ''' + expected = { + "reasoning": "Boolean and null test.", + "isCorrect": True, + "isWrong": False, + "unknown": None, + "answer": 42 + } + self.assertEqual(convert_response_to_json(response), expected) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/nano-graphrag/tests/test_neo4j_storage.py b/nano-graphrag/tests/test_neo4j_storage.py new file mode 100644 index 0000000000000000000000000000000000000000..8a87c3aee118a713eb55b9988de459d614221683 --- /dev/null +++ b/nano-graphrag/tests/test_neo4j_storage.py @@ -0,0 +1,210 @@ +import os +import pytest +import numpy as np +from functools import wraps +from nano_graphrag import GraphRAG +from nano_graphrag._storage import Neo4jStorage +from nano_graphrag._utils import wrap_embedding_func_with_attrs + +if os.environ.get("NANO_GRAPHRAG_TEST_IGNORE_NEO4J", False): + pytest.skip("skipping neo4j tests", allow_module_level=True) + + +@pytest.fixture(scope="module") +def neo4j_config(): + return { + "neo4j_url": os.environ.get("NEO4J_URL", "bolt://localhost:7687"), + "neo4j_auth": ( + os.environ.get("NEO4J_USER", "neo4j"), + os.environ.get("NEO4J_PASSWORD", "neo4j"), + ), + } + + +@wrap_embedding_func_with_attrs(embedding_dim=384, max_token_size=8192) +async def mock_embedding(texts: list[str]) -> np.ndarray: + return np.random.rand(len(texts), 384) + + +@pytest.fixture +def neo4j_storage(neo4j_config): + rag = GraphRAG( + working_dir="./tests/neo4j_test", + embedding_func=mock_embedding, + graph_storage_cls=Neo4jStorage, + addon_params=neo4j_config, + ) + storage = rag.chunk_entity_relation_graph + return storage + + +def reset_graph(func): + @wraps(func) + async def new_func(neo4j_storage): + await neo4j_storage._debug_delete_all_node_edges() + await neo4j_storage.index_start_callback() + results = await func(neo4j_storage) + await neo4j_storage._debug_delete_all_node_edges() + return results + + return new_func + + +def test_neo4j_storage_init(): + rag = GraphRAG( + working_dir="./tests/neo4j_test", + embedding_func=mock_embedding, + ) + with pytest.raises(ValueError): + storage = Neo4jStorage( + namespace="nanographrag_test", global_config=rag.__dict__ + ) + + +@pytest.mark.asyncio +@reset_graph +async def test_upsert_and_get_node(neo4j_storage): + node_id = "node1" + node_data = {"attr1": "value1", "attr2": "value2"} + return_data = {"id": node_id, "clusters": "[]", **node_data} + + await neo4j_storage.upsert_node(node_id, node_data) + + result = await neo4j_storage.get_node(node_id) + assert result == return_data + + has_node = await neo4j_storage.has_node(node_id) + assert has_node is True + + +@pytest.mark.asyncio +@reset_graph +async def test_upsert_and_get_edge(neo4j_storage): + source_id = "node1" + target_id = "node2" + edge_data = {"weight": 1.0, "type": "connection"} + + await neo4j_storage.upsert_node(source_id, {}) + await neo4j_storage.upsert_node(target_id, {}) + await neo4j_storage.upsert_edge(source_id, target_id, edge_data) + + result = await neo4j_storage.get_edge(source_id, target_id) + print(result) + assert result == edge_data + + has_edge = await neo4j_storage.has_edge(source_id, target_id) + assert has_edge is True + + +@pytest.mark.asyncio +@reset_graph +async def test_node_degree(neo4j_storage): + node_id = "center" + await neo4j_storage.upsert_node(node_id, {}) + + num_neighbors = 5 + for i in range(num_neighbors): + neighbor_id = f"neighbor{i}" + await neo4j_storage.upsert_node(neighbor_id, {}) + await neo4j_storage.upsert_edge(node_id, neighbor_id, {}) + + degree = await neo4j_storage.node_degree(node_id) + assert degree == num_neighbors + + +@pytest.mark.asyncio +@reset_graph +async def test_edge_degree(neo4j_storage): + source_id = "node1" + target_id = "node2" + + await neo4j_storage.upsert_node(source_id, {}) + await neo4j_storage.upsert_node(target_id, {}) + await neo4j_storage.upsert_edge(source_id, target_id, {}) + + num_source_neighbors = 3 + for i in range(num_source_neighbors): + neighbor_id = f"neighbor{i}" + await neo4j_storage.upsert_node(neighbor_id, {}) + await neo4j_storage.upsert_edge(source_id, neighbor_id, {}) + + num_target_neighbors = 2 + for i in range(num_target_neighbors): + neighbor_id = f"target_neighbor{i}" + await neo4j_storage.upsert_node(neighbor_id, {}) + await neo4j_storage.upsert_edge(target_id, neighbor_id, {}) + + expected_edge_degree = (num_source_neighbors + 1) + (num_target_neighbors + 1) + edge_degree = await neo4j_storage.edge_degree(source_id, target_id) + assert edge_degree == expected_edge_degree + + +@pytest.mark.asyncio +@reset_graph +async def test_get_node_edges(neo4j_storage): + center_id = "center" + await neo4j_storage.upsert_node(center_id, {}) + + expected_edges = [] + for i in range(3): + neighbor_id = f"neighbor{i}" + await neo4j_storage.upsert_node(neighbor_id, {}) + await neo4j_storage.upsert_edge(center_id, neighbor_id, {}) + expected_edges.append((center_id, neighbor_id)) + + result = await neo4j_storage.get_node_edges(center_id) + print(result) + assert set(result) == set(expected_edges) + + +@pytest.mark.asyncio +@reset_graph +async def test_leiden_clustering(neo4j_storage): + for i in range(10): + await neo4j_storage.upsert_node(f"NODE{i}", {"source_id": f"chunk{i}"}) + + for i in range(9): + await neo4j_storage.upsert_edge(f"NODE{i}", f"NODE{i+1}", {"weight": 1.0}) + + await neo4j_storage.clustering(algorithm="leiden") + + community_schema = await neo4j_storage.community_schema() + + assert len(community_schema) > 0 + + for community in community_schema.values(): + assert "level" in community + assert "title" in community + assert "edges" in community + assert "nodes" in community + assert "chunk_ids" in community + assert "occurrence" in community + assert "sub_communities" in community + print(community) + + +@pytest.mark.asyncio +@reset_graph +async def test_nonexistent_node_and_edge(neo4j_storage): + assert await neo4j_storage.has_node("nonexistent") is False + assert await neo4j_storage.has_edge("node1", "node2") is False + assert await neo4j_storage.get_node("nonexistent") is None + assert await neo4j_storage.get_edge("node1", "node2") is None + assert await neo4j_storage.get_node_edges("nonexistent") == [] + assert await neo4j_storage.node_degree("nonexistent") == 0 + assert await neo4j_storage.edge_degree("node1", "node2") == 0 + + +@pytest.mark.asyncio +@reset_graph +async def test_cluster_error_handling(neo4j_storage): + with pytest.raises( + ValueError, match="Clustering algorithm invalid_algo not supported" + ): + await neo4j_storage.clustering("invalid_algo") + + +@pytest.mark.asyncio +@reset_graph +async def test_index_done(neo4j_storage): + await neo4j_storage.index_done_callback() diff --git a/nano-graphrag/tests/test_networkx_storage.py b/nano-graphrag/tests/test_networkx_storage.py new file mode 100644 index 0000000000000000000000000000000000000000..1b580a4e8282467e43b8dc078a47f6668f6cc62c --- /dev/null +++ b/nano-graphrag/tests/test_networkx_storage.py @@ -0,0 +1,377 @@ +import os +import shutil +import pytest +import networkx as nx +import numpy as np +import asyncio +import json +from nano_graphrag import GraphRAG +from nano_graphrag._storage import NetworkXStorage +from nano_graphrag._utils import wrap_embedding_func_with_attrs + +WORKING_DIR = "./tests/nano_graphrag_cache_networkx_storage_test" + + +@pytest.fixture(scope="function") +def setup_teardown(): + if os.path.exists(WORKING_DIR): + shutil.rmtree(WORKING_DIR) + os.mkdir(WORKING_DIR) + + yield + + shutil.rmtree(WORKING_DIR) + + +@wrap_embedding_func_with_attrs(embedding_dim=384, max_token_size=8192) +async def mock_embedding(texts: list[str]) -> np.ndarray: + return np.random.rand(len(texts), 384) + + +@pytest.fixture +def networkx_storage(setup_teardown): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=mock_embedding) + return NetworkXStorage( + namespace="test", + global_config=rag.__dict__, + ) + + +@pytest.mark.asyncio +async def test_upsert_and_get_node(networkx_storage): + node_id = "node1" + node_data = {"attr1": "value1", "attr2": "value2"} + + await networkx_storage.upsert_node(node_id, node_data) + + result = await networkx_storage.get_node(node_id) + assert result == node_data + + has_node = await networkx_storage.has_node(node_id) + assert has_node is True + + +@pytest.mark.asyncio +async def test_upsert_and_get_edge(networkx_storage): + source_id = "node1" + target_id = "node2" + edge_data = {"weight": 1.0, "type": "connection"} + + await networkx_storage.upsert_node(source_id, {}) + await networkx_storage.upsert_node(target_id, {}) + await networkx_storage.upsert_edge(source_id, target_id, edge_data) + + result = await networkx_storage.get_edge(source_id, target_id) + assert result == edge_data + + has_edge = await networkx_storage.has_edge(source_id, target_id) + assert has_edge is True + + +@pytest.mark.asyncio +async def test_node_degree(networkx_storage): + node_id = "center" + await networkx_storage.upsert_node(node_id, {}) + + num_neighbors = 5 + for i in range(num_neighbors): + neighbor_id = f"neighbor{i}" + await networkx_storage.upsert_node(neighbor_id, {}) + await networkx_storage.upsert_edge(node_id, neighbor_id, {}) + + degree = await networkx_storage.node_degree(node_id) + assert degree == num_neighbors + + +@pytest.mark.asyncio +async def test_edge_degree(networkx_storage): + source_id = "node1" + target_id = "node2" + + await networkx_storage.upsert_node(source_id, {}) + await networkx_storage.upsert_node(target_id, {}) + await networkx_storage.upsert_edge(source_id, target_id, {}) + + num_source_neighbors = 3 + for i in range(num_source_neighbors): + neighbor_id = f"neighbor{i}" + await networkx_storage.upsert_node(neighbor_id, {}) + await networkx_storage.upsert_edge(source_id, neighbor_id, {}) + + num_target_neighbors = 2 + for i in range(num_target_neighbors): + neighbor_id = f"target_neighbor{i}" + await networkx_storage.upsert_node(neighbor_id, {}) + await networkx_storage.upsert_edge(target_id, neighbor_id, {}) + + expected_edge_degree = (num_source_neighbors + 1) + (num_target_neighbors + 1) + edge_degree = await networkx_storage.edge_degree(source_id, target_id) + assert edge_degree == expected_edge_degree + + +@pytest.mark.asyncio +async def test_get_node_edges(networkx_storage): + center_id = "center" + await networkx_storage.upsert_node(center_id, {}) + + expected_edges = [] + for i in range(3): + neighbor_id = f"neighbor{i}" + await networkx_storage.upsert_node(neighbor_id, {}) + await networkx_storage.upsert_edge(center_id, neighbor_id, {}) + expected_edges.append((center_id, neighbor_id)) + + result = await networkx_storage.get_node_edges(center_id) + assert set(result) == set(expected_edges) + + +@pytest.mark.parametrize("algorithm", ["leiden"]) +@pytest.mark.asyncio +async def test_clustering(networkx_storage, algorithm): + # [numberchiffre]: node ID is case-sensitive for clustering with leiden. + for i in range(10): + await networkx_storage.upsert_node(f"NODE{i}", {"source_id": f"chunk{i}"}) + + for i in range(9): + await networkx_storage.upsert_edge(f"NODE{i}", f"NODE{i+1}", {}) + + assert networkx_storage._graph.number_of_nodes() > 0 + assert networkx_storage._graph.number_of_edges() > 0 + await networkx_storage.clustering(algorithm=algorithm) + + community_schema = await networkx_storage.community_schema() + + assert len(community_schema) > 0 + + for community in community_schema.values(): + assert "level" in community + assert "title" in community + assert "edges" in community + assert "nodes" in community + assert "chunk_ids" in community + assert "occurrence" in community + assert "sub_communities" in community + + +@pytest.mark.parametrize("algorithm", ["leiden"]) +@pytest.mark.asyncio +async def test_leiden_clustering_consistency(networkx_storage, algorithm): + for i in range(10): + await networkx_storage.upsert_node(f"NODE{i}", {"source_id": f"chunk{i}"}) + for i in range(9): + await networkx_storage.upsert_edge(f"NODE{i}", f"NODE{i+1}", {}) + + results = [] + for _ in range(3): + await networkx_storage.clustering(algorithm=algorithm) + community_schema = await networkx_storage.community_schema() + results.append(community_schema) + + assert all(len(r) == len(results[0]) for r in results), "Number of communities should be consistent" + + +@pytest.mark.parametrize("algorithm", ["leiden"]) +@pytest.mark.asyncio +async def test_leiden_clustering_community_structure(networkx_storage, algorithm): + for i in range(10): + await networkx_storage.upsert_node(f"A{i}", {"source_id": f"chunkA{i}"}) + await networkx_storage.upsert_node(f"B{i}", {"source_id": f"chunkB{i}"}) + for i in range(9): + await networkx_storage.upsert_edge(f"A{i}", f"A{i+1}", {}) + await networkx_storage.upsert_edge(f"B{i}", f"B{i+1}", {}) + + await networkx_storage.clustering(algorithm=algorithm) + community_schema = await networkx_storage.community_schema() + + assert len(community_schema) >= 2, "Should have at least two communities" + + communities = list(community_schema.values()) + a_nodes = set(node for node in communities[0]['nodes'] if node.startswith('A')) + b_nodes = set(node for node in communities[0]['nodes'] if node.startswith('B')) + assert len(a_nodes) == 0 or len(b_nodes) == 0, "Nodes from different groups should be in different communities" + + +@pytest.mark.parametrize("algorithm", ["leiden"]) +@pytest.mark.asyncio +async def test_leiden_clustering_hierarchical_structure(networkx_storage, algorithm): + await networkx_storage.upsert_node("NODE1", {"source_id": "chunk1", "clusters": json.dumps([{"level": 0, "cluster": "0"}, {"level": 1, "cluster": "1"}])}) + await networkx_storage.upsert_node("NODE2", {"source_id": "chunk2", "clusters": json.dumps([{"level": 0, "cluster": "0"}, {"level": 1, "cluster": "2"}])}) + await networkx_storage.upsert_edge("NODE1", "NODE2", {}) + await networkx_storage.clustering(algorithm=algorithm) + community_schema = await networkx_storage.community_schema() + + levels = set(community['level'] for community in community_schema.values()) + assert len(levels) >= 1, "Should have at least one level in the hierarchy" + + communities_per_level = {level: sum(1 for c in community_schema.values() if c['level'] == level) for level in levels} + assert communities_per_level[0] >= communities_per_level.get(max(levels), 0), "Lower levels should have more or equal number of communities" + + +@pytest.mark.asyncio +async def test_persistence(setup_teardown): + rag = GraphRAG(working_dir=WORKING_DIR, embedding_func=mock_embedding) + initial_storage = NetworkXStorage( + namespace="test_persistence", + global_config=rag.__dict__, + ) + + await initial_storage.upsert_node("node1", {"attr": "value"}) + await initial_storage.upsert_node("node2", {"attr": "value"}) + await initial_storage.upsert_edge("node1", "node2", {"weight": 1.0}) + + await initial_storage.index_done_callback() + + new_storage = NetworkXStorage( + namespace="test_persistence", + global_config=rag.__dict__, + ) + + assert await new_storage.has_node("node1") + assert await new_storage.has_node("node2") + assert await new_storage.has_edge("node1", "node2") + + node1_data = await new_storage.get_node("node1") + assert node1_data == {"attr": "value"} + + edge_data = await new_storage.get_edge("node1", "node2") + assert edge_data == {"weight": 1.0} + + +@pytest.mark.asyncio +async def test_embed_nodes(networkx_storage): + for i in range(5): + await networkx_storage.upsert_node(f"node{i}", {"id": f"node{i}"}) + + for i in range(4): + await networkx_storage.upsert_edge(f"node{i}", f"node{i+1}", {}) + + embeddings, node_ids = await networkx_storage.embed_nodes("node2vec") + + assert embeddings.shape == (5, networkx_storage.global_config['node2vec_params']['dimensions']) + assert len(node_ids) == 5 + assert all(f"node{i}" in node_ids for i in range(5)) + + +@pytest.mark.asyncio +async def test_stable_largest_connected_component_equal_components(): + G = nx.Graph() + G.add_edges_from([("A", "B"), ("C", "D"), ("E", "F")]) + result = NetworkXStorage.stable_largest_connected_component(G) + assert sorted(result.nodes()) == ["A", "B"] + assert list(result.edges()) == [("A", "B")] + + +@pytest.mark.asyncio +async def test_stable_largest_connected_component_stability(): + G = nx.Graph() + G.add_edges_from([("A", "B"), ("B", "C"), ("C", "D"), ("E", "F")]) + result1 = NetworkXStorage.stable_largest_connected_component(G) + result2 = NetworkXStorage.stable_largest_connected_component(G) + assert nx.is_isomorphic(result1, result2) + assert list(result1.nodes()) == list(result2.nodes()) + assert list(result1.edges()) == list(result2.edges()) + + +@pytest.mark.asyncio +async def test_stable_largest_connected_component_directed_graph(): + G = nx.DiGraph() + G.add_edges_from([("A", "B"), ("B", "C"), ("C", "D"), ("E", "F")]) + result = NetworkXStorage.stable_largest_connected_component(G) + assert sorted(result.nodes()) == ["A", "B", "C", "D"] + assert sorted(result.edges()) == [("A", "B"), ("B", "C"), ("C", "D")] + + +@pytest.mark.asyncio +async def test_stable_largest_connected_component_self_loops_and_parallel_edges(): + G = nx.Graph() + G.add_edges_from([("A", "B"), ("B", "C"), ("C", "A"), ("A", "A"), ("B", "B"), ("A", "B")]) + result = NetworkXStorage.stable_largest_connected_component(G) + assert sorted(result.nodes()) == ["A", "B", "C"] + assert sorted(result.edges()) == [('A', 'A'), ('A', 'B'), ('A', 'C'), ('B', 'B'), ('B', 'C')] + + +@pytest.mark.asyncio +async def test_community_schema_with_no_clusters(networkx_storage): + await networkx_storage.upsert_node("node1", {"source_id": "chunk1"}) + await networkx_storage.upsert_node("node2", {"source_id": "chunk2"}) + await networkx_storage.upsert_edge("node1", "node2", {}) + + community_schema = await networkx_storage.community_schema() + assert len(community_schema) == 0 + + +@pytest.mark.asyncio +async def test_community_schema_multiple_levels(networkx_storage): + await networkx_storage.upsert_node("node1", {"source_id": "chunk1", "clusters": json.dumps([{"level": 0, "cluster": "0"}, {"level": 1, "cluster": "1"}])}) + await networkx_storage.upsert_node("node2", {"source_id": "chunk2", "clusters": json.dumps([{"level": 0, "cluster": "0"}, {"level": 1, "cluster": "2"}])}) + await networkx_storage.upsert_edge("node1", "node2", {}) + + community_schema = await networkx_storage.community_schema() + assert len(community_schema) == 3 + assert set(community_schema.keys()) == {"0", "1", "2"} + assert community_schema["0"]["level"] == 0 + assert community_schema["1"]["level"] == 1 + assert community_schema["2"]["level"] == 1 + assert set(community_schema["0"]["sub_communities"]) == {"1", "2"} + + +@pytest.mark.asyncio +async def test_community_schema_occurrence(networkx_storage): + await networkx_storage.upsert_node("node1", {"source_id": "chunk1,chunk2", "clusters": json.dumps([{"level": 0, "cluster": "0"}])}) + await networkx_storage.upsert_node("node2", {"source_id": "chunk3", "clusters": json.dumps([{"level": 0, "cluster": "0"}])}) + await networkx_storage.upsert_node("node3", {"source_id": "chunk4", "clusters": json.dumps([{"level": 0, "cluster": "1"}])}) + + community_schema = await networkx_storage.community_schema() + assert len(community_schema) == 2 + assert community_schema["0"]["occurrence"] == 1 + assert community_schema["1"]["occurrence"] == 0.5 + + +@pytest.mark.asyncio +async def test_community_schema_sub_communities(networkx_storage): + await networkx_storage.upsert_node("node1", {"source_id": "chunk1", "clusters": json.dumps([{"level": 0, "cluster": "0"}, {"level": 1, "cluster": "1"}])}) + await networkx_storage.upsert_node("node2", {"source_id": "chunk2", "clusters": json.dumps([{"level": 0, "cluster": "0"}, {"level": 1, "cluster": "2"}])}) + await networkx_storage.upsert_node("node3", {"source_id": "chunk3", "clusters": json.dumps([{"level": 0, "cluster": "3"}, {"level": 1, "cluster": "4"}])}) + + community_schema = await networkx_storage.community_schema() + assert len(community_schema) == 5 + assert set(community_schema["0"]["sub_communities"]) == {"1", "2"} + assert community_schema["3"]["sub_communities"] == ["4"] + assert community_schema["1"]["sub_communities"] == [] + assert community_schema["2"]["sub_communities"] == [] + assert community_schema["4"]["sub_communities"] == [] + + +@pytest.mark.asyncio +async def test_concurrent_operations(networkx_storage): + async def add_nodes(start, end): + for i in range(start, end): + await networkx_storage.upsert_node(f"node{i}", {"value": i}) + + await asyncio.gather( + add_nodes(0, 500), + add_nodes(500, 1000) + ) + + assert await networkx_storage.node_degree("node0") == 0 + assert len(networkx_storage._graph.nodes) == 1000 + + +@pytest.mark.asyncio +async def test_nonexistent_node_and_edge(networkx_storage): + assert await networkx_storage.has_node("nonexistent") is False + assert await networkx_storage.has_edge("node1", "node2") is False + assert await networkx_storage.get_node("nonexistent") is None + assert await networkx_storage.get_edge("node1", "node2") is None + assert await networkx_storage.get_node_edges("nonexistent") is None + assert await networkx_storage.node_degree("nonexistent") == 0 + assert await networkx_storage.edge_degree("node1", "node2") == 0 + + +@pytest.mark.asyncio +async def test_error_handling(networkx_storage): + with pytest.raises(ValueError, match="Clustering algorithm invalid_algo not supported"): + await networkx_storage.clustering("invalid_algo") + + with pytest.raises(ValueError, match="Node embedding algorithm invalid_algo not supported"): + await networkx_storage.embed_nodes("invalid_algo") diff --git a/nano-graphrag/tests/test_openai.py b/nano-graphrag/tests/test_openai.py new file mode 100644 index 0000000000000000000000000000000000000000..9751aee71e1a6da6e17afc63f2d17f494edd9b0f --- /dev/null +++ b/nano-graphrag/tests/test_openai.py @@ -0,0 +1,132 @@ +import pytest +import numpy as np +from unittest.mock import AsyncMock, Mock, patch +from nano_graphrag import _llm + + +def test_get_openai_async_client_instance(): + with patch("nano_graphrag._llm.AsyncOpenAI") as mock_openai: + mock_openai.return_value = "CLIENT" + client = _llm.get_openai_async_client_instance() + assert client == "CLIENT" + + +def test_get_azure_openai_async_client_instance(): + with patch("nano_graphrag._llm.AsyncAzureOpenAI") as mock_openai: + mock_openai.return_value = "AZURE_CLIENT" + client = _llm.get_azure_openai_async_client_instance() + assert client == "AZURE_CLIENT" + + +@pytest.fixture +def mock_openai_client(): + with patch("nano_graphrag._llm.get_openai_async_client_instance") as mock_openai: + mock_client = AsyncMock() + mock_openai.return_value = mock_client + yield mock_client + + +@pytest.fixture +def mock_azure_openai_client(): + with patch( + "nano_graphrag._llm.get_azure_openai_async_client_instance" + ) as mock_openai: + mock_client = AsyncMock() + mock_openai.return_value = mock_client + yield mock_client + + +@pytest.mark.asyncio +async def test_openai_gpt4o(mock_openai_client): + mock_response = AsyncMock() + mock_response.choices = [Mock(message=Mock(content="1"))] + messages = [{"role": "system", "content": "3"}, {"role": "user", "content": "2"}] + mock_openai_client.chat.completions.create.return_value = mock_response + + response = await _llm.gpt_4o_complete("2", system_prompt="3") + + mock_openai_client.chat.completions.create.assert_awaited_once_with( + model="gpt-4o", + messages=messages, + ) + assert response == "1" + + +@pytest.mark.asyncio +async def test_openai_gpt4omini(mock_openai_client): + mock_response = AsyncMock() + mock_response.choices = [Mock(message=Mock(content="1"))] + messages = [{"role": "system", "content": "3"}, {"role": "user", "content": "2"}] + mock_openai_client.chat.completions.create.return_value = mock_response + + response = await _llm.gpt_4o_mini_complete("2", system_prompt="3") + + mock_openai_client.chat.completions.create.assert_awaited_once_with( + model="gpt-4o-mini", + messages=messages, + ) + assert response == "1" + + +@pytest.mark.asyncio +async def test_azure_openai_gpt4o(mock_azure_openai_client): + mock_response = AsyncMock() + mock_response.choices = [Mock(message=Mock(content="1"))] + messages = [{"role": "system", "content": "3"}, {"role": "user", "content": "2"}] + mock_azure_openai_client.chat.completions.create.return_value = mock_response + + response = await _llm.azure_gpt_4o_complete("2", system_prompt="3") + + mock_azure_openai_client.chat.completions.create.assert_awaited_once_with( + model="gpt-4o", + messages=messages, + ) + assert response == "1" + + +@pytest.mark.asyncio +async def test_azure_openai_gpt4omini(mock_azure_openai_client): + mock_response = AsyncMock() + mock_response.choices = [Mock(message=Mock(content="1"))] + messages = [{"role": "system", "content": "3"}, {"role": "user", "content": "2"}] + mock_azure_openai_client.chat.completions.create.return_value = mock_response + + response = await _llm.azure_gpt_4o_mini_complete("2", system_prompt="3") + + mock_azure_openai_client.chat.completions.create.assert_awaited_once_with( + model="gpt-4o-mini", + messages=messages, + ) + assert response == "1" + + +@pytest.mark.asyncio +async def test_openai_embedding(mock_openai_client): + mock_response = AsyncMock() + mock_response.data = [Mock(embedding=[1, 1, 1])] + texts = ["Hello world"] + mock_openai_client.embeddings.create.return_value = mock_response + + response = await _llm.openai_embedding(texts) + + mock_openai_client.embeddings.create.assert_awaited_once_with( + model="text-embedding-3-small", input=texts, encoding_format="float" + ) + # print(response) + assert np.allclose(response, np.array([[1, 1, 1]])) + + +@pytest.mark.asyncio +async def test_azure_openai_embedding(mock_azure_openai_client): + mock_response = AsyncMock() + mock_response.data = [Mock(embedding=[1, 1, 1])] + texts = ["Hello world"] + mock_azure_openai_client.embeddings.create.return_value = mock_response + + response = await _llm.azure_openai_embedding(texts) + + mock_azure_openai_client.embeddings.create.assert_awaited_once_with( + model="text-embedding-3-small", input=texts, encoding_format="float" + ) + # print(response) + assert np.allclose(response, np.array([[1, 1, 1]])) diff --git a/nano-graphrag/tests/test_rag.py b/nano-graphrag/tests/test_rag.py new file mode 100644 index 0000000000000000000000000000000000000000..e5ec0134339bbf1a1172987299caa1ed93eaccb5 --- /dev/null +++ b/nano-graphrag/tests/test_rag.py @@ -0,0 +1,101 @@ +import os +import json +import shutil +import numpy as np +from nano_graphrag import GraphRAG, QueryParam +from nano_graphrag._utils import wrap_embedding_func_with_attrs + +os.environ["OPENAI_API_KEY"] = "FAKE" + +WORKING_DIR = "./tests/nano_graphrag_cache_TEST" +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) +else: + shutil.rmtree(WORKING_DIR) + os.mkdir(WORKING_DIR) + +shutil.copy( + "./tests/fixtures/mock_cache.json", + os.path.join(WORKING_DIR, "kv_store_llm_response_cache.json"), +) +FAKE_RESPONSE = "Hello world" +FAKE_JSON = json.dumps({"points": [{"description": "Hello world", "score": 1}]}) + + +def remove_if_exist(file): + if os.path.exists(file): + os.remove(file) + + +# We're using random embedding function for testing +@wrap_embedding_func_with_attrs(embedding_dim=384, max_token_size=8192) +async def local_embedding(texts: list[str]) -> np.ndarray: + return np.random.rand(len(texts), 384) + + +def test_insert(): + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + + rag = GraphRAG( + working_dir=WORKING_DIR, embedding_func=local_embedding, enable_naive_rag=True + ) + rag.insert(FAKE_TEXT) + + +async def fake_model(prompt, system_prompt=None, history_messages=[], **kwargs) -> str: + return FAKE_RESPONSE + + +def test_local_query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=fake_model, + embedding_func=local_embedding, + ) + result = rag.query("Dickens", param=QueryParam(mode="local")) + assert result == FAKE_RESPONSE + + +async def fake_json_model( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return FAKE_JSON + + +def test_global_query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=fake_json_model, + embedding_func=local_embedding, + ) + result = rag.query("Dickens") + assert result == FAKE_JSON + + +def test_naive_query(): + rag = GraphRAG( + working_dir=WORKING_DIR, + best_model_func=fake_model, + embedding_func=local_embedding, + enable_naive_rag=True, + ) + result = rag.query("Dickens", param=QueryParam(mode="naive")) + assert result == FAKE_RESPONSE + + +def test_subcommunity_insert(): + with open("./tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + remove_if_exist(f"{WORKING_DIR}/milvus_lite.db") + remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json") + remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json") + remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml") + rag = GraphRAG( + working_dir=WORKING_DIR, + embedding_func=local_embedding, + enable_naive_rag=True, + addon_params={"force_to_use_sub_communities": True}, + ) + rag.insert(FAKE_TEXT) diff --git a/nano-graphrag/tests/test_splitter.py b/nano-graphrag/tests/test_splitter.py new file mode 100644 index 0000000000000000000000000000000000000000..c20fbe6f5e8c75445ddac45f821f8d05212e35b6 --- /dev/null +++ b/nano-graphrag/tests/test_splitter.py @@ -0,0 +1,72 @@ +import unittest +from typing import List +import tiktoken +from nano_graphrag._splitter import SeparatorSplitter +from nano_graphrag._op import chunking_by_seperators + +# Assuming the SeparatorSplitter class is already imported + + +class TestSeparatorSplitter(unittest.TestCase): + + def setUp(self): + self.tokenize = lambda text: [ + ord(c) for c in text + ] # Simple tokenizer for testing + self.detokenize = lambda tokens: "".join(chr(t) for t in tokens) + + def test_split_with_custom_separator(self): + splitter = SeparatorSplitter( + separators=[self.tokenize("\n"), self.tokenize(".")], + chunk_size=19, + chunk_overlap=0, + keep_separator="end", + ) + text = "This is a test.\nAnother test." + tokens = self.tokenize(text) + expected = [ + self.tokenize("This is a test.\n"), + self.tokenize("Another test."), + ] + result = splitter.split_tokens(tokens) + + self.assertEqual(result, expected) + + def test_chunk_size_limit(self): + splitter = SeparatorSplitter( + chunk_size=5, chunk_overlap=0, separators=[self.tokenize("\n")] + ) + text = "1234567890" + tokens = self.tokenize(text) + expected = [self.tokenize("12345"), self.tokenize("67890")] + result = splitter.split_tokens(tokens) + self.assertEqual(result, expected) + + def test_chunk_overlap(self): + splitter = SeparatorSplitter( + chunk_size=5, chunk_overlap=2, separators=[self.tokenize("\n")] + ) + text = "1234567890" + tokens = self.tokenize(text) + expected = [ + self.tokenize("12345"), + self.tokenize("45678"), + self.tokenize("7890"), + ] + result = splitter.split_tokens(tokens) + self.assertEqual(result, expected) + + def test_chunking_by_seperators(self): + encoder = tiktoken.encoding_for_model("gpt-4o") + text = "This is a test.\nAnother test." + tokens_list = [encoder.encode(text)] + doc_keys = ["doc1"] + results = chunking_by_seperators(tokens_list, doc_keys, encoder) + assert len(results) == 1 + assert results[0]["chunk_order_index"] == 0 + assert results[0]["full_doc_id"] == "doc1" + assert results[0]["content"] == text + + +if __name__ == "__main__": + unittest.main()