Spaces:

chenzihong
/

GraphGen

Running

App Files Files Community

chenzihong-gavin commited on Sep 10, 2025

Commit

0682cc6

1 Parent(s): 4b2a9c2

delete

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

hf-repo/LICENSE +0 -201
hf-repo/README.md +0 -43
hf-repo/app.py +0 -587
hf-repo/graphgen/__init__.py +0 -0
hf-repo/graphgen/configs/README.md +0 -1
hf-repo/graphgen/configs/aggregated_config.yaml +0 -21
hf-repo/graphgen/configs/atomic_config.yaml +0 -21
hf-repo/graphgen/configs/cot_config.yaml +0 -13
hf-repo/graphgen/configs/multi_hop_config.yaml +0 -21
hf-repo/graphgen/evaluate.py +0 -142
hf-repo/graphgen/generate.py +0 -103
hf-repo/graphgen/graphgen.py +0 -395
hf-repo/graphgen/judge.py +0 -60
hf-repo/graphgen/models/__init__.py +0 -45
hf-repo/graphgen/models/community/__init__.py +0 -0
hf-repo/graphgen/models/community/community_detector.py +0 -95
hf-repo/graphgen/models/embed/__init__.py +0 -0
hf-repo/graphgen/models/embed/embedding.py +0 -29
hf-repo/graphgen/models/evaluate/__init__.py +0 -0
hf-repo/graphgen/models/evaluate/base_evaluator.py +0 -51
hf-repo/graphgen/models/evaluate/length_evaluator.py +0 -22
hf-repo/graphgen/models/evaluate/mtld_evaluator.py +0 -76
hf-repo/graphgen/models/evaluate/reward_evaluator.py +0 -101
hf-repo/graphgen/models/evaluate/uni_evaluator.py +0 -159
hf-repo/graphgen/models/llm/__init__.py +0 -0
hf-repo/graphgen/models/llm/limitter.py +0 -88
hf-repo/graphgen/models/llm/openai_model.py +0 -155
hf-repo/graphgen/models/llm/tokenizer.py +0 -73
hf-repo/graphgen/models/llm/topk_token_model.py +0 -48
hf-repo/graphgen/models/search/__init__.py +0 -0
hf-repo/graphgen/models/search/db/__init__.py +0 -0
hf-repo/graphgen/models/search/db/uniprot_search.py +0 -64
hf-repo/graphgen/models/search/kg/__init__.py +0 -0
hf-repo/graphgen/models/search/kg/wiki_search.py +0 -37
hf-repo/graphgen/models/search/web/__init__.py +0 -0
hf-repo/graphgen/models/search/web/bing_search.py +0 -43
hf-repo/graphgen/models/search/web/google_search.py +0 -45
hf-repo/graphgen/models/storage/__init__.py +0 -0
hf-repo/graphgen/models/storage/base_storage.py +0 -115
hf-repo/graphgen/models/storage/json_storage.py +0 -87
hf-repo/graphgen/models/storage/networkx_storage.py +0 -159
hf-repo/graphgen/models/strategy/__init__.py +0 -0
hf-repo/graphgen/models/strategy/base_strategy.py +0 -5
hf-repo/graphgen/models/strategy/travserse_strategy.py +0 -30
hf-repo/graphgen/models/text/__init__.py +0 -0
hf-repo/graphgen/models/text/chunk.py +0 -7
hf-repo/graphgen/models/text/text_pair.py +0 -9
hf-repo/graphgen/models/vis/__init__.py +0 -0
hf-repo/graphgen/models/vis/community_visualizer.py +0 -48
hf-repo/graphgen/operators/__init__.py +0 -22

hf-repo/LICENSE DELETED Viewed

@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

hf-repo/README.md DELETED Viewed

@@ -1,43 +0,0 @@
----
-title: GraphGen Demo
-emoji: 📊
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: "5.44.0"
-python_version: "3.10"
-app_file: app.py
-suggested_hardware: cpu-basic
-pinned: false
-short_description: "Knowledge-driven synthetic data generation demo"
-tags:
-  - synthetic-data
-  - knowledge-graph
-  - gradio-demo
----
-# GraphGen Space 🤖📊
-This is the **official Hugging Face Space** for [GraphGen](https://github.com/open-sciencelab/GraphGen) – a framework that leverages knowledge graphs to generate high-quality synthetic question–answer pairs for supervised fine-tuning of LLMs.
-🔗 Paper: [arXiv 2505.20416](https://arxiv.org/abs/2505.20416)
-🐙 GitHub: [open-sciencelab/GraphGen](https://github.com/open-sciencelab/GraphGen)
----
-## How to use (🖱️ 3 clicks)
-1. Open the **Gradio app** above.
-2. Upload or paste your source text → click **Generate KG**.
-3. Download the generated QA pairs directly.
----
-## Local quick start (optional)
-```bash
-git clone https://github.com/open-sciencelab/GraphGen
-cd GraphGen
-uv venv --python 3.10 && uv pip install -r requirements.txt
-uv run webui/app.py   # http://localhost:7860
-```

hf-repo/app.py DELETED Viewed

@@ -1,587 +0,0 @@
-import json
-import os
-import sys
-import tempfile
-import gradio as gr
-import pandas as pd
-from gradio_i18n import Translate
-from gradio_i18n import gettext as _
-from webui.base import GraphGenParams
-from webui.cache_utils import cleanup_workspace, setup_workspace
-from webui.count_tokens import count_tokens
-from webui.test_api import test_api_connection
-# pylint: disable=wrong-import-position
-root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.append(root_dir)
-from graphgen.graphgen import GraphGen
-from graphgen.models import OpenAIModel, Tokenizer, TraverseStrategy
-from graphgen.models.llm.limitter import RPM, TPM
-from graphgen.utils import set_logger
-css = """
-.center-row {
-    display: flex;
-    justify-content: center;
-    align-items: center;
-}
-"""
-def init_graph_gen(config: dict, env: dict) -> GraphGen:
-    # Set up working directory
-    log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
-    set_logger(log_file, if_stream=False)
-    graph_gen = GraphGen(working_dir=working_dir)
-    # Set up LLM clients
-    graph_gen.synthesizer_llm_client = OpenAIModel(
-        model_name=env.get("SYNTHESIZER_MODEL", ""),
-        base_url=env.get("SYNTHESIZER_BASE_URL", ""),
-        api_key=env.get("SYNTHESIZER_API_KEY", ""),
-        request_limit=True,
-        rpm=RPM(env.get("RPM", 1000)),
-        tpm=TPM(env.get("TPM", 50000)),
-    )
-    graph_gen.trainee_llm_client = OpenAIModel(
-        model_name=env.get("TRAINEE_MODEL", ""),
-        base_url=env.get("TRAINEE_BASE_URL", ""),
-        api_key=env.get("TRAINEE_API_KEY", ""),
-        request_limit=True,
-        rpm=RPM(env.get("RPM", 1000)),
-        tpm=TPM(env.get("TPM", 50000)),
-    )
-    graph_gen.tokenizer_instance = Tokenizer(config.get("tokenizer", "cl100k_base"))
-    strategy_config = config.get("traverse_strategy", {})
-    graph_gen.traverse_strategy = TraverseStrategy(
-        qa_form=strategy_config.get("qa_form"),
-        expand_method=strategy_config.get("expand_method"),
-        bidirectional=strategy_config.get("bidirectional"),
-        max_extra_edges=strategy_config.get("max_extra_edges"),
-        max_tokens=strategy_config.get("max_tokens"),
-        max_depth=strategy_config.get("max_depth"),
-        edge_sampling=strategy_config.get("edge_sampling"),
-        isolated_node_strategy=strategy_config.get("isolated_node_strategy"),
-        loss_strategy=str(strategy_config.get("loss_strategy")),
-    )
-    return graph_gen
-# pylint: disable=too-many-statements
-def run_graphgen(params, progress=gr.Progress()):
-    def sum_tokens(client):
-        return sum(u["total_tokens"] for u in client.token_usage)
-    config = {
-        "if_trainee_model": params.if_trainee_model,
-        "input_file": params.input_file,
-        "tokenizer": params.tokenizer,
-        "quiz_samples": params.quiz_samples,
-        "traverse_strategy": {
-            "qa_form": params.qa_form,
-            "bidirectional": params.bidirectional,
-            "expand_method": params.expand_method,
-            "max_extra_edges": params.max_extra_edges,
-            "max_tokens": params.max_tokens,
-            "max_depth": params.max_depth,
-            "edge_sampling": params.edge_sampling,
-            "isolated_node_strategy": params.isolated_node_strategy,
-            "loss_strategy": params.loss_strategy,
-        },
-        "chunk_size": params.chunk_size,
-    }
-    env = {
-        "SYNTHESIZER_BASE_URL": params.synthesizer_url,
-        "SYNTHESIZER_MODEL": params.synthesizer_model,
-        "TRAINEE_BASE_URL": params.trainee_url,
-        "TRAINEE_MODEL": params.trainee_model,
-        "SYNTHESIZER_API_KEY": params.api_key,
-        "TRAINEE_API_KEY": params.trainee_api_key,
-        "RPM": params.rpm,
-        "TPM": params.tpm,
-    }
-    # Test API connection
-    test_api_connection(
-        env["SYNTHESIZER_BASE_URL"],
-        env["SYNTHESIZER_API_KEY"],
-        env["SYNTHESIZER_MODEL"],
-    )
-    if config["if_trainee_model"]:
-        test_api_connection(
-            env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
-        )
-    # Initialize GraphGen
-    graph_gen = init_graph_gen(config, env)
-    graph_gen.clear()
-    graph_gen.progress_bar = progress
-    try:
-        # Load input data
-        file = config["input_file"]
-        if isinstance(file, list):
-            file = file[0]
-        data = []
-        if file.endswith(".jsonl"):
-            data_type = "raw"
-            with open(file, "r", encoding="utf-8") as f:
-                data.extend(json.loads(line) for line in f)
-        elif file.endswith(".json"):
-            data_type = "chunked"
-            with open(file, "r", encoding="utf-8") as f:
-                data.extend(json.load(f))
-        elif file.endswith(".txt"):
-            # 读取文件后根据chunk_size转成raw格式的数据
-            data_type = "raw"
-            content = ""
-            with open(file, "r", encoding="utf-8") as f:
-                lines = f.readlines()
-                for line in lines:
-                    content += line.strip() + " "
-            size = int(config.get("chunk_size", 512))
-            chunks = [content[i : i + size] for i in range(0, len(content), size)]
-            data.extend([{"content": chunk} for chunk in chunks])
-        else:
-            raise ValueError(f"Unsupported file type: {file}")
-        # Process the data
-        graph_gen.insert(data, data_type)
-        if config["if_trainee_model"]:
-            # Generate quiz
-            graph_gen.quiz(max_samples=config["quiz_samples"])
-            # Judge statements
-            graph_gen.judge()
-        else:
-            graph_gen.traverse_strategy.edge_sampling = "random"
-            # Skip judge statements
-            graph_gen.judge(skip=True)
-        # Traverse graph
-        graph_gen.traverse(traverse_strategy=graph_gen.traverse_strategy)
-        # Save output
-        output_data = graph_gen.qa_storage.data
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
-        ) as tmpfile:
-            json.dump(output_data, tmpfile, ensure_ascii=False)
-            output_file = tmpfile.name
-        synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client)
-        trainee_tokens = (
-            sum_tokens(graph_gen.trainee_llm_client)
-            if config["if_trainee_model"]
-            else 0
-        )
-        total_tokens = synthesizer_tokens + trainee_tokens
-        data_frame = params.token_counter
-        try:
-            _update_data = [
-                [data_frame.iloc[0, 0], data_frame.iloc[0, 1], str(total_tokens)]
-            ]
-            new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
-            data_frame = new_df
-        except Exception as e:
-            raise gr.Error(f"DataFrame operation error: {str(e)}")
-        return output_file, gr.DataFrame(
-            label="Token Stats",
-            headers=["Source Text Token Count", "Expected Token Usage", "Token Used"],
-            datatype="str",
-            interactive=False,
-            value=data_frame,
-            visible=True,
-            wrap=True,
-        )
-    except Exception as e:  # pylint: disable=broad-except
-        raise gr.Error(f"Error occurred: {str(e)}")
-    finally:
-        # Clean up workspace
-        cleanup_workspace(graph_gen.working_dir)
-with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
-    # Header
-    gr.Image(
-        value=os.path.join(root_dir, "resources", "images", "logo.png"),
-        label="GraphGen Banner",
-        elem_id="banner",
-        interactive=False,
-        container=False,
-        show_download_button=False,
-        show_fullscreen_button=False,
-    )
-    lang_btn = gr.Radio(
-        choices=[
-            ("English", "en"),
-            ("简体中文", "zh"),
-        ],
-        value="en",
-        # label=_("Language"),
-        render=False,
-        container=False,
-        elem_classes=["center-row"],
-    )
-    gr.HTML(
-        """
-    <div style="display: flex; gap: 8px; margin-left: auto; align-items: center; justify-content: center;">
-        <a href="https://github.com/open-sciencelab/GraphGen/releases">
-            <img src="https://img.shields.io/badge/Version-v0.1.0-blue" alt="Version">
-        </a>
-        <a href="https://graphgen-docs.example.com">
-            <img src="https://img.shields.io/badge/Docs-Latest-brightgreen" alt="Documentation">
-        </a>
-        <a href="https://github.com/open-sciencelab/GraphGen/issues/10">
-            <img src="https://img.shields.io/github/stars/open-sciencelab/GraphGen?style=social" alt="GitHub Stars">
-        </a>
-        <a href="https://arxiv.org/abs/2505.20416">
-            <img src="https://img.shields.io/badge/arXiv-pdf-yellow" alt="arXiv">
-        </a>
-    </div>
-    """
-    )
-    with Translate(
-        os.path.join(root_dir, "webui", "translation.json"),
-        lang_btn,
-        placeholder_langs=["en", "zh"],
-        persistant=False,  # True to save the language setting in the browser. Requires gradio >= 5.6.0
-    ):
-        lang_btn.render()
-        gr.Markdown(
-            value="# "
-            + _("Title")
-            + "\n\n"
-            + "### [GraphGen](https://github.com/open-sciencelab/GraphGen) "
-            + _("Intro")
-        )
-        if_trainee_model = gr.Checkbox(
-            label=_("Use Trainee Model"), value=False, interactive=True
-        )
-        with gr.Accordion(label=_("Model Config"), open=False):
-            synthesizer_url = gr.Textbox(
-                label="Synthesizer URL",
-                value="https://api.siliconflow.cn/v1",
-                info=_("Synthesizer URL Info"),
-                interactive=True,
-            )
-            synthesizer_model = gr.Textbox(
-                label="Synthesizer Model",
-                value="Qwen/Qwen2.5-7B-Instruct",
-                info=_("Synthesizer Model Info"),
-                interactive=True,
-            )
-            trainee_url = gr.Textbox(
-                label="Trainee URL",
-                value="https://api.siliconflow.cn/v1",
-                info=_("Trainee URL Info"),
-                interactive=True,
-                visible=if_trainee_model.value is True,
-            )
-            trainee_model = gr.Textbox(
-                label="Trainee Model",
-                value="Qwen/Qwen2.5-7B-Instruct",
-                info=_("Trainee Model Info"),
-                interactive=True,
-                visible=if_trainee_model.value is True,
-            )
-            trainee_api_key = gr.Textbox(
-                label=_("SiliconFlow Token for Trainee Model"),
-                type="password",
-                value="",
-                info="https://cloud.siliconflow.cn/account/ak",
-                visible=if_trainee_model.value is True,
-            )
-        with gr.Accordion(label=_("Generation Config"), open=False):
-            chunk_size = gr.Slider(
-                label="Chunk Size",
-                minimum=256,
-                maximum=4096,
-                value=512,
-                step=256,
-                interactive=True,
-            )
-            tokenizer = gr.Textbox(
-                label="Tokenizer", value="cl100k_base", interactive=True
-            )
-            qa_form = gr.Radio(
-                choices=["atomic", "multi_hop", "aggregated"],
-                label="QA Form",
-                value="aggregated",
-                interactive=True,
-            )
-            quiz_samples = gr.Number(
-                label="Quiz Samples",
-                value=2,
-                minimum=1,
-                interactive=True,
-                visible=if_trainee_model.value is True,
-            )
-            bidirectional = gr.Checkbox(
-                label="Bidirectional", value=True, interactive=True
-            )
-            expand_method = gr.Radio(
-                choices=["max_width", "max_tokens"],
-                label="Expand Method",
-                value="max_tokens",
-                interactive=True,
-            )
-            max_extra_edges = gr.Slider(
-                minimum=1,
-                maximum=10,
-                value=5,
-                label="Max Extra Edges",
-                step=1,
-                interactive=True,
-                visible=expand_method.value == "max_width",
-            )
-            max_tokens = gr.Slider(
-                minimum=64,
-                maximum=1024,
-                value=256,
-                label="Max Tokens",
-                step=64,
-                interactive=True,
-                visible=(expand_method.value != "max_width"),
-            )
-            max_depth = gr.Slider(
-                minimum=1,
-                maximum=5,
-                value=2,
-                label="Max Depth",
-                step=1,
-                interactive=True,
-            )
-            edge_sampling = gr.Radio(
-                choices=["max_loss", "min_loss", "random"],
-                label="Edge Sampling",
-                value="max_loss",
-                interactive=True,
-                visible=if_trainee_model.value is True,
-            )
-            isolated_node_strategy = gr.Radio(
-                choices=["add", "ignore"],
-                label="Isolated Node Strategy",
-                value="ignore",
-                interactive=True,
-            )
-            loss_strategy = gr.Radio(
-                choices=["only_edge", "both"],
-                label="Loss Strategy",
-                value="only_edge",
-                interactive=True,
-            )
-        with gr.Row(equal_height=True):
-            with gr.Column(scale=3):
-                api_key = gr.Textbox(
-                    label=_("SiliconFlow Token"),
-                    type="password",
-                    value="",
-                    info="https://cloud.siliconflow.cn/account/ak",
-                )
-            with gr.Column(scale=1):
-                test_connection_btn = gr.Button(_("Test Connection"))
-        with gr.Blocks():
-            with gr.Row(equal_height=True):
-                with gr.Column():
-                    rpm = gr.Slider(
-                        label="RPM",
-                        minimum=10,
-                        maximum=10000,
-                        value=1000,
-                        step=100,
-                        interactive=True,
-                        visible=True,
-                    )
-                with gr.Column():
-                    tpm = gr.Slider(
-                        label="TPM",
-                        minimum=5000,
-                        maximum=5000000,
-                        value=50000,
-                        step=1000,
-                        interactive=True,
-                        visible=True,
-                    )
-        with gr.Blocks():
-            with gr.Row(equal_height=True):
-                with gr.Column(scale=1):
-                    upload_file = gr.File(
-                        label=_("Upload File"),
-                        file_count="single",
-                        file_types=[".txt", ".json", ".jsonl"],
-                        interactive=True,
-                    )
-                    examples_dir = os.path.join(root_dir, "webui", "examples")
-                    gr.Examples(
-                        examples=[
-                            [os.path.join(examples_dir, "txt_demo.txt")],
-                            [os.path.join(examples_dir, "raw_demo.jsonl")],
-                            [os.path.join(examples_dir, "chunked_demo.json")],
-                        ],
-                        inputs=upload_file,
-                        label=_("Example Files"),
-                        examples_per_page=3,
-                    )
-                with gr.Column(scale=1):
-                    output = gr.File(
-                        label="Output(See Github FAQ)",
-                        file_count="single",
-                        interactive=False,
-                    )
-        with gr.Blocks():
-            token_counter = gr.DataFrame(
-                label="Token Stats",
-                headers=[
-                    "Source Text Token Count",
-                    "Estimated Token Usage",
-                    "Token Used",
-                ],
-                datatype="str",
-                interactive=False,
-                visible=False,
-                wrap=True,
-            )
-        submit_btn = gr.Button(_("Run GraphGen"))
-        # Test Connection
-        test_connection_btn.click(
-            test_api_connection,
-            inputs=[synthesizer_url, api_key, synthesizer_model],
-            outputs=[],
-        )
-        if if_trainee_model.value:
-            test_connection_btn.click(
-                test_api_connection,
-                inputs=[trainee_url, api_key, trainee_model],
-                outputs=[],
-            )
-        expand_method.change(
-            lambda method: (
-                gr.update(visible=method == "max_width"),
-                gr.update(visible=method != "max_width"),
-            ),
-            inputs=expand_method,
-            outputs=[max_extra_edges, max_tokens],
-        )
-        if_trainee_model.change(
-            lambda use_trainee: [gr.update(visible=use_trainee)] * 5,
-            inputs=if_trainee_model,
-            outputs=[
-                trainee_url,
-                trainee_model,
-                quiz_samples,
-                edge_sampling,
-                trainee_api_key,
-            ],
-        )
-        upload_file.change(
-            lambda x: (gr.update(visible=True)),
-            inputs=[upload_file],
-            outputs=[token_counter],
-        ).then(
-            count_tokens,
-            inputs=[upload_file, tokenizer, token_counter],
-            outputs=[token_counter],
-        )
-        # run GraphGen
-        submit_btn.click(
-            lambda x: (gr.update(visible=False)),
-            inputs=[token_counter],
-            outputs=[token_counter],
-        )
-        submit_btn.click(
-            lambda *args: run_graphgen(
-                GraphGenParams(
-                    if_trainee_model=args[0],
-                    input_file=args[1],
-                    tokenizer=args[2],
-                    qa_form=args[3],
-                    bidirectional=args[4],
-                    expand_method=args[5],
-                    max_extra_edges=args[6],
-                    max_tokens=args[7],
-                    max_depth=args[8],
-                    edge_sampling=args[9],
-                    isolated_node_strategy=args[10],
-                    loss_strategy=args[11],
-                    synthesizer_url=args[12],
-                    synthesizer_model=args[13],
-                    trainee_model=args[14],
-                    api_key=args[15],
-                    chunk_size=args[16],
-                    rpm=args[17],
-                    tpm=args[18],
-                    quiz_samples=args[19],
-                    trainee_url=args[20],
-                    trainee_api_key=args[21],
-                    token_counter=args[22],
-                )
-            ),
-            inputs=[
-                if_trainee_model,
-                upload_file,
-                tokenizer,
-                qa_form,
-                bidirectional,
-                expand_method,
-                max_extra_edges,
-                max_tokens,
-                max_depth,
-                edge_sampling,
-                isolated_node_strategy,
-                loss_strategy,
-                synthesizer_url,
-                synthesizer_model,
-                trainee_model,
-                api_key,
-                chunk_size,
-                rpm,
-                tpm,
-                quiz_samples,
-                trainee_url,
-                trainee_api_key,
-                token_counter,
-            ],
-            outputs=[output, token_counter],
-        )
-if __name__ == "__main__":
-    demo.queue(api_open=False, default_concurrency_limit=2)
-    demo.launch(server_name="0.0.0.0")

hf-repo/graphgen/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/configs/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- # Configs for GraphGen

hf-repo/graphgen/configs/aggregated_config.yaml DELETED Viewed

@@ -1,21 +0,0 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
-output_data_type: aggregated # atomic, aggregated, multi_hop, cot
-output_data_format: ChatML # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 5 # maximum depth for graph traversal
-  max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

hf-repo/graphgen/configs/atomic_config.yaml DELETED Viewed

@@ -1,21 +0,0 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
-output_data_type: atomic # atomic, aggregated, multi_hop, cot
-output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 3 # maximum depth for graph traversal
-  max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

hf-repo/graphgen/configs/cot_config.yaml DELETED Viewed

@@ -1,13 +0,0 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl  # input file path, support json, jsonl, txt. See resources/input_examples for examples
-output_data_type: cot # atomic, aggregated, multi_hop, cot
-output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-method_params:
-  method: leiden
-  max_size: 20 # Maximum size of communities
-  use_lcc: false
-  random_seed: 42

hf-repo/graphgen/configs/multi_hop_config.yaml DELETED Viewed

@@ -1,21 +0,0 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
-output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
-output_data_format: ChatML # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 1 # maximum depth for graph traversal
-  max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

hf-repo/graphgen/evaluate.py DELETED Viewed

@@ -1,142 +0,0 @@
-"""Evaluate the quality of the generated text using various metrics"""
-import os
-import json
-import argparse
-import pandas as pd
-from dotenv import load_dotenv
-from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, TextPair, UniEvaluator
-from .utils import logger, set_logger
-sys_path = os.path.abspath(os.path.dirname(__file__))
-set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))
-load_dotenv()
-def evaluate_length(corpus, tokenizer_name):
-    length_evaluator = LengthEvaluator(
-        tokenizer_name=tokenizer_name
-    )
-    logger.info("Length evaluator loaded")
-    scores = length_evaluator.get_average_score(corpus)
-    logger.info("Length scores: %s", scores)
-    return scores
-def evaluate_mtld(corpus):
-    mtld_evaluator = MTLDEvaluator()
-    logger.info("MTLD evaluator loaded")
-    scores = mtld_evaluator.get_average_score(corpus)
-    logger.info("MTLD scores: %s", scores)
-    min_max_scores = mtld_evaluator.get_min_max_score(corpus)
-    logger.info("MTLD min max scores: %s", min_max_scores)
-    return scores, min_max_scores
-def evaluate_reward(corpus, reward_model_names):
-    scores = []
-    for reward_name in reward_model_names:
-        reward_evaluator = RewardEvaluator(
-            reward_name=reward_name
-        )
-        logger.info("Loaded reward model: %s", reward_name)
-        average_score = reward_evaluator.get_average_score(corpus)
-        logger.info("%s scores: %s", reward_name, average_score)
-        min_max_scores = reward_evaluator.get_min_max_score(corpus)
-        logger.info("%s min max scores: %s", reward_name, min_max_scores)
-        scores.append({
-            'reward_name': reward_name.split('/')[-1],
-            'score': average_score,
-            'min_max_scores': min_max_scores
-        })
-        del reward_evaluator
-        clean_gpu_cache()
-    return scores
-def evaluate_uni(corpus, uni_model_name):
-    uni_evaluator = UniEvaluator(
-        model_name=uni_model_name
-    )
-    logger.info("Uni evaluator loaded with model %s", uni_model_name)
-    uni_scores = uni_evaluator.get_average_score(corpus)
-    for key, value in uni_scores.items():
-        logger.info("Uni %s scores: %s", key, value)
-    min_max_scores = uni_evaluator.get_min_max_score(corpus)
-    for key, value in min_max_scores.items():
-        logger.info("Uni %s min max scores: %s", key, value)
-    del uni_evaluator
-    clean_gpu_cache()
-    return (uni_scores['naturalness'], uni_scores['coherence'], uni_scores['understandability'],
-            min_max_scores['naturalness'], min_max_scores['coherence'], min_max_scores['understandability'])
-def clean_gpu_cache():
-    import torch
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-if __name__ == '__main__':
-    import torch.multiprocessing as mp
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--folder', type=str, default='cache/data', help='folder to load data')
-    parser.add_argument('--output', type=str, default='cache/output', help='path to save output')
-    parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name')
-    parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2',
-                        help='Comma-separated list of reward models')
-    parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name')
-    args = parser.parse_args()
-    if not os.path.exists(args.folder):
-        raise ValueError(f"Folder {args.folder} does not exist")
-    if not os.path.exists(args.output):
-        os.makedirs(args.output)
-    reward_models = args.reward.split(',')
-    results = []
-    logger.info("Data loaded from %s", args.folder)
-    mp.set_start_method('spawn')
-    for file in os.listdir(args.folder):
-        if file.endswith('.json'):
-            logger.info("Processing %s", file)
-            with open(os.path.join(args.folder, file), 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            data = [TextPair(
-                question=data[key]['question'],
-                answer=data[key]['answer']
-            ) for key in data]
-            length_scores = evaluate_length(data, args.tokenizer)
-            mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
-            reward_scores = evaluate_reward(data, reward_models)
-            uni_naturalness_scores, uni_coherence_scores, uni_understandability_scores, \
-            min_max_uni_naturalness_scores, min_max_uni_coherence_scores, min_max_uni_understandability_scores \
-                = evaluate_uni(data, args.uni)
-            result = {
-                'file': file,
-                'number': len(data),
-                'length': length_scores,
-                'mtld': mtld_scores,
-                'mtld_min_max': min_max_mtld_scores,
-                'uni_naturalness': uni_naturalness_scores,
-                'uni_coherence': uni_coherence_scores,
-                'uni_understandability': uni_understandability_scores,
-                'uni_naturalness_min_max': min_max_uni_naturalness_scores,
-                'uni_coherence_min_max': min_max_uni_coherence_scores,
-                'uni_understandability_min_max': min_max_uni_understandability_scores
-            }
-            for reward_score in reward_scores:
-                result[reward_score['reward_name']] = reward_score['score']
-                result[f"{reward_score['reward_name']}_min_max"] = reward_score['min_max_scores']
-            results.append(result)
-    results = pd.DataFrame(results)
-    results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False)

hf-repo/graphgen/generate.py DELETED Viewed

@@ -1,103 +0,0 @@
-import argparse
-import os
-import time
-from importlib.resources import files
-import yaml
-from dotenv import load_dotenv
-from .graphgen import GraphGen
-from .utils import logger, set_logger
-sys_path = os.path.abspath(os.path.dirname(__file__))
-load_dotenv()
-def set_working_dir(folder):
-    os.makedirs(folder, exist_ok=True)
-    os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
-    os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
-def save_config(config_path, global_config):
-    if not os.path.exists(os.path.dirname(config_path)):
-        os.makedirs(os.path.dirname(config_path))
-    with open(config_path, "w", encoding="utf-8") as config_file:
-        yaml.dump(
-            global_config, config_file, default_flow_style=False, allow_unicode=True
-        )
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--config_file",
-        help="Config parameters for GraphGen.",
-        default=files("graphgen").joinpath("configs", "aggregated_config.yaml"),
-        type=str,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Output directory for GraphGen.",
-        default=sys_path,
-        required=True,
-        type=str,
-    )
-    args = parser.parse_args()
-    working_dir = args.output_dir
-    set_working_dir(working_dir)
-    with open(args.config_file, "r", encoding="utf-8") as f:
-        config = yaml.load(f, Loader=yaml.FullLoader)
-    output_data_type = config["output_data_type"]
-    unique_id = int(time.time())
-    set_logger(
-        os.path.join(
-            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
-        ),
-        if_stream=True,
-    )
-    logger.info(
-        "GraphGen with unique ID %s logging to %s",
-        unique_id,
-        os.path.join(
-            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
-        ),
-    )
-    graph_gen = GraphGen(working_dir=working_dir, unique_id=unique_id, config=config)
-    graph_gen.insert()
-    if config["search"]["enabled"]:
-        graph_gen.search()
-    # Use pipeline according to the output data type
-    if output_data_type in ["atomic", "aggregated", "multi_hop"]:
-        if "quiz_and_judge_strategy" in config and config[
-            "quiz_and_judge_strategy"
-        ].get("enabled", False):
-            graph_gen.quiz()
-            graph_gen.judge()
-        else:
-            logger.warning(
-                "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
-            )
-            graph_gen.traverse_strategy.edge_sampling = "random"
-        graph_gen.traverse()
-    elif output_data_type == "cot":
-        graph_gen.generate_reasoning(method_params=config["method_params"])
-    else:
-        raise ValueError(f"Unsupported output data type: {output_data_type}")
-    output_path = os.path.join(working_dir, "data", "graphgen", str(unique_id))
-    save_config(os.path.join(output_path, f"config-{unique_id}.yaml"), config)
-    logger.info("GraphGen completed successfully. Data saved to %s", output_path)
-if __name__ == "__main__":
-    main()

hf-repo/graphgen/graphgen.py DELETED Viewed

@@ -1,395 +0,0 @@
-import asyncio
-import os
-import time
-from dataclasses import dataclass, field
-from typing import Dict, List, Union, cast
-import gradio as gr
-from tqdm.asyncio import tqdm as tqdm_async
-from .models import (
-    Chunk,
-    JsonKVStorage,
-    JsonListStorage,
-    NetworkXStorage,
-    OpenAIModel,
-    Tokenizer,
-    TraverseStrategy,
-)
-from .models.storage.base_storage import StorageNameSpace
-from .operators import (
-    extract_kg,
-    generate_cot,
-    judge_statement,
-    quiz,
-    search_all,
-    traverse_graph_atomically,
-    traverse_graph_by_edge,
-    traverse_graph_for_multi_hop,
-)
-from .utils import (
-    compute_content_hash,
-    create_event_loop,
-    format_generation_results,
-    logger,
-    read_file,
-)
-sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-@dataclass
-class GraphGen:
-    unique_id: int = int(time.time())
-    working_dir: str = os.path.join(sys_path, "cache")
-    config: Dict = field(default_factory=dict)
-    # llm
-    tokenizer_instance: Tokenizer = None
-    synthesizer_llm_client: OpenAIModel = None
-    trainee_llm_client: OpenAIModel = None
-    # text chunking
-    # TODO: make it configurable
-    chunk_size: int = 1024
-    chunk_overlap_size: int = 100
-    # search
-    search_config: dict = field(
-        default_factory=lambda: {"enabled": False, "search_types": ["wikipedia"]}
-    )
-    # traversal
-    traverse_strategy: TraverseStrategy = None
-    # webui
-    progress_bar: gr.Progress = None
-    def __post_init__(self):
-        self.tokenizer_instance: Tokenizer = Tokenizer(
-            model_name=self.config["tokenizer"]
-        )
-        self.synthesizer_llm_client: OpenAIModel = OpenAIModel(
-            model_name=os.getenv("SYNTHESIZER_MODEL"),
-            api_key=os.getenv("SYNTHESIZER_API_KEY"),
-            base_url=os.getenv("SYNTHESIZER_BASE_URL"),
-            tokenizer_instance=self.tokenizer_instance,
-        )
-        self.trainee_llm_client: OpenAIModel = OpenAIModel(
-            model_name=os.getenv("TRAINEE_MODEL"),
-            api_key=os.getenv("TRAINEE_API_KEY"),
-            base_url=os.getenv("TRAINEE_BASE_URL"),
-            tokenizer_instance=self.tokenizer_instance,
-        )
-        self.search_config = self.config["search"]
-        if "traverse_strategy" in self.config:
-            self.traverse_strategy = TraverseStrategy(
-                **self.config["traverse_strategy"]
-            )
-        self.full_docs_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="full_docs"
-        )
-        self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="text_chunks"
-        )
-        self.graph_storage: NetworkXStorage = NetworkXStorage(
-            self.working_dir, namespace="graph"
-        )
-        self.search_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="search"
-        )
-        self.rephrase_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="rephrase"
-        )
-        self.qa_storage: JsonListStorage = JsonListStorage(
-            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)),
-            namespace=f"qa-{self.unique_id}",
-        )
-    async def async_split_chunks(
-        self, data: List[Union[List, Dict]], data_type: str
-    ) -> dict:
-        # TODO: configurable whether to use coreference resolution
-        if len(data) == 0:
-            return {}
-        inserting_chunks = {}
-        if data_type == "raw":
-            assert isinstance(data, list) and isinstance(data[0], dict)
-            # compute hash for each document
-            new_docs = {
-                compute_content_hash(doc["content"], prefix="doc-"): {
-                    "content": doc["content"]
-                }
-                for doc in data
-            }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(
-                list(new_docs.keys())
-            )
-            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
-            if len(new_docs) == 0:
-                logger.warning("All docs are already in the storage")
-                return {}
-            logger.info("[New Docs] inserting %d docs", len(new_docs))
-            cur_index = 1
-            doc_number = len(new_docs)
-            async for doc_key, doc in tqdm_async(
-                new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
-            ):
-                chunks = {
-                    compute_content_hash(dp["content"], prefix="chunk-"): {
-                        **dp,
-                        "full_doc_id": doc_key,
-                    }
-                    for dp in self.tokenizer_instance.chunk_by_token_size(
-                        doc["content"], self.chunk_overlap_size, self.chunk_size
-                    )
-                }
-                inserting_chunks.update(chunks)
-                if self.progress_bar is not None:
-                    self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
-                    cur_index += 1
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
-                list(inserting_chunks.keys())
-            )
-            inserting_chunks = {
-                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
-            }
-        elif data_type == "chunked":
-            assert isinstance(data, list) and isinstance(data[0], list)
-            new_docs = {
-                compute_content_hash("".join(chunk["content"]), prefix="doc-"): {
-                    "content": "".join(chunk["content"])
-                }
-                for doc in data
-                for chunk in doc
-            }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(
-                list(new_docs.keys())
-            )
-            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
-            if len(new_docs) == 0:
-                logger.warning("All docs are already in the storage")
-                return {}
-            logger.info("[New Docs] inserting %d docs", len(new_docs))
-            async for doc in tqdm_async(
-                data, desc="[1/4]Chunking documents", unit="doc"
-            ):
-                doc_str = "".join([chunk["content"] for chunk in doc])
-                for chunk in doc:
-                    chunk_key = compute_content_hash(chunk["content"], prefix="chunk-")
-                    inserting_chunks[chunk_key] = {
-                        **chunk,
-                        "full_doc_id": compute_content_hash(doc_str, prefix="doc-"),
-                    }
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
-                list(inserting_chunks.keys())
-            )
-            inserting_chunks = {
-                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
-            }
-        else:
-            raise ValueError(f"Unknown data type: {data_type}")
-        await self.full_docs_storage.upsert(new_docs)
-        await self.text_chunks_storage.upsert(inserting_chunks)
-        return inserting_chunks
-    def insert(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_insert())
-    async def async_insert(self):
-        """
-        insert chunks into the graph
-        """
-        input_file = self.config["input_file"]
-        data_type = self.config["input_data_type"]
-        data = read_file(input_file)
-        inserting_chunks = await self.async_split_chunks(data, data_type)
-        if len(inserting_chunks) == 0:
-            logger.warning("All chunks are already in the storage")
-            return
-        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
-        logger.info("[Entity and Relation Extraction]...")
-        _add_entities_and_relations = await extract_kg(
-            llm_client=self.synthesizer_llm_client,
-            kg_instance=self.graph_storage,
-            tokenizer_instance=self.tokenizer_instance,
-            chunks=[
-                Chunk(id=k, content=v["content"]) for k, v in inserting_chunks.items()
-            ],
-            progress_bar=self.progress_bar,
-        )
-        if not _add_entities_and_relations:
-            logger.warning("No entities or relations extracted")
-            return
-        await self._insert_done()
-    async def _insert_done(self):
-        tasks = []
-        for storage_instance in [
-            self.full_docs_storage,
-            self.text_chunks_storage,
-            self.graph_storage,
-            self.search_storage,
-        ]:
-            if storage_instance is None:
-                continue
-            tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
-        await asyncio.gather(*tasks)
-    def search(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_search())
-    async def async_search(self):
-        logger.info(
-            "Search is %s", "enabled" if self.search_config["enabled"] else "disabled"
-        )
-        if self.search_config["enabled"]:
-            logger.info(
-                "[Search] %s ...", ", ".join(self.search_config["search_types"])
-            )
-            all_nodes = await self.graph_storage.get_all_nodes()
-            all_nodes_names = [node[0] for node in all_nodes]
-            new_search_entities = await self.full_docs_storage.filter_keys(
-                all_nodes_names
-            )
-            logger.info(
-                "[Search] Found %d entities to search", len(new_search_entities)
-            )
-            _add_search_data = await search_all(
-                search_types=self.search_config["search_types"],
-                search_entities=new_search_entities,
-            )
-            if _add_search_data:
-                await self.search_storage.upsert(_add_search_data)
-                logger.info("[Search] %d entities searched", len(_add_search_data))
-                # Format search results for inserting
-                search_results = []
-                for _, search_data in _add_search_data.items():
-                    search_results.extend(
-                        [
-                            {"content": search_data[key]}
-                            for key in list(search_data.keys())
-                        ]
-                    )
-                # TODO: fix insert after search
-                await self.async_insert()
-    def quiz(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_quiz())
-    async def async_quiz(self):
-        max_samples = self.config["quiz_and_judge_strategy"]["quiz_samples"]
-        await quiz(
-            self.synthesizer_llm_client,
-            self.graph_storage,
-            self.rephrase_storage,
-            max_samples,
-        )
-        await self.rephrase_storage.index_done_callback()
-    def judge(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_judge())
-    async def async_judge(self):
-        re_judge = self.config["quiz_and_judge_strategy"]["re_judge"]
-        _update_relations = await judge_statement(
-            self.trainee_llm_client,
-            self.graph_storage,
-            self.rephrase_storage,
-            re_judge,
-        )
-        await _update_relations.index_done_callback()
-    def traverse(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_traverse())
-    async def async_traverse(self):
-        output_data_type = self.config["output_data_type"]
-        if output_data_type == "atomic":
-            results = await traverse_graph_atomically(
-                self.synthesizer_llm_client,
-                self.tokenizer_instance,
-                self.graph_storage,
-                self.traverse_strategy,
-                self.text_chunks_storage,
-                self.progress_bar,
-            )
-        elif output_data_type == "multi_hop":
-            results = await traverse_graph_for_multi_hop(
-                self.synthesizer_llm_client,
-                self.tokenizer_instance,
-                self.graph_storage,
-                self.traverse_strategy,
-                self.text_chunks_storage,
-                self.progress_bar,
-            )
-        elif output_data_type == "aggregated":
-            results = await traverse_graph_by_edge(
-                self.synthesizer_llm_client,
-                self.tokenizer_instance,
-                self.graph_storage,
-                self.traverse_strategy,
-                self.text_chunks_storage,
-                self.progress_bar,
-            )
-        else:
-            raise ValueError(f"Unknown qa_form: {output_data_type}")
-        results = format_generation_results(
-            results, output_data_format=self.config["output_data_format"]
-        )
-        await self.qa_storage.upsert(results)
-        await self.qa_storage.index_done_callback()
-    def generate_reasoning(self, method_params):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_generate_reasoning(method_params))
-    async def async_generate_reasoning(self, method_params):
-        results = await generate_cot(
-            self.graph_storage,
-            self.synthesizer_llm_client,
-            method_params=method_params,
-        )
-        results = format_generation_results(
-            results, output_data_format=self.config["output_data_format"]
-        )
-        await self.qa_storage.upsert(results)
-        await self.qa_storage.index_done_callback()
-    def clear(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_clear())
-    async def async_clear(self):
-        await self.full_docs_storage.drop()
-        await self.text_chunks_storage.drop()
-        await self.search_storage.drop()
-        await self.graph_storage.clear()
-        await self.rephrase_storage.drop()
-        await self.qa_storage.drop()
-        logger.info("All caches are cleared")

hf-repo/graphgen/judge.py DELETED Viewed

@@ -1,60 +0,0 @@
-import os
-import argparse
-import asyncio
-from dotenv import load_dotenv
-from .models import NetworkXStorage, JsonKVStorage, OpenAIModel
-from .operators import judge_statement
-sys_path = os.path.abspath(os.path.dirname(__file__))
-load_dotenv()
-def calculate_average_loss(graph: NetworkXStorage):
-    """
-    Calculate the average loss of the graph.
-    :param graph: NetworkXStorage
-    :return: float
-    """
-    edges = asyncio.run(graph.get_all_edges())
-    total_loss = 0
-    for edge in edges:
-        total_loss += edge[2]['loss']
-    return total_loss / len(edges)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input', type=str, default=os.path.join(sys_path, "cache"), help='path to load input graph')
-    parser.add_argument('--output', type=str, default='cache/output/new_graph.graphml', help='path to save output')
-    args = parser.parse_args()
-    llm_client = OpenAIModel(
-        model_name=os.getenv("TRAINEE_MODEL"),
-        api_key=os.getenv("TRAINEE_API_KEY"),
-        base_url=os.getenv("TRAINEE_BASE_URL")
-    )
-    graph_storage = NetworkXStorage(
-        args.input,
-        namespace="graph"
-    )
-    average_loss = calculate_average_loss(graph_storage)
-    print(f"Average loss of the graph: {average_loss}")
-    rephrase_storage = JsonKVStorage(
-        os.path.join(sys_path, "cache"),
-        namespace="rephrase"
-    )
-    new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True))
-    graph_file = asyncio.run(graph_storage.get_graph())
-    new_graph.write_nx_graph(graph_file, args.output)
-    average_loss = calculate_average_loss(new_graph)
-    print(f"Average loss of the graph: {average_loss}")

hf-repo/graphgen/models/__init__.py DELETED Viewed

@@ -1,45 +0,0 @@
-from .community.community_detector import CommunityDetector
-from .evaluate.length_evaluator import LengthEvaluator
-from .evaluate.mtld_evaluator import MTLDEvaluator
-from .evaluate.reward_evaluator import RewardEvaluator
-from .evaluate.uni_evaluator import UniEvaluator
-from .llm.openai_model import OpenAIModel
-from .llm.tokenizer import Tokenizer
-from .llm.topk_token_model import Token, TopkTokenModel
-from .search.db.uniprot_search import UniProtSearch
-from .search.kg.wiki_search import WikiSearch
-from .search.web.bing_search import BingSearch
-from .search.web.google_search import GoogleSearch
-from .storage.json_storage import JsonKVStorage, JsonListStorage
-from .storage.networkx_storage import NetworkXStorage
-from .strategy.travserse_strategy import TraverseStrategy
-from .text.chunk import Chunk
-from .text.text_pair import TextPair
-__all__ = [
-    # llm models
-    "OpenAIModel",
-    "TopkTokenModel",
-    "Token",
-    "Tokenizer",
-    # storage models
-    "Chunk",
-    "NetworkXStorage",
-    "JsonKVStorage",
-    "JsonListStorage",
-    # search models
-    "WikiSearch",
-    "GoogleSearch",
-    "BingSearch",
-    "UniProtSearch",
-    # evaluate models
-    "TextPair",
-    "LengthEvaluator",
-    "MTLDEvaluator",
-    "RewardEvaluator",
-    "UniEvaluator",
-    # strategy models
-    "TraverseStrategy",
-    # community models
-    "CommunityDetector",
-]

hf-repo/graphgen/models/community/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/community/community_detector.py DELETED Viewed

@@ -1,95 +0,0 @@
-from collections import defaultdict
-from dataclasses import dataclass
-from typing import Any, Dict, List
-from graphgen.models.storage.networkx_storage import NetworkXStorage
-@dataclass
-class CommunityDetector:
-    """Class for community detection algorithms."""
-    graph_storage: NetworkXStorage = None
-    method: str = "leiden"
-    method_params: Dict[str, Any] = None
-    async def detect_communities(self) -> Dict[str, int]:
-        if self.method == "leiden":
-            return await self._leiden_communities(**self.method_params or {})
-        raise ValueError(f"Unknown community detection method: {self.method}")
-    async def get_graph(self):
-        return await self.graph_storage.get_graph()
-    async def _leiden_communities(
-        self, max_size: int = None, **kwargs
-    ) -> Dict[str, int]:
-        """
-        Detect communities using the Leiden algorithm.
-        If max_size is given, any community larger than max_size will be split
-        into smaller sub-communities each having at most max_size nodes.
-        """
-        import igraph as ig
-        import networkx as nx
-        from leidenalg import ModularityVertexPartition, find_partition
-        graph = await self.get_graph()
-        graph.remove_nodes_from(list(nx.isolates(graph)))
-        ig_graph = ig.Graph.TupleList(graph.edges(), directed=False)
-        random_seed = kwargs.get("random_seed", 42)
-        use_lcc = kwargs.get("use_lcc", False)
-        communities: Dict[str, int] = {}
-        if use_lcc:
-            lcc = ig_graph.components().giant()
-            partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed)
-            for part, cluster in enumerate(partition):
-                for v in cluster:
-                    communities[lcc.vs[v]["name"]] = part
-        else:
-            offset = 0
-            for component in ig_graph.components():
-                subgraph = ig_graph.induced_subgraph(component)
-                partition = find_partition(
-                    subgraph, ModularityVertexPartition, seed=random_seed
-                )
-                for part, cluster in enumerate(partition):
-                    for v in cluster:
-                        original_node = subgraph.vs[v]["name"]
-                        communities[original_node] = part + offset
-                offset += len(partition)
-        # split large communities if max_size is specified
-        if max_size is None or max_size <= 0:
-            return communities
-        return await self._split_communities(communities, max_size)
-    @staticmethod
-    async def _split_communities(
-        communities: Dict[str, int], max_size: int
-    ) -> Dict[str, int]:
-        """
-        Split communities larger than max_size into smaller sub-communities.
-        """
-        cid2nodes: Dict[int, List[str]] = defaultdict(list)
-        for node, cid in communities.items():
-            cid2nodes[cid].append(node)
-        new_communities: Dict[str, int] = {}
-        new_cid = 0
-        for cid, nodes in cid2nodes.items():
-            if len(nodes) <= max_size:
-                for n in nodes:
-                    new_communities[n] = new_cid
-                new_cid += 1
-            else:
-                for start in range(0, len(nodes), max_size):
-                    sub = nodes[start : start + max_size]
-                    for n in sub:
-                        new_communities[n] = new_cid
-                    new_cid += 1
-        return new_communities

hf-repo/graphgen/models/embed/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/embed/embedding.py DELETED Viewed

@@ -1,29 +0,0 @@
-from dataclasses import dataclass
-import asyncio
-import numpy as np
-class UnlimitedSemaphore:
-    """A context manager that allows unlimited access."""
-    async def __aenter__(self):
-        pass
-    async def __aexit__(self, exc_type, exc, tb):
-        pass
-@dataclass
-class EmbeddingFunc:
-    embedding_dim: int
-    max_token_size: int
-    func: callable
-    concurrent_limit: int = 16
-    def __post_init__(self):
-        if self.concurrent_limit != 0:
-            self._semaphore = asyncio.Semaphore(self.concurrent_limit)
-        else:
-            self._semaphore = UnlimitedSemaphore()
-    async def __call__(self, *args, **kwargs) -> np.ndarray:
-        async with self._semaphore:
-            return await self.func(*args, **kwargs)

hf-repo/graphgen/models/evaluate/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/evaluate/base_evaluator.py DELETED Viewed

@@ -1,51 +0,0 @@
-import asyncio
-from dataclasses import dataclass
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.utils import create_event_loop
-from graphgen.models.text.text_pair import TextPair
-@dataclass
-class BaseEvaluator:
-    max_concurrent: int = 100
-    results: list[float] = None
-    def evaluate(self, pairs: list[TextPair]) -> list[float]:
-        """
-        Evaluate the text and return a score.
-        """
-        return create_event_loop().run_until_complete(self.async_evaluate(pairs))
-    async def async_evaluate(self, pairs: list[TextPair]) -> list[float]:
-        semaphore = asyncio.Semaphore(self.max_concurrent)
-        async def evaluate_with_semaphore(pair):
-            async with semaphore:  # 获取Semaphore
-                return await self.evaluate_single(pair)
-        results = []
-        for result in tqdm_async(
-            asyncio.as_completed([evaluate_with_semaphore(pair) for pair in pairs]),
-            total=len(pairs),
-        ):
-            results.append(await result)
-        return results
-    async def evaluate_single(self, pair: TextPair) -> float:
-        raise NotImplementedError()
-    def get_average_score(self, pairs: list[TextPair]) -> float:
-        """
-        Get the average score of a batch of texts.
-        """
-        results = self.evaluate(pairs)
-        self.results = results
-        return sum(self.results) / len(pairs)
-    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
-        """
-        Get the min and max score of a batch of texts.
-        """
-        if self.results is None:
-            self.get_average_score(pairs)
-        return min(self.results), max(self.results)

hf-repo/graphgen/models/evaluate/length_evaluator.py DELETED Viewed

@@ -1,22 +0,0 @@
-from dataclasses import dataclass
-from graphgen.models.evaluate.base_evaluator import BaseEvaluator
-from graphgen.models.llm.tokenizer import Tokenizer
-from graphgen.models.text.text_pair import TextPair
-from graphgen.utils import create_event_loop
-@dataclass
-class LengthEvaluator(BaseEvaluator):
-    tokenizer_name: str = "cl100k_base"
-    def __post_init__(self):
-        self.tokenizer = Tokenizer(
-            model_name=self.tokenizer_name
-        )
-    async def evaluate_single(self, pair: TextPair) -> float:
-        loop = create_event_loop()
-        return await loop.run_in_executor(None, self._calculate_length, pair.answer)
-    def _calculate_length(self, text: str) -> float:
-        tokens = self.tokenizer.encode_string(text)
-        return len(tokens)

hf-repo/graphgen/models/evaluate/mtld_evaluator.py DELETED Viewed

@@ -1,76 +0,0 @@
-from dataclasses import  dataclass, field
-from typing import Set
-from graphgen.models.evaluate.base_evaluator import BaseEvaluator
-from graphgen.models.text.text_pair import TextPair
-from graphgen.utils import detect_main_language, NLTKHelper, create_event_loop
-nltk_helper = NLTKHelper()
-@dataclass
-class MTLDEvaluator(BaseEvaluator):
-    """
-    衡量文本词汇多样性的指标
-    """
-    stopwords_en: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("english")))
-    stopwords_zh: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("chinese")))
-    async def evaluate_single(self, pair: TextPair) -> float:
-        loop = create_event_loop()
-        return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer)
-    def _calculate_mtld_score(self, text: str, threshold=0.72) -> float:
-        """
-        计算MTLD (向前和向后的平均值)
-        min is 1.0
-        higher is better
-        """
-        if not text or not text.strip():
-            return 0.0
-        lang = detect_main_language(text)
-        tokens = nltk_helper.word_tokenize(text, lang)
-        stopwords = self.stopwords_zh if lang == "zh" else self.stopwords_en
-        filtered_tokens = [word for word in tokens if word not in stopwords]
-        filtered_tokens = [word for word in filtered_tokens if word.isalnum()]
-        if not filtered_tokens:
-            return 0
-        # 计算向前的MTLD
-        forward_factors = self._compute_factors(filtered_tokens, threshold)
-        # 计算向后的MTLD
-        backward_factors = self._compute_factors(filtered_tokens[::-1], threshold)
-        # 取平均值
-        return (forward_factors + backward_factors) / 2
-    @staticmethod
-    def _compute_factors(tokens: list, threshold: float) -> float:
-        factors = 0
-        current_segment = []
-        unique_words = set()
-        for token in tokens:
-            current_segment.append(token)
-            unique_words.add(token)
-            ttr = len(unique_words) / len(current_segment)
-            if ttr <= threshold:
-                factors += 1
-                current_segment = []
-                unique_words = set()
-        # 处理最后一个不完整片段
-        if current_segment:
-            ttr = len(unique_words) / len(current_segment)
-            if ttr <= threshold:
-                factors += 1
-            else:
-                factors += (1 - (ttr - threshold) / (1 - threshold))
-        return len(tokens) / factors if factors > 0 else len(tokens)

hf-repo/graphgen/models/evaluate/reward_evaluator.py DELETED Viewed

@@ -1,101 +0,0 @@
-from dataclasses import dataclass
-from tqdm import tqdm
-from graphgen.models.text.text_pair import TextPair
-@dataclass
-class RewardEvaluator:
-    """
-    Reward Model Evaluator.
-    OpenAssistant/reward-model-deberta-v3-large-v2: 分数范围为[-inf, inf]，越高越好
-    """
-    reward_name: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
-    max_length: int = 2560
-    results: list[float] = None
-    def __post_init__(self):
-        import torch
-        self.num_gpus = torch.cuda.device_count()
-    @staticmethod
-    def process_chunk(rank, pairs, reward_name, max_length, return_dict):
-        import torch
-        from transformers import AutoModelForSequenceClassification, AutoTokenizer
-        device = f'cuda:{rank}'
-        torch.cuda.set_device(rank)
-        rank_model = AutoModelForSequenceClassification.from_pretrained(reward_name)
-        tokenizer = AutoTokenizer.from_pretrained(reward_name)
-        rank_model.to(device)
-        rank_model.eval()
-        results = []
-        with torch.no_grad():
-            for pair in tqdm(pairs):
-                inputs = tokenizer(
-                    pair.question,
-                    pair.answer,
-                    return_tensors="pt",
-                    max_length=max_length,
-                    truncation=True
-                )
-                inputs = {k: v.to(device) for k, v in inputs.items()}
-                score = rank_model(**inputs).logits[0].item()
-                results.append(score)
-        return_dict[rank] = results
-    def evaluate(self, pairs: list[TextPair]) -> list[float]:
-        import torch.multiprocessing as mp
-        chunk_size = len(pairs) // self.num_gpus
-        chunks = []
-        for i in range(self.num_gpus):
-            start = i * chunk_size
-            end = start + chunk_size
-            if i == self.num_gpus - 1:
-                end = len(pairs)
-            chunks.append(pairs[start:end])
-        # multi-process
-        manager = mp.Manager()
-        return_dict = manager.dict()
-        processes = []
-        for rank, chunk in enumerate(chunks):
-            p = mp.Process(
-                target=self.process_chunk,
-                args=(rank, chunk, self.reward_name, self.max_length, return_dict)
-            )
-            p.start()
-            processes.append(p)
-        for p in processes:
-            p.join()
-        # 合并结果
-        results = []
-        for rank in range(len(chunks)):
-            results.extend(return_dict[rank])
-        for p in processes:
-            if p.is_alive():
-                p.terminate()
-                p.join()
-        return results
-    def get_average_score(self, pairs: list[TextPair]) -> float:
-        """
-        Get the average score of a batch of texts.
-        """
-        results = self.evaluate(pairs)
-        self.results = results
-        return sum(self.results) / len(pairs)
-    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
-        """
-        Get the min and max score of a batch of texts.
-        """
-        if self.results is None:
-            self.get_average_score(pairs)
-        return min(self.results), max(self.results)

hf-repo/graphgen/models/evaluate/uni_evaluator.py DELETED Viewed

@@ -1,159 +0,0 @@
-# https://github.com/maszhongming/UniEval/tree/main
-from dataclasses import dataclass, field
-from tqdm import tqdm
-from graphgen.models.text.text_pair import TextPair
-def _add_questions(dimension: str, question: str, answer: str):
-    if dimension == "naturalness":
-        cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer
-    elif dimension == "coherence":
-        cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \
-                    + answer + ' </s> dialogue history: ' + question
-    elif dimension == "understandability":
-        cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer
-    else:
-        raise NotImplementedError(
-            'The input format for this dimension is still undefined. Please customize it first.')
-    return cur_input
-@dataclass
-class UniEvaluator:
-    model_name: str = "MingZhong/unieval-sum"
-    dimensions: list = field(default_factory=lambda: ['naturalness', 'coherence', 'understandability'])
-    max_length: int = 2560
-    results: dict = None
-    def __post_init__(self):
-        import torch
-        self.num_gpus = torch.cuda.device_count()
-        self.results = {}
-    @staticmethod
-    def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):
-        import torch
-        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-        device = f'cuda:{rank}'
-        torch.cuda.set_device(rank)
-        rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        rank_model.to(device)
-        rank_model.eval()
-        softmax = torch.nn.Softmax(dim=1)
-        pos_id = tokenizer("Yes")["input_ids"][0]
-        neg_id = tokenizer("No")["input_ids"][0]
-        results = []
-        with torch.no_grad():
-            for pair in tqdm(pairs):
-                text = _add_questions(dimension, pair.question, pair.answer)
-                tgt = "No"
-                encoded_src = tokenizer(
-                    text,
-                    max_length=max_length,
-                    truncation=True,
-                    padding=True,
-                    return_tensors='pt'
-                )
-                encoded_tgt = tokenizer(
-                    tgt,
-                    max_length=max_length,
-                    truncation=True,
-                    padding=True,
-                    return_tensors='pt'
-                )
-                src_tokens = encoded_src['input_ids'].to(device)
-                src_mask = encoded_src['attention_mask'].to(device)
-                tgt_tokens = encoded_tgt['input_ids'].to(device)[:, 0].unsqueeze(-1)
-                output = rank_model(
-                    input_ids=src_tokens,
-                    attention_mask=src_mask,
-                    labels=tgt_tokens,
-                    use_cache = False
-                )
-                logits = output.logits.view(-1, rank_model.config.vocab_size)
-                pos_score = softmax(logits)[:, pos_id]  # Yes
-                neg_score = softmax(logits)[:, neg_id]
-                score = pos_score / (pos_score + neg_score)
-                results.append(score.item())
-        return_dict[rank] = results
-    def evaluate(self, pairs: list[TextPair]) -> list[dict]:
-        import torch.multiprocessing as mp
-        final_results = []
-        for dimension in self.dimensions:
-            chunk_size = len(pairs) // self.num_gpus
-            chunks = []
-            for i in range(self.num_gpus):
-                start = i * chunk_size
-                end = start + chunk_size
-                if i == self.num_gpus - 1:
-                    end = len(pairs)
-                chunks.append(pairs[start:end])
-            # multi-process
-            manager = mp.Manager()
-            return_dict = manager.dict()
-            processes = []
-            for rank, chunk in enumerate(chunks):
-                p = mp.Process(
-                    target=self.process_chunk,
-                    args=(rank, chunk, self.model_name, self.max_length, dimension, return_dict)
-                )
-                p.start()
-                processes.append(p)
-            for p in processes:
-                p.join()
-            # 合并结果
-            results = []
-            for rank in range(len(chunks)):
-                results.extend(return_dict[rank])
-            for p in processes:
-                if p.is_alive():
-                    p.terminate()
-                    p.join()
-            final_results.append({
-                dimension: results
-            })
-        return final_results
-    def get_average_score(self, pairs: list[TextPair]) -> dict:
-        """
-        Get the average score of a batch of texts.
-        """
-        results = self.evaluate(pairs)
-        final_results = {}
-        for result in results:
-            for key, value in result.items():
-                final_results[key] = sum(value) / len(value)
-                self.results[key] = value
-        return final_results
-    def get_min_max_score(self, pairs: list[TextPair]) -> dict:
-        """
-        Get the min and max score of a batch of texts.
-        """
-        if self.results is None:
-            self.get_average_score(pairs)
-        final_results = {}
-        for key, value in self.results.items():
-            final_results[key] = min(value), max(value)
-        return final_results

hf-repo/graphgen/models/llm/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/llm/limitter.py DELETED Viewed

@@ -1,88 +0,0 @@
-import time
-from datetime import datetime, timedelta
-import asyncio
-from graphgen.utils import logger
-class RPM:
-    def __init__(self, rpm: int = 1000):
-        self.rpm = rpm
-        self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
-    def get_minute_slot(self):
-        current_time = time.time()
-        dt_object = datetime.fromtimestamp(current_time)
-        total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
-        return total_minutes_since_midnight
-    async def wait(self, silent=False):
-        current = time.time()
-        dt_object = datetime.fromtimestamp(current)
-        minute_slot = self.get_minute_slot()
-        if self.record['rpm_slot'] == minute_slot:
-            # check RPM exceed
-            if self.record['counter'] >= self.rpm:
-                # wait until next minute
-                next_minute = dt_object.replace(
-                    second=0, microsecond=0) + timedelta(minutes=1)
-                _next = next_minute.timestamp()
-                sleep_time = abs(_next - current)
-                if not silent:
-                    logger.info('RPM sleep %s', sleep_time)
-                await asyncio.sleep(sleep_time)
-                self.record = {
-                    'rpm_slot': self.get_minute_slot(),
-                    'counter': 0
-                }
-        else:
-            self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
-        self.record['counter'] += 1
-        if not silent:
-            logger.debug(self.record)
-class TPM:
-    def __init__(self, tpm: int = 20000):
-        self.tpm = tpm
-        self.record = {'tpm_slot': self.get_minute_slot(), 'counter': 0}
-    def get_minute_slot(self):
-        current_time = time.time()
-        dt_object = datetime.fromtimestamp(current_time)
-        total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
-        return total_minutes_since_midnight
-    async def wait(self, token_count, silent=False):
-        current = time.time()
-        dt_object = datetime.fromtimestamp(current)
-        minute_slot = self.get_minute_slot()
-        # get next slot, skip
-        if self.record['tpm_slot'] != minute_slot:
-            self.record = {'tpm_slot': minute_slot, 'counter': token_count}
-            return
-        # check RPM exceed
-        self.record['counter'] += token_count
-        if self.record['counter'] > self.tpm:
-            # wait until next minute
-            next_minute = dt_object.replace(
-                second=0, microsecond=0) + timedelta(minutes=1)
-            _next = next_minute.timestamp()
-            sleep_time = abs(_next - current)
-            logger.info('TPM sleep %s', sleep_time)
-            await asyncio.sleep(sleep_time)
-            self.record = {
-                'tpm_slot': self.get_minute_slot(),
-                'counter': token_count
-            }
-        if not silent:
-            logger.debug(self.record)

hf-repo/graphgen/models/llm/openai_model.py DELETED Viewed

@@ -1,155 +0,0 @@
-import math
-import re
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-import openai
-from openai import APIConnectionError, APITimeoutError, AsyncOpenAI, RateLimitError
-from tenacity import (
-    retry,
-    retry_if_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
-from graphgen.models.llm.limitter import RPM, TPM
-from graphgen.models.llm.tokenizer import Tokenizer
-from graphgen.models.llm.topk_token_model import Token, TopkTokenModel
-def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]:
-    token_logprobs = response.choices[0].logprobs.content
-    tokens = []
-    for token_prob in token_logprobs:
-        prob = math.exp(token_prob.logprob)
-        candidate_tokens = [
-            Token(t.token, math.exp(t.logprob)) for t in token_prob.top_logprobs
-        ]
-        token = Token(token_prob.token, prob, top_candidates=candidate_tokens)
-        tokens.append(token)
-    return tokens
-def filter_think_tags(text: str) -> str:
-    """
-    Remove <think> tags from the text.
-    If the text contains <think> and </think>, it removes everything between them and the tags themselves.
-    """
-    think_pattern = re.compile(r"<think>.*?</think>", re.DOTALL)
-    filtered_text = think_pattern.sub("", text).strip()
-    return filtered_text if filtered_text else text.strip()
-@dataclass
-class OpenAIModel(TopkTokenModel):
-    model_name: str = "gpt-4o-mini"
-    api_key: str = None
-    base_url: str = None
-    system_prompt: str = ""
-    json_mode: bool = False
-    seed: int = None
-    token_usage: list = field(default_factory=list)
-    request_limit: bool = False
-    rpm: RPM = field(default_factory=lambda: RPM(rpm=1000))
-    tpm: TPM = field(default_factory=lambda: TPM(tpm=50000))
-    tokenizer_instance: Tokenizer = field(default_factory=Tokenizer)
-    def __post_init__(self):
-        assert self.api_key is not None, "Please provide api key to access openai api."
-        self.client = AsyncOpenAI(
-            api_key=self.api_key or "dummy", base_url=self.base_url
-        )
-    def _pre_generate(self, text: str, history: List[str]) -> Dict:
-        kwargs = {
-            "temperature": self.temperature,
-            "top_p": self.topp,
-            "max_tokens": self.max_tokens,
-        }
-        if self.seed:
-            kwargs["seed"] = self.seed
-        if self.json_mode:
-            kwargs["response_format"] = {"type": "json_object"}
-        messages = []
-        if self.system_prompt:
-            messages.append({"role": "system", "content": self.system_prompt})
-        messages.append({"role": "user", "content": text})
-        if history:
-            assert len(history) % 2 == 0, "History should have even number of elements."
-            messages = history + messages
-        kwargs["messages"] = messages
-        return kwargs
-    @retry(
-        stop=stop_after_attempt(5),
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type(
-            (RateLimitError, APIConnectionError, APITimeoutError)
-        ),
-    )
-    async def generate_topk_per_token(
-        self, text: str, history: Optional[List[str]] = None
-    ) -> List[Token]:
-        kwargs = self._pre_generate(text, history)
-        if self.topk_per_token > 0:
-            kwargs["logprobs"] = True
-            kwargs["top_logprobs"] = self.topk_per_token
-        # Limit max_tokens to 1 to avoid long completions
-        kwargs["max_tokens"] = 1
-        completion = await self.client.chat.completions.create(  # pylint: disable=E1125
-            model=self.model_name, **kwargs
-        )
-        tokens = get_top_response_tokens(completion)
-        return tokens
-    @retry(
-        stop=stop_after_attempt(5),
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type(
-            (RateLimitError, APIConnectionError, APITimeoutError)
-        ),
-    )
-    async def generate_answer(
-        self, text: str, history: Optional[List[str]] = None, temperature: int = 0
-    ) -> str:
-        kwargs = self._pre_generate(text, history)
-        kwargs["temperature"] = temperature
-        prompt_tokens = 0
-        for message in kwargs["messages"]:
-            prompt_tokens += len(
-                self.tokenizer_instance.encode_string(message["content"])
-            )
-        estimated_tokens = prompt_tokens + kwargs["max_tokens"]
-        if self.request_limit:
-            await self.rpm.wait(silent=True)
-            await self.tpm.wait(estimated_tokens, silent=True)
-        completion = await self.client.chat.completions.create(  # pylint: disable=E1125
-            model=self.model_name, **kwargs
-        )
-        if hasattr(completion, "usage"):
-            self.token_usage.append(
-                {
-                    "prompt_tokens": completion.usage.prompt_tokens,
-                    "completion_tokens": completion.usage.completion_tokens,
-                    "total_tokens": completion.usage.total_tokens,
-                }
-            )
-        return filter_think_tags(completion.choices[0].message.content)
-    async def generate_inputs_prob(
-        self, text: str, history: Optional[List[str]] = None
-    ) -> List[Token]:
-        raise NotImplementedError

hf-repo/graphgen/models/llm/tokenizer.py DELETED Viewed

@@ -1,73 +0,0 @@
-from dataclasses import dataclass
-from typing import List
-import tiktoken
-try:
-    from transformers import AutoTokenizer
-    TRANSFORMERS_AVAILABLE = True
-except ImportError:
-    AutoTokenizer = None
-    TRANSFORMERS_AVAILABLE = False
-def get_tokenizer(tokenizer_name: str = "cl100k_base"):
-    """
-    Get a tokenizer instance by name.
-    :param tokenizer_name: tokenizer name, tiktoken encoding name or Hugging Face model name
-    :return: tokenizer instance
-    """
-    if tokenizer_name in tiktoken.list_encoding_names():
-        return tiktoken.get_encoding(tokenizer_name)
-    if TRANSFORMERS_AVAILABLE:
-        try:
-            return AutoTokenizer.from_pretrained(tokenizer_name)
-        except Exception as e:
-            raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e
-    else:
-        raise ValueError("Hugging Face Transformers is not available, please install it first.")
-@dataclass
-class Tokenizer:
-    model_name: str = "cl100k_base"
-    def __post_init__(self):
-        self.tokenizer = get_tokenizer(self.model_name)
-    def encode_string(self, text: str) -> List[int]:
-        """
-        Encode text to tokens
-        :param text
-        :return: tokens
-        """
-        return self.tokenizer.encode(text)
-    def decode_tokens(self, tokens: List[int]) -> str:
-        """
-        Decode tokens to text
-        :param tokens
-        :return: text
-        """
-        return self.tokenizer.decode(tokens)
-    def chunk_by_token_size(
-        self, content: str, overlap_token_size=128, max_token_size=1024
-    ):
-        tokens = self.encode_string(content)
-        results = []
-        for index, start in enumerate(
-            range(0, len(tokens), max_token_size - overlap_token_size)
-        ):
-            chunk_content = self.decode_tokens(
-                tokens[start : start + max_token_size]
-            )
-            results.append(
-                {
-                    "tokens": min(max_token_size, len(tokens) - start),
-                    "content": chunk_content.strip(),
-                    "chunk_order_index": index,
-                }
-            )
-        return results

hf-repo/graphgen/models/llm/topk_token_model.py DELETED Viewed

@@ -1,48 +0,0 @@
-import math
-from dataclasses import dataclass, field
-from typing import List, Union, Optional
-@dataclass
-class Token:
-    text: str
-    prob: float
-    top_candidates: List = field(default_factory=list)
-    ppl: Union[float, None] = field(default=None)
-    @property
-    def logprob(self) -> float:
-        return math.log(self.prob)
-@dataclass
-class TopkTokenModel:
-    do_sample: bool = False
-    temperature: float = 0
-    max_tokens: int = 4096
-    repetition_penalty: float = 1.05
-    num_beams: int = 1
-    topk: int = 50
-    topp: float = 0.95
-    topk_per_token: int = 5  # number of topk tokens to generate for each token
-    async def generate_topk_per_token(self, text: str) -> List[Token]:
-        """
-        Generate prob, text and candidates for each token of the model's output.
-        This function is used to visualize the inference process.
-        """
-        raise NotImplementedError
-    async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
-        """
-        Generate prob and text for each token of the input text.
-        This function is used to visualize the ppl.
-        """
-        raise NotImplementedError
-    async def generate_answer(self, text: str, history: Optional[List[str]] = None) -> str:
-        """
-        Generate answer from the model.
-        """
-        raise NotImplementedError

hf-repo/graphgen/models/search/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/search/db/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/search/db/uniprot_search.py DELETED Viewed

@@ -1,64 +0,0 @@
-from dataclasses import dataclass
-import requests
-from fastapi import HTTPException
-from graphgen.utils import logger
-UNIPROT_BASE = "https://rest.uniprot.org/uniprotkb/search"
-@dataclass
-class UniProtSearch:
-    """
-    UniProt Search client to search with UniProt.
-    1) Get the protein by accession number.
-    2) Search with keywords or protein names.
-    """
-    def get_entry(self, accession: str) -> dict:
-        """
-        Get the UniProt entry by accession number(e.g., P04637).
-        """
-        url = f"{UNIPROT_BASE}/{accession}.json"
-        return self._safe_get(url).json()
-    def search(
-        self,
-        query: str,
-        *,
-        size: int = 10,
-        cursor: str = None,
-        fields: list[str] = None,
-    ) -> dict:
-        """
-        Search UniProt with a query string.
-        :param query: The search query.
-        :param size: The number of results to return.
-        :param cursor: The cursor for pagination.
-        :param fields: The fields to return in the response.
-        :return: A dictionary containing the search results.
-        """
-        params = {
-            "query": query,
-            "size": size,
-        }
-        if cursor:
-            params["cursor"] = cursor
-        if fields:
-            params["fields"] = ",".join(fields)
-        url = UNIPROT_BASE
-        return self._safe_get(url, params=params).json()
-    @staticmethod
-    def _safe_get(url: str, params: dict = None) -> requests.Response:
-        r = requests.get(
-            url,
-            params=params,
-            headers={"Accept": "application/json"},
-            timeout=10,
-        )
-        if not r.ok:
-            logger.error("Search engine error: %s", r.text)
-            raise HTTPException(r.status_code, "Search engine error.")
-        return r

hf-repo/graphgen/models/search/kg/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/search/kg/wiki_search.py DELETED Viewed

@@ -1,37 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Union
-import wikipedia
-from wikipedia import set_lang
-from graphgen.utils import detect_main_language, logger
-@dataclass
-class WikiSearch:
-    @staticmethod
-    def set_language(language: str):
-        assert language in ["en", "zh"], "Only support English and Chinese"
-        set_lang(language)
-    async def search(self, query: str, num_results: int = 1) -> Union[List[str], None]:
-        self.set_language(detect_main_language(query))
-        return wikipedia.search(query, results=num_results, suggestion=False)
-    async def summary(self, query: str) -> Union[str, None]:
-        self.set_language(detect_main_language(query))
-        try:
-            result = wikipedia.summary(query, auto_suggest=False, redirect=False)
-        except wikipedia.exceptions.DisambiguationError as e:
-            logger.error("DisambiguationError: %s", e)
-            result = None
-        return result
-    async def page(self, query: str) -> Union[str, None]:
-        self.set_language(detect_main_language(query))
-        try:
-            result = wikipedia.page(query, auto_suggest=False, redirect=False).content
-        except wikipedia.exceptions.DisambiguationError as e:
-            logger.error("DisambiguationError: %s", e)
-            result = None
-        return result

hf-repo/graphgen/models/search/web/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/search/web/bing_search.py DELETED Viewed

@@ -1,43 +0,0 @@
-from dataclasses import dataclass
-import requests
-from fastapi import HTTPException
-from graphgen.utils import logger
-BING_SEARCH_V7_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"
-BING_MKT = "en-US"
-@dataclass
-class BingSearch:
-    """
-    Bing Search client to search with Bing.
-    """
-    subscription_key: str
-    def search(self, query: str, num_results: int = 1):
-        """
-        Search with Bing and return the contexts.
-        :param query: The search query.
-        :param num_results: The number of results to return.
-        :return: A list of search results.
-        """
-        params = {"q": query, "mkt": BING_MKT, "count": num_results}
-        response = requests.get(
-            BING_SEARCH_V7_ENDPOINT,
-            headers={"Ocp-Apim-Subscription-Key": self.subscription_key},
-            params=params,
-            timeout=10,
-        )
-        if not response.ok:
-            logger.error("Search engine error: %s", response.text)
-            raise HTTPException(response.status_code, "Search engine error.")
-        json_content = response.json()
-        try:
-            contexts = json_content["webPages"]["value"][:num_results]
-        except KeyError:
-            logger.error("Error encountered: %s", json_content)
-            return []
-        return contexts

hf-repo/graphgen/models/search/web/google_search.py DELETED Viewed

@@ -1,45 +0,0 @@
-from dataclasses import dataclass
-import requests
-from fastapi import HTTPException
-from graphgen.utils import logger
-GOOGLE_SEARCH_ENDPOINT = "https://customsearch.googleapis.com/customsearch/v1"
-@dataclass
-class GoogleSearch:
-    def __init__(self, subscription_key: str, cx: str):
-        """
-        Initialize the Google Search client with the subscription key and custom search engine ID.
-        :param subscription_key: Your Google API subscription key.
-        :param cx: Your custom search engine ID.
-        """
-        self.subscription_key = subscription_key
-        self.cx = cx
-    def search(self, query: str, num_results: int = 1):
-        """
-        Search with Google and return the contexts.
-        :param query: The search query.
-        :param num_results: The number of results to return.
-        :return: A list of search results.
-        """
-        params = {
-            "key": self.subscription_key,
-            "cx": self.cx,
-            "q": query,
-            "num": num_results,
-        }
-        response = requests.get(GOOGLE_SEARCH_ENDPOINT, params=params, timeout=10)
-        if not response.ok:
-            logger.error("Search engine error: %s", response.text)
-            raise HTTPException(response.status_code, "Search engine error.")
-        json_content = response.json()
-        try:
-            contexts = json_content["items"][:num_results]
-        except KeyError:
-            logger.error("Error encountered: %s", json_content)
-            return []
-        return contexts

hf-repo/graphgen/models/storage/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/storage/base_storage.py DELETED Viewed

@@ -1,115 +0,0 @@
-from dataclasses import dataclass
-from typing import Generic, TypeVar, Union
-from graphgen.models.embed.embedding import EmbeddingFunc
-T = TypeVar("T")
-@dataclass
-class StorageNameSpace:
-    working_dir: str = None
-    namespace: str = None
-    async def index_done_callback(self):
-        """commit the storage operations after indexing"""
-    async def query_done_callback(self):
-        """commit the storage operations after querying"""
-@dataclass
-class BaseListStorage(Generic[T], StorageNameSpace):
-    async def all_items(self) -> list[T]:
-        raise NotImplementedError
-    async def get_by_index(self, index: int) -> Union[T, None]:
-        raise NotImplementedError
-    async def append(self, data: T):
-        raise NotImplementedError
-    async def upsert(self, data: list[T]):
-        raise NotImplementedError
-    async def drop(self):
-        raise NotImplementedError
-@dataclass
-class BaseKVStorage(Generic[T], StorageNameSpace):
-    async def all_keys(self) -> list[str]:
-        raise NotImplementedError
-    async def get_by_id(self, id: str) -> Union[T, None]:
-        raise NotImplementedError
-    async def get_by_ids(
-        self, ids: list[str], fields: Union[set[str], None] = None
-    ) -> list[Union[T, None]]:
-        raise NotImplementedError
-    async def filter_keys(self, data: list[str]) -> set[str]:
-        """return un-exist keys"""
-        raise NotImplementedError
-    async def upsert(self, data: dict[str, T]):
-        raise NotImplementedError
-    async def drop(self):
-        raise NotImplementedError
-@dataclass
-class BaseGraphStorage(StorageNameSpace):
-    embedding_func: EmbeddingFunc = None
-    async def has_node(self, node_id: str) -> bool:
-        raise NotImplementedError
-    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
-        raise NotImplementedError
-    async def node_degree(self, node_id: str) -> int:
-        raise NotImplementedError
-    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
-        raise NotImplementedError
-    async def get_node(self, node_id: str) -> Union[dict, None]:
-        raise NotImplementedError
-    async def update_node(self, node_id: str, node_data: dict[str, str]):
-        raise NotImplementedError
-    async def get_all_nodes(self) -> Union[list[dict], None]:
-        raise NotImplementedError
-    async def get_edge(
-        self, source_node_id: str, target_node_id: str
-    ) -> Union[dict, None]:
-        raise NotImplementedError
-    async def update_edge(
-        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
-    ):
-        raise NotImplementedError
-    async def get_all_edges(self) -> Union[list[dict], None]:
-        raise NotImplementedError
-    async def get_node_edges(
-        self, source_node_id: str
-    ) -> Union[list[tuple[str, str]], None]:
-        raise NotImplementedError
-    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
-        raise NotImplementedError
-    async def upsert_edge(
-        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
-    ):
-        raise NotImplementedError
-    async def delete_node(self, node_id: str):
-        raise NotImplementedError

hf-repo/graphgen/models/storage/json_storage.py DELETED Viewed

@@ -1,87 +0,0 @@
-import os
-from dataclasses import dataclass
-from graphgen.models.storage.base_storage import BaseKVStorage, BaseListStorage
-from graphgen.utils import load_json, logger, write_json
-@dataclass
-class JsonKVStorage(BaseKVStorage):
-    _data: dict[str, str] = None
-    def __post_init__(self):
-        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
-        self._data = load_json(self._file_name) or {}
-        logger.info("Load KV %s with %d data", self.namespace, len(self._data))
-    @property
-    def data(self):
-        return self._data
-    async def all_keys(self) -> list[str]:
-        return list(self._data.keys())
-    async def index_done_callback(self):
-        write_json(self._data, self._file_name)
-    async def get_by_id(self, id):
-        return self._data.get(id, None)
-    async def get_by_ids(self, ids, fields=None) -> list:
-        if fields is None:
-            return [self._data.get(id, None) for id in ids]
-        return [
-            (
-                {k: v for k, v in self._data[id].items() if k in fields}
-                if self._data.get(id, None)
-                else None
-            )
-            for id in ids
-        ]
-    async def filter_keys(self, data: list[str]) -> set[str]:
-        return {s for s in data if s not in self._data}
-    async def upsert(self, data: dict):
-        left_data = {k: v for k, v in data.items() if k not in self._data}
-        self._data.update(left_data)
-        return left_data
-    async def drop(self):
-        self._data = {}
-@dataclass
-class JsonListStorage(BaseListStorage):
-    _data: list = None
-    def __post_init__(self):
-        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
-        self._data = load_json(self._file_name) or []
-        logger.info("Load List %s with %d data", self.namespace, len(self._data))
-    @property
-    def data(self):
-        return self._data
-    async def all_items(self) -> list:
-        return self._data
-    async def index_done_callback(self):
-        write_json(self._data, self._file_name)
-    async def get_by_index(self, index: int):
-        if index < 0 or index >= len(self._data):
-            return None
-        return self._data[index]
-    async def append(self, data):
-        self._data.append(data)
-    async def upsert(self, data: list):
-        left_data = [d for d in data if d not in self._data]
-        self._data.extend(left_data)
-        return left_data
-    async def drop(self):
-        self._data = []

hf-repo/graphgen/models/storage/networkx_storage.py DELETED Viewed

@@ -1,159 +0,0 @@
-import os
-import html
-from typing import Any, Union, cast, Optional
-from dataclasses import dataclass
-import networkx as nx
-from graphgen.utils import logger
-from .base_storage import BaseGraphStorage
-@dataclass
-class NetworkXStorage(BaseGraphStorage):
-    @staticmethod
-    def load_nx_graph(file_name) -> Optional[nx.Graph]:
-        if os.path.exists(file_name):
-            return nx.read_graphml(file_name)
-        return None
-    @staticmethod
-    def write_nx_graph(graph: nx.Graph, file_name):
-        logger.info("Writing graph with %d nodes, %d edges", graph.number_of_nodes(), graph.number_of_edges())
-        nx.write_graphml(graph, file_name)
-    @staticmethod
-    def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph:
-        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
-        Return the largest connected component of the graph, with nodes and edges sorted in a stable way.
-        """
-        from graspologic.utils import largest_connected_component
-        graph = graph.copy()
-        graph = cast(nx.Graph, largest_connected_component(graph))
-        node_mapping = {
-            node: html.unescape(node.upper().strip()) for node in graph.nodes()
-        }  # type: ignore
-        graph = nx.relabel_nodes(graph, node_mapping)
-        return NetworkXStorage._stabilize_graph(graph)
-    @staticmethod
-    def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
-        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
-        Ensure an undirected graph with the same relationships will always be read the same way.
-        通过对节点和边进行排序来实现
-        """
-        fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()
-        sorted_nodes = graph.nodes(data=True)
-        sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])
-        fixed_graph.add_nodes_from(sorted_nodes)
-        edges = list(graph.edges(data=True))
-        if not graph.is_directed():
-            def _sort_source_target(edge):
-                source, target, edge_data = edge
-                if source > target:
-                    source, target = target, source
-                return source, target, edge_data
-            edges = [_sort_source_target(edge) for edge in edges]
-        def _get_edge_key(source: Any, target: Any) -> str:
-            return f"{source} -> {target}"
-        edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))
-        fixed_graph.add_edges_from(edges)
-        return fixed_graph
-    def __post_init__(self):
-        """
-        如果图文件存在，则加载图文件，否则创建一个新图
-        """
-        self._graphml_xml_file = os.path.join(
-            self.working_dir, f"{self.namespace}.graphml"
-        )
-        preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
-        if preloaded_graph is not None:
-            logger.info(
-                "Loaded graph from %s with %d nodes, %d edges", self._graphml_xml_file,
-                preloaded_graph.number_of_nodes(), preloaded_graph.number_of_edges()
-            )
-        self._graph = preloaded_graph or nx.Graph()
-    async def index_done_callback(self):
-        NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file)
-    async def has_node(self, node_id: str) -> bool:
-        return self._graph.has_node(node_id)
-    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
-        return self._graph.has_edge(source_node_id, target_node_id)
-    async def get_node(self, node_id: str) -> Union[dict, None]:
-        return self._graph.nodes.get(node_id)
-    async def get_all_nodes(self) -> Union[list[dict], None]:
-        return self._graph.nodes(data=True)
-    async def node_degree(self, node_id: str) -> int:
-        return self._graph.degree(node_id)
-    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
-        return self._graph.degree(src_id) + self._graph.degree(tgt_id)
-    async def get_edge(
-        self, source_node_id: str, target_node_id: str
-    ) -> Union[dict, None]:
-        return self._graph.edges.get((source_node_id, target_node_id))
-    async def get_all_edges(self) -> Union[list[dict], None]:
-        return self._graph.edges(data=True)
-    async def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]:
-        if self._graph.has_node(source_node_id):
-            return list(self._graph.edges(source_node_id, data=True))
-        return None
-    async def get_graph(self) -> nx.Graph:
-        return self._graph
-    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
-        self._graph.add_node(node_id, **node_data)
-    async def update_node(self, node_id: str, node_data: dict[str, str]):
-        if self._graph.has_node(node_id):
-            self._graph.nodes[node_id].update(node_data)
-        else:
-            logger.warning("Node %s not found in the graph for update.", node_id)
-    async def upsert_edge(
-        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
-    ):
-        self._graph.add_edge(source_node_id, target_node_id, **edge_data)
-    async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]):
-        if self._graph.has_edge(source_node_id, target_node_id):
-            self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
-        else:
-            logger.warning("Edge %s -> %s not found in the graph for update.", source_node_id, target_node_id)
-    async def delete_node(self, node_id: str):
-        """
-        Delete a node from the graph based on the specified node_id.
-        :param node_id: The node_id to delete
-        """
-        if self._graph.has_node(node_id):
-            self._graph.remove_node(node_id)
-            logger.info("Node %s deleted from the graph.", node_id)
-        else:
-            logger.warning("Node %s not found in the graph for deletion.", node_id)
-    async def clear(self):
-        """
-        Clear the graph by removing all nodes and edges.
-        """
-        self._graph.clear()
-        logger.info("Graph %s cleared.", self.namespace)

hf-repo/graphgen/models/strategy/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/strategy/base_strategy.py DELETED Viewed

@@ -1,5 +0,0 @@
-from dataclasses import dataclass
-@dataclass
-class BaseStrategy:
-    pass

hf-repo/graphgen/models/strategy/travserse_strategy.py DELETED Viewed

@@ -1,30 +0,0 @@
-from dataclasses import dataclass, fields
-from graphgen.models.strategy.base_strategy import BaseStrategy
-@dataclass
-class TraverseStrategy(BaseStrategy):
-    # 生成的QA形式：原子、多跳、聚合型
-    qa_form: str = "atomic" # "atomic" or "multi_hop" or "aggregated"
-    # 最大边数和最大token数方法中选择一个生效
-    expand_method: str = "max_tokens" # "max_width" or "max_tokens"
-    # 单向拓展还是双向拓展
-    bidirectional: bool = True
-    # 每个方向拓展的最大边数
-    max_extra_edges: int = 5
-    # 最长token数
-    max_tokens: int = 256
-    # 每个方向拓展的最大深度
-    max_depth: int = 2
-    # 同一层中选边的策略（如果是双向拓展，同一层指的是两边连接的边的集合）
-    edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random"
-    # 孤立节点的处理策略
-    isolated_node_strategy: str = "add" # "add" or "ignore"
-    loss_strategy: str = "only_edge"  # only_edge, both
-    def to_yaml(self):
-        strategy_dict = {}
-        for f in fields(self):
-            strategy_dict[f.name] = getattr(self, f.name)
-        return {"traverse_strategy": strategy_dict}

hf-repo/graphgen/models/text/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/text/chunk.py DELETED Viewed

@@ -1,7 +0,0 @@
-from dataclasses import dataclass
-@dataclass
-class Chunk:
-    id : str
-    content: str

hf-repo/graphgen/models/text/text_pair.py DELETED Viewed

@@ -1,9 +0,0 @@
-from dataclasses import dataclass
-@dataclass
-class TextPair:
-    """
-    A pair of input data.
-    """
-    question: str
-    answer: str

hf-repo/graphgen/models/vis/__init__.py DELETED Viewed

File without changes

hf-repo/graphgen/models/vis/community_visualizer.py DELETED Viewed

@@ -1,48 +0,0 @@
-from dataclasses import dataclass
-from typing import Dict
-import matplotlib.pyplot as plt
-import networkx as nx
-@dataclass
-class Visualizer:
-    """
-    Class for visualizing graphs using NetworkX and Matplotlib.
-    """
-    graph: nx.Graph = None
-    communities: Dict[str, int] = None
-    layout: str = "spring"
-    max_nodes: int = 1000
-    node_size: int = 10
-    alpha: float = 0.6
-    def visualize(self, save_path: str = None):
-        n = self.graph.number_of_nodes()
-        if self.layout == "spring":
-            k = max(0.1, 1.0 / (n**0.5))
-            pos = nx.spring_layout(self.graph, k=k, seed=42)
-        else:
-            raise ValueError(f"Unknown layout: {self.layout}")
-        plt.figure(figsize=(10, 10))
-        node_colors = [self.communities.get(node, 0) for node in self.graph.nodes()]
-        nx.draw_networkx_nodes(
-            self.graph,
-            pos,
-            node_size=self.node_size,
-            node_color=node_colors,
-            cmap=plt.cm.tab20,
-            alpha=self.alpha,
-        )
-        nx.draw_networkx_edges(self.graph, pos, alpha=0.3, width=0.2)
-        plt.axis("off")
-        if save_path:
-            plt.savefig(save_path, dpi=300, bbox_inches="tight")
-            print("Saved to", save_path)
-        else:
-            plt.show()

hf-repo/graphgen/operators/__init__.py DELETED Viewed

@@ -1,22 +0,0 @@
-from graphgen.operators.generate.generate_cot import generate_cot
-from graphgen.operators.kg.extract_kg import extract_kg
-from graphgen.operators.search.search_all import search_all
-from .judge import judge_statement
-from .quiz import quiz
-from .traverse_graph import (
-    traverse_graph_atomically,
-    traverse_graph_by_edge,
-    traverse_graph_for_multi_hop,
-)
-__all__ = [
-    "extract_kg",
-    "quiz",
-    "judge_statement",
-    "search_all",
-    "traverse_graph_by_edge",
-    "traverse_graph_atomically",
-    "traverse_graph_for_multi_hop",
-    "generate_cot",
-]