Spaces:

chenzihong
/

GraphGen

Build error

App Files Files Community

github-actions[bot] commited on Dec 23, 2025

Commit

a8c3e2a

1 Parent(s): bf63ef4

Auto-sync from demo at Tue Dec 23 13:00:55 UTC 2025

Browse files

Files changed (12) hide show

graphgen/models/generator/aggregated_generator.py +21 -16
graphgen/models/generator/atomic_generator.py +10 -8
graphgen/models/generator/cot_generator.py +20 -13
graphgen/models/generator/multi_hop_generator.py +10 -8
graphgen/models/generator/vqa_generator.py +16 -19
graphgen/models/llm/local/vllm_wrapper.py +7 -5
graphgen/operators/generate/generate_service.py +3 -0
graphgen/templates/generation/aggregated_generation.py +33 -9
graphgen/templates/generation/atomic_generation.py +28 -12
graphgen/templates/generation/cot_generation.py +6 -6
graphgen/templates/generation/multi_hop_generation.py +34 -22
graphgen/templates/generation/vqa_generation.py +15 -12

graphgen/models/generator/aggregated_generator.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import Any
 from graphgen.bases import BaseGenerator
 from graphgen.templates import AGGREGATED_GENERATION_PROMPT
@@ -56,19 +57,21 @@ class AggregatedGenerator(BaseGenerator):
         return prompt
     @staticmethod
-    def parse_rephrased_text(response: str) -> str:
         """
         Parse the rephrased text from the response.
         :param response:
         :return: rephrased text
         """
-        if "Rephrased Text:" in response:
-            rephrased_text = response.split("Rephrased Text:")[1].strip()
-        elif "重述文本:" in response:
-            rephrased_text = response.split("重述文本:")[1].strip()
         else:
-            rephrased_text = response.strip()
-        return rephrased_text.strip('"')
     @staticmethod
     def _build_prompt_for_question_generation(answer: str) -> str:
@@ -85,15 +88,13 @@ class AggregatedGenerator(BaseGenerator):
     @staticmethod
     def parse_response(response: str) -> dict:
-        if response.startswith("Question:"):
-            question = response[len("Question:") :].strip()
-        elif response.startswith("问题："):
-            question = response[len("问题：") :].strip()
         else:
-            question = response.strip()
-        return {
-            "question": question,
-        }
     async def generate(
         self,
@@ -110,9 +111,13 @@ class AggregatedGenerator(BaseGenerator):
         rephrasing_prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(rephrasing_prompt)
         context = self.parse_rephrased_text(response)
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", context)
         qa_pairs = {

+import re
+from typing import Any, Optional
 from graphgen.bases import BaseGenerator
 from graphgen.templates import AGGREGATED_GENERATION_PROMPT
         return prompt
     @staticmethod
+    def parse_rephrased_text(response: str) -> Optional[str]:
         """
         Parse the rephrased text from the response.
         :param response:
         :return: rephrased text
         """
+        rephrased_match = re.search(
+            r"<rephrased_text>(.*?)</rephrased_text>", response, re.DOTALL
+        )
+        if rephrased_match:
+            rephrased_text = rephrased_match.group(1).strip()
         else:
+            logger.warning("Failed to parse rephrased text from response: %s", response)
+            return None
+        return rephrased_text.strip('"').strip("'")
     @staticmethod
     def _build_prompt_for_question_generation(answer: str) -> str:
     @staticmethod
     def parse_response(response: str) -> dict:
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        if question_match:
+            question = question_match.group(1).strip()
         else:
+            logger.warning("Failed to parse question from response: %s", response)
+            return {"question": ""}
+        return {"question": question.strip('"').strip("'")}
     async def generate(
         self,
         rephrasing_prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(rephrasing_prompt)
         context = self.parse_rephrased_text(response)
+        if not context:
+            return result
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
+        if not question:
+            return result
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", context)
         qa_pairs = {

graphgen/models/generator/atomic_generator.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Any
 from graphgen.bases import BaseGenerator
@@ -29,17 +30,18 @@ class AtomicGenerator(BaseGenerator):
         :param response:
         :return:
         """
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {

+import re
 from typing import Any
 from graphgen.bases import BaseGenerator
         :param response:
         :return:
         """
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {

graphgen/models/generator/cot_generator.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Any
 from graphgen.bases import BaseGenerator
@@ -67,22 +68,26 @@ class CoTGenerator(BaseGenerator):
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Reasoning-Path Design:" in response:
-            question = (
-                response.split("Question:")[1]
-                .split("Reasoning-Path Design:")[0]
-                .strip()
-            )
-            reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
-        elif "问题：" in response and "推理路径设计：" in response:
-            question = response.split("问题：")[1].split("推理路径设计：")[0].strip()
-            reasoning_path = response.split("推理路径设计：")[1].strip()
         else:
-            logger.warning("Failed to parse CoT template: %s", response)
             return {}
-        question = question.strip('"')
-        reasoning_path = reasoning_path.strip('"')
         logger.debug("CoT Question: %s", question)
         logger.debug("CoT Reasoning Path: %s", reasoning_path)
         return {
@@ -105,6 +110,8 @@ class CoTGenerator(BaseGenerator):
         prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(prompt)
         response = self.parse_response(response)
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)

+import re
 from typing import Any
 from graphgen.bases import BaseGenerator
     @staticmethod
     def parse_response(response: str) -> dict:
+        """
+        Parse CoT template from response.
+        :param response:
+        :return: dict with question and reasoning_path
+        """
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        reasoning_path_match = re.search(
+            r"<reasoning_path>(.*?)</reasoning_path>", response, re.DOTALL
+        )
+        if question_match and reasoning_path_match:
+            question = question_match.group(1).strip()
+            reasoning_path = reasoning_path_match.group(1).strip()
         else:
+            logger.warning("Failed to parse response: %s", response)
             return {}
+        question = question.strip('"').strip("'")
+        reasoning_path = reasoning_path.strip('"').strip("'")
         logger.debug("CoT Question: %s", question)
         logger.debug("CoT Reasoning Path: %s", reasoning_path)
         return {
         prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(prompt)
         response = self.parse_response(response)
+        if not response:
+            return result
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)

graphgen/models/generator/multi_hop_generator.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Any
 from graphgen.bases import BaseGenerator
@@ -32,17 +33,18 @@ class MultiHopGenerator(BaseGenerator):
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {

+import re
 from typing import Any
 from graphgen.bases import BaseGenerator
     @staticmethod
     def parse_response(response: str) -> dict:
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {

graphgen/models/generator/vqa_generator.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Any
 from graphgen.bases import BaseGenerator
@@ -38,25 +39,21 @@ class VQAGenerator(BaseGenerator):
         :return: QA pairs
         """
         qa_pairs = {}
-        qa_list = response.strip().split("\n\n")
-        for qa in qa_list:
-            if "Question:" in qa and "Answer:" in qa:
-                question = qa.split("Question:")[1].split("Answer:")[0].strip()
-                answer = qa.split("Answer:")[1].strip()
-            elif "问题：" in qa and "答案：" in qa:
-                question = qa.split("问题：")[1].split("答案：")[0].strip()
-                answer = qa.split("答案：")[1].strip()
-            else:
-                logger.error("Failed to parse QA pair: %s", qa)
-                continue
-            question = question.strip('"')
-            answer = answer.strip('"')
-            logger.debug("Question: %s", question)
-            logger.debug("Answer: %s", answer)
-            qa_pairs[compute_content_hash(question)] = {
-                "question": question,
-                "answer": answer,
-            }
         return qa_pairs
     async def generate(

+import re
 from typing import Any
 from graphgen.bases import BaseGenerator
         :return: QA pairs
         """
         qa_pairs = {}
+        pattern = r"<question>(.*?)</question>\s*<answer>(.*?)</answer>"
+        matches = re.findall(pattern, response, re.DOTALL)
+        if matches:
+            for question, answer in matches:
+                question = question.strip().strip('"').strip("'")
+                answer = answer.strip().strip('"').strip("'")
+                logger.debug("Question: %s", question)
+                logger.debug("Answer: %s", answer)
+                qa_pairs[compute_content_hash(question)] = {
+                    "question": question,
+                    "answer": answer,
+                }
+        else:
+            logger.warning("Error parsing the response %s", response)
         return qa_pairs
     async def generate(

graphgen/models/llm/local/vllm_wrapper.py CHANGED Viewed

@@ -16,7 +16,7 @@ class VLLMWrapper(BaseLLMWrapper):
         model: str,
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.9,
-        temperature: float = 0.0,
         top_p: float = 1.0,
         topk: int = 5,
         **kwargs: Any,
@@ -66,7 +66,7 @@ class VLLMWrapper(BaseLLMWrapper):
         sp = self.SamplingParams(
             temperature=self.temperature if self.temperature > 0 else 1.0,
             top_p=self.top_p if self.temperature > 0 else 1.0,
-            max_tokens=extra.get("max_new_tokens", 512),
         )
         result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
@@ -82,7 +82,7 @@ class VLLMWrapper(BaseLLMWrapper):
     async def generate_topk_per_token(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
-        ) -> List[Token]:
         full_prompt = self._build_inputs(text, history)
         request_id = f"graphgen_topk_{uuid.uuid4()}"
@@ -110,7 +110,9 @@ class VLLMWrapper(BaseLLMWrapper):
         candidate_tokens = []
         for _, logprob_obj in top_logprobs.items():
-            tok_str = logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
             prob = float(math.exp(logprob_obj.logprob))
             candidate_tokens.append(Token(tok_str, prob))
@@ -120,7 +122,7 @@ class VLLMWrapper(BaseLLMWrapper):
             main_token = Token(
                 text=candidate_tokens[0].text,
                 prob=candidate_tokens[0].prob,
-                top_candidates=candidate_tokens
             )
             return [main_token]
         return []

         model: str,
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.9,
+        temperature: float = 0.6,
         top_p: float = 1.0,
         topk: int = 5,
         **kwargs: Any,
         sp = self.SamplingParams(
             temperature=self.temperature if self.temperature > 0 else 1.0,
             top_p=self.top_p if self.temperature > 0 else 1.0,
+            max_tokens=extra.get("max_new_tokens", 2048),
         )
         result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
     async def generate_topk_per_token(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
         full_prompt = self._build_inputs(text, history)
         request_id = f"graphgen_topk_{uuid.uuid4()}"
         candidate_tokens = []
         for _, logprob_obj in top_logprobs.items():
+            tok_str = (
+                logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
+            )
             prob = float(math.exp(logprob_obj.logprob))
             candidate_tokens.append(Token(tok_str, prob))
             main_token = Token(
                 text=candidate_tokens[0].text,
                 prob=candidate_tokens[0].prob,
+                top_candidates=candidate_tokens,
             )
             return [main_token]
         return []

graphgen/operators/generate/generate_service.py CHANGED Viewed

@@ -61,6 +61,9 @@ class GenerateService(BaseOperator):
             unit="batch",
         )
         results = self.generator.format_generation_results(
             results, output_data_format=self.data_format
         )

             unit="batch",
         )
+        # Filter out empty results
+        results = [res for res in results if res]
         results = self.generator.format_generation_results(
             results, output_data_format=self.data_format
         )

graphgen/templates/generation/aggregated_generation.py CHANGED Viewed

@@ -132,6 +132,8 @@ To generate a version of the text that is rephrased and conveys the same meaning
    - Logical consistency throughout
    - Clear cause-and-effect relationships
 ################
 -ENTITIES-
 ################
@@ -175,6 +177,8 @@ ANSWER_REPHRASING_ZH: str = """---角色---
     - 整体逻辑一致性
     - 清晰的因果关系
 ################
 -实体-
 ################
@@ -191,6 +195,9 @@ REQUIREMENT_ZH = """
 ################
 请在下方直接输出连贯的重述文本，不要输出任何额外的内容。
 重述文本:
 """
@@ -198,25 +205,42 @@ REQUIREMENT_EN = """
 ################
 Please directly output the coherent rephrased text below, without any additional content.
 Rephrased Text:
 """
 QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
-################
-Answer:
-{answer}
-################
 Question:
 """
 QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案，请生成一个与答案对应的问题。
-################
-答案：
-{answer}
-################
-问题：
 """
 AGGREGATED_GENERATION_PROMPT = {

    - Logical consistency throughout
    - Clear cause-and-effect relationships
+**Attention: Please directly provide the rephrased text without any additional content or analysis.**
 ################
 -ENTITIES-
 ################
     - 整体逻辑一致性
     - 清晰的因果关系
+**注意： 请你直接给出重述文本，不要输出任何额外的内容，也不要进行任何分析。**
 ################
 -实体-
 ################
 ################
 请在下方直接输出连贯的重述文本，不要输出任何额外的内容。
+输出格式：
+<rephrased_text>rephrased_text_here</rephrased_text>
 重述文本:
 """
 ################
 Please directly output the coherent rephrased text below, without any additional content.
+Output format:
+<rephrased_text>rephrased_text_here</rephrased_text>
 Rephrased Text:
 """
 QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
+The answer for which a question needs to be generated is as follows:
+<answer>{answer}</answer>
+Please note the following requirements:
+1. Only output one question text without any additional explanations or analysis.
+2. Do not repeat the content of the answer or any fragments of it.
+3. The question must be independently understandable and fully match the answer.
+Output format:
+<question>question_text</question>
 Question:
 """
 QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案，请生成一个与答案对应的问题。
+需要生成问题的答案如下：
+<answer>{answer}</answer>
+请注意下列要求：
+1. 仅输出一个问题文本，不得包含任何额外说明或分析
+2. 不得重复答案内容或其中任何片段
+3. 问题必须可独立理解且与答案完全匹配
+输出格式：
+<question>question_text</question>
+问题:
 """
 AGGREGATED_GENERATION_PROMPT = {

graphgen/templates/generation/atomic_generation.py CHANGED Viewed

@@ -1,28 +1,44 @@
 # pylint: disable=C0301
 TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
-The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
-For example:
-Question: What is the effect of overexpressing the BG1 gene on grain size and development?
-Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.
-Question: What role does TAC4 play in the gravitropism of rice shoots?
-Answer: TAC4 is a key regulator of gravitropism in rice shoots, promoting the bending of shoots towards the gravity vector.
 Here is the text passage you need to generate a QA pair for:
 {context}
 """
 TEMPLATE_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答（QA）对。
-答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
-例如：
-问题：过表达BG1基因对谷粒大小和发育有什么影响？
-答案：BG1基因的过表达显著增加了谷粒大小，表明其在谷物发育中的作用。
-问题：TAC4在水稻茎的重力性状中扮演什么角色？
-答案：TAC4是水稻茎重力性状的关键调节因子，促进茎向重力矢量弯曲。
 以下是你需要为其生成QA对的文本段落：
 {context}
 """

 # pylint: disable=C0301
 TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
+Please note the following requirements:
+1. Output only one QA pair without any additional explanations or analysis.
+2. Do not repeat the content of the answer or any part of it.
+3. The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
+Output format:
+<question>question_text</question>
+<answer>answer_text</answer>
+For example:
+<question>What is the effect of overexpressing the BG1 gene on grain size and development?</question>
+<answer>Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.</answer>
 Here is the text passage you need to generate a QA pair for:
 {context}
+Output:
 """
 TEMPLATE_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答（QA）对。
+请注意下列要求：
+1. 仅输出一个问答（QA）对，不得包含任何额外说明或分析
+2. 不得重复答案内容或其中任何片段
+3. 答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
+输出格式如下：
+<question>question_text</question>
+<answer>answer_text</answer>
+例如：
+<question>过表达BG1基因对谷粒大小和发育有什么影响？</question>
+<answer>BG1基因的过表达显著增加了谷粒大小，表明其在谷物发育中的作用。</answer>
 以下是你需要为其生成QA对的文本段落：
 {context}
+输出：
 """

graphgen/templates/generation/cot_generation.py CHANGED Viewed

@@ -81,7 +81,7 @@ Input:
 Output:
 """
-COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务不是回答问题，\
 而是根据给定的知识图谱中的实体和关系的名称以及描述信息，设计一条可复用、可泛化的 CoT 推理路径模板。\
 -步骤-
@@ -115,8 +115,8 @@ COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务
 4. 不要出现具体数值或结论，不要出现“识别实体”、“识别关系”这类无意义的操作描述。
 5. 使用中文作为输出语言。
 6. 输出格式为：
-问题：
-推理路径设计：
 -真实数据-
 输入:
@@ -130,7 +130,7 @@ COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务
 """
-COT_TEMPLATE_DESIGN_EN = """You are a “meta-reasoning architect”. \
 Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \
 template based solely on the names and descriptions of entities and \
 relationships in the provided knowledge graph.
@@ -168,8 +168,8 @@ relationships in the provided knowledge graph.
 and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship".
 5. Use English as the output language.
 6. The output format is:
-Question:
-Reasoning-Path Design:
 Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided.

 Output:
 """
+COT_TEMPLATE_DESIGN_ZH: str = """你是一位“元推理架构师”。你的任务不是回答问题，\
 而是根据给定的知识图谱中的实体和关系的名称以及描述信息，设计一条可复用、可泛化的 CoT 推理路径模板。\
 -步骤-
 4. 不要出现具体数值或结论，不要出现“识别实体”、“识别关系”这类无意义的操作描述。
 5. 使用中文作为输出语言。
 6. 输出格式为：
+<question>问题文本</question>
+<reasoning_path>推理路径设计文本</reasoning_path>
 -真实数据-
 输入:
 """
+COT_TEMPLATE_DESIGN_EN: str = """You are a “meta-reasoning architect”. \
 Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \
 template based solely on the names and descriptions of entities and \
 relationships in the provided knowledge graph.
 and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship".
 5. Use English as the output language.
 6. The output format is:
+<question>question text</question>
+<reasoning_path>reasoning path design text</reasoning_path>
 Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided.

graphgen/templates/generation/multi_hop_generation.py CHANGED Viewed

@@ -1,56 +1,68 @@
 # pylint: disable=C0301
-TEMPLATE_ZH: str = """请基于以下知识子图生成多跳推理问题和答案。你将获得一个知识子图，其中包含一系列实体、关系和事实。你的任务是提出一个问题，该问题需要经过多次推理才能回答。问题的答案应该是从给定的知识子图中推断出来的。确保问题的难度适中，需要多步推理才能回答。
 例如：
-########
 --实体--
 1. 苹果
 2. 水果
 3. 维生素C
-########
 --关系--
 1. 苹果-水果：苹果是一种水果
 2. 水果-维生素C：水果中富含维生素C
-########
-问题：通过吃苹果补充的什么物质，有助于维持健康？
-答案：维生素C
-########
-#########
 --实体--
 {entities}
-#########
 --关系--
 {relationships}
-#########
-直接输出生成的问题和答案，请不要直接复制示例问题和答案，不要输出无关内容。
 """
-TEMPLATE_EN: str = """Please generate a multi-hop reasoning question and answer based on the following knowledge subgraph. You will be provided with a knowledge subgraph that contains a series of entities, relations, and facts. Your task is to generate a question that requires multiple steps of reasoning to answer. The answer to the question should be inferred from the given knowledge subgraph. Ensure that the question is of moderate difficulty and requires multiple steps of reasoning to answer.
 For example:
-########
 --Entities--
 1. Apple
 2. Fruit
 3. Vitamin C
-########
 --Relations--
 1. Apple-Fruit: Apple is a type of fruit
 2. Fruit-Vitamin C: Fruits are rich in Vitamin C
-########
-Question: What substance, obtained through eating apples, helps maintain health?
-Answer: Vitamin C
-########
-########
 --Entities--
 {entities}
-########
 --Relations--
 {relationships}
-########
-Output the generated question and answer directly, please do not copy the example question and answer directly, and do not provide irrelevant information.
 """
 MULTI_HOP_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}

 # pylint: disable=C0301
+TEMPLATE_ZH: str = """请基于以下知识子图生成多跳推理问题和答案。你将获得一个知识子图，其中包含一系列实体、关系和事实。
+你的任务是生成一个问答对，其中问题需要经过多次推理才能回答。问题的答案应该是从给定的知识子图中推断出来的。确保问题的难度适中，需要多步推理才能回答。
+请注意下列要求：
+1. 仅输出一个问答（QA）对，不得包含任何额外说明或分析
+2. 不得重复答案内容或其中任何片段，不要直接复制示例问题和答案
+3. 答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
+输出格式：
+<question>question_text</question>
+<answer>answer_text</answer>
 例如：
+输入为：
 --实体--
 1. 苹果
 2. 水果
 3. 维生素C
 --关系--
 1. 苹果-水果：苹果是一种水果
 2. 水果-维生素C：水果中富含维生素C
+输出：
+<question>通过吃苹果补充的什么物质，有助于维持健康？</question>
+<answer>维生素C</answer>
+真实输入如下：
 --实体--
 {entities}
 --关系--
 {relationships}
+输出：
 """
+TEMPLATE_EN: str = """Please generate a multi-hop reasoning question and answer based on the following knowledge subgraph. You will be provided with a knowledge subgraph that contains a series of entities, relations, and facts.
+Your task is to generate a question-answer (QA) pair where the question requires multiple steps of reasoning to answer. The answer to the question should be inferred from the given knowledge subgraph. Ensure that the question is of moderate difficulty and requires multiple steps of reasoning to answer.
+Please note the following requirements:
+1. Output only one QA pair without any additional explanations or analysis.
+2. Do not repeat the content of the answer or any part of it. Do not directly copy the example question and answer.
+3. The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
 For example:
+Input:
 --Entities--
 1. Apple
 2. Fruit
 3. Vitamin C
 --Relations--
 1. Apple-Fruit: Apple is a type of fruit
 2. Fruit-Vitamin C: Fruits are rich in Vitamin C
+Output:
+<question>What substance, obtained by eating apples, helps maintain health?</question>
+<answer>Vitamin C</answer>
+Real input:
 --Entities--
 {entities}
 --Relations--
 {relationships}
+Output:
 """
 MULTI_HOP_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}

graphgen/templates/generation/vqa_generation.py CHANGED Viewed

@@ -39,14 +39,16 @@ Create multiple sets of VQA question-answer pairs that satisfy the following:
 ################
 {relationships}
 ################
-Directly output the generated questions and answers, please do not directly copy the example questions and answers, and do not provide irrelevant information.
-Here is the response format you should follow:
-Question: <Question1>
-Answer: <Answer1>
-Question: <Question2>
-Answer: <Answer2>
 """
 TEMPLATE_ZH: str = """---角色---
@@ -91,14 +93,15 @@ TEMPLATE_ZH: str = """---角色---
 ################
 {relationships}
 ################
-直接输出生成的问题和答案，请不要直接复制示例问题和答案，不要输出无关内容。
-以下是你应该遵循的响应格式：
-问题： <问题1>
-答案： <答案1>
-问题： <问题2>
-答案： <答案2>
 """
 VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}

 ################
 {relationships}
 ################
+Please directly output the generated questions and answers, do not directly copy the example questions and answers, and do not provide irrelevant information.
+Here is the response format you should follow:
+<question>question1</question>
+<answer>answer1</answer>
+<question>question2</question>
+<answer>answer2</answer>
+Output:
 """
 TEMPLATE_ZH: str = """---角色---
 ################
 {relationships}
 ################
+请直接输出生成的问题和答案，不要直接复制示例问题和答案，也不要提供无关信息。
+以下是你应遵循的响应格式：
+<question>question1</question>
+<answer>answer1</answer>
+<question>question2</question>
+<answer>answer2</answer>
+输出：
 """
 VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}