Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
·
a8c3e2a
1
Parent(s):
bf63ef4
Auto-sync from demo at Tue Dec 23 13:00:55 UTC 2025
Browse files- graphgen/models/generator/aggregated_generator.py +21 -16
- graphgen/models/generator/atomic_generator.py +10 -8
- graphgen/models/generator/cot_generator.py +20 -13
- graphgen/models/generator/multi_hop_generator.py +10 -8
- graphgen/models/generator/vqa_generator.py +16 -19
- graphgen/models/llm/local/vllm_wrapper.py +7 -5
- graphgen/operators/generate/generate_service.py +3 -0
- graphgen/templates/generation/aggregated_generation.py +33 -9
- graphgen/templates/generation/atomic_generation.py +28 -12
- graphgen/templates/generation/cot_generation.py +6 -6
- graphgen/templates/generation/multi_hop_generation.py +34 -22
- graphgen/templates/generation/vqa_generation.py +15 -12
graphgen/models/generator/aggregated_generator.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
from graphgen.bases import BaseGenerator
|
| 4 |
from graphgen.templates import AGGREGATED_GENERATION_PROMPT
|
|
@@ -56,19 +57,21 @@ class AggregatedGenerator(BaseGenerator):
|
|
| 56 |
return prompt
|
| 57 |
|
| 58 |
@staticmethod
|
| 59 |
-
def parse_rephrased_text(response: str) -> str:
|
| 60 |
"""
|
| 61 |
Parse the rephrased text from the response.
|
| 62 |
:param response:
|
| 63 |
:return: rephrased text
|
| 64 |
"""
|
| 65 |
-
|
| 66 |
-
rephrased_text
|
| 67 |
-
|
| 68 |
-
|
|
|
|
| 69 |
else:
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
|
| 73 |
@staticmethod
|
| 74 |
def _build_prompt_for_question_generation(answer: str) -> str:
|
|
@@ -85,15 +88,13 @@ class AggregatedGenerator(BaseGenerator):
|
|
| 85 |
|
| 86 |
@staticmethod
|
| 87 |
def parse_response(response: str) -> dict:
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
question = response[len("问题:") :].strip()
|
| 92 |
else:
|
| 93 |
-
question
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
}
|
| 97 |
|
| 98 |
async def generate(
|
| 99 |
self,
|
|
@@ -110,9 +111,13 @@ class AggregatedGenerator(BaseGenerator):
|
|
| 110 |
rephrasing_prompt = self.build_prompt(batch)
|
| 111 |
response = await self.llm_client.generate_answer(rephrasing_prompt)
|
| 112 |
context = self.parse_rephrased_text(response)
|
|
|
|
|
|
|
| 113 |
question_generation_prompt = self._build_prompt_for_question_generation(context)
|
| 114 |
response = await self.llm_client.generate_answer(question_generation_prompt)
|
| 115 |
question = self.parse_response(response)["question"]
|
|
|
|
|
|
|
| 116 |
logger.debug("Question: %s", question)
|
| 117 |
logger.debug("Answer: %s", context)
|
| 118 |
qa_pairs = {
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Any, Optional
|
| 3 |
|
| 4 |
from graphgen.bases import BaseGenerator
|
| 5 |
from graphgen.templates import AGGREGATED_GENERATION_PROMPT
|
|
|
|
| 57 |
return prompt
|
| 58 |
|
| 59 |
@staticmethod
|
| 60 |
+
def parse_rephrased_text(response: str) -> Optional[str]:
|
| 61 |
"""
|
| 62 |
Parse the rephrased text from the response.
|
| 63 |
:param response:
|
| 64 |
:return: rephrased text
|
| 65 |
"""
|
| 66 |
+
rephrased_match = re.search(
|
| 67 |
+
r"<rephrased_text>(.*?)</rephrased_text>", response, re.DOTALL
|
| 68 |
+
)
|
| 69 |
+
if rephrased_match:
|
| 70 |
+
rephrased_text = rephrased_match.group(1).strip()
|
| 71 |
else:
|
| 72 |
+
logger.warning("Failed to parse rephrased text from response: %s", response)
|
| 73 |
+
return None
|
| 74 |
+
return rephrased_text.strip('"').strip("'")
|
| 75 |
|
| 76 |
@staticmethod
|
| 77 |
def _build_prompt_for_question_generation(answer: str) -> str:
|
|
|
|
| 88 |
|
| 89 |
@staticmethod
|
| 90 |
def parse_response(response: str) -> dict:
|
| 91 |
+
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
|
| 92 |
+
if question_match:
|
| 93 |
+
question = question_match.group(1).strip()
|
|
|
|
| 94 |
else:
|
| 95 |
+
logger.warning("Failed to parse question from response: %s", response)
|
| 96 |
+
return {"question": ""}
|
| 97 |
+
return {"question": question.strip('"').strip("'")}
|
|
|
|
| 98 |
|
| 99 |
async def generate(
|
| 100 |
self,
|
|
|
|
| 111 |
rephrasing_prompt = self.build_prompt(batch)
|
| 112 |
response = await self.llm_client.generate_answer(rephrasing_prompt)
|
| 113 |
context = self.parse_rephrased_text(response)
|
| 114 |
+
if not context:
|
| 115 |
+
return result
|
| 116 |
question_generation_prompt = self._build_prompt_for_question_generation(context)
|
| 117 |
response = await self.llm_client.generate_answer(question_generation_prompt)
|
| 118 |
question = self.parse_response(response)["question"]
|
| 119 |
+
if not question:
|
| 120 |
+
return result
|
| 121 |
logger.debug("Question: %s", question)
|
| 122 |
logger.debug("Answer: %s", context)
|
| 123 |
qa_pairs = {
|
graphgen/models/generator/atomic_generator.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import Any
|
| 2 |
|
| 3 |
from graphgen.bases import BaseGenerator
|
|
@@ -29,17 +30,18 @@ class AtomicGenerator(BaseGenerator):
|
|
| 29 |
:param response:
|
| 30 |
:return:
|
| 31 |
"""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
question =
|
| 37 |
-
answer =
|
| 38 |
else:
|
| 39 |
logger.warning("Failed to parse response: %s", response)
|
| 40 |
return {}
|
| 41 |
-
|
| 42 |
-
|
|
|
|
| 43 |
logger.debug("Question: %s", question)
|
| 44 |
logger.debug("Answer: %s", answer)
|
| 45 |
return {
|
|
|
|
| 1 |
+
import re
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
from graphgen.bases import BaseGenerator
|
|
|
|
| 30 |
:param response:
|
| 31 |
:return:
|
| 32 |
"""
|
| 33 |
+
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
|
| 34 |
+
answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
|
| 35 |
+
|
| 36 |
+
if question_match and answer_match:
|
| 37 |
+
question = question_match.group(1).strip()
|
| 38 |
+
answer = answer_match.group(1).strip()
|
| 39 |
else:
|
| 40 |
logger.warning("Failed to parse response: %s", response)
|
| 41 |
return {}
|
| 42 |
+
|
| 43 |
+
question = question.strip('"').strip("'")
|
| 44 |
+
answer = answer.strip('"').strip("'")
|
| 45 |
logger.debug("Question: %s", question)
|
| 46 |
logger.debug("Answer: %s", answer)
|
| 47 |
return {
|
graphgen/models/generator/cot_generator.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import Any
|
| 2 |
|
| 3 |
from graphgen.bases import BaseGenerator
|
|
@@ -67,22 +68,26 @@ class CoTGenerator(BaseGenerator):
|
|
| 67 |
|
| 68 |
@staticmethod
|
| 69 |
def parse_response(response: str) -> dict:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
| 80 |
else:
|
| 81 |
-
logger.warning("Failed to parse
|
| 82 |
return {}
|
| 83 |
|
| 84 |
-
question = question.strip('"')
|
| 85 |
-
reasoning_path = reasoning_path.strip('"')
|
|
|
|
| 86 |
logger.debug("CoT Question: %s", question)
|
| 87 |
logger.debug("CoT Reasoning Path: %s", reasoning_path)
|
| 88 |
return {
|
|
@@ -105,6 +110,8 @@ class CoTGenerator(BaseGenerator):
|
|
| 105 |
prompt = self.build_prompt(batch)
|
| 106 |
response = await self.llm_client.generate_answer(prompt)
|
| 107 |
response = self.parse_response(response)
|
|
|
|
|
|
|
| 108 |
question, reasoning_path = response["question"], response["reasoning_path"]
|
| 109 |
prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
|
| 110 |
cot_answer = await self.llm_client.generate_answer(prompt)
|
|
|
|
| 1 |
+
import re
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
from graphgen.bases import BaseGenerator
|
|
|
|
| 68 |
|
| 69 |
@staticmethod
|
| 70 |
def parse_response(response: str) -> dict:
|
| 71 |
+
"""
|
| 72 |
+
Parse CoT template from response.
|
| 73 |
+
:param response:
|
| 74 |
+
:return: dict with question and reasoning_path
|
| 75 |
+
"""
|
| 76 |
+
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
|
| 77 |
+
reasoning_path_match = re.search(
|
| 78 |
+
r"<reasoning_path>(.*?)</reasoning_path>", response, re.DOTALL
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
if question_match and reasoning_path_match:
|
| 82 |
+
question = question_match.group(1).strip()
|
| 83 |
+
reasoning_path = reasoning_path_match.group(1).strip()
|
| 84 |
else:
|
| 85 |
+
logger.warning("Failed to parse response: %s", response)
|
| 86 |
return {}
|
| 87 |
|
| 88 |
+
question = question.strip('"').strip("'")
|
| 89 |
+
reasoning_path = reasoning_path.strip('"').strip("'")
|
| 90 |
+
|
| 91 |
logger.debug("CoT Question: %s", question)
|
| 92 |
logger.debug("CoT Reasoning Path: %s", reasoning_path)
|
| 93 |
return {
|
|
|
|
| 110 |
prompt = self.build_prompt(batch)
|
| 111 |
response = await self.llm_client.generate_answer(prompt)
|
| 112 |
response = self.parse_response(response)
|
| 113 |
+
if not response:
|
| 114 |
+
return result
|
| 115 |
question, reasoning_path = response["question"], response["reasoning_path"]
|
| 116 |
prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
|
| 117 |
cot_answer = await self.llm_client.generate_answer(prompt)
|
graphgen/models/generator/multi_hop_generator.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import Any
|
| 2 |
|
| 3 |
from graphgen.bases import BaseGenerator
|
|
@@ -32,17 +33,18 @@ class MultiHopGenerator(BaseGenerator):
|
|
| 32 |
|
| 33 |
@staticmethod
|
| 34 |
def parse_response(response: str) -> dict:
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
question =
|
| 40 |
-
answer =
|
| 41 |
else:
|
| 42 |
logger.warning("Failed to parse response: %s", response)
|
| 43 |
return {}
|
| 44 |
-
|
| 45 |
-
|
|
|
|
| 46 |
logger.debug("Question: %s", question)
|
| 47 |
logger.debug("Answer: %s", answer)
|
| 48 |
return {
|
|
|
|
| 1 |
+
import re
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
from graphgen.bases import BaseGenerator
|
|
|
|
| 33 |
|
| 34 |
@staticmethod
|
| 35 |
def parse_response(response: str) -> dict:
|
| 36 |
+
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
|
| 37 |
+
answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
|
| 38 |
+
|
| 39 |
+
if question_match and answer_match:
|
| 40 |
+
question = question_match.group(1).strip()
|
| 41 |
+
answer = answer_match.group(1).strip()
|
| 42 |
else:
|
| 43 |
logger.warning("Failed to parse response: %s", response)
|
| 44 |
return {}
|
| 45 |
+
|
| 46 |
+
question = question.strip('"').strip("'")
|
| 47 |
+
answer = answer.strip('"').strip("'")
|
| 48 |
logger.debug("Question: %s", question)
|
| 49 |
logger.debug("Answer: %s", answer)
|
| 50 |
return {
|
graphgen/models/generator/vqa_generator.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import Any
|
| 2 |
|
| 3 |
from graphgen.bases import BaseGenerator
|
|
@@ -38,25 +39,21 @@ class VQAGenerator(BaseGenerator):
|
|
| 38 |
:return: QA pairs
|
| 39 |
"""
|
| 40 |
qa_pairs = {}
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
logger.
|
| 56 |
-
qa_pairs[compute_content_hash(question)] = {
|
| 57 |
-
"question": question,
|
| 58 |
-
"answer": answer,
|
| 59 |
-
}
|
| 60 |
return qa_pairs
|
| 61 |
|
| 62 |
async def generate(
|
|
|
|
| 1 |
+
import re
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
from graphgen.bases import BaseGenerator
|
|
|
|
| 39 |
:return: QA pairs
|
| 40 |
"""
|
| 41 |
qa_pairs = {}
|
| 42 |
+
pattern = r"<question>(.*?)</question>\s*<answer>(.*?)</answer>"
|
| 43 |
+
matches = re.findall(pattern, response, re.DOTALL)
|
| 44 |
+
|
| 45 |
+
if matches:
|
| 46 |
+
for question, answer in matches:
|
| 47 |
+
question = question.strip().strip('"').strip("'")
|
| 48 |
+
answer = answer.strip().strip('"').strip("'")
|
| 49 |
+
logger.debug("Question: %s", question)
|
| 50 |
+
logger.debug("Answer: %s", answer)
|
| 51 |
+
qa_pairs[compute_content_hash(question)] = {
|
| 52 |
+
"question": question,
|
| 53 |
+
"answer": answer,
|
| 54 |
+
}
|
| 55 |
+
else:
|
| 56 |
+
logger.warning("Error parsing the response %s", response)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
return qa_pairs
|
| 58 |
|
| 59 |
async def generate(
|
graphgen/models/llm/local/vllm_wrapper.py
CHANGED
|
@@ -16,7 +16,7 @@ class VLLMWrapper(BaseLLMWrapper):
|
|
| 16 |
model: str,
|
| 17 |
tensor_parallel_size: int = 1,
|
| 18 |
gpu_memory_utilization: float = 0.9,
|
| 19 |
-
temperature: float = 0.
|
| 20 |
top_p: float = 1.0,
|
| 21 |
topk: int = 5,
|
| 22 |
**kwargs: Any,
|
|
@@ -66,7 +66,7 @@ class VLLMWrapper(BaseLLMWrapper):
|
|
| 66 |
sp = self.SamplingParams(
|
| 67 |
temperature=self.temperature if self.temperature > 0 else 1.0,
|
| 68 |
top_p=self.top_p if self.temperature > 0 else 1.0,
|
| 69 |
-
max_tokens=extra.get("max_new_tokens",
|
| 70 |
)
|
| 71 |
|
| 72 |
result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
|
|
@@ -82,7 +82,7 @@ class VLLMWrapper(BaseLLMWrapper):
|
|
| 82 |
|
| 83 |
async def generate_topk_per_token(
|
| 84 |
self, text: str, history: Optional[List[str]] = None, **extra: Any
|
| 85 |
-
|
| 86 |
full_prompt = self._build_inputs(text, history)
|
| 87 |
request_id = f"graphgen_topk_{uuid.uuid4()}"
|
| 88 |
|
|
@@ -110,7 +110,9 @@ class VLLMWrapper(BaseLLMWrapper):
|
|
| 110 |
|
| 111 |
candidate_tokens = []
|
| 112 |
for _, logprob_obj in top_logprobs.items():
|
| 113 |
-
tok_str =
|
|
|
|
|
|
|
| 114 |
prob = float(math.exp(logprob_obj.logprob))
|
| 115 |
candidate_tokens.append(Token(tok_str, prob))
|
| 116 |
|
|
@@ -120,7 +122,7 @@ class VLLMWrapper(BaseLLMWrapper):
|
|
| 120 |
main_token = Token(
|
| 121 |
text=candidate_tokens[0].text,
|
| 122 |
prob=candidate_tokens[0].prob,
|
| 123 |
-
top_candidates=candidate_tokens
|
| 124 |
)
|
| 125 |
return [main_token]
|
| 126 |
return []
|
|
|
|
| 16 |
model: str,
|
| 17 |
tensor_parallel_size: int = 1,
|
| 18 |
gpu_memory_utilization: float = 0.9,
|
| 19 |
+
temperature: float = 0.6,
|
| 20 |
top_p: float = 1.0,
|
| 21 |
topk: int = 5,
|
| 22 |
**kwargs: Any,
|
|
|
|
| 66 |
sp = self.SamplingParams(
|
| 67 |
temperature=self.temperature if self.temperature > 0 else 1.0,
|
| 68 |
top_p=self.top_p if self.temperature > 0 else 1.0,
|
| 69 |
+
max_tokens=extra.get("max_new_tokens", 2048),
|
| 70 |
)
|
| 71 |
|
| 72 |
result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
|
|
|
|
| 82 |
|
| 83 |
async def generate_topk_per_token(
|
| 84 |
self, text: str, history: Optional[List[str]] = None, **extra: Any
|
| 85 |
+
) -> List[Token]:
|
| 86 |
full_prompt = self._build_inputs(text, history)
|
| 87 |
request_id = f"graphgen_topk_{uuid.uuid4()}"
|
| 88 |
|
|
|
|
| 110 |
|
| 111 |
candidate_tokens = []
|
| 112 |
for _, logprob_obj in top_logprobs.items():
|
| 113 |
+
tok_str = (
|
| 114 |
+
logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
|
| 115 |
+
)
|
| 116 |
prob = float(math.exp(logprob_obj.logprob))
|
| 117 |
candidate_tokens.append(Token(tok_str, prob))
|
| 118 |
|
|
|
|
| 122 |
main_token = Token(
|
| 123 |
text=candidate_tokens[0].text,
|
| 124 |
prob=candidate_tokens[0].prob,
|
| 125 |
+
top_candidates=candidate_tokens,
|
| 126 |
)
|
| 127 |
return [main_token]
|
| 128 |
return []
|
graphgen/operators/generate/generate_service.py
CHANGED
|
@@ -61,6 +61,9 @@ class GenerateService(BaseOperator):
|
|
| 61 |
unit="batch",
|
| 62 |
)
|
| 63 |
|
|
|
|
|
|
|
|
|
|
| 64 |
results = self.generator.format_generation_results(
|
| 65 |
results, output_data_format=self.data_format
|
| 66 |
)
|
|
|
|
| 61 |
unit="batch",
|
| 62 |
)
|
| 63 |
|
| 64 |
+
# Filter out empty results
|
| 65 |
+
results = [res for res in results if res]
|
| 66 |
+
|
| 67 |
results = self.generator.format_generation_results(
|
| 68 |
results, output_data_format=self.data_format
|
| 69 |
)
|
graphgen/templates/generation/aggregated_generation.py
CHANGED
|
@@ -132,6 +132,8 @@ To generate a version of the text that is rephrased and conveys the same meaning
|
|
| 132 |
- Logical consistency throughout
|
| 133 |
- Clear cause-and-effect relationships
|
| 134 |
|
|
|
|
|
|
|
| 135 |
################
|
| 136 |
-ENTITIES-
|
| 137 |
################
|
|
@@ -175,6 +177,8 @@ ANSWER_REPHRASING_ZH: str = """---角色---
|
|
| 175 |
- 整体逻辑一致性
|
| 176 |
- 清晰的因果关系
|
| 177 |
|
|
|
|
|
|
|
| 178 |
################
|
| 179 |
-实体-
|
| 180 |
################
|
|
@@ -191,6 +195,9 @@ REQUIREMENT_ZH = """
|
|
| 191 |
################
|
| 192 |
请在下方直接输出连贯的重述文本,不要输出任何额外的内容。
|
| 193 |
|
|
|
|
|
|
|
|
|
|
| 194 |
重述文本:
|
| 195 |
"""
|
| 196 |
|
|
@@ -198,25 +205,42 @@ REQUIREMENT_EN = """
|
|
| 198 |
################
|
| 199 |
Please directly output the coherent rephrased text below, without any additional content.
|
| 200 |
|
|
|
|
|
|
|
|
|
|
| 201 |
Rephrased Text:
|
| 202 |
"""
|
| 203 |
|
| 204 |
QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
Question:
|
| 211 |
"""
|
| 212 |
|
| 213 |
QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案,请生成一个与答案对应的问题。
|
| 214 |
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
"""
|
| 221 |
|
| 222 |
AGGREGATED_GENERATION_PROMPT = {
|
|
|
|
| 132 |
- Logical consistency throughout
|
| 133 |
- Clear cause-and-effect relationships
|
| 134 |
|
| 135 |
+
**Attention: Please directly provide the rephrased text without any additional content or analysis.**
|
| 136 |
+
|
| 137 |
################
|
| 138 |
-ENTITIES-
|
| 139 |
################
|
|
|
|
| 177 |
- 整体逻辑一致性
|
| 178 |
- 清晰的因果关系
|
| 179 |
|
| 180 |
+
**注意: 请你直接给出重述文本,不要输出任何额外的内容,也不要进行任何分析。**
|
| 181 |
+
|
| 182 |
################
|
| 183 |
-实体-
|
| 184 |
################
|
|
|
|
| 195 |
################
|
| 196 |
请在下方直接输出连贯的重述文本,不要输出任何额外的内容。
|
| 197 |
|
| 198 |
+
输出格式:
|
| 199 |
+
<rephrased_text>rephrased_text_here</rephrased_text>
|
| 200 |
+
|
| 201 |
重述文本:
|
| 202 |
"""
|
| 203 |
|
|
|
|
| 205 |
################
|
| 206 |
Please directly output the coherent rephrased text below, without any additional content.
|
| 207 |
|
| 208 |
+
Output format:
|
| 209 |
+
<rephrased_text>rephrased_text_here</rephrased_text>
|
| 210 |
+
|
| 211 |
Rephrased Text:
|
| 212 |
"""
|
| 213 |
|
| 214 |
QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
|
| 215 |
|
| 216 |
+
The answer for which a question needs to be generated is as follows:
|
| 217 |
+
<answer>{answer}</answer>
|
| 218 |
+
|
| 219 |
+
Please note the following requirements:
|
| 220 |
+
1. Only output one question text without any additional explanations or analysis.
|
| 221 |
+
2. Do not repeat the content of the answer or any fragments of it.
|
| 222 |
+
3. The question must be independently understandable and fully match the answer.
|
| 223 |
+
|
| 224 |
+
Output format:
|
| 225 |
+
<question>question_text</question>
|
| 226 |
+
|
| 227 |
Question:
|
| 228 |
"""
|
| 229 |
|
| 230 |
QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案,请生成一个与答案对应的问题。
|
| 231 |
|
| 232 |
+
需要生成问题的答案如下:
|
| 233 |
+
<answer>{answer}</answer>
|
| 234 |
+
|
| 235 |
+
请注意下列要求:
|
| 236 |
+
1. 仅输出一个问题文本,不得包含任何额外说明或分析
|
| 237 |
+
2. 不得重复答案内容或其中任何片段
|
| 238 |
+
3. 问题必须可独立理解且与答案完全匹配
|
| 239 |
+
|
| 240 |
+
输出格式:
|
| 241 |
+
<question>question_text</question>
|
| 242 |
+
|
| 243 |
+
问题:
|
| 244 |
"""
|
| 245 |
|
| 246 |
AGGREGATED_GENERATION_PROMPT = {
|
graphgen/templates/generation/atomic_generation.py
CHANGED
|
@@ -1,28 +1,44 @@
|
|
| 1 |
# pylint: disable=C0301
|
| 2 |
TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
|
| 3 |
-
The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
|
| 4 |
-
For example:
|
| 5 |
-
Question: What is the effect of overexpressing the BG1 gene on grain size and development?
|
| 6 |
-
Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
Here is the text passage you need to generate a QA pair for:
|
| 12 |
{context}
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
TEMPLATE_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答(QA)对。
|
| 16 |
-
答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
|
| 17 |
-
例如:
|
| 18 |
-
问题:过表达BG1基因对谷粒大小和发育有什么影响?
|
| 19 |
-
答案:BG1基因的过表达显著增加了谷粒大小,表明其在谷物发育中的作用。
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
以下是你需要为其生成QA对的文本段落:
|
| 25 |
{context}
|
|
|
|
|
|
|
| 26 |
"""
|
| 27 |
|
| 28 |
|
|
|
|
| 1 |
# pylint: disable=C0301
|
| 2 |
TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
Please note the following requirements:
|
| 5 |
+
1. Output only one QA pair without any additional explanations or analysis.
|
| 6 |
+
2. Do not repeat the content of the answer or any part of it.
|
| 7 |
+
3. The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
|
| 8 |
+
|
| 9 |
+
Output format:
|
| 10 |
+
<question>question_text</question>
|
| 11 |
+
<answer>answer_text</answer>
|
| 12 |
+
|
| 13 |
+
For example:
|
| 14 |
+
<question>What is the effect of overexpressing the BG1 gene on grain size and development?</question>
|
| 15 |
+
<answer>Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.</answer>
|
| 16 |
|
| 17 |
Here is the text passage you need to generate a QA pair for:
|
| 18 |
{context}
|
| 19 |
+
|
| 20 |
+
Output:
|
| 21 |
"""
|
| 22 |
|
| 23 |
TEMPLATE_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答(QA)对。
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
请注意下列要求:
|
| 26 |
+
1. 仅输出一个问答(QA)对,不得包含任何额外说明或分析
|
| 27 |
+
2. 不得重复答案内容或其中任何片段
|
| 28 |
+
3. 答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
|
| 29 |
+
|
| 30 |
+
输出格式如下:
|
| 31 |
+
<question>question_text</question>
|
| 32 |
+
<answer>answer_text</answer>
|
| 33 |
+
|
| 34 |
+
例如:
|
| 35 |
+
<question>过表达BG1基因对谷粒大小和发育有什么影响?</question>
|
| 36 |
+
<answer>BG1基因的过表达显著增加了谷粒大小,表明其在谷物发育中的作用。</answer>
|
| 37 |
|
| 38 |
以下是你需要为其生成QA对的文本段落:
|
| 39 |
{context}
|
| 40 |
+
|
| 41 |
+
输出:
|
| 42 |
"""
|
| 43 |
|
| 44 |
|
graphgen/templates/generation/cot_generation.py
CHANGED
|
@@ -81,7 +81,7 @@ Input:
|
|
| 81 |
Output:
|
| 82 |
"""
|
| 83 |
|
| 84 |
-
COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务不是回答问题,\
|
| 85 |
而是根据给定的知识图谱中的实体和关系的名称以及描述信息,设计一条可复用、可泛化的 CoT 推理路径模板。\
|
| 86 |
|
| 87 |
-步骤-
|
|
@@ -115,8 +115,8 @@ COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务
|
|
| 115 |
4. 不要出现具体数值或结论,不要出现“识别实体”、“识别关系”这类无意义的操作描述。
|
| 116 |
5. 使用中文作为输出语言。
|
| 117 |
6. 输出格式为:
|
| 118 |
-
|
| 119 |
-
|
| 120 |
|
| 121 |
-真实数据-
|
| 122 |
输入:
|
|
@@ -130,7 +130,7 @@ COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务
|
|
| 130 |
"""
|
| 131 |
|
| 132 |
|
| 133 |
-
COT_TEMPLATE_DESIGN_EN = """You are a “meta-reasoning architect”. \
|
| 134 |
Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \
|
| 135 |
template based solely on the names and descriptions of entities and \
|
| 136 |
relationships in the provided knowledge graph.
|
|
@@ -168,8 +168,8 @@ relationships in the provided knowledge graph.
|
|
| 168 |
and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship".
|
| 169 |
5. Use English as the output language.
|
| 170 |
6. The output format is:
|
| 171 |
-
|
| 172 |
-
|
| 173 |
|
| 174 |
Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided.
|
| 175 |
|
|
|
|
| 81 |
Output:
|
| 82 |
"""
|
| 83 |
|
| 84 |
+
COT_TEMPLATE_DESIGN_ZH: str = """你是一位“元推理架构师”。你的任务不是回答问题,\
|
| 85 |
而是根据给定的知识图谱中的实体和关系的名称以及描述信息,设计一条可复用、可泛化的 CoT 推理路径模板。\
|
| 86 |
|
| 87 |
-步骤-
|
|
|
|
| 115 |
4. 不要出现具体数值或结论,不要出现“识别实体”、“识别关系”这类无意义的操作描述。
|
| 116 |
5. 使用中文作为输出语言。
|
| 117 |
6. 输出格式为:
|
| 118 |
+
<question>问题文本</question>
|
| 119 |
+
<reasoning_path>推理路径设计文本</reasoning_path>
|
| 120 |
|
| 121 |
-真实数据-
|
| 122 |
输入:
|
|
|
|
| 130 |
"""
|
| 131 |
|
| 132 |
|
| 133 |
+
COT_TEMPLATE_DESIGN_EN: str = """You are a “meta-reasoning architect”. \
|
| 134 |
Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \
|
| 135 |
template based solely on the names and descriptions of entities and \
|
| 136 |
relationships in the provided knowledge graph.
|
|
|
|
| 168 |
and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship".
|
| 169 |
5. Use English as the output language.
|
| 170 |
6. The output format is:
|
| 171 |
+
<question>question text</question>
|
| 172 |
+
<reasoning_path>reasoning path design text</reasoning_path>
|
| 173 |
|
| 174 |
Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided.
|
| 175 |
|
graphgen/templates/generation/multi_hop_generation.py
CHANGED
|
@@ -1,56 +1,68 @@
|
|
| 1 |
# pylint: disable=C0301
|
| 2 |
-
TEMPLATE_ZH: str = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
例如:
|
| 5 |
-
|
| 6 |
--实体--
|
| 7 |
1. 苹果
|
| 8 |
2. 水果
|
| 9 |
3. 维生素C
|
| 10 |
-
########
|
| 11 |
--关系--
|
| 12 |
1. 苹果-水果:苹果是一种水果
|
| 13 |
2. 水果-维生素C:水果中富含维生素C
|
| 14 |
-
########
|
| 15 |
-
问题:通过吃苹果补充的什么物质,有助于维持健康?
|
| 16 |
-
答案:维生素C
|
| 17 |
-
########
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
--实体--
|
| 21 |
{entities}
|
| 22 |
-
#########
|
| 23 |
--关系--
|
| 24 |
{relationships}
|
| 25 |
-
|
| 26 |
-
|
| 27 |
"""
|
| 28 |
|
| 29 |
-
TEMPLATE_EN: str = """Please generate a multi-hop reasoning question and answer based on the following knowledge subgraph. You will be provided with a knowledge subgraph that contains a series of entities, relations, and facts.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
For example:
|
| 32 |
-
|
| 33 |
--Entities--
|
| 34 |
1. Apple
|
| 35 |
2. Fruit
|
| 36 |
3. Vitamin C
|
| 37 |
-
########
|
| 38 |
--Relations--
|
| 39 |
1. Apple-Fruit: Apple is a type of fruit
|
| 40 |
2. Fruit-Vitamin C: Fruits are rich in Vitamin C
|
| 41 |
-
########
|
| 42 |
-
Question: What substance, obtained through eating apples, helps maintain health?
|
| 43 |
-
Answer: Vitamin C
|
| 44 |
-
########
|
| 45 |
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
--Entities--
|
| 48 |
{entities}
|
| 49 |
-
########
|
| 50 |
--Relations--
|
| 51 |
{relationships}
|
| 52 |
-
|
| 53 |
-
Output
|
| 54 |
"""
|
| 55 |
|
| 56 |
MULTI_HOP_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
|
|
|
|
| 1 |
# pylint: disable=C0301
|
| 2 |
+
TEMPLATE_ZH: str = """请基于以下知识子图生成多跳推理问题和答案。你将获得一个知识子图,其中包含一系列实体、关系和事实。
|
| 3 |
+
你的任务是生成一个问答对,其中问题需要经过多次推理才能回答。问题的答案应该是从给定的知识子图中推断出来的。确保问题的难度适中,需要多步推理才能回答。
|
| 4 |
+
|
| 5 |
+
请注意下列要求:
|
| 6 |
+
1. 仅输出一个问答(QA)对,不得包含任何额外说明或分析
|
| 7 |
+
2. 不得重复答案内容或其中任何片段,不要直接复制示例问题和答案
|
| 8 |
+
3. 答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
|
| 9 |
+
|
| 10 |
+
输出格式:
|
| 11 |
+
<question>question_text</question>
|
| 12 |
+
<answer>answer_text</answer>
|
| 13 |
|
| 14 |
例如:
|
| 15 |
+
输入为:
|
| 16 |
--实体--
|
| 17 |
1. 苹果
|
| 18 |
2. 水果
|
| 19 |
3. 维生素C
|
|
|
|
| 20 |
--关系--
|
| 21 |
1. 苹果-水果:苹果是一种水果
|
| 22 |
2. 水果-维生素C:水果中富含维生素C
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
输出:
|
| 25 |
+
<question>通过吃苹果补充的什么物质,有助于维持健康?</question>
|
| 26 |
+
<answer>维生素C</answer>
|
| 27 |
+
|
| 28 |
+
真实输入如下:
|
| 29 |
--实体--
|
| 30 |
{entities}
|
|
|
|
| 31 |
--关系--
|
| 32 |
{relationships}
|
| 33 |
+
|
| 34 |
+
输出:
|
| 35 |
"""
|
| 36 |
|
| 37 |
+
TEMPLATE_EN: str = """Please generate a multi-hop reasoning question and answer based on the following knowledge subgraph. You will be provided with a knowledge subgraph that contains a series of entities, relations, and facts.
|
| 38 |
+
Your task is to generate a question-answer (QA) pair where the question requires multiple steps of reasoning to answer. The answer to the question should be inferred from the given knowledge subgraph. Ensure that the question is of moderate difficulty and requires multiple steps of reasoning to answer.
|
| 39 |
+
|
| 40 |
+
Please note the following requirements:
|
| 41 |
+
1. Output only one QA pair without any additional explanations or analysis.
|
| 42 |
+
2. Do not repeat the content of the answer or any part of it. Do not directly copy the example question and answer.
|
| 43 |
+
3. The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
|
| 44 |
|
| 45 |
For example:
|
| 46 |
+
Input:
|
| 47 |
--Entities--
|
| 48 |
1. Apple
|
| 49 |
2. Fruit
|
| 50 |
3. Vitamin C
|
|
|
|
| 51 |
--Relations--
|
| 52 |
1. Apple-Fruit: Apple is a type of fruit
|
| 53 |
2. Fruit-Vitamin C: Fruits are rich in Vitamin C
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
Output:
|
| 56 |
+
<question>What substance, obtained by eating apples, helps maintain health?</question>
|
| 57 |
+
<answer>Vitamin C</answer>
|
| 58 |
+
|
| 59 |
+
Real input:
|
| 60 |
--Entities--
|
| 61 |
{entities}
|
|
|
|
| 62 |
--Relations--
|
| 63 |
{relationships}
|
| 64 |
+
|
| 65 |
+
Output:
|
| 66 |
"""
|
| 67 |
|
| 68 |
MULTI_HOP_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
|
graphgen/templates/generation/vqa_generation.py
CHANGED
|
@@ -39,14 +39,16 @@ Create multiple sets of VQA question-answer pairs that satisfy the following:
|
|
| 39 |
################
|
| 40 |
{relationships}
|
| 41 |
################
|
| 42 |
-
Directly output the generated questions and answers, please do not directly copy the example questions and answers, and do not provide irrelevant information.
|
| 43 |
-
Here is the response format you should follow:
|
| 44 |
-
Question: <Question1>
|
| 45 |
-
Answer: <Answer1>
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
|
|
|
| 50 |
"""
|
| 51 |
|
| 52 |
TEMPLATE_ZH: str = """---角色---
|
|
@@ -91,14 +93,15 @@ TEMPLATE_ZH: str = """---角色---
|
|
| 91 |
################
|
| 92 |
{relationships}
|
| 93 |
################
|
| 94 |
-
直接输出生成的问题和答案,请不要直接复制示例问题和答案,不要输出无关内容。
|
| 95 |
-
以下是你应该遵循的响应格式:
|
| 96 |
-
问题: <问题1>
|
| 97 |
-
答案: <答案1>
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
|
|
|
| 102 |
"""
|
| 103 |
|
| 104 |
VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
|
|
|
|
| 39 |
################
|
| 40 |
{relationships}
|
| 41 |
################
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
Please directly output the generated questions and answers, do not directly copy the example questions and answers, and do not provide irrelevant information.
|
| 44 |
+
|
| 45 |
+
Here is the response format you should follow:
|
| 46 |
+
<question>question1</question>
|
| 47 |
+
<answer>answer1</answer>
|
| 48 |
+
<question>question2</question>
|
| 49 |
+
<answer>answer2</answer>
|
| 50 |
|
| 51 |
+
Output:
|
| 52 |
"""
|
| 53 |
|
| 54 |
TEMPLATE_ZH: str = """---角色---
|
|
|
|
| 93 |
################
|
| 94 |
{relationships}
|
| 95 |
################
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
+
请直接输出生成的问题和答案,不要直接复制示例问题和答案,也不要提供无关信息。
|
| 98 |
+
以下是你应遵循的响应格式:
|
| 99 |
+
<question>question1</question>
|
| 100 |
+
<answer>answer1</answer>
|
| 101 |
+
<question>question2</question>
|
| 102 |
+
<answer>answer2</answer>
|
| 103 |
|
| 104 |
+
输出:
|
| 105 |
"""
|
| 106 |
|
| 107 |
VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
|