Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
5cb2e67
1
Parent(s):
b1922c7
Auto-sync from demo at Thu Jan 15 11:07:22 UTC 2026
Browse files- graphgen/bases/base_generator.py +44 -35
- graphgen/models/__init__.py +3 -0
- graphgen/models/generator/__init__.py +3 -0
- graphgen/models/generator/fill_in_blank_generator.py +99 -0
- graphgen/models/generator/multi_answer_generator.py +118 -0
- graphgen/models/generator/multi_choice_generator.py +118 -0
- graphgen/operators/generate/generate_service.py +33 -8
- graphgen/templates/__init__.py +3 -1
- graphgen/templates/generation/__init__.py +3 -0
- graphgen/templates/generation/classification_generation.py +0 -0
- graphgen/templates/generation/fill_in_blank_generation.py +78 -0
- graphgen/templates/generation/multi_answer_generation.py +100 -0
- graphgen/templates/generation/multi_choice_generation.py +97 -0
- graphgen/templates/question_generation.py +0 -32
graphgen/bases/base_generator.py
CHANGED
|
@@ -46,38 +46,47 @@ class BaseGenerator(ABC):
|
|
| 46 |
def format_generation_results(
|
| 47 |
results: list[dict], output_data_format: str
|
| 48 |
) -> list[dict[str, Any]]:
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
{
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def format_generation_results(
|
| 47 |
results: list[dict], output_data_format: str
|
| 48 |
) -> list[dict[str, Any]]:
|
| 49 |
+
|
| 50 |
+
flat_results = []
|
| 51 |
+
for item in results:
|
| 52 |
+
for _, qa_data in item.items():
|
| 53 |
+
question = qa_data.get("question", "")
|
| 54 |
+
answer = qa_data.get("answer", "")
|
| 55 |
+
if "options" in qa_data and qa_data["options"]:
|
| 56 |
+
options = qa_data["options"]
|
| 57 |
+
options_str = "\n".join(
|
| 58 |
+
[f"{key}. {options[key]}" for key in sorted(options.keys())]
|
| 59 |
+
)
|
| 60 |
+
question += f"\nOptions:\n{options_str}"
|
| 61 |
+
|
| 62 |
+
if output_data_format == "Alpaca":
|
| 63 |
+
flat_results.append(
|
| 64 |
+
{
|
| 65 |
+
"instruction": question,
|
| 66 |
+
"input": "",
|
| 67 |
+
"output": answer,
|
| 68 |
+
}
|
| 69 |
+
)
|
| 70 |
+
elif output_data_format == "Sharegpt":
|
| 71 |
+
flat_results.append(
|
| 72 |
+
{
|
| 73 |
+
"conversations": [
|
| 74 |
+
{"from": "human", "value": question},
|
| 75 |
+
{"from": "gpt", "value": answer},
|
| 76 |
+
]
|
| 77 |
+
}
|
| 78 |
+
)
|
| 79 |
+
elif output_data_format == "ChatML":
|
| 80 |
+
flat_results.append(
|
| 81 |
+
{
|
| 82 |
+
"messages": [
|
| 83 |
+
{"role": "user", "content": question},
|
| 84 |
+
{"role": "assistant", "content": answer},
|
| 85 |
+
]
|
| 86 |
+
}
|
| 87 |
+
)
|
| 88 |
+
else:
|
| 89 |
+
raise ValueError(
|
| 90 |
+
f"Unknown output data format: {output_data_format}"
|
| 91 |
+
)
|
| 92 |
+
return flat_results
|
graphgen/models/__init__.py
CHANGED
|
@@ -11,6 +11,9 @@ from .generator import (
|
|
| 11 |
AggregatedGenerator,
|
| 12 |
AtomicGenerator,
|
| 13 |
CoTGenerator,
|
|
|
|
|
|
|
|
|
|
| 14 |
MultiHopGenerator,
|
| 15 |
QuizGenerator,
|
| 16 |
VQAGenerator,
|
|
|
|
| 11 |
AggregatedGenerator,
|
| 12 |
AtomicGenerator,
|
| 13 |
CoTGenerator,
|
| 14 |
+
FillInBlankGenerator,
|
| 15 |
+
MultiAnswerGenerator,
|
| 16 |
+
MultiChoiceGenerator,
|
| 17 |
MultiHopGenerator,
|
| 18 |
QuizGenerator,
|
| 19 |
VQAGenerator,
|
graphgen/models/generator/__init__.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
from .aggregated_generator import AggregatedGenerator
|
| 2 |
from .atomic_generator import AtomicGenerator
|
| 3 |
from .cot_generator import CoTGenerator
|
|
|
|
|
|
|
|
|
|
| 4 |
from .multi_hop_generator import MultiHopGenerator
|
| 5 |
from .quiz_generator import QuizGenerator
|
| 6 |
from .vqa_generator import VQAGenerator
|
|
|
|
| 1 |
from .aggregated_generator import AggregatedGenerator
|
| 2 |
from .atomic_generator import AtomicGenerator
|
| 3 |
from .cot_generator import CoTGenerator
|
| 4 |
+
from .fill_in_blank_generator import FillInBlankGenerator
|
| 5 |
+
from .multi_answer_generator import MultiAnswerGenerator
|
| 6 |
+
from .multi_choice_generator import MultiChoiceGenerator
|
| 7 |
from .multi_hop_generator import MultiHopGenerator
|
| 8 |
from .quiz_generator import QuizGenerator
|
| 9 |
from .vqa_generator import VQAGenerator
|
graphgen/models/generator/fill_in_blank_generator.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Any
|
| 3 |
+
|
| 4 |
+
from graphgen.bases import BaseGenerator
|
| 5 |
+
from graphgen.templates import FILL_IN_BLANK_GENERATION_PROMPT
|
| 6 |
+
from graphgen.utils import compute_content_hash, detect_main_language, logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FillInBlankGenerator(BaseGenerator):
|
| 10 |
+
def __init__(self, llm_client, num_of_questions) -> None:
|
| 11 |
+
super().__init__(llm_client)
|
| 12 |
+
self.num_of_questions = num_of_questions
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def parse_response(response: str) -> Any:
|
| 16 |
+
"""
|
| 17 |
+
Parse fill-in-the-blank QA pairs from the LLM response.
|
| 18 |
+
Each QA pair contains question text with placeholders and the correct answer(s).
|
| 19 |
+
|
| 20 |
+
:param response: The LLM response containing XML-formatted QA pairs
|
| 21 |
+
:return: Dictionary mapping question hash to question data, where each
|
| 22 |
+
value is a dict with "question", "answer", and "answers" keys
|
| 23 |
+
"""
|
| 24 |
+
qa_pairs = {}
|
| 25 |
+
|
| 26 |
+
# Extract all QA pair blocks
|
| 27 |
+
qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
|
| 28 |
+
|
| 29 |
+
if not qa_blocks:
|
| 30 |
+
logger.warning("No QA pairs found in response: %s", response)
|
| 31 |
+
return {}
|
| 32 |
+
|
| 33 |
+
for block in qa_blocks:
|
| 34 |
+
# Extract and clean question text
|
| 35 |
+
q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
|
| 36 |
+
if not q_match:
|
| 37 |
+
logger.warning("Failed to parse question from block: %s", block)
|
| 38 |
+
continue
|
| 39 |
+
question = q_match.group(1).strip().strip('"').strip("'")
|
| 40 |
+
|
| 41 |
+
# Extract and clean answer text
|
| 42 |
+
ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
|
| 43 |
+
if not ans_match:
|
| 44 |
+
logger.warning("Failed to parse answer from block: %s", block)
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
answer_text = ans_match.group(1).strip().strip('"').strip("'")
|
| 48 |
+
|
| 49 |
+
# Parse multiple answers (e.g., "A8X, 八百万" or "A8X")
|
| 50 |
+
# Split by comma and strip whitespace from each answer
|
| 51 |
+
answers = [ans.strip() for ans in answer_text.split(",") if ans.strip()]
|
| 52 |
+
|
| 53 |
+
# Ensure at least one valid answer
|
| 54 |
+
if len(answers) == 0:
|
| 55 |
+
logger.warning("No valid answers found in: %s", answer_text)
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
# Build result entry with question hash as key
|
| 59 |
+
question_hash = compute_content_hash(question)
|
| 60 |
+
qa_pairs[question_hash] = {
|
| 61 |
+
"question": question,
|
| 62 |
+
"answer": answer_text, # Original answer text with commas
|
| 63 |
+
"answers": answers, # List of individual answers: ["A8X"] or ["A8X", "八百万"]
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
logger.debug(
|
| 67 |
+
"Successfully parsed fill-in-the-blank question: %s", question[:50]
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if not qa_pairs:
|
| 71 |
+
logger.error("Failed to parse any valid QA pairs from response")
|
| 72 |
+
|
| 73 |
+
return qa_pairs
|
| 74 |
+
|
| 75 |
+
# pylint: disable=W0221
|
| 76 |
+
def build_prompt(
|
| 77 |
+
self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
|
| 78 |
+
) -> str:
|
| 79 |
+
nodes, edges = batch
|
| 80 |
+
entities_str = "\n".join(
|
| 81 |
+
[
|
| 82 |
+
f"{index + 1}. {node[0]}: {node[1]['description']}"
|
| 83 |
+
for index, node in enumerate(nodes)
|
| 84 |
+
]
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
relationships_str = "\n".join(
|
| 88 |
+
[
|
| 89 |
+
f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
|
| 90 |
+
for index, edge in enumerate(edges)
|
| 91 |
+
]
|
| 92 |
+
)
|
| 93 |
+
context = entities_str + "\n" + relationships_str
|
| 94 |
+
language = detect_main_language(entities_str + relationships_str)
|
| 95 |
+
prompt = FILL_IN_BLANK_GENERATION_PROMPT[language].format(
|
| 96 |
+
context=context,
|
| 97 |
+
num_of_questions=self.num_of_questions,
|
| 98 |
+
)
|
| 99 |
+
return prompt
|
graphgen/models/generator/multi_answer_generator.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Any
|
| 3 |
+
|
| 4 |
+
from graphgen.bases import BaseGenerator
|
| 5 |
+
from graphgen.templates import MAQ_GENERATION_PROMPT
|
| 6 |
+
from graphgen.utils import compute_content_hash, detect_main_language, logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class MultiAnswerGenerator(BaseGenerator):
|
| 10 |
+
def __init__(self, llm_client, num_of_questions) -> None:
|
| 11 |
+
super().__init__(llm_client)
|
| 12 |
+
self.num_of_questions = num_of_questions
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def parse_response(response: str) -> Any:
|
| 16 |
+
"""
|
| 17 |
+
Parse multiple-answer QA pairs from the LLM response.
|
| 18 |
+
Each QA pair contains question text, four options, and the correct answers (one or more).
|
| 19 |
+
|
| 20 |
+
:param response: The LLM response containing XML-formatted QA pairs
|
| 21 |
+
:return: Dictionary mapping question hash to question data, where each
|
| 22 |
+
value is a dict with "question", "options", and "answer" keys
|
| 23 |
+
"""
|
| 24 |
+
qa_pairs = {}
|
| 25 |
+
|
| 26 |
+
# Extract all QA pair blocks
|
| 27 |
+
qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
|
| 28 |
+
|
| 29 |
+
if not qa_blocks:
|
| 30 |
+
logger.warning("No QA pairs found in response: %s", response)
|
| 31 |
+
return {}
|
| 32 |
+
|
| 33 |
+
for block in qa_blocks:
|
| 34 |
+
# Extract and clean question text
|
| 35 |
+
q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
|
| 36 |
+
if not q_match:
|
| 37 |
+
logger.warning("Failed to parse question from block: %s", block)
|
| 38 |
+
continue
|
| 39 |
+
question = q_match.group(1).strip().strip('"').strip("'")
|
| 40 |
+
|
| 41 |
+
# Extract and parse options (A, B, C, D)
|
| 42 |
+
opt_match = re.search(r"<options>(.*?)</options>", block, re.DOTALL)
|
| 43 |
+
if not opt_match:
|
| 44 |
+
logger.warning("Failed to parse options from block: %s", block)
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
options = {}
|
| 48 |
+
options_text = opt_match.group(1).strip()
|
| 49 |
+
for line in options_text.split("\n"):
|
| 50 |
+
line = line.strip()
|
| 51 |
+
if not line:
|
| 52 |
+
continue
|
| 53 |
+
# Match patterns like "A. text" or "B. text"
|
| 54 |
+
if m := re.match(r"^([A-Z])[.\s]\s*(.*)$", line):
|
| 55 |
+
letter, text = m.groups()
|
| 56 |
+
options[letter] = text.strip()
|
| 57 |
+
|
| 58 |
+
# Extract and validate answer
|
| 59 |
+
ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
|
| 60 |
+
if not ans_match:
|
| 61 |
+
logger.warning("Failed to parse answer from block: %s", block)
|
| 62 |
+
continue
|
| 63 |
+
answer_text = ans_match.group(1).strip().strip('"').strip("'")
|
| 64 |
+
answers = [ans.strip().upper() for ans in answer_text.split(",") if ans.strip()]
|
| 65 |
+
invalid_answers = [ans for ans in answers if ans not in options]
|
| 66 |
+
if invalid_answers:
|
| 67 |
+
logger.warning(
|
| 68 |
+
"Answers %s not found in options: %s",
|
| 69 |
+
invalid_answers,
|
| 70 |
+
list(options.keys()),
|
| 71 |
+
)
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
# Ensure at least one valid answer
|
| 75 |
+
if len(answers) == 0:
|
| 76 |
+
logger.warning("No valid answers found in: %s", answer_text)
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
# Build result entry with question hash as key
|
| 80 |
+
question_hash = compute_content_hash(question)
|
| 81 |
+
qa_pairs[question_hash] = {
|
| 82 |
+
"question": question,
|
| 83 |
+
"options": options, # Dict like {"A": "text", "B": "text", ...}
|
| 84 |
+
"answer": ", ".join(answers),
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
logger.debug("Successfully parsed MAQ: %s", question[:50])
|
| 88 |
+
|
| 89 |
+
if not qa_pairs:
|
| 90 |
+
logger.error("Failed to parse any valid MAQ pairs from response")
|
| 91 |
+
|
| 92 |
+
return qa_pairs
|
| 93 |
+
|
| 94 |
+
# pylint: disable=W0221
|
| 95 |
+
def build_prompt(
|
| 96 |
+
self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
|
| 97 |
+
) -> str:
|
| 98 |
+
nodes, edges = batch
|
| 99 |
+
entities_str = "\n".join(
|
| 100 |
+
[
|
| 101 |
+
f"{index + 1}. {node[0]}: {node[1]['description']}"
|
| 102 |
+
for index, node in enumerate(nodes)
|
| 103 |
+
]
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
relationships_str = "\n".join(
|
| 107 |
+
[
|
| 108 |
+
f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
|
| 109 |
+
for index, edge in enumerate(edges)
|
| 110 |
+
]
|
| 111 |
+
)
|
| 112 |
+
context = entities_str + "\n" + relationships_str
|
| 113 |
+
language = detect_main_language(entities_str + relationships_str)
|
| 114 |
+
prompt = MAQ_GENERATION_PROMPT[language].format(
|
| 115 |
+
context=context,
|
| 116 |
+
num_of_questions=self.num_of_questions,
|
| 117 |
+
)
|
| 118 |
+
return prompt
|
graphgen/models/generator/multi_choice_generator.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Any
|
| 3 |
+
|
| 4 |
+
from graphgen.bases import BaseGenerator
|
| 5 |
+
from graphgen.templates import MCQ_GENERATION_PROMPT
|
| 6 |
+
from graphgen.utils import compute_content_hash, detect_main_language, logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class MultiChoiceGenerator(BaseGenerator):
|
| 10 |
+
def __init__(self, llm_client, num_of_questions) -> None:
|
| 11 |
+
super().__init__(llm_client)
|
| 12 |
+
self.num_of_questions = num_of_questions
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def parse_response(response: str) -> Any:
|
| 16 |
+
"""
|
| 17 |
+
Parse multiple choice QA pairs from the LLM response.
|
| 18 |
+
Each QA pair contains question text, four options, and the correct answer.
|
| 19 |
+
|
| 20 |
+
:param response: The LLM response containing XML-formatted QA pairs
|
| 21 |
+
:return: Dictionary mapping question hash to question data, where each
|
| 22 |
+
value is a dict with "question", "options", and "answer" keys
|
| 23 |
+
"""
|
| 24 |
+
qa_pairs = {}
|
| 25 |
+
|
| 26 |
+
# Extract all QA pair blocks
|
| 27 |
+
qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
|
| 28 |
+
|
| 29 |
+
if not qa_blocks:
|
| 30 |
+
logger.warning("No QA pairs found in response: %s", response)
|
| 31 |
+
return {}
|
| 32 |
+
|
| 33 |
+
for block in qa_blocks:
|
| 34 |
+
# Extract and clean question text
|
| 35 |
+
q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
|
| 36 |
+
if not q_match:
|
| 37 |
+
logger.warning("Failed to parse question from block: %s", block)
|
| 38 |
+
continue
|
| 39 |
+
question = q_match.group(1).strip().strip('"').strip("'")
|
| 40 |
+
|
| 41 |
+
# Extract and parse options (A, B, C, D)
|
| 42 |
+
opt_match = re.search(r"<options>(.*?)</options>", block, re.DOTALL)
|
| 43 |
+
if not opt_match:
|
| 44 |
+
logger.warning("Failed to parse options from block: %s", block)
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
options = {}
|
| 48 |
+
options_text = opt_match.group(1).strip()
|
| 49 |
+
for line in options_text.split("\n"):
|
| 50 |
+
line = line.strip()
|
| 51 |
+
if not line:
|
| 52 |
+
continue
|
| 53 |
+
# Match patterns like "A. text" or "B. text"
|
| 54 |
+
if m := re.match(r"^([A-D])[.\s]\s*(.*)$", line):
|
| 55 |
+
letter, text = m.groups()
|
| 56 |
+
options[letter] = text.strip()
|
| 57 |
+
|
| 58 |
+
# Validate options count
|
| 59 |
+
if len(options) != 4:
|
| 60 |
+
logger.warning(
|
| 61 |
+
"Expected 4 options, found %d: %s", len(options), options_text
|
| 62 |
+
)
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
# Extract and validate answer
|
| 66 |
+
ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
|
| 67 |
+
if not ans_match:
|
| 68 |
+
logger.warning("Failed to parse answer from block: %s", block)
|
| 69 |
+
continue
|
| 70 |
+
answer = ans_match.group(1).strip().strip('"').strip("'")
|
| 71 |
+
|
| 72 |
+
# Ensure answer exists in options
|
| 73 |
+
if answer not in options:
|
| 74 |
+
logger.warning(
|
| 75 |
+
"Answer '%s' not found in options: %s", answer, list(options.keys())
|
| 76 |
+
)
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
# Build result entry with question hash as key
|
| 80 |
+
question_hash = compute_content_hash(question)
|
| 81 |
+
qa_pairs[question_hash] = {
|
| 82 |
+
"question": question,
|
| 83 |
+
"options": options, # Dict like {"A": "text", "B": "text", ...}
|
| 84 |
+
"answer": answer, # Single letter: "A", "B", "C", or "D"
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
logger.debug("Successfully parsed MCQ: %s", question[:50])
|
| 88 |
+
|
| 89 |
+
if not qa_pairs:
|
| 90 |
+
logger.error("Failed to parse any valid MCQ pairs from response")
|
| 91 |
+
|
| 92 |
+
return qa_pairs
|
| 93 |
+
|
| 94 |
+
# pylint: disable=W0221
|
| 95 |
+
def build_prompt(
|
| 96 |
+
self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
|
| 97 |
+
) -> str:
|
| 98 |
+
nodes, edges = batch
|
| 99 |
+
entities_str = "\n".join(
|
| 100 |
+
[
|
| 101 |
+
f"{index + 1}. {node[0]}: {node[1]['description']}"
|
| 102 |
+
for index, node in enumerate(nodes)
|
| 103 |
+
]
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
relationships_str = "\n".join(
|
| 107 |
+
[
|
| 108 |
+
f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
|
| 109 |
+
for index, edge in enumerate(edges)
|
| 110 |
+
]
|
| 111 |
+
)
|
| 112 |
+
context = entities_str + "\n" + relationships_str
|
| 113 |
+
language = detect_main_language(entities_str + relationships_str)
|
| 114 |
+
prompt = MCQ_GENERATION_PROMPT[language].format(
|
| 115 |
+
context=context,
|
| 116 |
+
num_of_questions=self.num_of_questions,
|
| 117 |
+
)
|
| 118 |
+
return prompt
|
graphgen/operators/generate/generate_service.py
CHANGED
|
@@ -2,13 +2,6 @@ import pandas as pd
|
|
| 2 |
|
| 3 |
from graphgen.bases import BaseLLMWrapper, BaseOperator
|
| 4 |
from graphgen.common import init_llm
|
| 5 |
-
from graphgen.models import (
|
| 6 |
-
AggregatedGenerator,
|
| 7 |
-
AtomicGenerator,
|
| 8 |
-
CoTGenerator,
|
| 9 |
-
MultiHopGenerator,
|
| 10 |
-
VQAGenerator,
|
| 11 |
-
)
|
| 12 |
from graphgen.utils import logger, run_concurrent
|
| 13 |
|
| 14 |
|
|
@@ -22,6 +15,7 @@ class GenerateService(BaseOperator):
|
|
| 22 |
working_dir: str = "cache",
|
| 23 |
method: str = "aggregated",
|
| 24 |
data_format: str = "ChatML",
|
|
|
|
| 25 |
):
|
| 26 |
super().__init__(working_dir=working_dir, op_name="generate_service")
|
| 27 |
self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
|
|
@@ -30,15 +24,46 @@ class GenerateService(BaseOperator):
|
|
| 30 |
self.data_format = data_format
|
| 31 |
|
| 32 |
if self.method == "atomic":
|
|
|
|
|
|
|
| 33 |
self.generator = AtomicGenerator(self.llm_client)
|
| 34 |
elif self.method == "aggregated":
|
|
|
|
|
|
|
| 35 |
self.generator = AggregatedGenerator(self.llm_client)
|
| 36 |
elif self.method == "multi_hop":
|
|
|
|
|
|
|
| 37 |
self.generator = MultiHopGenerator(self.llm_client)
|
| 38 |
elif self.method == "cot":
|
|
|
|
|
|
|
| 39 |
self.generator = CoTGenerator(self.llm_client)
|
| 40 |
-
elif self.method
|
|
|
|
|
|
|
| 41 |
self.generator = VQAGenerator(self.llm_client)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
else:
|
| 43 |
raise ValueError(f"Unsupported generation mode: {method}")
|
| 44 |
|
|
|
|
| 2 |
|
| 3 |
from graphgen.bases import BaseLLMWrapper, BaseOperator
|
| 4 |
from graphgen.common import init_llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from graphgen.utils import logger, run_concurrent
|
| 6 |
|
| 7 |
|
|
|
|
| 15 |
working_dir: str = "cache",
|
| 16 |
method: str = "aggregated",
|
| 17 |
data_format: str = "ChatML",
|
| 18 |
+
**generate_kwargs,
|
| 19 |
):
|
| 20 |
super().__init__(working_dir=working_dir, op_name="generate_service")
|
| 21 |
self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
|
|
|
|
| 24 |
self.data_format = data_format
|
| 25 |
|
| 26 |
if self.method == "atomic":
|
| 27 |
+
from graphgen.models import AtomicGenerator
|
| 28 |
+
|
| 29 |
self.generator = AtomicGenerator(self.llm_client)
|
| 30 |
elif self.method == "aggregated":
|
| 31 |
+
from graphgen.models import AggregatedGenerator
|
| 32 |
+
|
| 33 |
self.generator = AggregatedGenerator(self.llm_client)
|
| 34 |
elif self.method == "multi_hop":
|
| 35 |
+
from graphgen.models import MultiHopGenerator
|
| 36 |
+
|
| 37 |
self.generator = MultiHopGenerator(self.llm_client)
|
| 38 |
elif self.method == "cot":
|
| 39 |
+
from graphgen.models import CoTGenerator
|
| 40 |
+
|
| 41 |
self.generator = CoTGenerator(self.llm_client)
|
| 42 |
+
elif self.method == "vqa":
|
| 43 |
+
from graphgen.models import VQAGenerator
|
| 44 |
+
|
| 45 |
self.generator = VQAGenerator(self.llm_client)
|
| 46 |
+
elif self.method == "multi_choice":
|
| 47 |
+
from graphgen.models import MultiChoiceGenerator
|
| 48 |
+
|
| 49 |
+
self.generator = MultiChoiceGenerator(
|
| 50 |
+
self.llm_client,
|
| 51 |
+
num_of_questions=generate_kwargs.get("num_of_questions", 5),
|
| 52 |
+
)
|
| 53 |
+
elif self.method == "multi_answer":
|
| 54 |
+
from graphgen.models import MultiAnswerGenerator
|
| 55 |
+
|
| 56 |
+
self.generator = MultiAnswerGenerator(
|
| 57 |
+
self.llm_client,
|
| 58 |
+
num_of_questions=generate_kwargs.get("num_of_questions", 3),
|
| 59 |
+
)
|
| 60 |
+
elif self.method == "fill_in_blank":
|
| 61 |
+
from graphgen.models import FillInBlankGenerator
|
| 62 |
+
|
| 63 |
+
self.generator = FillInBlankGenerator(
|
| 64 |
+
self.llm_client,
|
| 65 |
+
num_of_questions=generate_kwargs.get("num_of_questions", 5),
|
| 66 |
+
)
|
| 67 |
else:
|
| 68 |
raise ValueError(f"Unsupported generation mode: {method}")
|
| 69 |
|
graphgen/templates/__init__.py
CHANGED
|
@@ -6,10 +6,12 @@ from .generation import (
|
|
| 6 |
AGGREGATED_GENERATION_PROMPT,
|
| 7 |
ATOMIC_GENERATION_PROMPT,
|
| 8 |
COT_GENERATION_PROMPT,
|
|
|
|
|
|
|
|
|
|
| 9 |
MULTI_HOP_GENERATION_PROMPT,
|
| 10 |
VQA_GENERATION_PROMPT,
|
| 11 |
)
|
| 12 |
from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
|
| 13 |
-
from .question_generation import QUESTION_GENERATION_PROMPT
|
| 14 |
from .search_judgement import SEARCH_JUDGEMENT_PROMPT
|
| 15 |
from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
|
|
|
|
| 6 |
AGGREGATED_GENERATION_PROMPT,
|
| 7 |
ATOMIC_GENERATION_PROMPT,
|
| 8 |
COT_GENERATION_PROMPT,
|
| 9 |
+
FILL_IN_BLANK_GENERATION_PROMPT,
|
| 10 |
+
MAQ_GENERATION_PROMPT,
|
| 11 |
+
MCQ_GENERATION_PROMPT,
|
| 12 |
MULTI_HOP_GENERATION_PROMPT,
|
| 13 |
VQA_GENERATION_PROMPT,
|
| 14 |
)
|
| 15 |
from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
|
|
|
|
| 16 |
from .search_judgement import SEARCH_JUDGEMENT_PROMPT
|
| 17 |
from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
|
graphgen/templates/generation/__init__.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
|
| 2 |
from .atomic_generation import ATOMIC_GENERATION_PROMPT
|
| 3 |
from .cot_generation import COT_GENERATION_PROMPT
|
|
|
|
|
|
|
|
|
|
| 4 |
from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
|
| 5 |
from .vqa_generation import VQA_GENERATION_PROMPT
|
|
|
|
| 1 |
from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
|
| 2 |
from .atomic_generation import ATOMIC_GENERATION_PROMPT
|
| 3 |
from .cot_generation import COT_GENERATION_PROMPT
|
| 4 |
+
from .fill_in_blank_generation import FILL_IN_BLANK_GENERATION_PROMPT
|
| 5 |
+
from .multi_answer_generation import MAQ_GENERATION_PROMPT
|
| 6 |
+
from .multi_choice_generation import MCQ_GENERATION_PROMPT
|
| 7 |
from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
|
| 8 |
from .vqa_generation import VQA_GENERATION_PROMPT
|
graphgen/templates/generation/classification_generation.py
ADDED
|
File without changes
|
graphgen/templates/generation/fill_in_blank_generation.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TEMPLATE_ZH = """请根据上下文资料生成独立的知识问答填空题。填空题的答案必须能在原文中直接找到。
|
| 2 |
+
|
| 3 |
+
生成要求:
|
| 4 |
+
1. **语言一致性**:若上下文资料为中文,则生成中文问题;若为英文,则生成英文问题
|
| 5 |
+
2. **数量**:每个上下文资料生成{num_of_questions}个填空题
|
| 6 |
+
3. **独立性**:每个问题必须完整独立,不依赖其他问题
|
| 7 |
+
4. **准确性**:正确答案必须能从原文直接得出
|
| 8 |
+
5. **占位符格式**:使用________(四个下划线)作为填空占位符
|
| 9 |
+
|
| 10 |
+
输出格式:
|
| 11 |
+
<qa_pairs>
|
| 12 |
+
<qa_pair>
|
| 13 |
+
<question>问题文本(使用________作为占位符)</question>
|
| 14 |
+
<answer>正确答案文本(多个空用逗号分隔)</answer>
|
| 15 |
+
</qa_pair>
|
| 16 |
+
</qa_pairs>
|
| 17 |
+
|
| 18 |
+
示例(根据iPad Air 2生成2题):
|
| 19 |
+
<qa_pairs>
|
| 20 |
+
<qa_pair>
|
| 21 |
+
<question>iPad Air 2 是由________制造的?</question>
|
| 22 |
+
<answer>美国苹果公司(Apple)</answer>
|
| 23 |
+
</qa_pair>
|
| 24 |
+
<qa_pair>
|
| 25 |
+
<question>iPad Air 2 的发布日期是________,上市日期是________。</question>
|
| 26 |
+
<answer>2014年10月16日,2014年10月22日</answer>
|
| 27 |
+
</qa_pair>
|
| 28 |
+
</qa_pairs>
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
上下文资料:
|
| 32 |
+
{{context}}
|
| 33 |
+
|
| 34 |
+
请为以下资料生成{num_of_questions}个填空题:
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
TEMPLATE_EN = """Generate independent fill-in-the-blank questions based on the provided context. \
|
| 39 |
+
Answers must be directly derivable from the text.
|
| 40 |
+
|
| 41 |
+
Requirements:
|
| 42 |
+
1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
|
| 43 |
+
2. **Quantity**: Generate {num_of_questions} questions per context
|
| 44 |
+
3. **Independence**: Each question must be self-contained
|
| 45 |
+
4. **Accuracy**: Correct answer must be directly found in the source text
|
| 46 |
+
5. **Placeholder Format**: Use ________ (four underscores) as the blank placeholder
|
| 47 |
+
|
| 48 |
+
Output Format:
|
| 49 |
+
<qa_pairs>
|
| 50 |
+
<qa_pair>
|
| 51 |
+
<question>Question text (use ________ as placeholder)</question>
|
| 52 |
+
<answer>Correct answer text (separate multiple blanks with commas)</answer>
|
| 53 |
+
</qa_pair>
|
| 54 |
+
</qa_pairs>
|
| 55 |
+
|
| 56 |
+
Example (2 questions):
|
| 57 |
+
<qa_pairs>
|
| 58 |
+
<qa_pair>
|
| 59 |
+
<question>The iPad Air 2 was manufactured by ________?</question>
|
| 60 |
+
<answer>Apple Inc.</answer>
|
| 61 |
+
</qa_pair>
|
| 62 |
+
<qa_pair>
|
| 63 |
+
<question>The iPad Air 2 was released on ________ and launched on ________.</question>
|
| 64 |
+
<answer>October 16, 2014, October 22, 2014</answer>
|
| 65 |
+
</qa_pair>
|
| 66 |
+
</qa_pairs>
|
| 67 |
+
|
| 68 |
+
Context:
|
| 69 |
+
{{context}}
|
| 70 |
+
|
| 71 |
+
Please generate {num_of_questions} fill-in-the-blank questions for the following context:
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
FILL_IN_BLANK_GENERATION_PROMPT = {
|
| 76 |
+
"zh": TEMPLATE_ZH,
|
| 77 |
+
"en": TEMPLATE_EN,
|
| 78 |
+
}
|
graphgen/templates/generation/multi_answer_generation.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TEMPLATE_ZH = """请根据上下文资料生成独立的知识问答不定项选择题,每个选择题包含四个选项,其中有若干个正确答案(至少一个),其他为干扰项。
|
| 2 |
+
|
| 3 |
+
生成要求:
|
| 4 |
+
1. **语言一致性**:若上下文资料为中文,则生成中文问题;若为英文,则生成英文问题
|
| 5 |
+
2. **数量**:每个上下文资料生成{num_of_questions}个选择题
|
| 6 |
+
3. **独立性**:每个问题必须完整独立,不依赖其他问题
|
| 7 |
+
4. **准确性**:正确答案必须能从原文直接得出,干扰项需合理且有区分度
|
| 8 |
+
5. **答案格式**:当有多个正确答案时,用逗号分隔选项字母,如"A, B, C"
|
| 9 |
+
|
| 10 |
+
输出格式:
|
| 11 |
+
<qa_pairs>
|
| 12 |
+
<qa_pair>
|
| 13 |
+
<question>问题文本</question>
|
| 14 |
+
<options>A. 选项A文本
|
| 15 |
+
B. 选项B文本
|
| 16 |
+
C. 选项C文本
|
| 17 |
+
D. 选项D文本</options>
|
| 18 |
+
<answer>正确答案选项字母(多个答案用逗号分隔)</answer>
|
| 19 |
+
</qa_pair>
|
| 20 |
+
</qa_pairs>
|
| 21 |
+
|
| 22 |
+
示例(根据iPad Air 2生成2题):
|
| 23 |
+
<qa_pairs>
|
| 24 |
+
<qa_pair>
|
| 25 |
+
<question>iPad Air 2的发布年份是?</question>
|
| 26 |
+
<options>A. 2012年
|
| 27 |
+
B. 2014年
|
| 28 |
+
C. 2015年
|
| 29 |
+
D. 2017年</options>
|
| 30 |
+
<answer>B</answer>
|
| 31 |
+
</qa_pair>
|
| 32 |
+
<qa_pair>
|
| 33 |
+
<question>以下哪些是 iPad Air 2 的特点?</question>
|
| 34 |
+
<options>A. Touch ID指纹识别功能
|
| 35 |
+
B. A8X高效处理器
|
| 36 |
+
C. 十百万像素前置相机
|
| 37 |
+
D. 八百万像素后置相机镜头</options>
|
| 38 |
+
<answer>A, B, D</answer>
|
| 39 |
+
</qa_pair>
|
| 40 |
+
</qa_pairs>
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
上下文资料:
|
| 44 |
+
{context}
|
| 45 |
+
|
| 46 |
+
请为以下资料生成{num_of_questions}个不定项选择题:
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
TEMPLATE_EN = """Generate independent multiple-select knowledge questions \
|
| 51 |
+
based on the provided context. Each question should contain four options \
|
| 52 |
+
with one or more correct answers and distractors.
|
| 53 |
+
|
| 54 |
+
Requirements:
|
| 55 |
+
1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
|
| 56 |
+
2. **Quantity**: Generate {num_of_questions} questions per context
|
| 57 |
+
3. **Independence**: Each question must be self-contained
|
| 58 |
+
4. **Accuracy**: Correct answer(s) must be derivable from text, distractors should be plausible
|
| 59 |
+
5. **Answer Format**: For multiple correct answers, separate option letters with commas, e.g., "A, B, C"
|
| 60 |
+
|
| 61 |
+
Output Format:
|
| 62 |
+
<qa_pairs>
|
| 63 |
+
<qa_pair>
|
| 64 |
+
<question>Question text</question>
|
| 65 |
+
<options>A. Option A text
|
| 66 |
+
B. Option B text
|
| 67 |
+
C. Option C text
|
| 68 |
+
D. Option D text</options>
|
| 69 |
+
<answer>Correct option letter(s) (separate multiple answers with commas)</answer>
|
| 70 |
+
</qa_pair>
|
| 71 |
+
</qa_pairs>
|
| 72 |
+
|
| 73 |
+
Example (2 questions):
|
| 74 |
+
<qa_pairs>
|
| 75 |
+
<qa_pair>
|
| 76 |
+
<question>What are the features of iPad Air 2?</question>
|
| 77 |
+
<options>A. Touch ID fingerprint recognition
|
| 78 |
+
B. A8X processor
|
| 79 |
+
C. Ten-megapixel front camera
|
| 80 |
+
D. Eight-megapixel rear camera</options>
|
| 81 |
+
<answer>A, B, D</answer>
|
| 82 |
+
</qa_pair>
|
| 83 |
+
<qa_pair>
|
| 84 |
+
<question>When was iPad Air 2 discontinued?</question>
|
| 85 |
+
<options>A. March 21, 2016
|
| 86 |
+
B. March 21, 2017
|
| 87 |
+
C. October 22, 2017
|
| 88 |
+
D. October 16, 2016</options>
|
| 89 |
+
<answer>B</answer>
|
| 90 |
+
</qa_pair>
|
| 91 |
+
</qa_pairs>
|
| 92 |
+
|
| 93 |
+
Context:
|
| 94 |
+
{context}
|
| 95 |
+
|
| 96 |
+
Please generate {num_of_questions} multiple-select questions for the following context:
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
MAQ_GENERATION_PROMPT = {"zh": TEMPLATE_ZH, "en": TEMPLATE_EN}
|
graphgen/templates/generation/multi_choice_generation.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TEMPLATE_GENERATION_ZH: str = """请根据上下文资料生成独立的知识问答单选题,每个选择题包含四个选项,其中仅有一个正确答案,其他三个为干扰项。
|
| 2 |
+
|
| 3 |
+
生成要求:
|
| 4 |
+
1. **语言一致性**:若上下文资料为中文,则生成中文问题;若为英文,则生成英文问题
|
| 5 |
+
2. **数量**:每个上下文资料生成{num_of_questions}个选择题
|
| 6 |
+
3. **独立性**:每个问题必须完整独立,不依赖其他问题
|
| 7 |
+
4. **准确性**:正确答案必须能从原文直接得出,干扰项需合理且有区分度
|
| 8 |
+
|
| 9 |
+
输出格式:
|
| 10 |
+
<qa_pairs>
|
| 11 |
+
<qa_pair>
|
| 12 |
+
<question>问题文本</question>
|
| 13 |
+
<options>A. 选项A文本
|
| 14 |
+
B. 选项B文本
|
| 15 |
+
C. 选项C文本
|
| 16 |
+
D. 选项D文本</options>
|
| 17 |
+
<answer>正确答案选项字母</answer>
|
| 18 |
+
</qa_pair>
|
| 19 |
+
</qa_pairs>
|
| 20 |
+
|
| 21 |
+
示例(根据iPad Air 2生成2题):
|
| 22 |
+
<qa_pairs>
|
| 23 |
+
<qa_pair>
|
| 24 |
+
<question>iPad Air 2的发布年份是?</question>
|
| 25 |
+
<options>A. 2012年
|
| 26 |
+
B. 2014年
|
| 27 |
+
C. 2015年
|
| 28 |
+
D. 2017年</options>
|
| 29 |
+
<answer>B</answer>
|
| 30 |
+
</qa_pair>
|
| 31 |
+
<qa_pair>
|
| 32 |
+
<question>iPad Air 2搭载的处理器型号是?</question>
|
| 33 |
+
<options>A. A8
|
| 34 |
+
B. A9X
|
| 35 |
+
C. A8X
|
| 36 |
+
D. A10</options>
|
| 37 |
+
<answer>C</answer>
|
| 38 |
+
</qa_pair>
|
| 39 |
+
</qa_pairs>
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
上下文资料:
|
| 43 |
+
{context}
|
| 44 |
+
|
| 45 |
+
请为以下资料生成{num_of_questions}个选择题:
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
TEMPLATE_GENERATION_EN: str = """Generate independent multiple-choice questions \
|
| 49 |
+
based on the provided context. Each question should contain four options \
|
| 50 |
+
with only one correct answer and three distractors.
|
| 51 |
+
|
| 52 |
+
Requirements:
|
| 53 |
+
1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
|
| 54 |
+
2. **Quantity**: Generate {num_of_questions} questions per context
|
| 55 |
+
3. **Independence**: Each question must be self-contained
|
| 56 |
+
4. **Accuracy**: Correct answer must be derivable from text, distractors should be plausible
|
| 57 |
+
|
| 58 |
+
Output Format:
|
| 59 |
+
<qa_pairs>
|
| 60 |
+
<qa_pair>
|
| 61 |
+
<question>Question text</question>
|
| 62 |
+
<options>A. Option A text
|
| 63 |
+
B. Option B text
|
| 64 |
+
C. Option C text
|
| 65 |
+
D. Option D text</options>
|
| 66 |
+
<answer>Correct option letter</answer>
|
| 67 |
+
</qa_pair>
|
| 68 |
+
</qa_pairs>
|
| 69 |
+
|
| 70 |
+
Example (2 questions):
|
| 71 |
+
<qa_pairs>
|
| 72 |
+
<qa_pair>
|
| 73 |
+
<question>What year was the iPad Air 2 released?</question>
|
| 74 |
+
<options>A. 2012
|
| 75 |
+
B. 2014
|
| 76 |
+
C. 2015
|
| 77 |
+
D. 2017</options>
|
| 78 |
+
<answer>B</answer>
|
| 79 |
+
</qa_pair>
|
| 80 |
+
<qa_pair>
|
| 81 |
+
<question>Which processor does iPad Air 2 use?</question>
|
| 82 |
+
<options>A. A8
|
| 83 |
+
B. A9X
|
| 84 |
+
C. A8X
|
| 85 |
+
D. A10</options>
|
| 86 |
+
<answer>C</answer>
|
| 87 |
+
</qa_pair>
|
| 88 |
+
</qa_pairs>
|
| 89 |
+
|
| 90 |
+
Context:
|
| 91 |
+
{context}
|
| 92 |
+
|
| 93 |
+
Please generate {num_of_questions} questions for the following context:
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
MCQ_GENERATION_PROMPT = {"zh": TEMPLATE_GENERATION_ZH, "en": TEMPLATE_GENERATION_EN}
|
graphgen/templates/question_generation.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
# pylint: disable=C0301
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
# TODO: 修改这里的prompt
|
| 5 |
-
TEMPLATE_MULTI_EN = """You are an assistant to help read a article and then rephrase it in a question answering format. The user will provide you with an article with its content. You need to generate a paraphrase of the same article in question and answer format with one tag of "Question: ..." followed by "Answer: ...". Remember to keep the meaning and every content of the article intact.
|
| 6 |
-
|
| 7 |
-
Here is the format you should follow for your response:
|
| 8 |
-
Question: <Question>
|
| 9 |
-
Answer: <Answer>
|
| 10 |
-
|
| 11 |
-
Here is the article you need to rephrase:
|
| 12 |
-
{doc}
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
TEMPLATE_MULTI_ZH = """你是一位助手,帮助阅读一篇文章,然后以问答格式重述它。用户将为您提供一篇带有内容的文章。你需要以一个标签"问题:..."为开头,接着是"答案:...",生成一篇与原文章相同的问答格式的重述。请确保保持文章的意义和每个内容不变。
|
| 16 |
-
|
| 17 |
-
以下是你应该遵循的响应格式:
|
| 18 |
-
问题: <问题>
|
| 19 |
-
答案: <答案>
|
| 20 |
-
|
| 21 |
-
以下是你需要重述的文章:
|
| 22 |
-
{doc}
|
| 23 |
-
"""
|
| 24 |
-
|
| 25 |
-
QUESTION_GENERATION_PROMPT = {
|
| 26 |
-
"English": {
|
| 27 |
-
"MULTI_TEMPLATE": TEMPLATE_MULTI_EN,
|
| 28 |
-
},
|
| 29 |
-
"Chinese": {
|
| 30 |
-
"MULTI_TEMPLATE": TEMPLATE_MULTI_ZH,
|
| 31 |
-
},
|
| 32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|