github-actions[bot] commited on
Commit
5cb2e67
·
1 Parent(s): b1922c7

Auto-sync from demo at Thu Jan 15 11:07:22 UTC 2026

Browse files
graphgen/bases/base_generator.py CHANGED
@@ -46,38 +46,47 @@ class BaseGenerator(ABC):
46
  def format_generation_results(
47
  results: list[dict], output_data_format: str
48
  ) -> list[dict[str, Any]]:
49
- if output_data_format == "Alpaca":
50
- results = [
51
- {
52
- "instruction": v["question"],
53
- "input": "",
54
- "output": v["answer"],
55
- }
56
- for item in results
57
- for k, v in item.items()
58
- ]
59
- elif output_data_format == "Sharegpt":
60
- results = [
61
- {
62
- "conversations": [
63
- {"from": "human", "value": v["question"]},
64
- {"from": "gpt", "value": v["answer"]},
65
- ]
66
- }
67
- for item in results
68
- for k, v in item.items()
69
- ]
70
- elif output_data_format == "ChatML":
71
- results = [
72
- {
73
- "messages": [
74
- {"role": "user", "content": v["question"]},
75
- {"role": "assistant", "content": v["answer"]},
76
- ]
77
- }
78
- for item in results
79
- for k, v in item.items()
80
- ]
81
- else:
82
- raise ValueError(f"Unknown output data format: {output_data_format}")
83
- return results
 
 
 
 
 
 
 
 
 
 
46
  def format_generation_results(
47
  results: list[dict], output_data_format: str
48
  ) -> list[dict[str, Any]]:
49
+
50
+ flat_results = []
51
+ for item in results:
52
+ for _, qa_data in item.items():
53
+ question = qa_data.get("question", "")
54
+ answer = qa_data.get("answer", "")
55
+ if "options" in qa_data and qa_data["options"]:
56
+ options = qa_data["options"]
57
+ options_str = "\n".join(
58
+ [f"{key}. {options[key]}" for key in sorted(options.keys())]
59
+ )
60
+ question += f"\nOptions:\n{options_str}"
61
+
62
+ if output_data_format == "Alpaca":
63
+ flat_results.append(
64
+ {
65
+ "instruction": question,
66
+ "input": "",
67
+ "output": answer,
68
+ }
69
+ )
70
+ elif output_data_format == "Sharegpt":
71
+ flat_results.append(
72
+ {
73
+ "conversations": [
74
+ {"from": "human", "value": question},
75
+ {"from": "gpt", "value": answer},
76
+ ]
77
+ }
78
+ )
79
+ elif output_data_format == "ChatML":
80
+ flat_results.append(
81
+ {
82
+ "messages": [
83
+ {"role": "user", "content": question},
84
+ {"role": "assistant", "content": answer},
85
+ ]
86
+ }
87
+ )
88
+ else:
89
+ raise ValueError(
90
+ f"Unknown output data format: {output_data_format}"
91
+ )
92
+ return flat_results
graphgen/models/__init__.py CHANGED
@@ -11,6 +11,9 @@ from .generator import (
11
  AggregatedGenerator,
12
  AtomicGenerator,
13
  CoTGenerator,
 
 
 
14
  MultiHopGenerator,
15
  QuizGenerator,
16
  VQAGenerator,
 
11
  AggregatedGenerator,
12
  AtomicGenerator,
13
  CoTGenerator,
14
+ FillInBlankGenerator,
15
+ MultiAnswerGenerator,
16
+ MultiChoiceGenerator,
17
  MultiHopGenerator,
18
  QuizGenerator,
19
  VQAGenerator,
graphgen/models/generator/__init__.py CHANGED
@@ -1,6 +1,9 @@
1
  from .aggregated_generator import AggregatedGenerator
2
  from .atomic_generator import AtomicGenerator
3
  from .cot_generator import CoTGenerator
 
 
 
4
  from .multi_hop_generator import MultiHopGenerator
5
  from .quiz_generator import QuizGenerator
6
  from .vqa_generator import VQAGenerator
 
1
  from .aggregated_generator import AggregatedGenerator
2
  from .atomic_generator import AtomicGenerator
3
  from .cot_generator import CoTGenerator
4
+ from .fill_in_blank_generator import FillInBlankGenerator
5
+ from .multi_answer_generator import MultiAnswerGenerator
6
+ from .multi_choice_generator import MultiChoiceGenerator
7
  from .multi_hop_generator import MultiHopGenerator
8
  from .quiz_generator import QuizGenerator
9
  from .vqa_generator import VQAGenerator
graphgen/models/generator/fill_in_blank_generator.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Any
3
+
4
+ from graphgen.bases import BaseGenerator
5
+ from graphgen.templates import FILL_IN_BLANK_GENERATION_PROMPT
6
+ from graphgen.utils import compute_content_hash, detect_main_language, logger
7
+
8
+
9
+ class FillInBlankGenerator(BaseGenerator):
10
+ def __init__(self, llm_client, num_of_questions) -> None:
11
+ super().__init__(llm_client)
12
+ self.num_of_questions = num_of_questions
13
+
14
+ @staticmethod
15
+ def parse_response(response: str) -> Any:
16
+ """
17
+ Parse fill-in-the-blank QA pairs from the LLM response.
18
+ Each QA pair contains question text with placeholders and the correct answer(s).
19
+
20
+ :param response: The LLM response containing XML-formatted QA pairs
21
+ :return: Dictionary mapping question hash to question data, where each
22
+ value is a dict with "question", "answer", and "answers" keys
23
+ """
24
+ qa_pairs = {}
25
+
26
+ # Extract all QA pair blocks
27
+ qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
28
+
29
+ if not qa_blocks:
30
+ logger.warning("No QA pairs found in response: %s", response)
31
+ return {}
32
+
33
+ for block in qa_blocks:
34
+ # Extract and clean question text
35
+ q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
36
+ if not q_match:
37
+ logger.warning("Failed to parse question from block: %s", block)
38
+ continue
39
+ question = q_match.group(1).strip().strip('"').strip("'")
40
+
41
+ # Extract and clean answer text
42
+ ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
43
+ if not ans_match:
44
+ logger.warning("Failed to parse answer from block: %s", block)
45
+ continue
46
+
47
+ answer_text = ans_match.group(1).strip().strip('"').strip("'")
48
+
49
+ # Parse multiple answers (e.g., "A8X, 八百万" or "A8X")
50
+ # Split by comma and strip whitespace from each answer
51
+ answers = [ans.strip() for ans in answer_text.split(",") if ans.strip()]
52
+
53
+ # Ensure at least one valid answer
54
+ if len(answers) == 0:
55
+ logger.warning("No valid answers found in: %s", answer_text)
56
+ continue
57
+
58
+ # Build result entry with question hash as key
59
+ question_hash = compute_content_hash(question)
60
+ qa_pairs[question_hash] = {
61
+ "question": question,
62
+ "answer": answer_text, # Original answer text with commas
63
+ "answers": answers, # List of individual answers: ["A8X"] or ["A8X", "八百万"]
64
+ }
65
+
66
+ logger.debug(
67
+ "Successfully parsed fill-in-the-blank question: %s", question[:50]
68
+ )
69
+
70
+ if not qa_pairs:
71
+ logger.error("Failed to parse any valid QA pairs from response")
72
+
73
+ return qa_pairs
74
+
75
+ # pylint: disable=W0221
76
+ def build_prompt(
77
+ self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
78
+ ) -> str:
79
+ nodes, edges = batch
80
+ entities_str = "\n".join(
81
+ [
82
+ f"{index + 1}. {node[0]}: {node[1]['description']}"
83
+ for index, node in enumerate(nodes)
84
+ ]
85
+ )
86
+
87
+ relationships_str = "\n".join(
88
+ [
89
+ f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
90
+ for index, edge in enumerate(edges)
91
+ ]
92
+ )
93
+ context = entities_str + "\n" + relationships_str
94
+ language = detect_main_language(entities_str + relationships_str)
95
+ prompt = FILL_IN_BLANK_GENERATION_PROMPT[language].format(
96
+ context=context,
97
+ num_of_questions=self.num_of_questions,
98
+ )
99
+ return prompt
graphgen/models/generator/multi_answer_generator.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Any
3
+
4
+ from graphgen.bases import BaseGenerator
5
+ from graphgen.templates import MAQ_GENERATION_PROMPT
6
+ from graphgen.utils import compute_content_hash, detect_main_language, logger
7
+
8
+
9
+ class MultiAnswerGenerator(BaseGenerator):
10
+ def __init__(self, llm_client, num_of_questions) -> None:
11
+ super().__init__(llm_client)
12
+ self.num_of_questions = num_of_questions
13
+
14
+ @staticmethod
15
+ def parse_response(response: str) -> Any:
16
+ """
17
+ Parse multiple-answer QA pairs from the LLM response.
18
+ Each QA pair contains question text, four options, and the correct answers (one or more).
19
+
20
+ :param response: The LLM response containing XML-formatted QA pairs
21
+ :return: Dictionary mapping question hash to question data, where each
22
+ value is a dict with "question", "options", and "answer" keys
23
+ """
24
+ qa_pairs = {}
25
+
26
+ # Extract all QA pair blocks
27
+ qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
28
+
29
+ if not qa_blocks:
30
+ logger.warning("No QA pairs found in response: %s", response)
31
+ return {}
32
+
33
+ for block in qa_blocks:
34
+ # Extract and clean question text
35
+ q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
36
+ if not q_match:
37
+ logger.warning("Failed to parse question from block: %s", block)
38
+ continue
39
+ question = q_match.group(1).strip().strip('"').strip("'")
40
+
41
+ # Extract and parse options (A, B, C, D)
42
+ opt_match = re.search(r"<options>(.*?)</options>", block, re.DOTALL)
43
+ if not opt_match:
44
+ logger.warning("Failed to parse options from block: %s", block)
45
+ continue
46
+
47
+ options = {}
48
+ options_text = opt_match.group(1).strip()
49
+ for line in options_text.split("\n"):
50
+ line = line.strip()
51
+ if not line:
52
+ continue
53
+ # Match patterns like "A. text" or "B. text"
54
+ if m := re.match(r"^([A-Z])[.\s]\s*(.*)$", line):
55
+ letter, text = m.groups()
56
+ options[letter] = text.strip()
57
+
58
+ # Extract and validate answer
59
+ ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
60
+ if not ans_match:
61
+ logger.warning("Failed to parse answer from block: %s", block)
62
+ continue
63
+ answer_text = ans_match.group(1).strip().strip('"').strip("'")
64
+ answers = [ans.strip().upper() for ans in answer_text.split(",") if ans.strip()]
65
+ invalid_answers = [ans for ans in answers if ans not in options]
66
+ if invalid_answers:
67
+ logger.warning(
68
+ "Answers %s not found in options: %s",
69
+ invalid_answers,
70
+ list(options.keys()),
71
+ )
72
+ continue
73
+
74
+ # Ensure at least one valid answer
75
+ if len(answers) == 0:
76
+ logger.warning("No valid answers found in: %s", answer_text)
77
+ continue
78
+
79
+ # Build result entry with question hash as key
80
+ question_hash = compute_content_hash(question)
81
+ qa_pairs[question_hash] = {
82
+ "question": question,
83
+ "options": options, # Dict like {"A": "text", "B": "text", ...}
84
+ "answer": ", ".join(answers),
85
+ }
86
+
87
+ logger.debug("Successfully parsed MAQ: %s", question[:50])
88
+
89
+ if not qa_pairs:
90
+ logger.error("Failed to parse any valid MAQ pairs from response")
91
+
92
+ return qa_pairs
93
+
94
+ # pylint: disable=W0221
95
+ def build_prompt(
96
+ self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
97
+ ) -> str:
98
+ nodes, edges = batch
99
+ entities_str = "\n".join(
100
+ [
101
+ f"{index + 1}. {node[0]}: {node[1]['description']}"
102
+ for index, node in enumerate(nodes)
103
+ ]
104
+ )
105
+
106
+ relationships_str = "\n".join(
107
+ [
108
+ f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
109
+ for index, edge in enumerate(edges)
110
+ ]
111
+ )
112
+ context = entities_str + "\n" + relationships_str
113
+ language = detect_main_language(entities_str + relationships_str)
114
+ prompt = MAQ_GENERATION_PROMPT[language].format(
115
+ context=context,
116
+ num_of_questions=self.num_of_questions,
117
+ )
118
+ return prompt
graphgen/models/generator/multi_choice_generator.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Any
3
+
4
+ from graphgen.bases import BaseGenerator
5
+ from graphgen.templates import MCQ_GENERATION_PROMPT
6
+ from graphgen.utils import compute_content_hash, detect_main_language, logger
7
+
8
+
9
+ class MultiChoiceGenerator(BaseGenerator):
10
+ def __init__(self, llm_client, num_of_questions) -> None:
11
+ super().__init__(llm_client)
12
+ self.num_of_questions = num_of_questions
13
+
14
+ @staticmethod
15
+ def parse_response(response: str) -> Any:
16
+ """
17
+ Parse multiple choice QA pairs from the LLM response.
18
+ Each QA pair contains question text, four options, and the correct answer.
19
+
20
+ :param response: The LLM response containing XML-formatted QA pairs
21
+ :return: Dictionary mapping question hash to question data, where each
22
+ value is a dict with "question", "options", and "answer" keys
23
+ """
24
+ qa_pairs = {}
25
+
26
+ # Extract all QA pair blocks
27
+ qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
28
+
29
+ if not qa_blocks:
30
+ logger.warning("No QA pairs found in response: %s", response)
31
+ return {}
32
+
33
+ for block in qa_blocks:
34
+ # Extract and clean question text
35
+ q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
36
+ if not q_match:
37
+ logger.warning("Failed to parse question from block: %s", block)
38
+ continue
39
+ question = q_match.group(1).strip().strip('"').strip("'")
40
+
41
+ # Extract and parse options (A, B, C, D)
42
+ opt_match = re.search(r"<options>(.*?)</options>", block, re.DOTALL)
43
+ if not opt_match:
44
+ logger.warning("Failed to parse options from block: %s", block)
45
+ continue
46
+
47
+ options = {}
48
+ options_text = opt_match.group(1).strip()
49
+ for line in options_text.split("\n"):
50
+ line = line.strip()
51
+ if not line:
52
+ continue
53
+ # Match patterns like "A. text" or "B. text"
54
+ if m := re.match(r"^([A-D])[.\s]\s*(.*)$", line):
55
+ letter, text = m.groups()
56
+ options[letter] = text.strip()
57
+
58
+ # Validate options count
59
+ if len(options) != 4:
60
+ logger.warning(
61
+ "Expected 4 options, found %d: %s", len(options), options_text
62
+ )
63
+ continue
64
+
65
+ # Extract and validate answer
66
+ ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
67
+ if not ans_match:
68
+ logger.warning("Failed to parse answer from block: %s", block)
69
+ continue
70
+ answer = ans_match.group(1).strip().strip('"').strip("'")
71
+
72
+ # Ensure answer exists in options
73
+ if answer not in options:
74
+ logger.warning(
75
+ "Answer '%s' not found in options: %s", answer, list(options.keys())
76
+ )
77
+ continue
78
+
79
+ # Build result entry with question hash as key
80
+ question_hash = compute_content_hash(question)
81
+ qa_pairs[question_hash] = {
82
+ "question": question,
83
+ "options": options, # Dict like {"A": "text", "B": "text", ...}
84
+ "answer": answer, # Single letter: "A", "B", "C", or "D"
85
+ }
86
+
87
+ logger.debug("Successfully parsed MCQ: %s", question[:50])
88
+
89
+ if not qa_pairs:
90
+ logger.error("Failed to parse any valid MCQ pairs from response")
91
+
92
+ return qa_pairs
93
+
94
+ # pylint: disable=W0221
95
+ def build_prompt(
96
+ self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
97
+ ) -> str:
98
+ nodes, edges = batch
99
+ entities_str = "\n".join(
100
+ [
101
+ f"{index + 1}. {node[0]}: {node[1]['description']}"
102
+ for index, node in enumerate(nodes)
103
+ ]
104
+ )
105
+
106
+ relationships_str = "\n".join(
107
+ [
108
+ f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
109
+ for index, edge in enumerate(edges)
110
+ ]
111
+ )
112
+ context = entities_str + "\n" + relationships_str
113
+ language = detect_main_language(entities_str + relationships_str)
114
+ prompt = MCQ_GENERATION_PROMPT[language].format(
115
+ context=context,
116
+ num_of_questions=self.num_of_questions,
117
+ )
118
+ return prompt
graphgen/operators/generate/generate_service.py CHANGED
@@ -2,13 +2,6 @@ import pandas as pd
2
 
3
  from graphgen.bases import BaseLLMWrapper, BaseOperator
4
  from graphgen.common import init_llm
5
- from graphgen.models import (
6
- AggregatedGenerator,
7
- AtomicGenerator,
8
- CoTGenerator,
9
- MultiHopGenerator,
10
- VQAGenerator,
11
- )
12
  from graphgen.utils import logger, run_concurrent
13
 
14
 
@@ -22,6 +15,7 @@ class GenerateService(BaseOperator):
22
  working_dir: str = "cache",
23
  method: str = "aggregated",
24
  data_format: str = "ChatML",
 
25
  ):
26
  super().__init__(working_dir=working_dir, op_name="generate_service")
27
  self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
@@ -30,15 +24,46 @@ class GenerateService(BaseOperator):
30
  self.data_format = data_format
31
 
32
  if self.method == "atomic":
 
 
33
  self.generator = AtomicGenerator(self.llm_client)
34
  elif self.method == "aggregated":
 
 
35
  self.generator = AggregatedGenerator(self.llm_client)
36
  elif self.method == "multi_hop":
 
 
37
  self.generator = MultiHopGenerator(self.llm_client)
38
  elif self.method == "cot":
 
 
39
  self.generator = CoTGenerator(self.llm_client)
40
- elif self.method in ["vqa"]:
 
 
41
  self.generator = VQAGenerator(self.llm_client)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  else:
43
  raise ValueError(f"Unsupported generation mode: {method}")
44
 
 
2
 
3
  from graphgen.bases import BaseLLMWrapper, BaseOperator
4
  from graphgen.common import init_llm
 
 
 
 
 
 
 
5
  from graphgen.utils import logger, run_concurrent
6
 
7
 
 
15
  working_dir: str = "cache",
16
  method: str = "aggregated",
17
  data_format: str = "ChatML",
18
+ **generate_kwargs,
19
  ):
20
  super().__init__(working_dir=working_dir, op_name="generate_service")
21
  self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
 
24
  self.data_format = data_format
25
 
26
  if self.method == "atomic":
27
+ from graphgen.models import AtomicGenerator
28
+
29
  self.generator = AtomicGenerator(self.llm_client)
30
  elif self.method == "aggregated":
31
+ from graphgen.models import AggregatedGenerator
32
+
33
  self.generator = AggregatedGenerator(self.llm_client)
34
  elif self.method == "multi_hop":
35
+ from graphgen.models import MultiHopGenerator
36
+
37
  self.generator = MultiHopGenerator(self.llm_client)
38
  elif self.method == "cot":
39
+ from graphgen.models import CoTGenerator
40
+
41
  self.generator = CoTGenerator(self.llm_client)
42
+ elif self.method == "vqa":
43
+ from graphgen.models import VQAGenerator
44
+
45
  self.generator = VQAGenerator(self.llm_client)
46
+ elif self.method == "multi_choice":
47
+ from graphgen.models import MultiChoiceGenerator
48
+
49
+ self.generator = MultiChoiceGenerator(
50
+ self.llm_client,
51
+ num_of_questions=generate_kwargs.get("num_of_questions", 5),
52
+ )
53
+ elif self.method == "multi_answer":
54
+ from graphgen.models import MultiAnswerGenerator
55
+
56
+ self.generator = MultiAnswerGenerator(
57
+ self.llm_client,
58
+ num_of_questions=generate_kwargs.get("num_of_questions", 3),
59
+ )
60
+ elif self.method == "fill_in_blank":
61
+ from graphgen.models import FillInBlankGenerator
62
+
63
+ self.generator = FillInBlankGenerator(
64
+ self.llm_client,
65
+ num_of_questions=generate_kwargs.get("num_of_questions", 5),
66
+ )
67
  else:
68
  raise ValueError(f"Unsupported generation mode: {method}")
69
 
graphgen/templates/__init__.py CHANGED
@@ -6,10 +6,12 @@ from .generation import (
6
  AGGREGATED_GENERATION_PROMPT,
7
  ATOMIC_GENERATION_PROMPT,
8
  COT_GENERATION_PROMPT,
 
 
 
9
  MULTI_HOP_GENERATION_PROMPT,
10
  VQA_GENERATION_PROMPT,
11
  )
12
  from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
13
- from .question_generation import QUESTION_GENERATION_PROMPT
14
  from .search_judgement import SEARCH_JUDGEMENT_PROMPT
15
  from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
 
6
  AGGREGATED_GENERATION_PROMPT,
7
  ATOMIC_GENERATION_PROMPT,
8
  COT_GENERATION_PROMPT,
9
+ FILL_IN_BLANK_GENERATION_PROMPT,
10
+ MAQ_GENERATION_PROMPT,
11
+ MCQ_GENERATION_PROMPT,
12
  MULTI_HOP_GENERATION_PROMPT,
13
  VQA_GENERATION_PROMPT,
14
  )
15
  from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
 
16
  from .search_judgement import SEARCH_JUDGEMENT_PROMPT
17
  from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
graphgen/templates/generation/__init__.py CHANGED
@@ -1,5 +1,8 @@
1
  from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
2
  from .atomic_generation import ATOMIC_GENERATION_PROMPT
3
  from .cot_generation import COT_GENERATION_PROMPT
 
 
 
4
  from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
5
  from .vqa_generation import VQA_GENERATION_PROMPT
 
1
  from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
2
  from .atomic_generation import ATOMIC_GENERATION_PROMPT
3
  from .cot_generation import COT_GENERATION_PROMPT
4
+ from .fill_in_blank_generation import FILL_IN_BLANK_GENERATION_PROMPT
5
+ from .multi_answer_generation import MAQ_GENERATION_PROMPT
6
+ from .multi_choice_generation import MCQ_GENERATION_PROMPT
7
  from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
8
  from .vqa_generation import VQA_GENERATION_PROMPT
graphgen/templates/generation/classification_generation.py ADDED
File without changes
graphgen/templates/generation/fill_in_blank_generation.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TEMPLATE_ZH = """请根据上下文资料生成独立的知识问答填空题。填空题的答案必须能在原文中直接找到。
2
+
3
+ 生成要求:
4
+ 1. **语言一致性**:若上下文资料为中文,则生成中文问题;若为英文,则生成英文问题
5
+ 2. **数量**:每个上下文资料生成{num_of_questions}个填空题
6
+ 3. **独立性**:每个问题必须完整独立,不依赖其他问题
7
+ 4. **准确性**:正确答案必须能从原文直接得出
8
+ 5. **占位符格式**:使用________(四个下划线)作为填空占位符
9
+
10
+ 输出格式:
11
+ <qa_pairs>
12
+ <qa_pair>
13
+ <question>问题文本(使用________作为占位符)</question>
14
+ <answer>正确答案文本(多个空用逗号分隔)</answer>
15
+ </qa_pair>
16
+ </qa_pairs>
17
+
18
+ 示例(根据iPad Air 2生成2题):
19
+ <qa_pairs>
20
+ <qa_pair>
21
+ <question>iPad Air 2 是由________制造的?</question>
22
+ <answer>美国苹果公司(Apple)</answer>
23
+ </qa_pair>
24
+ <qa_pair>
25
+ <question>iPad Air 2 的发布日期是________,上市日期是________。</question>
26
+ <answer>2014年10月16日,2014年10月22日</answer>
27
+ </qa_pair>
28
+ </qa_pairs>
29
+
30
+
31
+ 上下文资料:
32
+ {{context}}
33
+
34
+ 请为以下资料生成{num_of_questions}个填空题:
35
+ """
36
+
37
+
38
+ TEMPLATE_EN = """Generate independent fill-in-the-blank questions based on the provided context. \
39
+ Answers must be directly derivable from the text.
40
+
41
+ Requirements:
42
+ 1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
43
+ 2. **Quantity**: Generate {num_of_questions} questions per context
44
+ 3. **Independence**: Each question must be self-contained
45
+ 4. **Accuracy**: Correct answer must be directly found in the source text
46
+ 5. **Placeholder Format**: Use ________ (four underscores) as the blank placeholder
47
+
48
+ Output Format:
49
+ <qa_pairs>
50
+ <qa_pair>
51
+ <question>Question text (use ________ as placeholder)</question>
52
+ <answer>Correct answer text (separate multiple blanks with commas)</answer>
53
+ </qa_pair>
54
+ </qa_pairs>
55
+
56
+ Example (2 questions):
57
+ <qa_pairs>
58
+ <qa_pair>
59
+ <question>The iPad Air 2 was manufactured by ________?</question>
60
+ <answer>Apple Inc.</answer>
61
+ </qa_pair>
62
+ <qa_pair>
63
+ <question>The iPad Air 2 was released on ________ and launched on ________.</question>
64
+ <answer>October 16, 2014, October 22, 2014</answer>
65
+ </qa_pair>
66
+ </qa_pairs>
67
+
68
+ Context:
69
+ {{context}}
70
+
71
+ Please generate {num_of_questions} fill-in-the-blank questions for the following context:
72
+ """
73
+
74
+
75
+ FILL_IN_BLANK_GENERATION_PROMPT = {
76
+ "zh": TEMPLATE_ZH,
77
+ "en": TEMPLATE_EN,
78
+ }
graphgen/templates/generation/multi_answer_generation.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TEMPLATE_ZH = """请根据上下文资料生成独立的知识问答不定项选择题,每个选择题包含四个选项,其中有若干个正确答案(至少一个),其他为干扰项。
2
+
3
+ 生成要求:
4
+ 1. **语言一致性**:若上下文资料为中文,则生成中文问题;若为英文,则生成英文问题
5
+ 2. **数量**:每个上下文资料生成{num_of_questions}个选择题
6
+ 3. **独立性**:每个问题必须完整独立,不依赖其他问题
7
+ 4. **准确性**:正确答案必须能从原文直接得出,干扰项需合理且有区分度
8
+ 5. **答案格式**:当有多个正确答案时,用逗号分隔选项字母,如"A, B, C"
9
+
10
+ 输出格式:
11
+ <qa_pairs>
12
+ <qa_pair>
13
+ <question>问题文本</question>
14
+ <options>A. 选项A文本
15
+ B. 选项B文本
16
+ C. 选项C文本
17
+ D. 选项D文本</options>
18
+ <answer>正确答案选项字母(多个答案用逗号分隔)</answer>
19
+ </qa_pair>
20
+ </qa_pairs>
21
+
22
+ 示例(根据iPad Air 2生成2题):
23
+ <qa_pairs>
24
+ <qa_pair>
25
+ <question>iPad Air 2的发布年份是?</question>
26
+ <options>A. 2012年
27
+ B. 2014年
28
+ C. 2015年
29
+ D. 2017年</options>
30
+ <answer>B</answer>
31
+ </qa_pair>
32
+ <qa_pair>
33
+ <question>以下哪些是 iPad Air 2 的特点?</question>
34
+ <options>A. Touch ID指纹识别功能
35
+ B. A8X高效处理器
36
+ C. 十百万像素前置相机
37
+ D. 八百万像素后置相机镜头</options>
38
+ <answer>A, B, D</answer>
39
+ </qa_pair>
40
+ </qa_pairs>
41
+
42
+
43
+ 上下文资料:
44
+ {context}
45
+
46
+ 请为以下资料生成{num_of_questions}个不定项选择题:
47
+ """
48
+
49
+
50
+ TEMPLATE_EN = """Generate independent multiple-select knowledge questions \
51
+ based on the provided context. Each question should contain four options \
52
+ with one or more correct answers and distractors.
53
+
54
+ Requirements:
55
+ 1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
56
+ 2. **Quantity**: Generate {num_of_questions} questions per context
57
+ 3. **Independence**: Each question must be self-contained
58
+ 4. **Accuracy**: Correct answer(s) must be derivable from text, distractors should be plausible
59
+ 5. **Answer Format**: For multiple correct answers, separate option letters with commas, e.g., "A, B, C"
60
+
61
+ Output Format:
62
+ <qa_pairs>
63
+ <qa_pair>
64
+ <question>Question text</question>
65
+ <options>A. Option A text
66
+ B. Option B text
67
+ C. Option C text
68
+ D. Option D text</options>
69
+ <answer>Correct option letter(s) (separate multiple answers with commas)</answer>
70
+ </qa_pair>
71
+ </qa_pairs>
72
+
73
+ Example (2 questions):
74
+ <qa_pairs>
75
+ <qa_pair>
76
+ <question>What are the features of iPad Air 2?</question>
77
+ <options>A. Touch ID fingerprint recognition
78
+ B. A8X processor
79
+ C. Ten-megapixel front camera
80
+ D. Eight-megapixel rear camera</options>
81
+ <answer>A, B, D</answer>
82
+ </qa_pair>
83
+ <qa_pair>
84
+ <question>When was iPad Air 2 discontinued?</question>
85
+ <options>A. March 21, 2016
86
+ B. March 21, 2017
87
+ C. October 22, 2017
88
+ D. October 16, 2016</options>
89
+ <answer>B</answer>
90
+ </qa_pair>
91
+ </qa_pairs>
92
+
93
+ Context:
94
+ {context}
95
+
96
+ Please generate {num_of_questions} multiple-select questions for the following context:
97
+ """
98
+
99
+
100
+ MAQ_GENERATION_PROMPT = {"zh": TEMPLATE_ZH, "en": TEMPLATE_EN}
graphgen/templates/generation/multi_choice_generation.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TEMPLATE_GENERATION_ZH: str = """请根据上下文资料生成独立的知识问答单选题,每个选择题包含四个选项,其中仅有一个正确答案,其他三个为干扰项。
2
+
3
+ 生成要求:
4
+ 1. **语言一致性**:若上下文资料为中文,则生成中文问题;若为英文,则生成英文问题
5
+ 2. **数量**:每个上下文资料生成{num_of_questions}个选择题
6
+ 3. **独立性**:每个问题必须完整独立,不依赖其他问题
7
+ 4. **准确性**:正确答案必须能从原文直接得出,干扰项需合理且有区分度
8
+
9
+ 输出格式:
10
+ <qa_pairs>
11
+ <qa_pair>
12
+ <question>问题文本</question>
13
+ <options>A. 选项A文本
14
+ B. 选项B文本
15
+ C. 选项C文本
16
+ D. 选项D文本</options>
17
+ <answer>正确答案选项字母</answer>
18
+ </qa_pair>
19
+ </qa_pairs>
20
+
21
+ 示例(根据iPad Air 2生成2题):
22
+ <qa_pairs>
23
+ <qa_pair>
24
+ <question>iPad Air 2的发布年份是?</question>
25
+ <options>A. 2012年
26
+ B. 2014年
27
+ C. 2015年
28
+ D. 2017年</options>
29
+ <answer>B</answer>
30
+ </qa_pair>
31
+ <qa_pair>
32
+ <question>iPad Air 2搭载的处理器型号是?</question>
33
+ <options>A. A8
34
+ B. A9X
35
+ C. A8X
36
+ D. A10</options>
37
+ <answer>C</answer>
38
+ </qa_pair>
39
+ </qa_pairs>
40
+
41
+
42
+ 上下文资料:
43
+ {context}
44
+
45
+ 请为以下资料生成{num_of_questions}个选择题:
46
+ """
47
+
48
+ TEMPLATE_GENERATION_EN: str = """Generate independent multiple-choice questions \
49
+ based on the provided context. Each question should contain four options \
50
+ with only one correct answer and three distractors.
51
+
52
+ Requirements:
53
+ 1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
54
+ 2. **Quantity**: Generate {num_of_questions} questions per context
55
+ 3. **Independence**: Each question must be self-contained
56
+ 4. **Accuracy**: Correct answer must be derivable from text, distractors should be plausible
57
+
58
+ Output Format:
59
+ <qa_pairs>
60
+ <qa_pair>
61
+ <question>Question text</question>
62
+ <options>A. Option A text
63
+ B. Option B text
64
+ C. Option C text
65
+ D. Option D text</options>
66
+ <answer>Correct option letter</answer>
67
+ </qa_pair>
68
+ </qa_pairs>
69
+
70
+ Example (2 questions):
71
+ <qa_pairs>
72
+ <qa_pair>
73
+ <question>What year was the iPad Air 2 released?</question>
74
+ <options>A. 2012
75
+ B. 2014
76
+ C. 2015
77
+ D. 2017</options>
78
+ <answer>B</answer>
79
+ </qa_pair>
80
+ <qa_pair>
81
+ <question>Which processor does iPad Air 2 use?</question>
82
+ <options>A. A8
83
+ B. A9X
84
+ C. A8X
85
+ D. A10</options>
86
+ <answer>C</answer>
87
+ </qa_pair>
88
+ </qa_pairs>
89
+
90
+ Context:
91
+ {context}
92
+
93
+ Please generate {num_of_questions} questions for the following context:
94
+ """
95
+
96
+
97
+ MCQ_GENERATION_PROMPT = {"zh": TEMPLATE_GENERATION_ZH, "en": TEMPLATE_GENERATION_EN}
graphgen/templates/question_generation.py DELETED
@@ -1,32 +0,0 @@
1
- # pylint: disable=C0301
2
-
3
-
4
- # TODO: 修改这里的prompt
5
- TEMPLATE_MULTI_EN = """You are an assistant to help read a article and then rephrase it in a question answering format. The user will provide you with an article with its content. You need to generate a paraphrase of the same article in question and answer format with one tag of "Question: ..." followed by "Answer: ...". Remember to keep the meaning and every content of the article intact.
6
-
7
- Here is the format you should follow for your response:
8
- Question: <Question>
9
- Answer: <Answer>
10
-
11
- Here is the article you need to rephrase:
12
- {doc}
13
- """
14
-
15
- TEMPLATE_MULTI_ZH = """你是一位助手,帮助阅读一篇文章,然后以问答格式重述它。用户将为您提供一篇带有内容的文章。你需要以一个标签"问题:..."为开头,接着是"答案:...",生成一篇与原文章相同的问答格式的重述。请确保保持文章的意义和每个内容不变。
16
-
17
- 以下是你应该遵循的响应格式:
18
- 问题: <问题>
19
- 答案: <答案>
20
-
21
- 以下是你需要重述的文章:
22
- {doc}
23
- """
24
-
25
- QUESTION_GENERATION_PROMPT = {
26
- "English": {
27
- "MULTI_TEMPLATE": TEMPLATE_MULTI_EN,
28
- },
29
- "Chinese": {
30
- "MULTI_TEMPLATE": TEMPLATE_MULTI_ZH,
31
- },
32
- }