github-actions[bot] commited on
Commit
1189434
·
1 Parent(s): 8e67692

Auto-sync from demo at Fri Nov 21 06:06:05 UTC 2025

Browse files
graphgen/graphgen.py CHANGED
@@ -237,6 +237,7 @@ class GraphGen:
237
  self.graph_storage,
238
  self.rephrase_storage,
239
  re_judge,
 
240
  )
241
 
242
  await self.rephrase_storage.index_done_callback()
 
237
  self.graph_storage,
238
  self.rephrase_storage,
239
  re_judge,
240
+ progress_bar=self.progress_bar,
241
  )
242
 
243
  await self.rephrase_storage.index_done_callback()
graphgen/operators/partition/pre_tokenize.py CHANGED
@@ -1,6 +1,8 @@
1
  import asyncio
2
  from typing import List, Tuple
3
 
 
 
4
  from graphgen.bases import BaseGraphStorage, BaseTokenizer
5
  from graphgen.utils import run_concurrent
6
 
@@ -10,9 +12,11 @@ async def pre_tokenize(
10
  tokenizer: BaseTokenizer,
11
  edges: List[Tuple],
12
  nodes: List[Tuple],
 
 
13
  ) -> Tuple[List, List]:
14
  """为 edges/nodes 补 token-length 并回写存储,并发 1000,带进度条。"""
15
- sem = asyncio.Semaphore(1000)
16
 
17
  async def _patch_and_write(obj: Tuple, *, is_node: bool) -> Tuple:
18
  async with sem:
@@ -35,11 +39,15 @@ async def pre_tokenize(
35
  lambda e: _patch_and_write(e, is_node=False),
36
  edges,
37
  desc="Pre-tokenizing edges",
 
 
38
  ),
39
  run_concurrent(
40
  lambda n: _patch_and_write(n, is_node=True),
41
  nodes,
42
  desc="Pre-tokenizing nodes",
 
 
43
  ),
44
  )
45
 
 
1
  import asyncio
2
  from typing import List, Tuple
3
 
4
+ import gradio as gr
5
+
6
  from graphgen.bases import BaseGraphStorage, BaseTokenizer
7
  from graphgen.utils import run_concurrent
8
 
 
12
  tokenizer: BaseTokenizer,
13
  edges: List[Tuple],
14
  nodes: List[Tuple],
15
+ progress_bar: gr.Progress = None,
16
+ max_concurrent: int = 1000,
17
  ) -> Tuple[List, List]:
18
  """为 edges/nodes 补 token-length 并回写存储,并发 1000,带进度条。"""
19
+ sem = asyncio.Semaphore(max_concurrent)
20
 
21
  async def _patch_and_write(obj: Tuple, *, is_node: bool) -> Tuple:
22
  async with sem:
 
39
  lambda e: _patch_and_write(e, is_node=False),
40
  edges,
41
  desc="Pre-tokenizing edges",
42
+ unit="edge",
43
+ progress_bar=progress_bar,
44
  ),
45
  run_concurrent(
46
  lambda n: _patch_and_write(n, is_node=True),
47
  nodes,
48
  desc="Pre-tokenizing nodes",
49
+ unit="node",
50
+ progress_bar=progress_bar,
51
  ),
52
  )
53
 
graphgen/operators/quiz_and_judge/judge.py CHANGED
@@ -1,12 +1,11 @@
1
- import asyncio
2
  import math
3
 
4
- from tqdm.asyncio import tqdm as tqdm_async
5
 
6
  from graphgen.bases import BaseLLMWrapper
7
  from graphgen.models import JsonKVStorage, NetworkXStorage
8
  from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
9
- from graphgen.utils import logger, yes_no_loss_entropy
10
 
11
 
12
  async def judge_statement( # pylint: disable=too-many-statements
@@ -14,7 +13,7 @@ async def judge_statement( # pylint: disable=too-many-statements
14
  graph_storage: NetworkXStorage,
15
  rephrase_storage: JsonKVStorage,
16
  re_judge: bool = False,
17
- max_concurrent: int = 1000,
18
  ) -> NetworkXStorage:
19
  """
20
  Get all edges and nodes and judge them
@@ -23,128 +22,124 @@ async def judge_statement( # pylint: disable=too-many-statements
23
  :param graph_storage: graph storage instance
24
  :param rephrase_storage: rephrase storage instance
25
  :param re_judge: re-judge the relations
26
- :param max_concurrent: max concurrent
27
  :return:
28
  """
29
 
30
- semaphore = asyncio.Semaphore(max_concurrent)
31
-
32
  async def _judge_single_relation(
33
  edge: tuple,
34
  ):
35
- async with semaphore:
36
- source_id = edge[0]
37
- target_id = edge[1]
38
- edge_data = edge[2]
39
-
40
- if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
41
- logger.debug(
42
- "Edge %s -> %s already judged, loss: %s, skip",
43
- source_id,
44
- target_id,
45
- edge_data["loss"],
46
- )
47
- return source_id, target_id, edge_data
48
 
49
- description = edge_data["description"]
50
 
51
- try:
52
- descriptions = await rephrase_storage.get_by_id(description)
53
- assert descriptions is not None
54
 
55
- judgements = []
56
- gts = [gt for _, gt in descriptions]
57
- for description, gt in descriptions:
58
- judgement = await trainee_llm_client.generate_topk_per_token(
59
- STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
60
- statement=description
61
- )
62
  )
63
- judgements.append(judgement[0].top_candidates)
 
64
 
65
- loss = yes_no_loss_entropy(judgements, gts)
66
 
67
- logger.debug(
68
- "Edge %s -> %s description: %s loss: %s",
69
- source_id,
70
- target_id,
71
- description,
72
- loss,
73
- )
74
 
75
- edge_data["loss"] = loss
76
- except Exception as e: # pylint: disable=broad-except
77
- logger.error(
78
- "Error in judging relation %s -> %s: %s", source_id, target_id, e
79
- )
80
- logger.info("Use default loss 0.1")
81
- edge_data["loss"] = -math.log(0.1)
82
 
83
- await graph_storage.update_edge(source_id, target_id, edge_data)
84
- return source_id, target_id, edge_data
85
 
86
  edges = await graph_storage.get_all_edges()
87
 
88
- results = []
89
- for result in tqdm_async(
90
- asyncio.as_completed([_judge_single_relation(edge) for edge in edges]),
91
- total=len(edges),
92
  desc="Judging relations",
93
- ):
94
- results.append(await result)
 
95
 
96
  async def _judge_single_entity(
97
  node: tuple,
98
  ):
99
- async with semaphore:
100
- node_id = node[0]
101
- node_data = node[1]
102
 
103
- if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
104
- logger.debug(
105
- "Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
106
- )
107
- return node_id, node_data
108
 
109
- description = node_data["description"]
110
 
111
- try:
112
- descriptions = await rephrase_storage.get_by_id(description)
113
- assert descriptions is not None
114
 
115
- judgements = []
116
- gts = [gt for _, gt in descriptions]
117
- for description, gt in descriptions:
118
- judgement = await trainee_llm_client.generate_topk_per_token(
119
- STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
120
- statement=description
121
- )
122
  )
123
- judgements.append(judgement[0].top_candidates)
 
124
 
125
- loss = yes_no_loss_entropy(judgements, gts)
126
 
127
- logger.debug(
128
- "Node %s description: %s loss: %s", node_id, description, loss
129
- )
130
 
131
- node_data["loss"] = loss
132
- except Exception as e: # pylint: disable=broad-except
133
- logger.error("Error in judging entity %s: %s", node_id, e)
134
- logger.error("Use default loss 0.1")
135
- node_data["loss"] = -math.log(0.1)
136
 
137
- await graph_storage.update_node(node_id, node_data)
138
- return node_id, node_data
139
 
140
  nodes = await graph_storage.get_all_nodes()
141
 
142
- results = []
143
- for result in tqdm_async(
144
- asyncio.as_completed([_judge_single_entity(node) for node in nodes]),
145
- total=len(nodes),
146
  desc="Judging entities",
147
- ):
148
- results.append(await result)
 
149
 
150
  return graph_storage
 
 
1
  import math
2
 
3
+ import gradio as gr
4
 
5
  from graphgen.bases import BaseLLMWrapper
6
  from graphgen.models import JsonKVStorage, NetworkXStorage
7
  from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
8
+ from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
9
 
10
 
11
  async def judge_statement( # pylint: disable=too-many-statements
 
13
  graph_storage: NetworkXStorage,
14
  rephrase_storage: JsonKVStorage,
15
  re_judge: bool = False,
16
+ progress_bar: gr.Progress = None,
17
  ) -> NetworkXStorage:
18
  """
19
  Get all edges and nodes and judge them
 
22
  :param graph_storage: graph storage instance
23
  :param rephrase_storage: rephrase storage instance
24
  :param re_judge: re-judge the relations
25
+ :param progress_bar
26
  :return:
27
  """
28
 
 
 
29
  async def _judge_single_relation(
30
  edge: tuple,
31
  ):
32
+ source_id = edge[0]
33
+ target_id = edge[1]
34
+ edge_data = edge[2]
35
+
36
+ if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
37
+ logger.debug(
38
+ "Edge %s -> %s already judged, loss: %s, skip",
39
+ source_id,
40
+ target_id,
41
+ edge_data["loss"],
42
+ )
43
+ return source_id, target_id, edge_data
 
44
 
45
+ description = edge_data["description"]
46
 
47
+ try:
48
+ descriptions = await rephrase_storage.get_by_id(description)
49
+ assert descriptions is not None
50
 
51
+ judgements = []
52
+ gts = [gt for _, gt in descriptions]
53
+ for description, gt in descriptions:
54
+ judgement = await trainee_llm_client.generate_topk_per_token(
55
+ STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
56
+ statement=description
 
57
  )
58
+ )
59
+ judgements.append(judgement[0].top_candidates)
60
 
61
+ loss = yes_no_loss_entropy(judgements, gts)
62
 
63
+ logger.debug(
64
+ "Edge %s -> %s description: %s loss: %s",
65
+ source_id,
66
+ target_id,
67
+ description,
68
+ loss,
69
+ )
70
 
71
+ edge_data["loss"] = loss
72
+ except Exception as e: # pylint: disable=broad-except
73
+ logger.error(
74
+ "Error in judging relation %s -> %s: %s", source_id, target_id, e
75
+ )
76
+ logger.info("Use default loss 0.1")
77
+ edge_data["loss"] = -math.log(0.1)
78
 
79
+ await graph_storage.update_edge(source_id, target_id, edge_data)
80
+ return source_id, target_id, edge_data
81
 
82
  edges = await graph_storage.get_all_edges()
83
 
84
+ await run_concurrent(
85
+ _judge_single_relation,
86
+ edges,
 
87
  desc="Judging relations",
88
+ unit="relation",
89
+ progress_bar=progress_bar,
90
+ )
91
 
92
  async def _judge_single_entity(
93
  node: tuple,
94
  ):
95
+ node_id = node[0]
96
+ node_data = node[1]
 
97
 
98
+ if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
99
+ logger.debug(
100
+ "Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
101
+ )
102
+ return node_id, node_data
103
 
104
+ description = node_data["description"]
105
 
106
+ try:
107
+ descriptions = await rephrase_storage.get_by_id(description)
108
+ assert descriptions is not None
109
 
110
+ judgements = []
111
+ gts = [gt for _, gt in descriptions]
112
+ for description, gt in descriptions:
113
+ judgement = await trainee_llm_client.generate_topk_per_token(
114
+ STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
115
+ statement=description
 
116
  )
117
+ )
118
+ judgements.append(judgement[0].top_candidates)
119
 
120
+ loss = yes_no_loss_entropy(judgements, gts)
121
 
122
+ logger.debug(
123
+ "Node %s description: %s loss: %s", node_id, description, loss
124
+ )
125
 
126
+ node_data["loss"] = loss
127
+ except Exception as e: # pylint: disable=broad-except
128
+ logger.error("Error in judging entity %s: %s", node_id, e)
129
+ logger.error("Use default loss 0.1")
130
+ node_data["loss"] = -math.log(0.1)
131
 
132
+ await graph_storage.update_node(node_id, node_data)
133
+ return node_id, node_data
134
 
135
  nodes = await graph_storage.get_all_nodes()
136
 
137
+ await run_concurrent(
138
+ _judge_single_entity,
139
+ nodes,
 
140
  desc="Judging entities",
141
+ unit="entity",
142
+ progress_bar=progress_bar,
143
+ )
144
 
145
  return graph_storage