Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
·
1189434
1
Parent(s):
8e67692
Auto-sync from demo at Fri Nov 21 06:06:05 UTC 2025
Browse files
graphgen/graphgen.py
CHANGED
|
@@ -237,6 +237,7 @@ class GraphGen:
|
|
| 237 |
self.graph_storage,
|
| 238 |
self.rephrase_storage,
|
| 239 |
re_judge,
|
|
|
|
| 240 |
)
|
| 241 |
|
| 242 |
await self.rephrase_storage.index_done_callback()
|
|
|
|
| 237 |
self.graph_storage,
|
| 238 |
self.rephrase_storage,
|
| 239 |
re_judge,
|
| 240 |
+
progress_bar=self.progress_bar,
|
| 241 |
)
|
| 242 |
|
| 243 |
await self.rephrase_storage.index_done_callback()
|
graphgen/operators/partition/pre_tokenize.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import asyncio
|
| 2 |
from typing import List, Tuple
|
| 3 |
|
|
|
|
|
|
|
| 4 |
from graphgen.bases import BaseGraphStorage, BaseTokenizer
|
| 5 |
from graphgen.utils import run_concurrent
|
| 6 |
|
|
@@ -10,9 +12,11 @@ async def pre_tokenize(
|
|
| 10 |
tokenizer: BaseTokenizer,
|
| 11 |
edges: List[Tuple],
|
| 12 |
nodes: List[Tuple],
|
|
|
|
|
|
|
| 13 |
) -> Tuple[List, List]:
|
| 14 |
"""为 edges/nodes 补 token-length 并回写存储,并发 1000,带进度条。"""
|
| 15 |
-
sem = asyncio.Semaphore(
|
| 16 |
|
| 17 |
async def _patch_and_write(obj: Tuple, *, is_node: bool) -> Tuple:
|
| 18 |
async with sem:
|
|
@@ -35,11 +39,15 @@ async def pre_tokenize(
|
|
| 35 |
lambda e: _patch_and_write(e, is_node=False),
|
| 36 |
edges,
|
| 37 |
desc="Pre-tokenizing edges",
|
|
|
|
|
|
|
| 38 |
),
|
| 39 |
run_concurrent(
|
| 40 |
lambda n: _patch_and_write(n, is_node=True),
|
| 41 |
nodes,
|
| 42 |
desc="Pre-tokenizing nodes",
|
|
|
|
|
|
|
| 43 |
),
|
| 44 |
)
|
| 45 |
|
|
|
|
| 1 |
import asyncio
|
| 2 |
from typing import List, Tuple
|
| 3 |
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
from graphgen.bases import BaseGraphStorage, BaseTokenizer
|
| 7 |
from graphgen.utils import run_concurrent
|
| 8 |
|
|
|
|
| 12 |
tokenizer: BaseTokenizer,
|
| 13 |
edges: List[Tuple],
|
| 14 |
nodes: List[Tuple],
|
| 15 |
+
progress_bar: gr.Progress = None,
|
| 16 |
+
max_concurrent: int = 1000,
|
| 17 |
) -> Tuple[List, List]:
|
| 18 |
"""为 edges/nodes 补 token-length 并回写存储,并发 1000,带进度条。"""
|
| 19 |
+
sem = asyncio.Semaphore(max_concurrent)
|
| 20 |
|
| 21 |
async def _patch_and_write(obj: Tuple, *, is_node: bool) -> Tuple:
|
| 22 |
async with sem:
|
|
|
|
| 39 |
lambda e: _patch_and_write(e, is_node=False),
|
| 40 |
edges,
|
| 41 |
desc="Pre-tokenizing edges",
|
| 42 |
+
unit="edge",
|
| 43 |
+
progress_bar=progress_bar,
|
| 44 |
),
|
| 45 |
run_concurrent(
|
| 46 |
lambda n: _patch_and_write(n, is_node=True),
|
| 47 |
nodes,
|
| 48 |
desc="Pre-tokenizing nodes",
|
| 49 |
+
unit="node",
|
| 50 |
+
progress_bar=progress_bar,
|
| 51 |
),
|
| 52 |
)
|
| 53 |
|
graphgen/operators/quiz_and_judge/judge.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
import math
|
| 3 |
|
| 4 |
-
|
| 5 |
|
| 6 |
from graphgen.bases import BaseLLMWrapper
|
| 7 |
from graphgen.models import JsonKVStorage, NetworkXStorage
|
| 8 |
from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
|
| 9 |
-
from graphgen.utils import logger, yes_no_loss_entropy
|
| 10 |
|
| 11 |
|
| 12 |
async def judge_statement( # pylint: disable=too-many-statements
|
|
@@ -14,7 +13,7 @@ async def judge_statement( # pylint: disable=too-many-statements
|
|
| 14 |
graph_storage: NetworkXStorage,
|
| 15 |
rephrase_storage: JsonKVStorage,
|
| 16 |
re_judge: bool = False,
|
| 17 |
-
|
| 18 |
) -> NetworkXStorage:
|
| 19 |
"""
|
| 20 |
Get all edges and nodes and judge them
|
|
@@ -23,128 +22,124 @@ async def judge_statement( # pylint: disable=too-many-statements
|
|
| 23 |
:param graph_storage: graph storage instance
|
| 24 |
:param rephrase_storage: rephrase storage instance
|
| 25 |
:param re_judge: re-judge the relations
|
| 26 |
-
:param
|
| 27 |
:return:
|
| 28 |
"""
|
| 29 |
|
| 30 |
-
semaphore = asyncio.Semaphore(max_concurrent)
|
| 31 |
-
|
| 32 |
async def _judge_single_relation(
|
| 33 |
edge: tuple,
|
| 34 |
):
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
return source_id, target_id, edge_data
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
)
|
| 62 |
)
|
| 63 |
-
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
|
| 86 |
edges = await graph_storage.get_all_edges()
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
total=len(edges),
|
| 92 |
desc="Judging relations",
|
| 93 |
-
|
| 94 |
-
|
|
|
|
| 95 |
|
| 96 |
async def _judge_single_entity(
|
| 97 |
node: tuple,
|
| 98 |
):
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
node_data = node[1]
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
|
| 109 |
-
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
)
|
| 122 |
)
|
| 123 |
-
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
|
| 140 |
nodes = await graph_storage.get_all_nodes()
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
total=len(nodes),
|
| 146 |
desc="Judging entities",
|
| 147 |
-
|
| 148 |
-
|
|
|
|
| 149 |
|
| 150 |
return graph_storage
|
|
|
|
|
|
|
| 1 |
import math
|
| 2 |
|
| 3 |
+
import gradio as gr
|
| 4 |
|
| 5 |
from graphgen.bases import BaseLLMWrapper
|
| 6 |
from graphgen.models import JsonKVStorage, NetworkXStorage
|
| 7 |
from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
|
| 8 |
+
from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
|
| 9 |
|
| 10 |
|
| 11 |
async def judge_statement( # pylint: disable=too-many-statements
|
|
|
|
| 13 |
graph_storage: NetworkXStorage,
|
| 14 |
rephrase_storage: JsonKVStorage,
|
| 15 |
re_judge: bool = False,
|
| 16 |
+
progress_bar: gr.Progress = None,
|
| 17 |
) -> NetworkXStorage:
|
| 18 |
"""
|
| 19 |
Get all edges and nodes and judge them
|
|
|
|
| 22 |
:param graph_storage: graph storage instance
|
| 23 |
:param rephrase_storage: rephrase storage instance
|
| 24 |
:param re_judge: re-judge the relations
|
| 25 |
+
:param progress_bar
|
| 26 |
:return:
|
| 27 |
"""
|
| 28 |
|
|
|
|
|
|
|
| 29 |
async def _judge_single_relation(
|
| 30 |
edge: tuple,
|
| 31 |
):
|
| 32 |
+
source_id = edge[0]
|
| 33 |
+
target_id = edge[1]
|
| 34 |
+
edge_data = edge[2]
|
| 35 |
+
|
| 36 |
+
if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
|
| 37 |
+
logger.debug(
|
| 38 |
+
"Edge %s -> %s already judged, loss: %s, skip",
|
| 39 |
+
source_id,
|
| 40 |
+
target_id,
|
| 41 |
+
edge_data["loss"],
|
| 42 |
+
)
|
| 43 |
+
return source_id, target_id, edge_data
|
|
|
|
| 44 |
|
| 45 |
+
description = edge_data["description"]
|
| 46 |
|
| 47 |
+
try:
|
| 48 |
+
descriptions = await rephrase_storage.get_by_id(description)
|
| 49 |
+
assert descriptions is not None
|
| 50 |
|
| 51 |
+
judgements = []
|
| 52 |
+
gts = [gt for _, gt in descriptions]
|
| 53 |
+
for description, gt in descriptions:
|
| 54 |
+
judgement = await trainee_llm_client.generate_topk_per_token(
|
| 55 |
+
STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
|
| 56 |
+
statement=description
|
|
|
|
| 57 |
)
|
| 58 |
+
)
|
| 59 |
+
judgements.append(judgement[0].top_candidates)
|
| 60 |
|
| 61 |
+
loss = yes_no_loss_entropy(judgements, gts)
|
| 62 |
|
| 63 |
+
logger.debug(
|
| 64 |
+
"Edge %s -> %s description: %s loss: %s",
|
| 65 |
+
source_id,
|
| 66 |
+
target_id,
|
| 67 |
+
description,
|
| 68 |
+
loss,
|
| 69 |
+
)
|
| 70 |
|
| 71 |
+
edge_data["loss"] = loss
|
| 72 |
+
except Exception as e: # pylint: disable=broad-except
|
| 73 |
+
logger.error(
|
| 74 |
+
"Error in judging relation %s -> %s: %s", source_id, target_id, e
|
| 75 |
+
)
|
| 76 |
+
logger.info("Use default loss 0.1")
|
| 77 |
+
edge_data["loss"] = -math.log(0.1)
|
| 78 |
|
| 79 |
+
await graph_storage.update_edge(source_id, target_id, edge_data)
|
| 80 |
+
return source_id, target_id, edge_data
|
| 81 |
|
| 82 |
edges = await graph_storage.get_all_edges()
|
| 83 |
|
| 84 |
+
await run_concurrent(
|
| 85 |
+
_judge_single_relation,
|
| 86 |
+
edges,
|
|
|
|
| 87 |
desc="Judging relations",
|
| 88 |
+
unit="relation",
|
| 89 |
+
progress_bar=progress_bar,
|
| 90 |
+
)
|
| 91 |
|
| 92 |
async def _judge_single_entity(
|
| 93 |
node: tuple,
|
| 94 |
):
|
| 95 |
+
node_id = node[0]
|
| 96 |
+
node_data = node[1]
|
|
|
|
| 97 |
|
| 98 |
+
if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
|
| 99 |
+
logger.debug(
|
| 100 |
+
"Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
|
| 101 |
+
)
|
| 102 |
+
return node_id, node_data
|
| 103 |
|
| 104 |
+
description = node_data["description"]
|
| 105 |
|
| 106 |
+
try:
|
| 107 |
+
descriptions = await rephrase_storage.get_by_id(description)
|
| 108 |
+
assert descriptions is not None
|
| 109 |
|
| 110 |
+
judgements = []
|
| 111 |
+
gts = [gt for _, gt in descriptions]
|
| 112 |
+
for description, gt in descriptions:
|
| 113 |
+
judgement = await trainee_llm_client.generate_topk_per_token(
|
| 114 |
+
STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
|
| 115 |
+
statement=description
|
|
|
|
| 116 |
)
|
| 117 |
+
)
|
| 118 |
+
judgements.append(judgement[0].top_candidates)
|
| 119 |
|
| 120 |
+
loss = yes_no_loss_entropy(judgements, gts)
|
| 121 |
|
| 122 |
+
logger.debug(
|
| 123 |
+
"Node %s description: %s loss: %s", node_id, description, loss
|
| 124 |
+
)
|
| 125 |
|
| 126 |
+
node_data["loss"] = loss
|
| 127 |
+
except Exception as e: # pylint: disable=broad-except
|
| 128 |
+
logger.error("Error in judging entity %s: %s", node_id, e)
|
| 129 |
+
logger.error("Use default loss 0.1")
|
| 130 |
+
node_data["loss"] = -math.log(0.1)
|
| 131 |
|
| 132 |
+
await graph_storage.update_node(node_id, node_data)
|
| 133 |
+
return node_id, node_data
|
| 134 |
|
| 135 |
nodes = await graph_storage.get_all_nodes()
|
| 136 |
|
| 137 |
+
await run_concurrent(
|
| 138 |
+
_judge_single_entity,
|
| 139 |
+
nodes,
|
|
|
|
| 140 |
desc="Judging entities",
|
| 141 |
+
unit="entity",
|
| 142 |
+
progress_bar=progress_bar,
|
| 143 |
+
)
|
| 144 |
|
| 145 |
return graph_storage
|