GeoLLM / revision /convert.py
Pengfa Li
Upload folder using huggingface_hub
badcf3c verified
import json
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
from datetime import datetime
def parse_knowledge_graph_txt(txt_file):
"""
解析知识图谱txt文件,提取节点和边信息
参数:
- txt_file: txt文件路径
返回:
- nodes: 节点列表 [(id, label, degree, importance)]
- edges: 边列表 [(source, target, relation, weight)]
"""
nodes = []
edges = []
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
# 解析节点信息
node_section_match = re.search(r'【节点信息】.*?格式: 节点名称.*?\n-{40}\n(.*?)\n={80}', content, re.DOTALL)
if node_section_match:
node_lines = node_section_match.group(1).strip().split('\n')
for line in node_lines:
if line.strip():
# 解析格式: " 1. 节点名称 | 度数: X | 重要性: 级别"
match = re.match(r'\s*(\d+)\.\s+(.*?)\s+\|\s+度数:\s+(\d+)\s+\|\s+重要性:\s+(\S+)', line)
if match:
node_id = int(match.group(1)) - 1 # 从0开始的ID
label = match.group(2).strip()
degree = int(match.group(3))
importance = match.group(4)
nodes.append((node_id, label, degree, importance))
# 解析边关系信息
edge_section_match = re.search(r'【边关系信息】.*?格式: 节点1.*?\n-{60}\n(.*?)\n={80}', content, re.DOTALL)
if edge_section_match:
edge_lines = edge_section_match.group(1).strip().split('\n')
for line in edge_lines:
if line.strip():
# 解析格式: " 1. 节点1 <--[关系]--> 节点2 | 重要性分数: X"
match = re.match(r'\s*(\d+)\.\s+(.*?)\s+<--\[(.*?)\]-->\s+(.*?)\s+\|\s+重要性分数:\s+(\d+)', line)
if match:
edge_id = int(match.group(1)) - 1
source_label = match.group(2).strip()
relation = match.group(3).strip()
target_label = match.group(4).strip()
weight = int(match.group(5))
# 找到对应的节点ID
source_id = None
target_id = None
for node_id, label, _, _ in nodes:
if label == source_label:
source_id = node_id
if label == target_label:
target_id = node_id
if source_id is not None and target_id is not None:
edges.append((source_id, target_id, relation, weight))
return nodes, edges
def create_gexf_from_txt(txt_file, output_file='knowledge_graph.gexf',
graph_title="Knowledge Graph", creator="Knowledge Graph Generator"):
"""
将txt格式的知识图谱转换为GEXF格式
参数:
- txt_file: 输入的txt文件路径
- output_file: 输出的gexf文件路径
- graph_title: 图的标题
- creator: 创建者信息
"""
# 解析txt文件
nodes, edges = parse_knowledge_graph_txt(txt_file)
if not nodes:
print("警告:未找到节点数据")
return
# 创建GEXF根元素
gexf = ET.Element('gexf')
gexf.set('xmlns', 'http://www.gexf.net/1.2draft')
gexf.set('xmlns:viz', 'http://www.gexf.net/1.2draft/viz')
gexf.set('version', '1.2')
# 添加meta信息
meta = ET.SubElement(gexf, 'meta')
meta.set('lastmodifieddate', datetime.now().strftime('%Y-%m-%d'))
creator_elem = ET.SubElement(meta, 'creator')
creator_elem.text = creator
description = ET.SubElement(meta, 'description')
description.text = f"{graph_title} - Generated from knowledge graph data"
# 创建graph元素
graph = ET.SubElement(gexf, 'graph')
graph.set('defaultedgetype', 'undirected')
graph.set('mode', 'static')
# 定义节点属性
attributes = ET.SubElement(graph, 'attributes')
attributes.set('class', 'node')
degree_attr = ET.SubElement(attributes, 'attribute')
degree_attr.set('id', 'degree')
degree_attr.set('title', 'Degree')
degree_attr.set('type', 'integer')
importance_attr = ET.SubElement(attributes, 'attribute')
importance_attr.set('id', 'importance')
importance_attr.set('title', 'Importance')
importance_attr.set('type', 'string')
# 定义边属性 - 同时添加label和relationship
edge_attributes = ET.SubElement(graph, 'attributes')
edge_attributes.set('class', 'edge')
label_attr = ET.SubElement(edge_attributes, 'attribute')
label_attr.set('id', 'label')
label_attr.set('title', 'Label')
label_attr.set('type', 'string')
relationship_attr = ET.SubElement(edge_attributes, 'attribute')
relationship_attr.set('id', 'relationship')
relationship_attr.set('title', 'Relationship')
relationship_attr.set('type', 'string')
# 添加节点
nodes_elem = ET.SubElement(graph, 'nodes')
nodes_elem.set('count', str(len(nodes)))
# 计算节点的可视化属性
max_degree = max(degree for _, _, degree, _ in nodes) if nodes else 1
min_degree = min(degree for _, _, degree, _ in nodes) if nodes else 1
for node_id, label, degree, importance in nodes:
node = ET.SubElement(nodes_elem, 'node')
node.set('id', str(node_id))
node.set('label', label)
# 添加节点属性值
attvalues = ET.SubElement(node, 'attvalues')
degree_val = ET.SubElement(attvalues, 'attvalue')
degree_val.set('for', 'degree')
degree_val.set('value', str(degree))
importance_val = ET.SubElement(attvalues, 'attvalue')
importance_val.set('for', 'importance')
importance_val.set('value', importance)
# 添加可视化属性
viz = ET.SubElement(node, 'viz:size')
# 根据度数计算节点大小(5-30之间)
if max_degree > min_degree:
size = 5 + 25 * (degree - min_degree) / (max_degree - min_degree)
else:
size = 15
viz.set('value', f'{size:.1f}')
# 根据重要性设置颜色
color = ET.SubElement(node, 'viz:color')
if importance == '高':
color.set('r', '46') # 深绿色
color.set('g', '139')
color.set('b', '87')
elif importance == '中':
color.set('r', '144') # 浅绿色
color.set('g', '238')
color.set('b', '144')
else: # 低
color.set('r', '224') # 很浅的绿色
color.set('g', '255')
color.set('b', '224')
color.set('a', '0.8')
# 添加边
edges_elem = ET.SubElement(graph, 'edges')
edges_elem.set('count', str(len(edges)))
for edge_id, (source_id, target_id, relation, weight) in enumerate(edges):
edge = ET.SubElement(edges_elem, 'edge')
edge.set('id', str(edge_id))
edge.set('source', str(source_id))
edge.set('target', str(target_id))
edge.set('weight', str(weight))
edge.set('label', relation) # 直接设置边的label属性
# 添加边属性值 - 同时设置label和relationship
attvalues = ET.SubElement(edge, 'attvalues')
label_val = ET.SubElement(attvalues, 'attvalue')
label_val.set('for', 'label')
label_val.set('value', relation)
relationship_val = ET.SubElement(attvalues, 'attvalue')
relationship_val.set('for', 'relationship')
relationship_val.set('value', relation) # 与label值完全一致
# 添加边的可视化属性
edge_color = ET.SubElement(edge, 'viz:color')
edge_color.set('r', '128')
edge_color.set('g', '128')
edge_color.set('b', '128')
edge_color.set('a', '0.5')
# 根据权重设置边的粗细
max_weight = max(w for _, _, _, w in edges) if edges else 1
min_weight = min(w for _, _, _, w in edges) if edges else 1
if max_weight > min_weight:
thickness = 1 + 4 * (weight - min_weight) / (max_weight - min_weight)
else:
thickness = 2
edge_thickness = ET.SubElement(edge, 'viz:thickness')
edge_thickness.set('value', f'{thickness:.1f}')
# 格式化XML并保存
rough_string = ET.tostring(gexf, encoding='unicode')
reparsed = minidom.parseString(rough_string)
pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None)
# 移除空行
pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()]
pretty_xml = '\n'.join(pretty_lines)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(pretty_xml)
print(f"GEXF文件已生成: {output_file}")
print(f"节点数: {len(nodes)}, 边数: {len(edges)}")
print("边属性: 同时包含label和relationship属性,值完全一致")
return output_file
def create_enhanced_gexf_from_data(nodes_data, edges_data, output_file='enhanced_knowledge_graph.gexf',
graph_title="Enhanced Knowledge Graph"):
"""
直接从节点和边数据创建增强的GEXF文件
参数:
- nodes_data: 节点数据列表 [(label, degree, importance)]
- edges_data: 边数据列表 [(source_label, target_label, relation, weight)]
- output_file: 输出文件路径
- graph_title: 图标题
"""
# 创建节点ID映射
label_to_id = {label: idx for idx, (label, _, _) in enumerate(nodes_data)}
# 创建GEXF根元素
gexf = ET.Element('gexf')
gexf.set('xmlns', 'http://www.gexf.net/1.2draft')
gexf.set('xmlns:viz', 'http://www.gexf.net/1.2draft/viz')
gexf.set('version', '1.2')
# 添加meta信息
meta = ET.SubElement(gexf, 'meta')
meta.set('lastmodifieddate', datetime.now().strftime('%Y-%m-%d+%H:%M'))
creator_elem = ET.SubElement(meta, 'creator')
creator_elem.text = "Knowledge Graph GEXF Generator v1.0"
description = ET.SubElement(meta, 'description')
description.text = f"{graph_title} - 地质知识图谱可视化数据"
keywords = ET.SubElement(meta, 'keywords')
keywords.text = "知识图谱,地质学,三元组,网络分析"
# 创建graph元素
graph = ET.SubElement(gexf, 'graph')
graph.set('defaultedgetype', 'undirected')
graph.set('mode', 'static')
graph.set('timeformat', 'date')
# 定义节点属性
attributes = ET.SubElement(graph, 'attributes')
attributes.set('class', 'node')
degree_attr = ET.SubElement(attributes, 'attribute')
degree_attr.set('id', '0')
degree_attr.set('title', 'degree')
degree_attr.set('type', 'integer')
importance_attr = ET.SubElement(attributes, 'attribute')
importance_attr.set('id', '1')
importance_attr.set('title', 'importance')
importance_attr.set('type', 'string')
category_attr = ET.SubElement(attributes, 'attribute')
category_attr.set('id', '2')
category_attr.set('title', 'category')
category_attr.set('type', 'string')
# 定义边属性 - 同时添加label和relationship
edge_attributes = ET.SubElement(graph, 'attributes')
edge_attributes.set('class', 'edge')
label_attr = ET.SubElement(edge_attributes, 'attribute')
label_attr.set('id', '0')
label_attr.set('title', 'label')
label_attr.set('type', 'string')
relationship_attr = ET.SubElement(edge_attributes, 'attribute')
relationship_attr.set('id', '1')
relationship_attr.set('title', 'relationship')
relationship_attr.set('type', 'string')
relation_type_attr = ET.SubElement(edge_attributes, 'attribute')
relation_type_attr.set('id', '2')
relation_type_attr.set('title', 'relation_type')
relation_type_attr.set('type', 'string')
# 添加节点
nodes_elem = ET.SubElement(graph, 'nodes')
nodes_elem.set('count', str(len(nodes_data)))
# 计算节点的可视化属性
degrees = [degree for _, degree, _ in nodes_data]
max_degree = max(degrees) if degrees else 1
min_degree = min(degrees) if degrees else 1
for idx, (label, degree, importance) in enumerate(nodes_data):
node = ET.SubElement(nodes_elem, 'node')
node.set('id', str(idx))
node.set('label', label)
# 添加节点属性值
attvalues = ET.SubElement(node, 'attvalues')
degree_val = ET.SubElement(attvalues, 'attvalue')
degree_val.set('for', '0')
degree_val.set('value', str(degree))
importance_val = ET.SubElement(attvalues, 'attvalue')
importance_val.set('for', '1')
importance_val.set('value', importance)
# 根据实体名称推断类别
category = "地质实体"
if any(keyword in label for keyword in ["组", "段", "层"]):
category = "地层单位"
elif any(keyword in label for keyword in ["岩", "石"]):
category = "岩石类型"
elif any(keyword in label for keyword in ["构造", "断层", "褶皱"]):
category = "构造单元"
elif any(keyword in label for keyword in ["矿物", "成分"]):
category = "矿物组分"
category_val = ET.SubElement(attvalues, 'attvalue')
category_val.set('for', '2')
category_val.set('value', category)
# 添加可视化属性
viz_size = ET.SubElement(node, 'viz:size')
# 根据度数计算节点大小(10-50之间)
if max_degree > min_degree:
size = 10 + 40 * (degree - min_degree) / (max_degree - min_degree)
else:
size = 25
viz_size.set('value', f'{size:.1f}')
# 根据重要性和类别设置颜色
color = ET.SubElement(node, 'viz:color')
if importance == '高':
if category == "地层单位":
color.set('r', '34'); color.set('g', '139'); color.set('b', '34') # 森林绿
elif category == "岩石类型":
color.set('r', '139'); color.set('g', '69'); color.set('b', '19') # 棕色
elif category == "构造单元":
color.set('r', '220'); color.set('g', '20'); color.set('b', '60') # 深红
else:
color.set('r', '46'); color.set('g', '139'); color.set('b', '87') # 默认深绿
elif importance == '中':
if category == "地层单位":
color.set('r', '124'); color.set('g', '252'); color.set('b', '0') # 草绿
elif category == "岩石类型":
color.set('r', '210'); color.set('g', '180'); color.set('b', '140') # 棕褐色
elif category == "构造单元":
color.set('r', '255'); color.set('g', '105'); color.set('b', '180') # 粉红
else:
color.set('r', '144'); color.set('g', '238'); color.set('b', '144') # 默认浅绿
else: # 低
if category == "地层单位":
color.set('r', '240'); color.set('g', '255'); color.set('b', '240') # 很浅绿
elif category == "岩石类型":
color.set('r', '245'); color.set('g', '222'); color.set('b', '179') # 小麦色
elif category == "构造单元":
color.set('r', '255'); color.set('g', '228'); color.set('b', '225') # 很浅粉
else:
color.set('r', '224'); color.set('g', '255'); color.set('b', '224') # 默认很浅绿
color.set('a', '0.9')
# 添加节点位置(可选,用于初始布局)
position = ET.SubElement(node, 'viz:position')
# 简单的圆形布局
import math
angle = 2 * math.pi * idx / len(nodes_data)
radius = 100 + degree * 10
x = radius * math.cos(angle)
y = radius * math.sin(angle)
position.set('x', f'{x:.2f}')
position.set('y', f'{y:.2f}')
position.set('z', '0.0')
# 添加边
edges_elem = ET.SubElement(graph, 'edges')
edges_elem.set('count', str(len(edges_data)))
# 计算边权重范围
weights = [weight for _, _, _, weight in edges_data]
max_weight = max(weights) if weights else 1
min_weight = min(weights) if weights else 1
for edge_id, (source_label, target_label, relation, weight) in enumerate(edges_data):
if source_label in label_to_id and target_label in label_to_id:
edge = ET.SubElement(edges_elem, 'edge')
edge.set('id', str(edge_id))
edge.set('source', str(label_to_id[source_label]))
edge.set('target', str(label_to_id[target_label]))
edge.set('weight', str(weight))
edge.set('label', relation) # 直接设置边的label属性
# 添加边属性值 - 同时设置label和relationship
attvalues = ET.SubElement(edge, 'attvalues')
label_val = ET.SubElement(attvalues, 'attvalue')
label_val.set('for', '0')
label_val.set('value', relation)
relationship_val = ET.SubElement(attvalues, 'attvalue')
relationship_val.set('for', '1')
relationship_val.set('value', relation) # 与label值完全一致
# 根据关系推断关系类型
relation_type = "其他"
if relation in ["位于", "分布于", "出露于"]:
relation_type = "空间关系"
elif relation in ["含有", "包含", "夹有"]:
relation_type = "组成关系"
elif relation in ["厚度", "长度", "宽度"]:
relation_type = "数量关系"
elif relation in ["整合", "不整合", "断层接触"]:
relation_type = "接触关系"
elif relation in ["发育", "具有", "呈现"]:
relation_type = "特征关系"
elif relation in ["属于", "归属"]:
relation_type = "分类关系"
relation_type_val = ET.SubElement(attvalues, 'attvalue')
relation_type_val.set('for', '2')
relation_type_val.set('value', relation_type)
# 添加边的可视化属性
edge_color = ET.SubElement(edge, 'viz:color')
# 根据关系类型设置颜色
if relation_type == "空间关系":
edge_color.set('r', '0'); edge_color.set('g', '0'); edge_color.set('b', '255') # 蓝色
elif relation_type == "组成关系":
edge_color.set('r', '255'); edge_color.set('g', '165'); edge_color.set('b', '0') # 橙色
elif relation_type == "数量关系":
edge_color.set('r', '128'); edge_color.set('g', '0'); edge_color.set('b', '128') # 紫色
elif relation_type == "接触关系":
edge_color.set('r', '255'); edge_color.set('g', '0'); edge_color.set('b', '0') # 红色
elif relation_type == "特征关系":
edge_color.set('r', '0'); edge_color.set('g', '128'); edge_color.set('b', '0') # 绿色
elif relation_type == "分类关系":
edge_color.set('r', '255'); edge_color.set('g', '20'); edge_color.set('b', '147') # 深粉色
else:
edge_color.set('r', '128'); edge_color.set('g', '128'); edge_color.set('b', '128') # 灰色
edge_color.set('a', '0.7')
# 根据权重设置边的粗细
if max_weight > min_weight:
thickness = 1 + 5 * (weight - min_weight) / (max_weight - min_weight)
else:
thickness = 2
edge_thickness = ET.SubElement(edge, 'viz:thickness')
edge_thickness.set('value', f'{thickness:.1f}')
# 格式化XML并保存
rough_string = ET.tostring(gexf, encoding='unicode')
reparsed = minidom.parseString(rough_string)
pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None)
# 移除空行并清理格式
pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()]
pretty_xml = '\n'.join(pretty_lines)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(pretty_xml)
print(f"增强的GEXF文件已生成: {output_file}")
print(f"节点数: {len(nodes_data)}, 边数: {len(edges_data)}")
print("可视化特性:")
print("- 节点大小: 基于度数")
print("- 节点颜色: 基于重要性和类别")
print("- 边颜色: 基于关系类型")
print("- 边粗细: 基于权重")
print("- 边标签: 同时包含label和relationship属性,值完全一致")
print("- Gephi可直接识别label属性,relationship属性可用于分类分析")
return output_file
def fix_existing_gexf(input_file, output_file=None):
"""
修复现有的GEXF文件,将relation属性改为label属性,并添加relationship属性
参数:
- input_file: 输入的GEXF文件路径
- output_file: 输出的GEXF文件路径,如果为None则覆盖原文件
"""
if output_file is None:
output_file = input_file
try:
# 读取并解析GEXF文件
tree = ET.parse(input_file)
root = tree.getroot()
# 查找并修改边属性定义
for attributes in root.findall('.//{http://www.gexf.net/1.2draft}attributes[@class="edge"]'):
# 查找现有的relation属性,改为label
for attr in attributes.findall('.//{http://www.gexf.net/1.2draft}attribute[@id="relation"]'):
attr.set('id', 'label')
attr.set('title', 'Label')
print("已修改边属性定义: relation -> label")
# 添加relationship属性(如果不存在)
existing_relationship = attributes.find('.//{http://www.gexf.net/1.2draft}attribute[@id="relationship"]')
if existing_relationship is None:
relationship_attr = ET.SubElement(attributes, 'attribute')
relationship_attr.set('id', 'relationship')
relationship_attr.set('title', 'Relationship')
relationship_attr.set('type', 'string')
print("已添加边属性定义: relationship")
# 查找并修改边属性值
modified_count = 0
for attvalue in root.findall('.//{http://www.gexf.net/1.2draft}attvalue[@for="relation"]'):
relation_value = attvalue.get('value')
attvalue.set('for', 'label')
modified_count += 1
# 在同一个edge下添加relationship属性值
edge_attvalues = attvalue.getparent()
if edge_attvalues is not None:
# 检查是否已存在relationship属性值
existing_relationship_val = edge_attvalues.find('.//{http://www.gexf.net/1.2draft}attvalue[@for="relationship"]')
if existing_relationship_val is None:
relationship_val = ET.SubElement(edge_attvalues, 'attvalue')
relationship_val.set('for', 'relationship')
relationship_val.set('value', relation_value)
# 为所有边添加label属性(如果还没有的话)
for edge in root.findall('.//{http://www.gexf.net/1.2draft}edge'):
# 查找边的关系值
for attvalue in edge.findall('.//{http://www.gexf.net/1.2draft}attvalue[@for="label"]'):
relation_value = attvalue.get('value')
if relation_value:
edge.set('label', relation_value)
break
# 保存修改后的文件
rough_string = ET.tostring(root, encoding='unicode')
reparsed = minidom.parseString(rough_string)
pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None)
# 移除空行
pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()]
pretty_xml = '\n'.join(pretty_lines)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(pretty_xml)
print(f"GEXF文件修复完成: {output_file}")
print(f"修改了 {modified_count} 个边属性值")
print("已同时添加label和relationship属性,值完全一致")
except Exception as e:
print(f"修复GEXF文件时出错: {e}")
# 使用示例和测试函数
def test_conversion():
"""测试转换功能"""
# 测试从txt文件转换
txt_file = 'knowledge_graph_export.txt'
gexf_file = 'knowledge_graph.gexf'
try:
create_gexf_from_txt(txt_file, gexf_file)
print(f"转换成功: {txt_file} -> {gexf_file}")
except Exception as e:
print(f"转换失败: {e}")
if __name__ == "__main__":
test_conversion()