import json import xml.etree.ElementTree as ET from xml.dom import minidom import re from datetime import datetime def parse_knowledge_graph_txt(txt_file): """ 解析知识图谱txt文件,提取节点和边信息 参数: - txt_file: txt文件路径 返回: - nodes: 节点列表 [(id, label, degree, importance)] - edges: 边列表 [(source, target, relation, weight)] """ nodes = [] edges = [] with open(txt_file, 'r', encoding='utf-8') as f: content = f.read() # 解析节点信息 node_section_match = re.search(r'【节点信息】.*?格式: 节点名称.*?\n-{40}\n(.*?)\n={80}', content, re.DOTALL) if node_section_match: node_lines = node_section_match.group(1).strip().split('\n') for line in node_lines: if line.strip(): # 解析格式: " 1. 节点名称 | 度数: X | 重要性: 级别" match = re.match(r'\s*(\d+)\.\s+(.*?)\s+\|\s+度数:\s+(\d+)\s+\|\s+重要性:\s+(\S+)', line) if match: node_id = int(match.group(1)) - 1 # 从0开始的ID label = match.group(2).strip() degree = int(match.group(3)) importance = match.group(4) nodes.append((node_id, label, degree, importance)) # 解析边关系信息 edge_section_match = re.search(r'【边关系信息】.*?格式: 节点1.*?\n-{60}\n(.*?)\n={80}', content, re.DOTALL) if edge_section_match: edge_lines = edge_section_match.group(1).strip().split('\n') for line in edge_lines: if line.strip(): # 解析格式: " 1. 节点1 <--[关系]--> 节点2 | 重要性分数: X" match = re.match(r'\s*(\d+)\.\s+(.*?)\s+<--\[(.*?)\]-->\s+(.*?)\s+\|\s+重要性分数:\s+(\d+)', line) if match: edge_id = int(match.group(1)) - 1 source_label = match.group(2).strip() relation = match.group(3).strip() target_label = match.group(4).strip() weight = int(match.group(5)) # 找到对应的节点ID source_id = None target_id = None for node_id, label, _, _ in nodes: if label == source_label: source_id = node_id if label == target_label: target_id = node_id if source_id is not None and target_id is not None: edges.append((source_id, target_id, relation, weight)) return nodes, edges def create_gexf_from_txt(txt_file, output_file='knowledge_graph.gexf', graph_title="Knowledge Graph", creator="Knowledge Graph Generator"): """ 将txt格式的知识图谱转换为GEXF格式 参数: - txt_file: 输入的txt文件路径 - output_file: 输出的gexf文件路径 - graph_title: 图的标题 - creator: 创建者信息 """ # 解析txt文件 nodes, edges = parse_knowledge_graph_txt(txt_file) if not nodes: print("警告:未找到节点数据") return # 创建GEXF根元素 gexf = ET.Element('gexf') gexf.set('xmlns', 'http://www.gexf.net/1.2draft') gexf.set('xmlns:viz', 'http://www.gexf.net/1.2draft/viz') gexf.set('version', '1.2') # 添加meta信息 meta = ET.SubElement(gexf, 'meta') meta.set('lastmodifieddate', datetime.now().strftime('%Y-%m-%d')) creator_elem = ET.SubElement(meta, 'creator') creator_elem.text = creator description = ET.SubElement(meta, 'description') description.text = f"{graph_title} - Generated from knowledge graph data" # 创建graph元素 graph = ET.SubElement(gexf, 'graph') graph.set('defaultedgetype', 'undirected') graph.set('mode', 'static') # 定义节点属性 attributes = ET.SubElement(graph, 'attributes') attributes.set('class', 'node') degree_attr = ET.SubElement(attributes, 'attribute') degree_attr.set('id', 'degree') degree_attr.set('title', 'Degree') degree_attr.set('type', 'integer') importance_attr = ET.SubElement(attributes, 'attribute') importance_attr.set('id', 'importance') importance_attr.set('title', 'Importance') importance_attr.set('type', 'string') # 定义边属性 - 同时添加label和relationship edge_attributes = ET.SubElement(graph, 'attributes') edge_attributes.set('class', 'edge') label_attr = ET.SubElement(edge_attributes, 'attribute') label_attr.set('id', 'label') label_attr.set('title', 'Label') label_attr.set('type', 'string') relationship_attr = ET.SubElement(edge_attributes, 'attribute') relationship_attr.set('id', 'relationship') relationship_attr.set('title', 'Relationship') relationship_attr.set('type', 'string') # 添加节点 nodes_elem = ET.SubElement(graph, 'nodes') nodes_elem.set('count', str(len(nodes))) # 计算节点的可视化属性 max_degree = max(degree for _, _, degree, _ in nodes) if nodes else 1 min_degree = min(degree for _, _, degree, _ in nodes) if nodes else 1 for node_id, label, degree, importance in nodes: node = ET.SubElement(nodes_elem, 'node') node.set('id', str(node_id)) node.set('label', label) # 添加节点属性值 attvalues = ET.SubElement(node, 'attvalues') degree_val = ET.SubElement(attvalues, 'attvalue') degree_val.set('for', 'degree') degree_val.set('value', str(degree)) importance_val = ET.SubElement(attvalues, 'attvalue') importance_val.set('for', 'importance') importance_val.set('value', importance) # 添加可视化属性 viz = ET.SubElement(node, 'viz:size') # 根据度数计算节点大小(5-30之间) if max_degree > min_degree: size = 5 + 25 * (degree - min_degree) / (max_degree - min_degree) else: size = 15 viz.set('value', f'{size:.1f}') # 根据重要性设置颜色 color = ET.SubElement(node, 'viz:color') if importance == '高': color.set('r', '46') # 深绿色 color.set('g', '139') color.set('b', '87') elif importance == '中': color.set('r', '144') # 浅绿色 color.set('g', '238') color.set('b', '144') else: # 低 color.set('r', '224') # 很浅的绿色 color.set('g', '255') color.set('b', '224') color.set('a', '0.8') # 添加边 edges_elem = ET.SubElement(graph, 'edges') edges_elem.set('count', str(len(edges))) for edge_id, (source_id, target_id, relation, weight) in enumerate(edges): edge = ET.SubElement(edges_elem, 'edge') edge.set('id', str(edge_id)) edge.set('source', str(source_id)) edge.set('target', str(target_id)) edge.set('weight', str(weight)) edge.set('label', relation) # 直接设置边的label属性 # 添加边属性值 - 同时设置label和relationship attvalues = ET.SubElement(edge, 'attvalues') label_val = ET.SubElement(attvalues, 'attvalue') label_val.set('for', 'label') label_val.set('value', relation) relationship_val = ET.SubElement(attvalues, 'attvalue') relationship_val.set('for', 'relationship') relationship_val.set('value', relation) # 与label值完全一致 # 添加边的可视化属性 edge_color = ET.SubElement(edge, 'viz:color') edge_color.set('r', '128') edge_color.set('g', '128') edge_color.set('b', '128') edge_color.set('a', '0.5') # 根据权重设置边的粗细 max_weight = max(w for _, _, _, w in edges) if edges else 1 min_weight = min(w for _, _, _, w in edges) if edges else 1 if max_weight > min_weight: thickness = 1 + 4 * (weight - min_weight) / (max_weight - min_weight) else: thickness = 2 edge_thickness = ET.SubElement(edge, 'viz:thickness') edge_thickness.set('value', f'{thickness:.1f}') # 格式化XML并保存 rough_string = ET.tostring(gexf, encoding='unicode') reparsed = minidom.parseString(rough_string) pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None) # 移除空行 pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()] pretty_xml = '\n'.join(pretty_lines) with open(output_file, 'w', encoding='utf-8') as f: f.write(pretty_xml) print(f"GEXF文件已生成: {output_file}") print(f"节点数: {len(nodes)}, 边数: {len(edges)}") print("边属性: 同时包含label和relationship属性,值完全一致") return output_file def create_enhanced_gexf_from_data(nodes_data, edges_data, output_file='enhanced_knowledge_graph.gexf', graph_title="Enhanced Knowledge Graph"): """ 直接从节点和边数据创建增强的GEXF文件 参数: - nodes_data: 节点数据列表 [(label, degree, importance)] - edges_data: 边数据列表 [(source_label, target_label, relation, weight)] - output_file: 输出文件路径 - graph_title: 图标题 """ # 创建节点ID映射 label_to_id = {label: idx for idx, (label, _, _) in enumerate(nodes_data)} # 创建GEXF根元素 gexf = ET.Element('gexf') gexf.set('xmlns', 'http://www.gexf.net/1.2draft') gexf.set('xmlns:viz', 'http://www.gexf.net/1.2draft/viz') gexf.set('version', '1.2') # 添加meta信息 meta = ET.SubElement(gexf, 'meta') meta.set('lastmodifieddate', datetime.now().strftime('%Y-%m-%d+%H:%M')) creator_elem = ET.SubElement(meta, 'creator') creator_elem.text = "Knowledge Graph GEXF Generator v1.0" description = ET.SubElement(meta, 'description') description.text = f"{graph_title} - 地质知识图谱可视化数据" keywords = ET.SubElement(meta, 'keywords') keywords.text = "知识图谱,地质学,三元组,网络分析" # 创建graph元素 graph = ET.SubElement(gexf, 'graph') graph.set('defaultedgetype', 'undirected') graph.set('mode', 'static') graph.set('timeformat', 'date') # 定义节点属性 attributes = ET.SubElement(graph, 'attributes') attributes.set('class', 'node') degree_attr = ET.SubElement(attributes, 'attribute') degree_attr.set('id', '0') degree_attr.set('title', 'degree') degree_attr.set('type', 'integer') importance_attr = ET.SubElement(attributes, 'attribute') importance_attr.set('id', '1') importance_attr.set('title', 'importance') importance_attr.set('type', 'string') category_attr = ET.SubElement(attributes, 'attribute') category_attr.set('id', '2') category_attr.set('title', 'category') category_attr.set('type', 'string') # 定义边属性 - 同时添加label和relationship edge_attributes = ET.SubElement(graph, 'attributes') edge_attributes.set('class', 'edge') label_attr = ET.SubElement(edge_attributes, 'attribute') label_attr.set('id', '0') label_attr.set('title', 'label') label_attr.set('type', 'string') relationship_attr = ET.SubElement(edge_attributes, 'attribute') relationship_attr.set('id', '1') relationship_attr.set('title', 'relationship') relationship_attr.set('type', 'string') relation_type_attr = ET.SubElement(edge_attributes, 'attribute') relation_type_attr.set('id', '2') relation_type_attr.set('title', 'relation_type') relation_type_attr.set('type', 'string') # 添加节点 nodes_elem = ET.SubElement(graph, 'nodes') nodes_elem.set('count', str(len(nodes_data))) # 计算节点的可视化属性 degrees = [degree for _, degree, _ in nodes_data] max_degree = max(degrees) if degrees else 1 min_degree = min(degrees) if degrees else 1 for idx, (label, degree, importance) in enumerate(nodes_data): node = ET.SubElement(nodes_elem, 'node') node.set('id', str(idx)) node.set('label', label) # 添加节点属性值 attvalues = ET.SubElement(node, 'attvalues') degree_val = ET.SubElement(attvalues, 'attvalue') degree_val.set('for', '0') degree_val.set('value', str(degree)) importance_val = ET.SubElement(attvalues, 'attvalue') importance_val.set('for', '1') importance_val.set('value', importance) # 根据实体名称推断类别 category = "地质实体" if any(keyword in label for keyword in ["组", "段", "层"]): category = "地层单位" elif any(keyword in label for keyword in ["岩", "石"]): category = "岩石类型" elif any(keyword in label for keyword in ["构造", "断层", "褶皱"]): category = "构造单元" elif any(keyword in label for keyword in ["矿物", "成分"]): category = "矿物组分" category_val = ET.SubElement(attvalues, 'attvalue') category_val.set('for', '2') category_val.set('value', category) # 添加可视化属性 viz_size = ET.SubElement(node, 'viz:size') # 根据度数计算节点大小(10-50之间) if max_degree > min_degree: size = 10 + 40 * (degree - min_degree) / (max_degree - min_degree) else: size = 25 viz_size.set('value', f'{size:.1f}') # 根据重要性和类别设置颜色 color = ET.SubElement(node, 'viz:color') if importance == '高': if category == "地层单位": color.set('r', '34'); color.set('g', '139'); color.set('b', '34') # 森林绿 elif category == "岩石类型": color.set('r', '139'); color.set('g', '69'); color.set('b', '19') # 棕色 elif category == "构造单元": color.set('r', '220'); color.set('g', '20'); color.set('b', '60') # 深红 else: color.set('r', '46'); color.set('g', '139'); color.set('b', '87') # 默认深绿 elif importance == '中': if category == "地层单位": color.set('r', '124'); color.set('g', '252'); color.set('b', '0') # 草绿 elif category == "岩石类型": color.set('r', '210'); color.set('g', '180'); color.set('b', '140') # 棕褐色 elif category == "构造单元": color.set('r', '255'); color.set('g', '105'); color.set('b', '180') # 粉红 else: color.set('r', '144'); color.set('g', '238'); color.set('b', '144') # 默认浅绿 else: # 低 if category == "地层单位": color.set('r', '240'); color.set('g', '255'); color.set('b', '240') # 很浅绿 elif category == "岩石类型": color.set('r', '245'); color.set('g', '222'); color.set('b', '179') # 小麦色 elif category == "构造单元": color.set('r', '255'); color.set('g', '228'); color.set('b', '225') # 很浅粉 else: color.set('r', '224'); color.set('g', '255'); color.set('b', '224') # 默认很浅绿 color.set('a', '0.9') # 添加节点位置(可选,用于初始布局) position = ET.SubElement(node, 'viz:position') # 简单的圆形布局 import math angle = 2 * math.pi * idx / len(nodes_data) radius = 100 + degree * 10 x = radius * math.cos(angle) y = radius * math.sin(angle) position.set('x', f'{x:.2f}') position.set('y', f'{y:.2f}') position.set('z', '0.0') # 添加边 edges_elem = ET.SubElement(graph, 'edges') edges_elem.set('count', str(len(edges_data))) # 计算边权重范围 weights = [weight for _, _, _, weight in edges_data] max_weight = max(weights) if weights else 1 min_weight = min(weights) if weights else 1 for edge_id, (source_label, target_label, relation, weight) in enumerate(edges_data): if source_label in label_to_id and target_label in label_to_id: edge = ET.SubElement(edges_elem, 'edge') edge.set('id', str(edge_id)) edge.set('source', str(label_to_id[source_label])) edge.set('target', str(label_to_id[target_label])) edge.set('weight', str(weight)) edge.set('label', relation) # 直接设置边的label属性 # 添加边属性值 - 同时设置label和relationship attvalues = ET.SubElement(edge, 'attvalues') label_val = ET.SubElement(attvalues, 'attvalue') label_val.set('for', '0') label_val.set('value', relation) relationship_val = ET.SubElement(attvalues, 'attvalue') relationship_val.set('for', '1') relationship_val.set('value', relation) # 与label值完全一致 # 根据关系推断关系类型 relation_type = "其他" if relation in ["位于", "分布于", "出露于"]: relation_type = "空间关系" elif relation in ["含有", "包含", "夹有"]: relation_type = "组成关系" elif relation in ["厚度", "长度", "宽度"]: relation_type = "数量关系" elif relation in ["整合", "不整合", "断层接触"]: relation_type = "接触关系" elif relation in ["发育", "具有", "呈现"]: relation_type = "特征关系" elif relation in ["属于", "归属"]: relation_type = "分类关系" relation_type_val = ET.SubElement(attvalues, 'attvalue') relation_type_val.set('for', '2') relation_type_val.set('value', relation_type) # 添加边的可视化属性 edge_color = ET.SubElement(edge, 'viz:color') # 根据关系类型设置颜色 if relation_type == "空间关系": edge_color.set('r', '0'); edge_color.set('g', '0'); edge_color.set('b', '255') # 蓝色 elif relation_type == "组成关系": edge_color.set('r', '255'); edge_color.set('g', '165'); edge_color.set('b', '0') # 橙色 elif relation_type == "数量关系": edge_color.set('r', '128'); edge_color.set('g', '0'); edge_color.set('b', '128') # 紫色 elif relation_type == "接触关系": edge_color.set('r', '255'); edge_color.set('g', '0'); edge_color.set('b', '0') # 红色 elif relation_type == "特征关系": edge_color.set('r', '0'); edge_color.set('g', '128'); edge_color.set('b', '0') # 绿色 elif relation_type == "分类关系": edge_color.set('r', '255'); edge_color.set('g', '20'); edge_color.set('b', '147') # 深粉色 else: edge_color.set('r', '128'); edge_color.set('g', '128'); edge_color.set('b', '128') # 灰色 edge_color.set('a', '0.7') # 根据权重设置边的粗细 if max_weight > min_weight: thickness = 1 + 5 * (weight - min_weight) / (max_weight - min_weight) else: thickness = 2 edge_thickness = ET.SubElement(edge, 'viz:thickness') edge_thickness.set('value', f'{thickness:.1f}') # 格式化XML并保存 rough_string = ET.tostring(gexf, encoding='unicode') reparsed = minidom.parseString(rough_string) pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None) # 移除空行并清理格式 pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()] pretty_xml = '\n'.join(pretty_lines) with open(output_file, 'w', encoding='utf-8') as f: f.write(pretty_xml) print(f"增强的GEXF文件已生成: {output_file}") print(f"节点数: {len(nodes_data)}, 边数: {len(edges_data)}") print("可视化特性:") print("- 节点大小: 基于度数") print("- 节点颜色: 基于重要性和类别") print("- 边颜色: 基于关系类型") print("- 边粗细: 基于权重") print("- 边标签: 同时包含label和relationship属性,值完全一致") print("- Gephi可直接识别label属性,relationship属性可用于分类分析") return output_file def fix_existing_gexf(input_file, output_file=None): """ 修复现有的GEXF文件,将relation属性改为label属性,并添加relationship属性 参数: - input_file: 输入的GEXF文件路径 - output_file: 输出的GEXF文件路径,如果为None则覆盖原文件 """ if output_file is None: output_file = input_file try: # 读取并解析GEXF文件 tree = ET.parse(input_file) root = tree.getroot() # 查找并修改边属性定义 for attributes in root.findall('.//{http://www.gexf.net/1.2draft}attributes[@class="edge"]'): # 查找现有的relation属性,改为label for attr in attributes.findall('.//{http://www.gexf.net/1.2draft}attribute[@id="relation"]'): attr.set('id', 'label') attr.set('title', 'Label') print("已修改边属性定义: relation -> label") # 添加relationship属性(如果不存在) existing_relationship = attributes.find('.//{http://www.gexf.net/1.2draft}attribute[@id="relationship"]') if existing_relationship is None: relationship_attr = ET.SubElement(attributes, 'attribute') relationship_attr.set('id', 'relationship') relationship_attr.set('title', 'Relationship') relationship_attr.set('type', 'string') print("已添加边属性定义: relationship") # 查找并修改边属性值 modified_count = 0 for attvalue in root.findall('.//{http://www.gexf.net/1.2draft}attvalue[@for="relation"]'): relation_value = attvalue.get('value') attvalue.set('for', 'label') modified_count += 1 # 在同一个edge下添加relationship属性值 edge_attvalues = attvalue.getparent() if edge_attvalues is not None: # 检查是否已存在relationship属性值 existing_relationship_val = edge_attvalues.find('.//{http://www.gexf.net/1.2draft}attvalue[@for="relationship"]') if existing_relationship_val is None: relationship_val = ET.SubElement(edge_attvalues, 'attvalue') relationship_val.set('for', 'relationship') relationship_val.set('value', relation_value) # 为所有边添加label属性(如果还没有的话) for edge in root.findall('.//{http://www.gexf.net/1.2draft}edge'): # 查找边的关系值 for attvalue in edge.findall('.//{http://www.gexf.net/1.2draft}attvalue[@for="label"]'): relation_value = attvalue.get('value') if relation_value: edge.set('label', relation_value) break # 保存修改后的文件 rough_string = ET.tostring(root, encoding='unicode') reparsed = minidom.parseString(rough_string) pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None) # 移除空行 pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()] pretty_xml = '\n'.join(pretty_lines) with open(output_file, 'w', encoding='utf-8') as f: f.write(pretty_xml) print(f"GEXF文件修复完成: {output_file}") print(f"修改了 {modified_count} 个边属性值") print("已同时添加label和relationship属性,值完全一致") except Exception as e: print(f"修复GEXF文件时出错: {e}") # 使用示例和测试函数 def test_conversion(): """测试转换功能""" # 测试从txt文件转换 txt_file = 'knowledge_graph_export.txt' gexf_file = 'knowledge_graph.gexf' try: create_gexf_from_txt(txt_file, gexf_file) print(f"转换成功: {txt_file} -> {gexf_file}") except Exception as e: print(f"转换失败: {e}") if __name__ == "__main__": test_conversion()