GeoLLM / revision /translate_specific_gexf.py
Pengfa Li
Upload folder using huggingface_hub
badcf3c verified
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
def create_specific_translation_dict():
"""
基于实际GEXF文件内容创建专门的翻译词典
"""
translation_dict = {
# 从GEXF文件中提取的实际术语翻译
# 岩石类型 - Rock Types
"粉砂岩": "Siltstone",
"灰岩": "Limestone",
"板岩": "Slate",
"砂岩": "Sandstone",
"火山岩": "Volcanic rock",
"二云石英片岩": "Two-mica quartz schist",
"岩石": "Rock",
"硅质岩": "Chert",
"黑云斜长片麻岩": "Biotite plagioclase gneiss",
"长石石英砂岩": "Feldspathic quartz sandstone",
"长石砂岩": "Feldspathic sandstone",
"岩屑砂岩": "Lithic sandstone",
"粉砂质板岩": "Silty slate",
"泥晶灰岩": "Micritic limestone",
"碎屑岩": "Clastic rock",
"碳酸盐岩": "Carbonate rock",
"俄巴达动灰岩": "Obadong Limestone",
"安山岩": "Andesite",
"安山玄武岩": "Andesitic basalt",
"达龙砂岩": "Dalong Sandstone",
"英安岩": "Dacite",
# 地层单位和复合岩石名称
"九十道班组": "Jiushidaoban Formation",
"开心岭群": "Kaixinling Group",
"结隆组": "Jielong Formation",
"多彩蛇绿混杂岩": "Duocai Ophiolite Complex",
"通天河蛇绿混杂岩": "Tongtianhe Ophiolite Complex",
"波里拉组": "Bolila Formation",
"诺日巴尕日保组": "Nuoribagaribao Formation",
"含煤碎屑岩组": "Coal-bearing clastic formation",
"结扎群": "Jiezha Group",
"尕笛考组": "Gadikao Formation",
"甲丕拉组": "Jiapila Formation",
"巴颜喀拉山群": "Bayankala Group",
"灰色灰绿色厚层中-细粒岩屑长石砂岩": "Gray to grayish-green thick-bedded medium- to fine-grained lithic arkose",
"碎屑岩组": "Clastic formation",
"查涌蛇绿混杂岩": "Chayong Ophiolite Complex",
"杂多群": "Zaduo Group",
"测区南部岩石": "Southern survey area rocks",
"岩石组合": "Rock association",
"杂多群碎屑岩组": "Zaduo Group clastic formation",
"巴塘群": "Batang Group",
"古中元古代宁多岩群": "Paleo-Mesoproterozoic Ningduo Group",
"格仁火山岩": "Geren Volcanics",
"晚三叠世巴塘群及结扎群": "Late Triassic Batang Group and Jiezha Group",
"上段": "Upper member",
"火山岩组": "Volcanic formation",
"晚三叠世巴塘群": "Late Triassic Batang Group",
"西金乌兰-金沙江地层分区": "Xijinwulan-Jinsha River stratigraphic division",
"该套地层": "This stratigraphic sequence",
"新近纪曲果组": "Neogene Quguo Formation",
"相似地层": "Correlative strata",
"含矽线红柱石黑云母斜长片麻岩": "Sillimanite-andalusite-bearing biotite-plagioclase gneiss",
"沱沱河组": "Tuotuohe Formation",
"砂岩组(2-3B)": "Sandstone formation (2-3B)",
"晚三叠世结扎群": "Late Triassic Jiezha Group",
"巴贡组": "Bagong Formation",
"测区结扎群": "Survey area Jiezha Group",
"宁多群": "Ningduo Group",
"青海省第二区调队": "Qinghai Province Second Geological Survey Team",
"三叠纪查涌蛇绿混杂岩": "Triassic Chayong Ophiolite Complex",
"早中二叠世尕笛考组": "Early-Middle Permian Gadikao Formation",
"晚二叠世-早三叠世火山岩组": "Late Permian-Early Triassic volcanic formation",
"当江荣火山岩": "Dangjianglong Volcanics",
# 矿物成分
"斑晶": "Phenocryst",
"斜长石": "Plagioclase",
"石英": "Quartz",
"黑云母": "Biotite",
"黄铁矿晶体": "Pyrite crystals",
"砾石": "Gravel",
# 古生物
"双壳类": "Bivalvia",
"腕足类": "Brachiopoda",
"头足类": "Cephalopoda",
# 地理位置
"测区": "Study area",
"青海省": "Qinghai Province",
# 地质年代
"晚三叠世": "Late Triassic",
"早中二叠世": "Early-Middle Permian",
"三叠纪": "Triassic",
"早石炭世": "Early Carboniferous",
"二叠纪": "Permian",
# 颜色和描述
"灰绿色": "Grayish green",
# 构造特征
"片状构造": "Foliated structure",
"斑状结构": "Porphyritic texture",
"水平层理": "Horizontal bedding",
# 地质关系
"岩性": "Lithology",
"出露于": "Exposed",
"出露地层": "Outcrop",
"含有": "Contained",
"古生物": "Paleontological",
"属于": "Belongs to",
"断层接触": "Fault contact",
"包含": "Includes",
"所属年代": "Age",
"发育": "Develop",
"角度不整合": "Angular unconformity",
"命名": "Named by",
"创名": "Established by",
"结构": "Structure",
"厚度": "Thickness",
"分布形态": "Distribution pattern",
"坐标": "Coordinates",
"整合接触": "Integration of contacts",
"不整合接触": "Unconformity contact",
"海拔": "Elevation",
"位于": "Located at",
"面积": "Exposed area",
"大地构造位置": "Geotectonic position",
"地层区划": "Stratigraphical zoning",
"假整合接触": "Consolidated contact",
"行政区划": "Administrative area",
"吞噬": "Engulfed",
"长度": "Length",
"侵入": "Invade",
# 重要性等级
"高": "High",
"中": "Medium",
"低": "Low"
}
return translation_dict
def translate_geological_term_specific(chinese_term, translation_dict):
"""
翻译特定的地质术语
"""
# 直接匹配
if chinese_term in translation_dict:
return translation_dict[chinese_term]
# 如果找不到精确匹配,返回原文
return chinese_term
def translate_specific_gexf_file(input_file, output_file=None):
"""
翻译特定GEXF文件中的中文地质术语为英文
参数:
- input_file: 输入的GEXF文件路径
- output_file: 输出的GEXF文件路径,如果为None则在原文件名后加_english
"""
if output_file is None:
output_file = input_file.replace('.gexf', '_english.gexf')
# 获取翻译词典
translation_dict = create_specific_translation_dict()
try:
# 读取并解析GEXF文件
tree = ET.parse(input_file)
root = tree.getroot()
# 统计翻译数量
translated_nodes = 0
translated_edges = 0
untranslated_terms = set()
print(f"开始处理GEXF文件: {input_file}")
# 翻译节点标签
nodes = root.findall('.//{http://www.gexf.net/1.2draft}node')
print(f"找到 {len(nodes)} 个节点")
for node in nodes:
original_label = node.get('label', '')
if original_label:
translated_label = translate_geological_term_specific(original_label, translation_dict)
if translated_label != original_label:
node.set('label', translated_label)
translated_nodes += 1
print(f"节点翻译: {original_label} -> {translated_label}")
else:
untranslated_terms.add(original_label)
# 翻译边标签和属性值
edges = root.findall('.//{http://www.gexf.net/1.2draft}edge')
print(f"找到 {len(edges)} 条边")
for edge in edges:
# 翻译边的label属性
original_edge_label = edge.get('label', '')
if original_edge_label:
translated_edge_label = translate_geological_term_specific(original_edge_label, translation_dict)
if translated_edge_label != original_edge_label:
edge.set('label', translated_edge_label)
print(f"边标签翻译: {original_edge_label} -> {translated_edge_label}")
else:
untranslated_terms.add(original_edge_label)
# 翻译边的属性值
for attvalue in edge.findall('.//{http://www.gexf.net/1.2draft}attvalue'):
attr_for = attvalue.get('for', '')
original_value = attvalue.get('value', '')
if attr_for in ['label', 'relationship'] and original_value:
translated_value = translate_geological_term_specific(original_value, translation_dict)
if translated_value != original_value:
attvalue.set('value', translated_value)
translated_edges += 1
else:
untranslated_terms.add(original_value)
# 翻译节点的重要性属性值
importance_attrs = root.findall('.//{http://www.gexf.net/1.2draft}attvalue[@for="importance"]')
print(f"找到 {len(importance_attrs)} 个重要性属性")
for attvalue in importance_attrs:
original_importance = attvalue.get('value', '')
if original_importance:
translated_importance = translate_geological_term_specific(original_importance, translation_dict)
if translated_importance != original_importance:
attvalue.set('value', translated_importance)
print(f"重要性翻译: {original_importance} -> {translated_importance}")
# 更新meta信息
meta = root.find('.//{http://www.gexf.net/1.2draft}meta')
if meta is not None:
# 更新描述
description = meta.find('.//{http://www.gexf.net/1.2draft}description')
if description is not None:
description.text = "Geological Knowledge Graph - English Translation"
# 更新创建者
creator = meta.find('.//{http://www.gexf.net/1.2draft}creator')
if creator is not None:
creator.text = "Geological Knowledge Graph Translator"
# 保存翻译后的文件
rough_string = ET.tostring(root, encoding='unicode')
reparsed = minidom.parseString(rough_string)
pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None)
# 移除空行
pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()]
pretty_xml = '\n'.join(pretty_lines)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(pretty_xml)
# 输出翻译统计
print(f"\nGEXF英文翻译完成: {output_file}")
print(f"翻译的节点数: {translated_nodes}")
print(f"翻译的边关系数: {translated_edges}")
print(f"翻译词典包含术语: {len(translation_dict)} 个")
if untranslated_terms:
print(f"\n保持原文的术语 ({len(untranslated_terms)} 个):")
for term in sorted(untranslated_terms):
print(f" - {term}")
# 保存未翻译术语到文件
untranslated_file = output_file.replace('.gexf', '_untranslated_terms.txt')
with open(untranslated_file, 'w', encoding='utf-8') as f:
f.write("未翻译的地质术语列表:\n")
f.write("=" * 50 + "\n")
for term in sorted(untranslated_terms):
f.write(f"{term}\n")
print(f"未翻译术语已保存到: {untranslated_file}")
else:
print("所有术语均已成功翻译!")
print("\n翻译特点:")
print("- 专门针对此GEXF文件中的术语进行翻译")
print("- 保持了地质术语的专业性和准确性")
print("- 纯英文输出,适合国际学术交流")
print("- 保留了原有的网络结构和可视化属性")
return output_file
except Exception as e:
print(f"翻译GEXF文件时出错: {e}")
import traceback
traceback.print_exc()
return None
# 主函数
def main():
"""主函数,执行翻译任务"""
input_file = 'knowledge_graph.gexf'
print("开始翻译地质知识图谱GEXF文件...")
print("=" * 60)
# 检查文件是否存在
import os
if not os.path.exists(input_file):
print(f"错误:找不到输入文件 {input_file}")
print("请确保 knowledge_graph.gexf 文件在当前目录中")
return
# 创建纯英文版本
english_file = translate_specific_gexf_file(input_file, 'knowledge_graph_english.gexf')
print("\n" + "=" * 60)
print("翻译任务完成!")
if english_file:
print(f"输出文件: {english_file}")
if __name__ == "__main__":
main()