GeoLLM / revision /word_cloud.py
Pengfa Li
Upload folder using huggingface_hub
badcf3c verified
# 读取../data/GT_500.json的text列,做词云图
import json
import jieba
import matplotlib.pyplot as plt
from collections import Counter
import re
import numpy as np
import random
from PIL import Image, ImageDraw, ImageFont
import os
def clean_text(text):
"""清理文本,去除特殊字符和数字"""
# 去除英文字符、数字、特殊符号
text = re.sub(r'[a-zA-Z0-9\s]+', '', text)
# 去除常见的标点符号
text = re.sub(r'[,。!?;:""''()【】《》\-\+\=\*\/\\\|\&\%\$\#\@\^\~\`]', '', text)
return text
def get_font_path():
"""获取合适的字体路径"""
font_paths = [
'C:/Windows/Fonts/msyh.ttf', # 微软雅黑
'C:/Windows/Fonts/simsun.ttf', # 宋体
'C:/Windows/Fonts/simhei.ttf', # 黑体
]
for font_path in font_paths:
if os.path.exists(font_path):
return font_path
return None
def get_english_font_path():
"""获取合适的英文字体路径"""
font_paths = [
'C:/Windows/Fonts/arial.ttf', # Arial
'C:/Windows/Fonts/calibri.ttf', # Calibri
'C:/Windows/Fonts/times.ttf', # Times New Roman
'C:/Windows/Fonts/verdana.ttf', # Verdana
]
for font_path in font_paths:
if os.path.exists(font_path):
return font_path
return None
def translate_geological_terms():
"""地质术语中英文对照字典 - 根据地质领域专业术语修正"""
return {
'砂岩': 'Sandstone',
'灰色': 'Gray',
'层状': 'Layered', # 修正:地质上更常用 Layered 而非 Stratified
'灰岩': 'Limestone',
'地层': 'Formation', # 修正:Formation 比 Stratum 更准确
'板岩': 'Slate',
'长石': 'Feldspar',
'岩石': 'Rock',
'分布': 'Distribution',
'细粒': 'Fine-grained',
'构造': 'Structure',
'岩屑': 'Rock fragment', # 修正:更准确的术语
'碎屑岩': 'Clastic rock',
'化石': 'Fossil',
'灰绿色': 'Grayish-green', # 修正:颜色术语更准确
'整合': 'Conformity', # 修正:整合应为 Conformity,不整合才是 Unconformity
'石英砂': 'Quartz sand',
'火山岩': 'Volcanic rock',
'深灰色': 'Dark gray',
'薄层': 'Thin-bedded',
'三叠': 'Triassic',
'接触': 'Contact',
'青海省': 'Qinghai Province',
'厚度': 'Thickness',
'粉砂': 'Silt',
'混杂': 'Melange',
'一带': 'Belt',
'组成': 'Composition',
'层理': 'Bedding',
'时代': 'Age',
'二叠': 'Permian',
'凝灰岩': 'Tuff',
'断层': 'Fault',
'岩性': 'Lithology',
'剖面': 'Section', # 修正:地质剖面用 Section 更准确
'巴尕日': 'Bagari',
'少量': 'Minor', # 修正:地质描述中用 Minor 更专业
'多彩': 'Variegated', # 修正:地质颜色描述用 Variegated
'保组': 'Bao Formation',
'区内': 'Within the area', # 修正:更地道的表达
'测区': 'Survey area',
'巴塘': 'Batang',
'结构': 'Texture',
'火山': 'Volcanic', # 修正:形容词形式更准确
'碳酸盐岩': 'Carbonate rock',
'组合': 'Assemblage',
'上部': 'Upper', # 修正:地层术语简化
'钙质': 'Calcareous',
'熔岩': 'Lava',
'生物': 'Biogenic', # 修正:地质背景下用 Biogenic 更准确
'玄武岩': 'Basalt',
'结扎': 'Jiezha',
'腕足': 'Brachiopod',
'发育': 'Well-developed', # 修正:地质术语中的"发育"
'九十': 'Ninety',
'理化': 'Physicochemical', # 修正:化学术语标准写法
'东经': 'E longitude', # 修正:地理坐标标准写法
'北纬': 'N latitude', # 修正:地理坐标标准写法
'班组': 'Member', # 修正:地层单位用 Member
'中厚': 'Medium-bedded', # 修正:地层厚度术语
'为主': 'Dominated', # 修正:简化表达
'通天河': 'Tongtian River',
'一套': 'Suite', # 修正:地质套组用 Suite
'安山岩': 'Andesite',
'诺日': 'Nuori',
'砾岩': 'Conglomerate',
'碎屑': 'Clast', # 修正:单个碎屑用 Clast
'紫色': 'Purple',
'开心': 'Kaixin',
'杂多群': 'Zaduo Group',
'角度': 'Angle',
'高程': 'Elevation',
'地质': 'Geological',
'控制': 'Control',
'出露': 'Outcrop',
'下部': 'Lower', # 修正:地层术语简化
'杂多县': 'Zaduo County',
'双壳': 'Bivalve',
'治多县': 'Zhidoi County',
'片麻岩': 'Gneiss',
'珊瑚': 'Coral',
'中粒': 'Medium-grained',
'局部': 'Locally', # 修正:副词形式更准确
'本次': 'Present', # 修正:学术写作中的表达
'斜长石': 'Plagioclase',
'石英': 'Quartz',
'石炭世': 'Carboniferous',
'划分': 'Subdivision', # 修正:地层划分术语
'其上': 'Overlying',
'巴颜喀拉山': 'Bayan Har Mountains', # 修正:标准地名
'沉积': 'Sedimentary',
'面积': 'Area',
'其中': 'Including', # 修正:更简洁的表达
'工作': 'Study', # 修正:学术背景下用 Study
'二叠纪': 'Permian', # 修正:简化表达
'发现': 'Occurrence', # 修正:地质发现用 Occurrence
'建立': 'Established', # 修正:过去分词形式
'三叠纪': 'Triassic', # 修正:简化表达
'夹有': 'Intercalated with', # 修正:地层术语
'反映': 'Indicating' # 修正:学术写作中的表达
}
def create_english_wordcloud_image(word_freq, width=1800, height=1200, max_words=100):
"""创建英文词云图"""
# 创建白色背景图像
img = Image.new('RGB', (width, height), 'white')
draw = ImageDraw.Draw(img)
# 获取英文字体
font_path = get_english_font_path()
# 准备词语和颜色
top_words = word_freq.most_common(max_words)
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD',
'#98D8C8', '#F7DC6F', '#BB8FCE', '#85C1E9', '#F8C471', '#82E0AA',
'#FF8C94', '#FFB3BA', '#BFEFFF', '#FFD700', '#98FB98', '#DDA0DD']
if not top_words:
return img
max_freq = top_words[0][1]
min_freq = top_words[-1][1] if len(top_words) > 1 else max_freq
# 随机种子确保可重复性
random.seed(42)
np.random.seed(42)
placed_boxes = []
center_x, center_y = width // 2, height // 2
for i, (word, freq) in enumerate(top_words):
# 计算字体大小
if max_freq != min_freq:
font_size = int(24 + (freq - min_freq) / (max_freq - min_freq) * 80)
else:
font_size = 54
# 选择颜色
color = colors[i % len(colors)]
# 创建字体对象
try:
if font_path:
font = ImageFont.truetype(font_path, font_size)
else:
font = ImageFont.load_default()
except:
font = ImageFont.load_default()
# 获取文字尺寸
bbox = draw.textbbox((0, 0), word, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
# 紧凑的布局算法
placed = False
max_attempts = 2000
attempt = 0
while not placed and attempt < max_attempts:
if attempt < 300:
# 螺旋搜索
angle = attempt * 0.15
radius = 5 + attempt * 1.2
x = int(center_x + radius * np.cos(angle) - text_width // 2)
y = int(center_y + radius * np.sin(angle) - text_height // 2)
elif attempt < 600:
# 网格搜索
grid_step = 8
grid_offset = attempt - 300
grid_x = (grid_offset % 40) - 20
grid_y = (grid_offset // 40) - 15
x = int(center_x + grid_x * grid_step - text_width // 2)
y = int(center_y + grid_y * grid_step - text_height // 2)
else:
# 随机搜索
search_radius = min(300, 50 + attempt)
angle = random.uniform(0, 2 * np.pi)
radius = random.uniform(0, search_radius)
x = int(center_x + radius * np.cos(angle) - text_width // 2)
y = int(center_y + radius * np.sin(angle) - text_height // 2)
# 检查边界
if x < 15 or y < 15 or x + text_width > width - 15 or y + text_height > height - 15:
attempt += 1
continue
# 检查重叠
margin = max(2, font_size // 20)
current_box = (x - margin, y - margin, x + text_width + margin, y + text_height + margin)
overlap = False
for placed_box in placed_boxes:
if (current_box[0] < placed_box[2] and current_box[2] > placed_box[0] and
current_box[1] < placed_box[3] and current_box[3] > placed_box[1]):
overlap = True
break
if not overlap:
draw.text((x, y), word, font=font, fill=color)
placed_boxes.append(current_box)
placed = True
attempt += 1
return img
def create_english_traditional_wordcloud(word_freq, max_words=100):
"""创建英文传统样式词云图"""
# 使用PIL创建词云
img = create_english_wordcloud_image(word_freq, max_words=max_words)
# 转换为matplotlib可显示的格式
plt.figure(figsize=(20, 14))
plt.imshow(np.array(img))
plt.axis('off')
plt.title('GT_500 Dataset Text Content Word Cloud (English)', fontsize=24, fontweight='bold')
plt.tight_layout()
# 保存图片
plt.savefig('gt500_wordcloud_english_traditional.png', dpi=400, bbox_inches='tight',
facecolor='white', edgecolor='none')
img.save('gt500_wordcloud_english_traditional_pil.png', 'PNG', quality=95)
plt.show()
def create_wordcloud_image(word_freq, width=1800, height=1200, max_words=100):
"""使用PIL创建词云图,优化布局算法避免覆盖"""
# 创建白色背景图像
img = Image.new('RGB', (width, height), 'white')
draw = ImageDraw.Draw(img)
# 获取字体
font_path = get_font_path()
# 准备词语和颜色
top_words = word_freq.most_common(max_words)
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD',
'#98D8C8', '#F7DC6F', '#BB8FCE', '#85C1E9', '#F8C471', '#82E0AA',
'#FF8C94', '#FFB3BA', '#BFEFFF', '#FFD700', '#98FB98', '#DDA0DD']
if not top_words:
return img
max_freq = top_words[0][1]
min_freq = top_words[-1][1] if len(top_words) > 1 else max_freq
# 随机种子确保可重复性
random.seed(42)
np.random.seed(42)
placed_boxes = [] # 存储已放置的文字边界框
center_x, center_y = width // 2, height // 2
for i, (word, freq) in enumerate(top_words):
# 计算字体大小,调整范围让分布更合理
if max_freq != min_freq:
font_size = int(24 + (freq - min_freq) / (max_freq - min_freq) * 80)
else:
font_size = 54
# 选择颜色
color = colors[i % len(colors)]
# 创建字体对象
try:
if font_path:
font = ImageFont.truetype(font_path, font_size)
else:
font = ImageFont.load_default()
except:
font = ImageFont.load_default()
# 获取文字尺寸
bbox = draw.textbbox((0, 0), word, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
# 更紧凑的布局算法
placed = False
max_attempts = 2000
attempt = 0
while not placed and attempt < max_attempts:
if attempt < 300:
# 使用更紧密的螺旋搜索,起始半径更小
angle = attempt * 0.15 # 更小的角度步长
radius = 5 + attempt * 1.2 # 更小的半径增长
x = int(center_x + radius * np.cos(angle) - text_width // 2)
y = int(center_y + radius * np.sin(angle) - text_height // 2)
elif attempt < 600:
# 密集网格搜索
grid_step = 8 # 更小的网格步长
grid_offset = attempt - 300
grid_x = (grid_offset % 40) - 20
grid_y = (grid_offset // 40) - 15
x = int(center_x + grid_x * grid_step - text_width // 2)
y = int(center_y + grid_y * grid_step - text_height // 2)
else:
# 最后阶段的随机搜索,范围较小
search_radius = min(300, 50 + attempt)
angle = random.uniform(0, 2 * np.pi)
radius = random.uniform(0, search_radius)
x = int(center_x + radius * np.cos(angle) - text_width // 2)
y = int(center_y + radius * np.sin(angle) - text_height // 2)
# 确保在图像边界内
if x < 15 or y < 15 or x + text_width > width - 15 or y + text_height > height - 15:
attempt += 1
continue
# 检查是否与已有文字重叠,使用更小的间距
margin = max(2, font_size // 20) # 更小的动态间距
current_box = (x - margin, y - margin, x + text_width + margin, y + text_height + margin)
overlap = False
for placed_box in placed_boxes:
if (current_box[0] < placed_box[2] and current_box[2] > placed_box[0] and
current_box[1] < placed_box[3] and current_box[3] > placed_box[1]):
overlap = True
break
if not overlap:
# 减少旋转,保持紧凑
draw.text((x, y), word, font=font, fill=color)
placed_boxes.append(current_box)
placed = True
attempt += 1
return img
def create_traditional_wordcloud(word_freq, max_words=100):
"""创建传统样式的词云图"""
# 使用PIL创建词云
img = create_wordcloud_image(word_freq, max_words=max_words)
# 转换为matplotlib可显示的格式
plt.figure(figsize=(20, 14))
plt.imshow(np.array(img))
plt.axis('off')
plt.title('GT_500数据集文本内容词云图', fontsize=24, fontweight='bold',
fontfamily='SimHei')
plt.tight_layout()
# 保存图片
plt.savefig('gt500_wordcloud.png', dpi=1000, bbox_inches='tight',
facecolor='white', edgecolor='none')
img.save('gt500_wordcloud_pil.png', 'PNG', quality=95)
plt.show()
def create_bar_chart(word_freq, top_n=100):
"""创建词频条形图"""
top_words = word_freq.most_common(top_n)
words, freqs = zip(*top_words)
# 设置matplotlib支持中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(12, 8))
bars = plt.barh(range(len(words)), freqs, color='skyblue')
plt.yticks(range(len(words)), words)
plt.xlabel('词频', fontsize=12)
plt.title(f'GT_500数据集词频统计(前{top_n}位)', fontsize=16, fontweight='bold')
# 在条形图上添加数值
for i, (bar, freq) in enumerate(zip(bars, freqs)):
plt.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2,
str(freq), ha='left', va='center')
plt.tight_layout()
plt.savefig('gt500_word_frequency_bar.png', dpi=1000, bbox_inches='tight')
plt.show()
def read_gt500_and_create_wordcloud():
"""读取GT_500.json文件的text列并生成词云图"""
try:
# 读取JSON文件
with open('../data/GT_500.json', 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print("错误:找不到 ../data/GT_500.json 文件")
return
except json.JSONDecodeError:
print("错误:JSON文件格式错误")
return
# 提取所有text字段的内容
all_text = ""
text_count = 0
for item in data:
if isinstance(item, dict):
# 兼容大小写和不同的键名
text_content = None
for key in ['text', 'Text', 'TEXT', 'content', 'Content']:
if key in item and item[key]:
text_content = item[key]
break
if text_content:
cleaned_text = clean_text(str(text_content))
if cleaned_text.strip():
all_text += cleaned_text + " "
text_count += 1
print(f"总共处理了 {len(data)} 条记录")
print(f"其中 {text_count} 条记录包含有效文本内容")
print(f"文本总长度: {len(all_text)} 字符")
if not all_text.strip():
print("错误:没有找到有效的文本内容")
# 打印数据结构示例
if data:
print("数据结构示例:")
print(json.dumps(data[0] if isinstance(data, list) else data,
ensure_ascii=False, indent=2)[:500] + "...")
return
# 使用jieba进行中文分词
print("正在进行中文分词...")
words = jieba.lcut(all_text)
# 过滤掉长度小于2的词语和常见停用词
stop_words = {'的', '了', '和', '是', '在', '有', '与', '对', '为', '等', '及', '或', '但', '而',
'这', '那', '此', '其', '就', '都', '也', '不', '之', '中', '上', '下', '以', '可',
'如', '被', '将', '由', '会', '能', '要', '很', '较', '比', '更', '最', '非常',
'地区', '区域', '部分', '方面', '主要', '具有', '包括', '通过', '进行', '发生',
'形成', '产生', '出现', '存在', '导致', '造成', '影响', '作用', '特点', '特征',
'相关', '一般', '通常', '可能', '由于', '因此', '所以', '然后', '同时', '另外'}
filtered_words = [word for word in words if len(word) >= 2 and word not in stop_words and word.strip()]
# 统计词频
word_freq = Counter(filtered_words)
print(f"分词后得到 {len(filtered_words)} 个有效词语")
print(f"去重后有 {len(word_freq)} 个不同的词语")
if not word_freq:
print("错误:没有有效的词语用于生成词云")
return
print("\n词频前100:")
for word, freq in word_freq.most_common(100):
print(f"{word}: {freq}")
# 创建传统词云图
print("\n正在创建传统词云图...")
create_traditional_wordcloud(word_freq, max_words=100)
# 翻译为英文并创建英文词云
print("正在翻译为英文并创建英文词云图...")
translation_dict = translate_geological_terms()
english_word_freq = Counter()
for chinese_word, freq in word_freq.most_common(100):
if chinese_word in translation_dict:
english_word = translation_dict[chinese_word]
english_word_freq[english_word] = freq
else:
# 如果没有翻译,保持原文或使用拼音
english_word_freq[chinese_word] = freq
# 创建英文传统词云图
create_english_traditional_wordcloud(english_word_freq, max_words=100)
# 创建词频条形图
print("正在创建词频条形图...")
create_bar_chart(word_freq, top_n=100)
# 保存词频统计结果
with open('word_frequency.json', 'w', encoding='utf-8') as f:
json.dump(dict(word_freq.most_common(100)), f, ensure_ascii=False, indent=2)
# 保存英文词频统计结果
with open('word_frequency_english.json', 'w', encoding='utf-8') as f:
json.dump(dict(english_word_freq.most_common(100)), f, ensure_ascii=False, indent=2)
print("\n完成!")
print("- 传统词云图已保存为 'gt500_wordcloud.png' 和 'gt500_wordcloud_pil.png'")
print("- 螺旋词云图已保存为 'gt500_wordcloud_spiral.png'")
print("- 英文传统词云图已保存为 'gt500_wordcloud_english_traditional.png' 和 'gt500_wordcloud_english_traditional_pil.png'")
print("- 词频条形图已保存为 'gt500_word_frequency_bar.png'")
print("- 中文词频统计已保存为 'word_frequency.json'")
print("- 英文词频统计已保存为 'word_frequency_english.json'")
if __name__ == "__main__":
read_gt500_and_create_wordcloud()