|
|
| import json
|
| import jieba
|
| import matplotlib.pyplot as plt
|
| from collections import Counter
|
| import re
|
| import numpy as np
|
| import random
|
| from PIL import Image, ImageDraw, ImageFont
|
| import os
|
|
|
| def clean_text(text):
|
| """清理文本,去除特殊字符和数字"""
|
|
|
| text = re.sub(r'[a-zA-Z0-9\s]+', '', text)
|
|
|
| text = re.sub(r'[,。!?;:""''()【】《》\-\+\=\*\/\\\|\&\%\$\#\@\^\~\`]', '', text)
|
| return text
|
|
|
| def get_font_path():
|
| """获取合适的字体路径"""
|
| font_paths = [
|
| 'C:/Windows/Fonts/msyh.ttf',
|
| 'C:/Windows/Fonts/simsun.ttf',
|
| 'C:/Windows/Fonts/simhei.ttf',
|
| ]
|
|
|
| for font_path in font_paths:
|
| if os.path.exists(font_path):
|
| return font_path
|
| return None
|
|
|
| def get_english_font_path():
|
| """获取合适的英文字体路径"""
|
| font_paths = [
|
| 'C:/Windows/Fonts/arial.ttf',
|
| 'C:/Windows/Fonts/calibri.ttf',
|
| 'C:/Windows/Fonts/times.ttf',
|
| 'C:/Windows/Fonts/verdana.ttf',
|
| ]
|
|
|
| for font_path in font_paths:
|
| if os.path.exists(font_path):
|
| return font_path
|
| return None
|
|
|
| def translate_geological_terms():
|
| """地质术语中英文对照字典 - 根据地质领域专业术语修正"""
|
| return {
|
| '砂岩': 'Sandstone',
|
| '灰色': 'Gray',
|
| '层状': 'Layered',
|
| '灰岩': 'Limestone',
|
| '地层': 'Formation',
|
| '板岩': 'Slate',
|
| '长石': 'Feldspar',
|
| '岩石': 'Rock',
|
| '分布': 'Distribution',
|
| '细粒': 'Fine-grained',
|
| '构造': 'Structure',
|
| '岩屑': 'Rock fragment',
|
| '碎屑岩': 'Clastic rock',
|
| '化石': 'Fossil',
|
| '灰绿色': 'Grayish-green',
|
| '整合': 'Conformity',
|
| '石英砂': 'Quartz sand',
|
| '火山岩': 'Volcanic rock',
|
| '深灰色': 'Dark gray',
|
| '薄层': 'Thin-bedded',
|
| '三叠': 'Triassic',
|
| '接触': 'Contact',
|
| '青海省': 'Qinghai Province',
|
| '厚度': 'Thickness',
|
| '粉砂': 'Silt',
|
| '混杂': 'Melange',
|
| '一带': 'Belt',
|
| '组成': 'Composition',
|
| '层理': 'Bedding',
|
| '时代': 'Age',
|
| '二叠': 'Permian',
|
| '凝灰岩': 'Tuff',
|
| '断层': 'Fault',
|
| '岩性': 'Lithology',
|
| '剖面': 'Section',
|
| '巴尕日': 'Bagari',
|
| '少量': 'Minor',
|
| '多彩': 'Variegated',
|
| '保组': 'Bao Formation',
|
| '区内': 'Within the area',
|
| '测区': 'Survey area',
|
| '巴塘': 'Batang',
|
| '结构': 'Texture',
|
| '火山': 'Volcanic',
|
| '碳酸盐岩': 'Carbonate rock',
|
| '组合': 'Assemblage',
|
| '上部': 'Upper',
|
| '钙质': 'Calcareous',
|
| '熔岩': 'Lava',
|
| '生物': 'Biogenic',
|
| '玄武岩': 'Basalt',
|
| '结扎': 'Jiezha',
|
| '腕足': 'Brachiopod',
|
| '发育': 'Well-developed',
|
| '九十': 'Ninety',
|
| '理化': 'Physicochemical',
|
| '东经': 'E longitude',
|
| '北纬': 'N latitude',
|
| '班组': 'Member',
|
| '中厚': 'Medium-bedded',
|
| '为主': 'Dominated',
|
| '通天河': 'Tongtian River',
|
| '一套': 'Suite',
|
| '安山岩': 'Andesite',
|
| '诺日': 'Nuori',
|
| '砾岩': 'Conglomerate',
|
| '碎屑': 'Clast',
|
| '紫色': 'Purple',
|
| '开心': 'Kaixin',
|
| '杂多群': 'Zaduo Group',
|
| '角度': 'Angle',
|
| '高程': 'Elevation',
|
| '地质': 'Geological',
|
| '控制': 'Control',
|
| '出露': 'Outcrop',
|
| '下部': 'Lower',
|
| '杂多县': 'Zaduo County',
|
| '双壳': 'Bivalve',
|
| '治多县': 'Zhidoi County',
|
| '片麻岩': 'Gneiss',
|
| '珊瑚': 'Coral',
|
| '中粒': 'Medium-grained',
|
| '局部': 'Locally',
|
| '本次': 'Present',
|
| '斜长石': 'Plagioclase',
|
| '石英': 'Quartz',
|
| '石炭世': 'Carboniferous',
|
| '划分': 'Subdivision',
|
| '其上': 'Overlying',
|
| '巴颜喀拉山': 'Bayan Har Mountains',
|
| '沉积': 'Sedimentary',
|
| '面积': 'Area',
|
| '其中': 'Including',
|
| '工作': 'Study',
|
| '二叠纪': 'Permian',
|
| '发现': 'Occurrence',
|
| '建立': 'Established',
|
| '三叠纪': 'Triassic',
|
| '夹有': 'Intercalated with',
|
| '反映': 'Indicating'
|
| }
|
|
|
| def create_english_wordcloud_image(word_freq, width=1800, height=1200, max_words=100):
|
| """创建英文词云图"""
|
|
|
| img = Image.new('RGB', (width, height), 'white')
|
| draw = ImageDraw.Draw(img)
|
|
|
|
|
| font_path = get_english_font_path()
|
|
|
|
|
| top_words = word_freq.most_common(max_words)
|
| colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD',
|
| '#98D8C8', '#F7DC6F', '#BB8FCE', '#85C1E9', '#F8C471', '#82E0AA',
|
| '#FF8C94', '#FFB3BA', '#BFEFFF', '#FFD700', '#98FB98', '#DDA0DD']
|
|
|
| if not top_words:
|
| return img
|
|
|
| max_freq = top_words[0][1]
|
| min_freq = top_words[-1][1] if len(top_words) > 1 else max_freq
|
|
|
|
|
| random.seed(42)
|
| np.random.seed(42)
|
|
|
| placed_boxes = []
|
| center_x, center_y = width // 2, height // 2
|
|
|
| for i, (word, freq) in enumerate(top_words):
|
|
|
| if max_freq != min_freq:
|
| font_size = int(24 + (freq - min_freq) / (max_freq - min_freq) * 80)
|
| else:
|
| font_size = 54
|
|
|
|
|
| color = colors[i % len(colors)]
|
|
|
|
|
| try:
|
| if font_path:
|
| font = ImageFont.truetype(font_path, font_size)
|
| else:
|
| font = ImageFont.load_default()
|
| except:
|
| font = ImageFont.load_default()
|
|
|
|
|
| bbox = draw.textbbox((0, 0), word, font=font)
|
| text_width = bbox[2] - bbox[0]
|
| text_height = bbox[3] - bbox[1]
|
|
|
|
|
| placed = False
|
| max_attempts = 2000
|
| attempt = 0
|
|
|
| while not placed and attempt < max_attempts:
|
| if attempt < 300:
|
|
|
| angle = attempt * 0.15
|
| radius = 5 + attempt * 1.2
|
|
|
| x = int(center_x + radius * np.cos(angle) - text_width // 2)
|
| y = int(center_y + radius * np.sin(angle) - text_height // 2)
|
| elif attempt < 600:
|
|
|
| grid_step = 8
|
| grid_offset = attempt - 300
|
| grid_x = (grid_offset % 40) - 20
|
| grid_y = (grid_offset // 40) - 15
|
|
|
| x = int(center_x + grid_x * grid_step - text_width // 2)
|
| y = int(center_y + grid_y * grid_step - text_height // 2)
|
| else:
|
|
|
| search_radius = min(300, 50 + attempt)
|
| angle = random.uniform(0, 2 * np.pi)
|
| radius = random.uniform(0, search_radius)
|
|
|
| x = int(center_x + radius * np.cos(angle) - text_width // 2)
|
| y = int(center_y + radius * np.sin(angle) - text_height // 2)
|
|
|
|
|
| if x < 15 or y < 15 or x + text_width > width - 15 or y + text_height > height - 15:
|
| attempt += 1
|
| continue
|
|
|
|
|
| margin = max(2, font_size // 20)
|
| current_box = (x - margin, y - margin, x + text_width + margin, y + text_height + margin)
|
| overlap = False
|
|
|
| for placed_box in placed_boxes:
|
| if (current_box[0] < placed_box[2] and current_box[2] > placed_box[0] and
|
| current_box[1] < placed_box[3] and current_box[3] > placed_box[1]):
|
| overlap = True
|
| break
|
|
|
| if not overlap:
|
| draw.text((x, y), word, font=font, fill=color)
|
| placed_boxes.append(current_box)
|
| placed = True
|
|
|
| attempt += 1
|
|
|
| return img
|
|
|
| def create_english_traditional_wordcloud(word_freq, max_words=100):
|
| """创建英文传统样式词云图"""
|
|
|
| img = create_english_wordcloud_image(word_freq, max_words=max_words)
|
|
|
|
|
| plt.figure(figsize=(20, 14))
|
| plt.imshow(np.array(img))
|
| plt.axis('off')
|
| plt.title('GT_500 Dataset Text Content Word Cloud (English)', fontsize=24, fontweight='bold')
|
| plt.tight_layout()
|
|
|
|
|
| plt.savefig('gt500_wordcloud_english_traditional.png', dpi=400, bbox_inches='tight',
|
| facecolor='white', edgecolor='none')
|
| img.save('gt500_wordcloud_english_traditional_pil.png', 'PNG', quality=95)
|
| plt.show()
|
|
|
| def create_wordcloud_image(word_freq, width=1800, height=1200, max_words=100):
|
| """使用PIL创建词云图,优化布局算法避免覆盖"""
|
|
|
| img = Image.new('RGB', (width, height), 'white')
|
| draw = ImageDraw.Draw(img)
|
|
|
|
|
| font_path = get_font_path()
|
|
|
|
|
| top_words = word_freq.most_common(max_words)
|
| colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD',
|
| '#98D8C8', '#F7DC6F', '#BB8FCE', '#85C1E9', '#F8C471', '#82E0AA',
|
| '#FF8C94', '#FFB3BA', '#BFEFFF', '#FFD700', '#98FB98', '#DDA0DD']
|
|
|
| if not top_words:
|
| return img
|
|
|
| max_freq = top_words[0][1]
|
| min_freq = top_words[-1][1] if len(top_words) > 1 else max_freq
|
|
|
|
|
| random.seed(42)
|
| np.random.seed(42)
|
|
|
| placed_boxes = []
|
| center_x, center_y = width // 2, height // 2
|
|
|
| for i, (word, freq) in enumerate(top_words):
|
|
|
| if max_freq != min_freq:
|
| font_size = int(24 + (freq - min_freq) / (max_freq - min_freq) * 80)
|
| else:
|
| font_size = 54
|
|
|
|
|
| color = colors[i % len(colors)]
|
|
|
|
|
| try:
|
| if font_path:
|
| font = ImageFont.truetype(font_path, font_size)
|
| else:
|
| font = ImageFont.load_default()
|
| except:
|
| font = ImageFont.load_default()
|
|
|
|
|
| bbox = draw.textbbox((0, 0), word, font=font)
|
| text_width = bbox[2] - bbox[0]
|
| text_height = bbox[3] - bbox[1]
|
|
|
|
|
| placed = False
|
| max_attempts = 2000
|
| attempt = 0
|
|
|
| while not placed and attempt < max_attempts:
|
| if attempt < 300:
|
|
|
| angle = attempt * 0.15
|
| radius = 5 + attempt * 1.2
|
|
|
| x = int(center_x + radius * np.cos(angle) - text_width // 2)
|
| y = int(center_y + radius * np.sin(angle) - text_height // 2)
|
| elif attempt < 600:
|
|
|
| grid_step = 8
|
| grid_offset = attempt - 300
|
| grid_x = (grid_offset % 40) - 20
|
| grid_y = (grid_offset // 40) - 15
|
|
|
| x = int(center_x + grid_x * grid_step - text_width // 2)
|
| y = int(center_y + grid_y * grid_step - text_height // 2)
|
| else:
|
|
|
| search_radius = min(300, 50 + attempt)
|
| angle = random.uniform(0, 2 * np.pi)
|
| radius = random.uniform(0, search_radius)
|
|
|
| x = int(center_x + radius * np.cos(angle) - text_width // 2)
|
| y = int(center_y + radius * np.sin(angle) - text_height // 2)
|
|
|
|
|
| if x < 15 or y < 15 or x + text_width > width - 15 or y + text_height > height - 15:
|
| attempt += 1
|
| continue
|
|
|
|
|
| margin = max(2, font_size // 20)
|
| current_box = (x - margin, y - margin, x + text_width + margin, y + text_height + margin)
|
| overlap = False
|
|
|
| for placed_box in placed_boxes:
|
| if (current_box[0] < placed_box[2] and current_box[2] > placed_box[0] and
|
| current_box[1] < placed_box[3] and current_box[3] > placed_box[1]):
|
| overlap = True
|
| break
|
|
|
| if not overlap:
|
|
|
| draw.text((x, y), word, font=font, fill=color)
|
| placed_boxes.append(current_box)
|
| placed = True
|
|
|
| attempt += 1
|
|
|
| return img
|
|
|
| def create_traditional_wordcloud(word_freq, max_words=100):
|
| """创建传统样式的词云图"""
|
|
|
| img = create_wordcloud_image(word_freq, max_words=max_words)
|
|
|
|
|
| plt.figure(figsize=(20, 14))
|
| plt.imshow(np.array(img))
|
| plt.axis('off')
|
| plt.title('GT_500数据集文本内容词云图', fontsize=24, fontweight='bold',
|
| fontfamily='SimHei')
|
| plt.tight_layout()
|
|
|
|
|
| plt.savefig('gt500_wordcloud.png', dpi=1000, bbox_inches='tight',
|
| facecolor='white', edgecolor='none')
|
| img.save('gt500_wordcloud_pil.png', 'PNG', quality=95)
|
| plt.show()
|
|
|
| def create_bar_chart(word_freq, top_n=100):
|
| """创建词频条形图"""
|
| top_words = word_freq.most_common(top_n)
|
| words, freqs = zip(*top_words)
|
|
|
|
|
| plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
|
| plt.rcParams['axes.unicode_minus'] = False
|
|
|
| plt.figure(figsize=(12, 8))
|
| bars = plt.barh(range(len(words)), freqs, color='skyblue')
|
| plt.yticks(range(len(words)), words)
|
| plt.xlabel('词频', fontsize=12)
|
| plt.title(f'GT_500数据集词频统计(前{top_n}位)', fontsize=16, fontweight='bold')
|
|
|
|
|
| for i, (bar, freq) in enumerate(zip(bars, freqs)):
|
| plt.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2,
|
| str(freq), ha='left', va='center')
|
|
|
| plt.tight_layout()
|
| plt.savefig('gt500_word_frequency_bar.png', dpi=1000, bbox_inches='tight')
|
| plt.show()
|
|
|
| def read_gt500_and_create_wordcloud():
|
| """读取GT_500.json文件的text列并生成词云图"""
|
|
|
| try:
|
|
|
| with open('../data/GT_500.json', 'r', encoding='utf-8') as f:
|
| data = json.load(f)
|
| except FileNotFoundError:
|
| print("错误:找不到 ../data/GT_500.json 文件")
|
| return
|
| except json.JSONDecodeError:
|
| print("错误:JSON文件格式错误")
|
| return
|
|
|
|
|
| all_text = ""
|
| text_count = 0
|
|
|
| for item in data:
|
| if isinstance(item, dict):
|
|
|
| text_content = None
|
| for key in ['text', 'Text', 'TEXT', 'content', 'Content']:
|
| if key in item and item[key]:
|
| text_content = item[key]
|
| break
|
|
|
| if text_content:
|
| cleaned_text = clean_text(str(text_content))
|
| if cleaned_text.strip():
|
| all_text += cleaned_text + " "
|
| text_count += 1
|
|
|
| print(f"总共处理了 {len(data)} 条记录")
|
| print(f"其中 {text_count} 条记录包含有效文本内容")
|
| print(f"文本总长度: {len(all_text)} 字符")
|
|
|
| if not all_text.strip():
|
| print("错误:没有找到有效的文本内容")
|
|
|
| if data:
|
| print("数据结构示例:")
|
| print(json.dumps(data[0] if isinstance(data, list) else data,
|
| ensure_ascii=False, indent=2)[:500] + "...")
|
| return
|
|
|
|
|
| print("正在进行中文分词...")
|
| words = jieba.lcut(all_text)
|
|
|
|
|
| stop_words = {'的', '了', '和', '是', '在', '有', '与', '对', '为', '等', '及', '或', '但', '而',
|
| '这', '那', '此', '其', '就', '都', '也', '不', '之', '中', '上', '下', '以', '可',
|
| '如', '被', '将', '由', '会', '能', '要', '很', '较', '比', '更', '最', '非常',
|
| '地区', '区域', '部分', '方面', '主要', '具有', '包括', '通过', '进行', '发生',
|
| '形成', '产生', '出现', '存在', '导致', '造成', '影响', '作用', '特点', '特征',
|
| '相关', '一般', '通常', '可能', '由于', '因此', '所以', '然后', '同时', '另外'}
|
|
|
| filtered_words = [word for word in words if len(word) >= 2 and word not in stop_words and word.strip()]
|
|
|
|
|
| word_freq = Counter(filtered_words)
|
| print(f"分词后得到 {len(filtered_words)} 个有效词语")
|
| print(f"去重后有 {len(word_freq)} 个不同的词语")
|
|
|
| if not word_freq:
|
| print("错误:没有有效的词语用于生成词云")
|
| return
|
|
|
| print("\n词频前100:")
|
| for word, freq in word_freq.most_common(100):
|
| print(f"{word}: {freq}")
|
|
|
|
|
| print("\n正在创建传统词云图...")
|
| create_traditional_wordcloud(word_freq, max_words=100)
|
|
|
|
|
| print("正在翻译为英文并创建英文词云图...")
|
| translation_dict = translate_geological_terms()
|
| english_word_freq = Counter()
|
|
|
| for chinese_word, freq in word_freq.most_common(100):
|
| if chinese_word in translation_dict:
|
| english_word = translation_dict[chinese_word]
|
| english_word_freq[english_word] = freq
|
| else:
|
|
|
| english_word_freq[chinese_word] = freq
|
|
|
|
|
| create_english_traditional_wordcloud(english_word_freq, max_words=100)
|
|
|
|
|
| print("正在创建词频条形图...")
|
| create_bar_chart(word_freq, top_n=100)
|
|
|
|
|
| with open('word_frequency.json', 'w', encoding='utf-8') as f:
|
| json.dump(dict(word_freq.most_common(100)), f, ensure_ascii=False, indent=2)
|
|
|
|
|
| with open('word_frequency_english.json', 'w', encoding='utf-8') as f:
|
| json.dump(dict(english_word_freq.most_common(100)), f, ensure_ascii=False, indent=2)
|
|
|
| print("\n完成!")
|
| print("- 传统词云图已保存为 'gt500_wordcloud.png' 和 'gt500_wordcloud_pil.png'")
|
| print("- 螺旋词云图已保存为 'gt500_wordcloud_spiral.png'")
|
| print("- 英文传统词云图已保存为 'gt500_wordcloud_english_traditional.png' 和 'gt500_wordcloud_english_traditional_pil.png'")
|
| print("- 词频条形图已保存为 'gt500_word_frequency_bar.png'")
|
| print("- 中文词频统计已保存为 'word_frequency.json'")
|
| print("- 英文词频统计已保存为 'word_frequency_english.json'")
|
|
|
| if __name__ == "__main__":
|
| read_gt500_and_create_wordcloud() |