Spaces:

Hafnium49
/

material-universe

Sleeping

App Files Files Community

material-universe / scripts /apply_labels_to_map.py

Hafnium49

Initial deployment: Materials Database Explorer (Japanese)

412d7e5 verified about 1 month ago

raw

history blame contribute delete

12 kB

	"""
	Apply Scientific Labels to Material Universe Map

	Reads cluster definitions and regenerates the visualization with
	human-readable family names instead of cluster IDs, plus a summary table.

	Usage:
	uv run python scripts/apply_labels_to_map.py [--lang ja]

	Reads from:
	- material_universe_cache/materials_clustered.csv
	- docs/cluster_definitions.json

	Outputs:
	- docs/material_universe_map_labeled.html (English)
	- docs/material_universe_map_labeled_ja.html (Japanese, with --lang ja)
	"""

	import pandas as pd
	import json
	import plotly.express as px
	import os
	import argparse

	INPUT_CSV = "material_universe_cache/materials_clustered.csv"
	INPUT_LABELS = "docs/cluster_definitions.json"

	# Localization strings
	TRANSLATIONS = {
	"en": {
	"title": "The Material Universe",
	"subtitle": "33,973 stable crystals clustered into 21 material families using Tri-Fusion embeddings (Orb-v3 + MEGNet + OFM)",
	"chart_title": "The Material Universe: 33,973 Crystals in 21 Families",
	"family_summary": "Family Summary",
	"legend_title": "Material Family",
	"col_id": "ID",
	"col_family": "Material Family",
	"col_count": "Count",
	"col_percent": "%",
	"col_bandgap": "Avg Band Gap",
	"col_type": "Type",
	"type_metallic": "Metallic",
	"type_semiconductor": "Semiconductor",
	"type_insulator": "Insulator",
	"output_file": "docs/material_universe_map_labeled.html",
	},
	"ja": {
	"title": "マテリアル・ユニバース",
	"subtitle": "Tri-Fusion埋め込み（Orb-v3 + MEGNet + OFM）を用いて21の材料ファミリーにクラスタリングされた33,973の安定結晶",
	"chart_title": "マテリアル・ユニバース: 33,973結晶、21ファミリー",
	"family_summary": "ファミリー概要",
	"legend_title": "材料ファミリー",
	"col_id": "ID",
	"col_family": "材料ファミリー",
	"col_count": "数",
	"col_percent": "%",
	"col_bandgap": "平均バンドギャップ",
	"col_type": "タイプ",
	"type_metallic": "金属",
	"type_semiconductor": "半導体",
	"type_insulator": "絶縁体",
	"output_file": "docs/material_universe_map_labeled_ja.html",
	},
	}

	# Japanese translations for family names
	FAMILY_TRANSLATIONS_JA = {
	"Unclassified / Noise": "未分類 / ノイズ",
	"Intermetallic Alloys (Li/Mg/Rare Earth)": "金属間化合物（Li/Mg/希土類）",
	"Alkali Metal Fluorides": "アルカリ金属フッ化物",
	"Molybdates & Vanadates": "モリブデン酸塩 & バナジン酸塩",
	"Chalcogenide Oxysalts (Sulfates, Selenates, Tellurates)": "カルコゲン酸塩（硫酸塩、セレン酸塩、テルル酸塩）",
	"Phosphates & Arsenates": "リン酸塩 & ヒ酸塩",
	"Hydrides & Hydroxides": "水素化物 & 水酸化物",
	"Zintl Phases (Pnictides)": "ジントル相（プニクタイド）",
	"Chalcogenides (Sulfides, Selenides, Tellurides)": "カルコゲナイド（硫化物、セレン化物、テルル化物）",
	"Intermetallics & Zintl Compounds (Al/Ga/Ge/Si)": "金属間化合物 & ジントル化合物（Al/Ga/Ge/Si）",
	"Perovskite Oxides (Ru, Ir, Pt Group)": "ペロブスカイト酸化物（Ru, Ir, Pt族）",
	"Nickel Intermetallics": "ニッケル金属間化合物",
	"Nitrides & Carbonitrides": "窒化物 & 炭窒化物",
	"Heavy Halides (Bromides & Iodides)": "重ハロゲン化物（臭化物 & ヨウ化物）",
	"Chlorides": "塩化物",
	"Borides & Silicides": "ホウ化物 & ケイ化物",
	"Platinum Group Metal Borides": "白金族金属ホウ化物",
	"Transition Metal Oxides (Mn, Fe, Co, Ni)": "遷移金属酸化物（Mn, Fe, Co, Ni）",
	"Perovskite-type Tantalates, Niobates & Titanates": "ペロブスカイト型タンタル酸塩、ニオブ酸塩 & チタン酸塩",
	"Alkali Metal Oxides": "アルカリ金属酸化物",
	"Silicates & Aluminosilicates": "ケイ酸塩 & アルミノケイ酸塩",
	"Borates": "ホウ酸塩",
	}


	def get_electronic_type(avg_gap, lang="en"):
	"""Classify electronic type based on average band gap."""
	t = TRANSLATIONS[lang]
	if avg_gap < 0.1:
	return t["type_metallic"]
	elif avg_gap > 3.0:
	return t["type_insulator"]
	else:
	return t["type_semiconductor"]


	def translate_family(family, lang):
	"""Translate family name to target language."""
	if lang == "ja":
	return FAMILY_TRANSLATIONS_JA.get(family, family)
	return family


	def generate_summary_table(df, sorted_families, lang="en"):
	"""Generate HTML summary table of all families."""
	t = TRANSLATIONS[lang]
	rows = []
	total = len(df)

	for family in sorted_families:
	subset = df[df["Family"] == family]
	count = len(subset)
	pct = count / total * 100
	avg_gap = subset["BandGap"].mean()
	e_type = get_electronic_type(avg_gap, lang)

	# Get cluster ID (first one if multiple map to same family)
	cluster_id = subset["ClusterID"].iloc[0]

	# Translate family name for display
	family_display = translate_family(family, lang)

	rows.append(
	f"<tr><td>{cluster_id}</td><td>{family_display}</td><td>{count:,}</td>"
	f"<td>{pct:.1f}%</td><td>{avg_gap:.2f} eV</td><td>{e_type}</td></tr>"
	)

	table_html = f"""
	<table class="summary-table">
	<thead>
	<tr>
	<th>{t["col_id"]}</th>
	<th>{t["col_family"]}</th>
	<th>{t["col_count"]}</th>
	<th>{t["col_percent"]}</th>
	<th>{t["col_bandgap"]}</th>
	<th>{t["col_type"]}</th>
	</tr>
	</thead>
	<tbody>
	{"".join(rows)}
	</tbody>
	</table>
	"""
	return table_html


	def regenerate_map(lang="en"):
	"""Generate the labeled map in the specified language."""
	t = TRANSLATIONS[lang]
	output_html = t["output_file"]

	if not os.path.exists(INPUT_LABELS):
	print(f"Error: {INPUT_LABELS} not found. Run Phase 2 analysis first.")
	return

	if not os.path.exists(INPUT_CSV):
	print(f"Error: {INPUT_CSV} not found. Run clustering script first.")
	return

	print(f"Loading data and labels (lang={lang})...")
	df = pd.read_csv(INPUT_CSV)
	with open(INPUT_LABELS, "r") as f:
	label_map = json.load(f)

	# Map IDs to Scientific Names
	# Convert Cluster column to string to match JSON keys
	df["ClusterID"] = df["Cluster"].astype(str)

	# Apply mapping; Default to "Unclassified" if not found
	df["Family"] = df["ClusterID"].map(label_map).fillna("Unclassified")

	# Count materials per family for sorting
	family_counts = df["Family"].value_counts()
	df["FamilySize"] = df["Family"].map(family_counts)

	# Sort by family size (largest first), then by family name for consistent colors
	df = df.sort_values(["FamilySize", "Family"], ascending=[False, True])

	# Get unique families sorted by size
	sorted_families = (
	df.groupby("Family")["FamilySize"]
	.first()
	.sort_values(ascending=False)
	.index.tolist()
	)

	print(f" Materials: {len(df):,}")
	print(f" Families: {len(sorted_families)}")

	# Translate family names for display in chart
	if lang != "en":
	df["FamilyDisplay"] = df["Family"].apply(lambda x: translate_family(x, lang))
	sorted_families_display = [translate_family(f, lang) for f in sorted_families]
	color_col = "FamilyDisplay"
	else:
	sorted_families_display = sorted_families
	color_col = "Family"

	print("\nGenerating Labeled Map...")
	fig = px.scatter(
	df,
	x="x",
	y="y",
	color=color_col,
	category_orders={color_col: sorted_families_display}, # Sort legend by size
	hover_data=["Formula", "MP_ID", "BandGap", "ClusterID"],
	title=t["chart_title"],
	template="plotly_white",
	render_mode="webgl",
	width=1400,
	height=900,
	)

	# Update layout for better visualization
	fig.update_traces(marker=dict(size=3, opacity=0.7))
	fig.update_layout(
	legend=dict(
	orientation="v",
	yanchor="top",
	y=1,
	xanchor="left",
	x=1.02,
	title=t["legend_title"],
	font=dict(size=10),
	),
	margin=dict(r=280 if lang == "ja" else 250), # More space for Japanese
	)

	# Generate summary table
	summary_table = generate_summary_table(df, sorted_families, lang)

	# Get Plotly chart as HTML div
	chart_html = fig.to_html(full_html=False, include_plotlyjs="cdn")

	# Create full HTML with chart and summary table
	full_html = f"""<!DOCTYPE html>
	<html lang="{lang}">
	<head>
	<meta charset="utf-8">
	<title>{t["title"]}</title>
	<style>
	body {{
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Hiragino Sans", "Noto Sans JP", sans-serif;
	margin: 0;
	padding: 20px;
	background: #fafafa;
	}}
	h1 {{
	color: #333;
	margin-bottom: 10px;
	}}
	.subtitle {{
	color: #666;
	margin-bottom: 20px;
	}}
	.chart-container {{
	background: white;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	padding: 20px;
	margin-bottom: 30px;
	}}
	.summary-container {{
	background: white;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	padding: 20px;
	max-width: 1000px;
	}}
	h2 {{
	color: #333;
	margin-top: 0;
	}}
	.summary-table {{
	width: 100%;
	border-collapse: collapse;
	font-size: 14px;
	}}
	.summary-table th {{
	background: #f5f5f5;
	padding: 12px 8px;
	text-align: left;
	border-bottom: 2px solid #ddd;
	font-weight: 600;
	}}
	.summary-table td {{
	padding: 10px 8px;
	border-bottom: 1px solid #eee;
	}}
	.summary-table tr:hover {{
	background: #f9f9f9;
	}}
	.summary-table tr:nth-child(even) {{
	background: #fafafa;
	}}
	.summary-table tr:nth-child(even):hover {{
	background: #f5f5f5;
	}}
	</style>
	</head>
	<body>
	<h1>{t["title"]}</h1>
	<p class="subtitle">{t["subtitle"]}</p>

	<div class="chart-container">
	{chart_html}
	</div>

	<div class="summary-container">
	<h2>{t["family_summary"]}</h2>
	{summary_table}
	</div>
	</body>
	</html>
	"""

	# Ensure output directory exists
	os.makedirs(os.path.dirname(output_html), exist_ok=True)

	with open(output_html, "w", encoding="utf-8") as f:
	f.write(full_html)

	file_size_mb = os.path.getsize(output_html) / (1024 * 1024)
	print(f"\nLabeled Map saved to: {output_html}")
	print(f" File size: {file_size_mb:.1f} MB")

	# Print family distribution
	print("\nFamily Distribution:")
	for family in sorted_families[:10]:
	count = family_counts[family]
	pct = count / len(df) * 100
	print(f" {family}: {count:,} ({pct:.1f}%)")
	if len(sorted_families) > 10:
	print(f" ... and {len(sorted_families) - 10} more families")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Generate labeled Material Universe map")
	parser.add_argument(
	"--lang",
	choices=["en", "ja"],
	default="en",
	help="Language for labels (en=English, ja=Japanese)",
	)
	args = parser.parse_args()
	regenerate_map(lang=args.lang)