Spaces:

jhu-clsp
/

ScienceHierarchography

Sleeping

Muhan Gao

Add citation-based sorting functionality

7720dc2 10 months ago

52.7 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import os
	import json
	import gzip
	import re
	from urllib.parse import quote, unquote

	# Updated CSS styles to use default background
	CUSTOM_CSS = """
	<style>
	/* Set default background color */
	body {
	background-color: white !important;
	}

	.stApp {
	background-color: white !important;
	}

	h1 {
	color: #2E4053;
	font-family: 'Helvetica Neue', sans-serif;
	font-size: 2.8rem !important;
	border-bottom: 3px solid #3498DB;
	padding-bottom: 0.3em;
	}

	h2, h3, h4 {
	color: #2C3E50 !important;
	font-family: 'Arial Rounded MT Bold', sans-serif;
	}

	.metric-card {
	background: linear-gradient(145deg, #F8F9FA 0%, #FFFFFF 100%);
	border-radius: 12px;
	padding: 1.2rem;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
	border: 1px solid #E0E7FF;
	transition: transform 0.2s;
	}

	.metric-card:hover {
	transform: translateY(-2px);
	}

	.citation-badge:hover::after,
	.influential-badge:hover::after {
	content: attr(title);
	position: absolute;
	bottom: calc(100% + 5px);
	left: 50%;
	transform: translateX(-50%);
	background-color: rgba(0, 0, 0, 0.8);
	color: #fff;
	padding: 5px 10px;
	border-radius: 4px;
	white-space: nowrap;
	z-index: 100;
	opacity: 0;
	pointer-events: none;
	transition: opacity 0.3s ease;
	}

	.citation-badge:hover::after,
	.influential-badge:hover::after {
	opacity: 1;
	}

	.path-nav {
	color: #6C757D;
	font-size: 0.95rem;
	padding: 0.8rem 1rem;
	background: #F8F9FA;
	border-radius: 8px;
	margin: 0.5rem 0; /* 减少上下margin */
	}

	.stButton>button {
	background: #3498DB !important;
	color: white !important;
	border-radius: 8px !important;
	padding: 8px 20px !important;
	border: none !important;
	transition: all 0.3s !important;
	}

	.stButton>button:hover {
	background: #2980B9 !important;
	transform: scale(1.05);
	box-shadow: 0 4px 8px rgba(52, 152, 219, 0.3);
	}

	.paper-card, .cluster-card {
	background: white;
	border-radius: 10px;
	padding: 1.5rem;
	margin: 1rem 0;
	box-shadow: 0 2px 8px rgba(0, 0, 0, 0.06);
	border: 1px solid #EAEDF3;
	overflow: hidden;
	}

	/* 调整标题的字号 - 增大cluster title */
	.paper-title, .cluster-title {
	color: #2C3E50;
	font-size: 1.3rem !important; /* 增大原来的字号 */
	font-weight: 700; /* 加粗 */
	margin-bottom: 0.5rem;
	cursor: pointer;
	}

	.paper-abstract, .cluster-abstract {
	color: #6C757D;
	line-height: 1.6;
	font-size: 0.95rem;
	margin: 1rem 0;
	padding: 0.8rem;
	background: #F9FAFB;
	border-radius: 8px;
	border-left: 4px solid #3498DB;
	}

	/* 减少expander之间的间距 */
	.streamlit-expanderHeader {
	font-weight: 600 !important;
	color: #2C3E50 !important;
	margin-top: 0.5rem !important;
	margin-bottom: 0.5rem !important;
	}

	/* 调整expander的内部和外部间距 */
	.streamlit-expander {
	margin-top: 0.5rem !important;
	margin-bottom: 0.5rem !important;
	}

	/* 更紧凑的expander内容区 */
	.streamlit-expanderContent {
	background: #FAFAFA;
	border-radius: 0 0 8px 8px;
	border: 1px solid #EAEDF3;
	border-top: none;
	padding: 8px 12px !important; /* 减少内部padding */
	}

	/* Additional styles */
	.paper-section, .cluster-section {
	margin-top: 20px;
	padding: 15px;
	border-radius: 8px;
	background: #FAFAFA;
	border-left: 4px solid #3498DB;
	}

	.paper-section-title, .cluster-section-title {
	color: #2C3E50;
	font-weight: 600;
	margin-bottom: 10px;
	border-bottom: 2px solid #EEE;
	padding-bottom: 5px;
	}

	.section-problem {
	border-left-color: #3498DB;
	}

	.section-solution {
	border-left-color: #2ECC71;
	}

	.section-results {
	border-left-color: #9B59B6;
	}

	.label {
	font-weight: 600;
	color: #34495E;
	margin-bottom: 5px;
	}

	.value-box {
	background: #F8F9FA;
	padding: 10px;
	border-radius: 5px;
	margin-bottom: 10px;
	font-size: 0.95rem;
	color: #333;
	line-height: 1.5;
	}

	/* Citation badge styles */
	.citation-badge, .influential-badge {
	display: inline-flex;
	align-items: center;
	padding: 4px 8px;
	border-radius: 6px;
	font-size: 0.85rem;
	font-weight: 600;
	gap: 4px;
	white-space: nowrap;
	}

	.citation-badge {
	background: #EBF5FB;
	color: #2980B9;
	}

	.influential-badge {
	background: #FCF3CF;
	color: #F39C12;
	}

	.citation-icon, .influential-icon {
	font-size: 1rem;
	}

	/* 修改后的引用统计格式 */
	.citation-stats, .influential-stats {
	display: flex;
	align-items: center;
	padding: 4px 12px;
	border-radius: 6px;
	font-size: 0.85rem;
	margin-bottom: 6px;
	white-space: nowrap;
	}

	.citation-stats {
	background: #EBF5FB;
	color: #2980B9;
	}

	.influential-stats {
	background: #FCF3CF;
	color: #F39C12;
	}

	.stats-divider {
	margin: 0 6px;
	color: rgba(0,0,0,0.2);
	}

	/* Field of study badge */
	.field-badge {
	display: inline-block;
	background: #F1F8E9;
	color: #558B2F;
	padding: 3px 10px;
	border-radius: 16px;
	font-size: 0.75rem;
	font-weight: 500;
	border: 1px solid #C5E1A5;
	}

	/* JSON value display */
	.json-value {
	background: #F8F9FA;
	padding: 10px;
	border-radius: 6px;
	margin-bottom: 10px;
	white-space: pre-wrap;
	font-family: monospace;
	font-size: 0.9rem;
	line-height: 1.5;
	color: #2C3E50;
	overflow-x: auto;
	}

	/* Collapsible content */
	.cluster-content {
	display: none;
	}

	.cluster-content.show {
	display: block;
	}

	/* 重新设计集群标题区布局 */
	.cluster-header {
	display: flex;
	flex-wrap: wrap;
	justify-content: space-between;
	align-items: center;
	padding-bottom: 10px;
	border-bottom: 1px solid #eee;
	margin-bottom: 0px;
	}

	/* 左侧标题和集群信息 */
	.cluster-header-left {
	display: flex;
	align-items: center;
	flex: 1;
	min-width: 200px;
	}

	/* 中间区域用于摘要展开器 */
	.cluster-header-middle {
	display: flex;
	flex: 0 0 auto;
	margin: 0 15px;
	}

	/* 右侧统计数据 */
	.cluster-badge-container {
	display: flex;
	flex-wrap: wrap;
	gap: 6px;
	justify-content: flex-end;
	}

	/* 子集群查看按钮 */
	.view-button {
	margin-left: 15px;
	}

	/* 调整h3标题的上下margin */
	h3 {
	margin-top: 1rem !important;
	margin-bottom: 0.5rem !important;
	}

	/* 调整内容区块的上下margin */
	.stBlock {
	margin-top: 0.5rem !important;
	margin-bottom: 0.5rem !important;
	}

	/* 内联expander按钮样式 */
	.inline-expander-button {
	background: #E3F2FD;
	border: 1px solid #BBDEFB;
	border-radius: 4px;
	padding: 4px 8px;
	font-size: 0.85rem;
	color: #1976D2;
	cursor: pointer;
	display: inline-flex;
	align-items: center;
	transition: all 0.2s;
	}

	.inline-expander-button:hover {
	background: #BBDEFB;
	}

	/* 导航路径中的按钮样式 */
	.path-nav-button {
	display: inline-block;
	margin: 0 5px;
	padding: 5px 10px;
	background: #E3F2FD;
	border-radius: 5px;
	color: #1976D2;
	cursor: pointer;
	font-weight: 500;
	font-size: 0.9rem;
	border: none;
	transition: all 0.2s;
	}

	.path-nav-button:hover {
	background: #BBDEFB;
	}

	/* 路径导航容器样式 */
	.path-nav {
	color: #6C757D;
	font-size: 0.95rem;
	padding: 0.8rem 1rem;
	background: #F8F9FA;
	border-radius: 8px;
	margin: 0.8rem 0;
	}

	/* Paper count badge style */
	.paper-count-badge {
	display: inline-flex;
	align-items: center;
	margin-left: 12px;
	background: #E8F4FD;
	color: #2980B9;
	padding: 3px 8px;
	border-radius: 12px;
	font-size: 0.85rem;
	font-weight: 500;
	}
	</style>

	<script>
	function toggleClusterContent(id) {
	const content = document.getElementById('cluster-content-' + id);
	if (content) {
	content.classList.toggle('show');
	}
	}
	</script>
	"""

	def get_hierarchy_files():
	hierarchy_dir = 'hierarchies'
	if not os.path.exists(hierarchy_dir):
	return []
	files = [f for f in os.listdir(hierarchy_dir) if f.endswith('.json')]
	print(f"Found files: {files}")
	return files

	def parse_filename(filename):
	"""Parse hierarchy filename to extract metadata using improved patterns."""
	filename = filename.replace('.json', '')
	parts = filename.split('_')

	# Basic fields that should be consistent
	if len(parts) < 6:
	return {
	'date': 'Unknown',
	'embedder': 'Unknown',
	'summarizer': 'Unknown',
	'clustermethod': 'Unknown',
	'contribution_type': 'Unknown',
	'building_method': 'Unknown',
	'clusterlevel': 'Unknown',
	'clusterlevel_array': [],
	'level_count': 0,
	'random_seed': 'Unknown'
	}

	# These are consistent across formats
	date_str = parts[1]
	embedder = parts[2]
	summarizer = parts[3]
	clustermethod = parts[4]
	# parts[5] is typically "emb" placeholder
	contribution_type = parts[6]

	# Special handling for building methods
	# Check for compound building methods
	building_method = None
	clusterlevel_str = None
	seed = None

	# Handle different cases for building method and what follows
	if len(parts) > 7:
	if parts[7] == "bidirectional":
	building_method = "bidirectional"
	if len(parts) > 8:
	# The cluster level is next
	clusterlevel_str = parts[8]
	if len(parts) > 9:
	seed = parts[9]
	elif parts[7] == "top" and len(parts) > 8 and parts[8] == "down":
	building_method = "top_down"
	if len(parts) > 9:
	clusterlevel_str = parts[9]
	if len(parts) > 10:
	seed = parts[10]
	elif parts[7] == "bottom" and len(parts) > 8 and parts[8] == "up":
	building_method = "bottom_up"
	if len(parts) > 9:
	clusterlevel_str = parts[9]
	if len(parts) > 10:
	seed = parts[10]
	# Default case - building method is not compound
	else:
	building_method = parts[7]
	if len(parts) > 8:
	clusterlevel_str = parts[8]
	if len(parts) > 9:
	seed = parts[9]

	# Format date with slashes for better readability
	formatted_date = f"{date_str[:4]}/{date_str[4:6]}/{date_str[6:]}" if len(date_str) == 8 else date_str

	# Process cluster levels
	clusterlevel_array = clusterlevel_str.split('-') if clusterlevel_str else []
	level_count = len(clusterlevel_array)

	return {
	'date': formatted_date,
	'embedder': embedder,
	'summarizer': summarizer,
	'clustermethod': clustermethod,
	'contribution_type': contribution_type,
	'building_method': building_method or 'Unknown',
	'clusterlevel': clusterlevel_str or 'Unknown',
	'clusterlevel_array': clusterlevel_array,
	'level_count': level_count,
	'random_seed': seed or 'Unknown'
	}

	def format_hierarchy_option(filename):
	info = parse_filename(filename)
	levels_str = "×".join(info['clusterlevel_array'])

	return f"{info['date']} - {info['clustermethod']} ({info['embedder']}/{info['summarizer']}, {info['contribution_type']}, {info['building_method']}, {info['level_count']} levels: {levels_str}, seed: {info['random_seed']})"

	@st.cache_data
	def load_hierarchy_data(filename):
	"""Load hierarchy data with support for compressed files"""
	filepath = os.path.join('hierarchies', filename)

	# 检查是否存在未压缩版本
	if os.path.exists(filepath):
	with open(filepath, 'r') as f:
	return json.load(f)

	# 检查是否存在 gzip 压缩版本
	gzip_filepath = filepath + '.gz'
	if os.path.exists(gzip_filepath):
	try:
	with gzip.open(gzip_filepath, 'rt') as f:
	return json.load(f)
	except Exception as e:
	st.error(f"Error loading compressed file {gzip_filepath}: {str(e)}")
	return {"clusters": []}

	st.error(f"Could not find hierarchy file: {filepath} or {gzip_filepath}")
	return {"clusters": []}

	def get_cluster_statistics(clusters):
	"""获取集群统计信息，包括悬停提示"""
	def count_papers(node):
	if "children" not in node:
	return 0
	children = node["children"]
	if not children:
	return 0
	if "paper_id" in children[0]:
	return len(children)
	return sum(count_papers(child) for child in children)

	cluster_count = len(clusters)
	paper_counts = []

	for cluster, _ in clusters:
	paper_count = count_papers(cluster)
	paper_counts.append(paper_count)

	if paper_counts:
	total_papers = sum(paper_counts)
	average_papers = total_papers / cluster_count if cluster_count > 0 else 0
	return {
	'Total Clusters': {'value': cluster_count, 'tooltip': 'Total number of clusters at this level'},
	'Total Papers': {'value': total_papers, 'tooltip': 'Total number of papers across all clusters at this level'},
	'Average Papers per Cluster': {'value': round(average_papers, 2), 'tooltip': 'Average number of papers per cluster'},
	'Median Papers': {'value': round(np.median(paper_counts), 2), 'tooltip': 'Median number of papers per cluster'},
	'Standard Deviation': {'value': round(np.std(paper_counts), 2), 'tooltip': 'Standard deviation of paper counts across clusters'},
	'Max Papers in Cluster': {'value': max(paper_counts), 'tooltip': 'Maximum number of papers in any single cluster'},
	'Min Papers in Cluster': {'value': min(paper_counts), 'tooltip': 'Minimum number of papers in any single cluster'}
	}
	return {
	'Total Clusters': {'value': cluster_count, 'tooltip': 'Total number of clusters at this level'},
	'Total Papers': {'value': 0, 'tooltip': 'Total number of papers across all clusters at this level'},
	'Average Papers per Cluster': {'value': 0, 'tooltip': 'Average number of papers per cluster'},
	'Median Papers': {'value': 0, 'tooltip': 'Median number of papers per cluster'},
	'Standard Deviation': {'value': 0, 'tooltip': 'Standard deviation of paper counts across clusters'},
	'Max Papers in Cluster': {'value': 0, 'tooltip': 'Maximum number of papers in any single cluster'},
	'Min Papers in Cluster': {'value': 0, 'tooltip': 'Minimum number of papers in any single cluster'}
	}

	def calculate_citation_metrics(node):
	"""Calculate total, average, and maximum citation and influential citation counts for a cluster."""
	total_citations = 0
	total_influential_citations = 0
	paper_count = 0
	citation_values = [] # 存储每篇论文的引用数
	influential_citation_values = [] # 存储每篇论文的有影响力引用数

	def process_node(n):
	nonlocal total_citations, total_influential_citations, paper_count

	if "children" not in n or n["children"] is None:
	return

	children = n["children"]
	if not children:
	return

	# If this node contains papers directly
	if children and len(children) > 0 and isinstance(children[0], dict) and "paper_id" in children[0]:
	for paper in children:
	if not isinstance(paper, dict):
	continue
	semantic_scholar = paper.get('semantic_scholar', {}) or {}
	citations = semantic_scholar.get('citationCount', 0)
	influential_citations = semantic_scholar.get('influentialCitationCount', 0)

	total_citations += citations
	total_influential_citations += influential_citations
	paper_count += 1
	citation_values.append(citations)
	influential_citation_values.append(influential_citations)
	else:
	# Recursively process child clusters
	for child in children:
	if isinstance(child, dict):
	process_node(child)

	process_node(node)

	# 计算平均值和最大值
	avg_citations = round(total_citations / paper_count, 2) if paper_count > 0 else 0
	avg_influential_citations = round(total_influential_citations / paper_count, 2) if paper_count > 0 else 0
	max_citations = max(citation_values) if citation_values else 0
	max_influential_citations = max(influential_citation_values) if influential_citation_values else 0

	return {
	'total_citations': total_citations,
	'avg_citations': avg_citations,
	'max_citations': max_citations,
	'total_influential_citations': total_influential_citations,
	'avg_influential_citations': avg_influential_citations,
	'max_influential_citations': max_influential_citations,
	'paper_count': paper_count
	}

	def find_clusters_in_path(data, path):
	"""Find clusters or papers at the given path in the hierarchy."""
	if not data or "clusters" not in data:
	return []

	clusters = data["clusters"]
	current_clusters = []

	if not path:
	return [(cluster, []) for cluster in clusters]

	current = clusters
	for i, p in enumerate(path):
	found = False
	for cluster in current:
	if cluster.get("cluster_id") == p:
	if "children" not in cluster or not cluster["children"]:
	# No children found, return empty list
	return []

	current = cluster["children"]
	found = True

	if i == len(path) - 1:
	# We're at the target level
	if current and len(current) > 0 and isinstance(current[0], dict) and "paper_id" in current[0]:
	# This level contains papers
	return [(paper, path) for paper in current]
	else:
	# This level contains subclusters
	current_clusters = []
	for c in current:
	if isinstance(c, dict):
	cluster_id = c.get("cluster_id")
	if cluster_id is not None:
	current_clusters.append((c, path + [cluster_id]))
	return current_clusters
	break

	if not found:
	# Path segment not found
	return []

	return current_clusters

	def parse_json_abstract(abstract_text):
	"""Parse JSON formatted abstract string into a beautifully formatted HTML string"""
	try:
	abstract_json = json.loads(abstract_text)
	# Create a formatted display for the structured abstract
	if "Problem" in abstract_json:
	problem = abstract_json["Problem"]
	return f"""
	<div class='section-problem paper-section'>
	<div class='paper-section-title'>Problem</div>
	<div class='label'>Domain:</div>
	<div class='value-box'>{problem.get('overarching problem domain', 'N/A')}</div>
	<div class='label'>Challenges:</div>
	<div class='value-box'>{problem.get('challenges/difficulties', 'N/A')}</div>
	<div class='label'>Goal:</div>
	<div class='value-box'>{problem.get('research question/goal', 'N/A')}</div>
	</div>
	"""
	return abstract_text
	except (json.JSONDecodeError, ValueError, TypeError):
	# If not valid JSON, return the original text
	return abstract_text

	def display_path_details(path, data, level_count):
	if not path:
	return

	st.markdown("### Path Details")

	current = data["clusters"]

	# Dynamically generate level labels and containers
	for i, cluster_id in enumerate(path):
	# 修改这里：使用 i + 1 作为层级编号
	level_number = i + 1 # 从1开始计算层级，顶层是Level 1
	indent = i * 32 # Indent 32 pixels per level

	for c in current:
	if c["cluster_id"] == cluster_id:
	# Create a container with proper indentation
	st.markdown(f"""
	<div style='margin-left: {indent}px; margin-bottom: 10px;'>
	</div>
	""", unsafe_allow_html=True)

	# Add extra spacing at the bottom
	st.markdown("<div style='margin-bottom: 25px;'></div>", unsafe_allow_html=True)

	# Create a row with cluster name and level button
	col1, col2 = st.columns([0.85, 0.15])

	with col1:
	st.markdown(f"""
	<div style='display: flex; align-items: center;'>
	<div style='width: 12px; height: 12px;
	border-radius: 50%; background: #3B82F6;
	margin-right: 8px;'></div>
	<h4 style='font-size: 1.15rem; font-weight: 600;
	color: #1F2937; margin: 0;'>
	Cluster {c["cluster_id"]}: {c["title"]}
	</h4>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	button_clicked = st.button(f'Level {level_number}', key=f'level_btn_{i}_{c["cluster_id"]}')

	if button_clicked:
	st.session_state.path = path[:i]
	new_params = {}
	new_params['hierarchy'] = st.query_params['hierarchy']
	if st.session_state.path:
	new_params['path'] = st.session_state.path
	st.query_params.clear()
	for key, value in new_params.items():
	if isinstance(value, list):
	for v in value:
	st.query_params[key] = v
	else:
	st.query_params[key] = value
	st.rerun()

	# Calculate left margin for expander content to align with the header
	# Use an extra container with margin to create the indentation
	with st.container():
	st.markdown(f"""
	<div style='margin-left: {indent}px; width: calc(100% - {indent}px);'>
	</div>
	""", unsafe_allow_html=True)

	# Remove the key parameter that was causing the error
	with st.expander("📄 Show Cluster Details", expanded=False):
	# Parse abstract if it's in JSON format
	abstract_content = parse_json_abstract(c["abstract"])
	st.markdown(f"""
	<div style='color: #374151; line-height: 1.6;'>
	{abstract_content}
	</div>
	""", unsafe_allow_html=True)

	current = c["children"]
	break

	def display_paper(item):
	"""Display detailed paper information including problem, solution, and results with semantic scholar info"""

	# Check for semantic scholar data with proper fallbacks
	semantic_scholar = item.get('semantic_scholar', {}) or {}
	url = semantic_scholar.get('url', '')
	citation_count = semantic_scholar.get('citationCount', 0)
	influential_citation_count = semantic_scholar.get('influentialCitationCount', 0)
	fields_of_study = semantic_scholar.get('fieldsOfStudy', []) or []

	# Generate field badges HTML
	field_badges_html = ""
	for field in fields_of_study:
	field_badges_html += f"<span class='field-badge' title='Field of study'>{field}</span> "

	# Basic information section with URL link and citation counts - Always visible
	st.markdown(f"""
	<div class='paper-card'>
	<div style='display: flex; justify-content: space-between; align-items: flex-start;'>
	<div class='paper-title' style='flex-grow: 1;'>
	{item.get('title', 'Untitled Paper')}
	<a href="{url}" target="_blank"
	style='font-size: 0.9em; margin-left: 8px;
	color: #3498DB; text-decoration: none;
	transition: all 0.3s;'
	title='View paper on Semantic Scholar'>
	🔗
	</a>
	</div>
	<div style='display: flex; align-items: center; gap: 12px;'>
	<div class='citation-badge' title='Number of times this paper has been cited by other papers.'>
	<span class='citation-icon'>⭐</span> Citations: {citation_count}
	</div>
	<div class='influential-badge' title='Number of times this paper has been cited by influential papers. Influential citation means that the cited publication has a significant impact on the citing publication.'>
	<span class='influential-icon'>🔥</span> Influential Citations: {influential_citation_count}
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# One main expander for all detailed information - Default collapsed
	with st.expander("📑 Show Detailed Information", expanded=False):
	# Abstract section
	st.markdown("""
	<div style='margin-top: 15px; margin-bottom: 20px;'>
	<h4 style='color: #2C3E50; border-bottom: 2px solid #3498DB; padding-bottom: 8px;'>
	📄 Abstract
	</h4>
	</div>
	""", unsafe_allow_html=True)

	abstract_text = item.get('abstract', 'No abstract available')
	st.markdown(f"<div class='paper-abstract'>{abstract_text}</div>", unsafe_allow_html=True)

	# Problem section
	if 'problem' in item and item['problem']:
	st.markdown("""
	<div style='margin-top: 25px; margin-bottom: 20px;'>
	<h4 style='color: #2C3E50; border-bottom: 2px solid #3498DB; padding-bottom: 8px;'>
	🔍 Problem Details
	</h4>
	</div>
	""", unsafe_allow_html=True)

	problem = item['problem']
	cols = st.columns([1, 2])

	with cols[0]:
	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-bottom: 5px;'>
	Problem Domain
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'>
	Challenges/Difficulties
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'>
	Research Question/Goal
	</div>
	""", unsafe_allow_html=True)

	with cols[1]:
	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #3498DB;'>
	{problem.get('overarching problem domain', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #E74C3C; margin-top: 10px;'>
	{problem.get('challenges/difficulties', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #2ECC71; margin-top: 10px;'>
	{problem.get('research question/goal', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	# Solution section
	if 'solution' in item and item['solution']:
	st.markdown("""
	<div style='margin-top: 25px; margin-bottom: 20px;'>
	<h4 style='color: #2C3E50; border-bottom: 2px solid #2ECC71; padding-bottom: 8px;'>
	💡 Solution Details
	</h4>
	</div>
	""", unsafe_allow_html=True)

	solution = item['solution']
	cols = st.columns([1, 2])

	with cols[0]:
	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-bottom: 5px;'>
	Solution Domain
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'>
	Solution Approach
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'>
	Novelty of Solution
	</div>
	""", unsafe_allow_html=True)

	with cols[1]:
	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #3498DB;'>
	{solution.get('overarching solution domain', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #9B59B6; margin-top: 10px;'>
	{solution.get('solution approach', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #F1C40F; margin-top: 10px;'>
	{solution.get('novelty of the solution', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	# Results section
	if 'results' in item and item['results']:
	st.markdown("""
	<div style='margin-top: 25px; margin-bottom: 20px;'>
	<h4 style='color: #2C3E50; border-bottom: 2px solid #9B59B6; padding-bottom: 8px;'>
	📊 Results Details
	</h4>
	</div>
	""", unsafe_allow_html=True)

	results = item['results']
	cols = st.columns([1, 2])

	with cols[0]:
	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-bottom: 5px;'>
	Findings/Results
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div style='font-weight: 600; color: #34495E; margin-top: 15px; margin-bottom: 5px;'>
	Potential Impact
	</div>
	""", unsafe_allow_html=True)

	with cols[1]:
	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #3498DB;'>
	{results.get('findings/results', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	st.markdown(f"""
	<div style='background: #F8F9FA; padding: 10px; border-radius: 5px;
	border-left: 4px solid #E67E22; margin-top: 10px;'>
	{results.get('potential impact of the results', 'Not specified')}
	</div>
	""", unsafe_allow_html=True)

	# Author information
	if 'semantic_scholar' in item and item['semantic_scholar'] and 'authors' in item['semantic_scholar'] and item['semantic_scholar']['authors']:
	st.markdown("""
	<div style='margin-top: 25px; margin-bottom: 20px;'>
	<h4 style='color: #2C3E50; border-bottom: 2px solid #E67E22; padding-bottom: 8px;'>
	👥 Authors
	</h4>
	</div>
	""", unsafe_allow_html=True)

	authors = item['semantic_scholar']['authors'] or []
	for author in authors:
	if not isinstance(author, dict):
	continue

	st.markdown(f"""
	<div style='display: flex; margin-bottom: 15px; padding-bottom: 10px; border-bottom: 1px solid #eee;'>
	<div style='flex: 1;'>
	<div style='font-weight: 600; font-size: 1.05rem;'>{author.get('name', 'Unknown')}</div>
	<div style='color: #666; margin-top: 3px;'>Author ID: {author.get('authorId', 'N/A')}</div>
	</div>
	<div style='display: flex; gap: 15px;'>
	<div title='Papers'>
	<span style='font-size: 0.85rem; color: #666;'>Papers</span>
	<div style='font-weight: 600; color: #3498DB;'>{author.get('paperCount', 0)}</div>
	</div>
	<div title='Citations'>
	<span style='font-size: 0.85rem; color: #666;'>Citations</span>
	<div style='font-weight: 600; color: #3498DB;'>{author.get('citationCount', 0)}</div>
	</div>
	<div title='h-index'>
	<span style='font-size: 0.85rem; color: #666;'>h-index</span>
	<div style='font-weight: 600; color: #3498DB;'>{author.get('hIndex', 0)}</div>
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Close paper-card div
	st.markdown("</div>", unsafe_allow_html=True)

	def display_cluster(item, path):
	"""Display a collapsible cluster with citation metrics integrated into the header, including abstract expander and buttons"""

	# Generate a unique ID for this cluster for the expander functionality
	cluster_id = item['cluster_id']
	unique_id = f"{cluster_id}_{'-'.join(map(str, path))}"

	# Calculate citation metrics using the updated function
	citation_metrics = calculate_citation_metrics(item)

	# Parse the abstract
	abstract_content = parse_json_abstract(item['abstract'])

	# 根据是否包含子项来设置按钮文本和行为
	has_children = "children" in item and item["children"]
	if has_children:
	count = citation_metrics['paper_count'] if "paper_id" in item["children"][0] else len(item["children"])
	next_level_items = item["children"]
	is_next_level_papers = len(next_level_items) > 0 and "paper_id" in next_level_items[0]
	btn_text = f'View Papers ({count})' if is_next_level_papers else f'View Sub-clusters ({count})'

	# 标题和论文数量显示 - 确保它们在同一水平线上
	st.markdown(f"""
	<div style='display: flex; align-items: center;'>
	<div class='cluster-title' style='margin: 0; font-weight: 700; font-size: 1.3rem;'>
	{item['title']}
	</div>
	<div style='display: inline-flex; align-items: center; margin-left: 12px;
	background: #F4F6F9; color: #566573; padding: 2px 10px;
	border-radius: 6px; font-size: 0.95rem; font-weight: 500;'>
	<span style='margin-right: 4px;'>📑</span>{citation_metrics['paper_count']} papers
	</div>
	</div>
	""", unsafe_allow_html=True)

	# 使用两列布局
	cols = st.columns([8, 2])

	with cols[0]: # 统计数据区域
	# 引用统计格式：使用管道符号分隔
	st.markdown(f"""
	<div>
	<div class='citation-stats'>
	<span style='font-weight: bold; margin-right: 5px;'>⭐</span> Citations:
	Total {citation_metrics['total_citations']} <span class='stats-divider'>\|</span>
	Avg {citation_metrics['avg_citations']} <span class='stats-divider'>\|</span>
	Max {citation_metrics['max_citations']}
	</div>
	<div class='influential-stats'>
	<span style='font-weight: bold; margin-right: 5px;'>🔥</span> Influential Citations:
	Total {citation_metrics['total_influential_citations']} <span class='stats-divider'>\|</span>
	Avg {citation_metrics['avg_influential_citations']} <span class='stats-divider'>\|</span>
	Max {citation_metrics['max_influential_citations']}
	</div>
	</div>
	""", unsafe_allow_html=True)

	# 创建摘要展开器 - 修改文本为"Cluster Summary"
	with st.expander("📄 Cluster Summary", expanded=False):
	st.markdown(f"""
	<div class='cluster-abstract'>{abstract_content}</div>
	""", unsafe_allow_html=True)

	with cols[1]: # 查看按钮
	# 如果有子集群或论文，添加查看按钮
	if has_children:
	# 使用动态生成的按钮文本，而不是固定的"View Sub-Cluster"
	if st.button(btn_text, key=f"btn_{unique_id}"):
	st.session_state.path.append(item['cluster_id'])
	st.rerun()

	# 创建一个分隔线
	st.markdown("<hr style='margin: 0.5rem 0; border-color: #eee;'>", unsafe_allow_html=True)

	def sort_items_by_citations(items, sort_by="citation_count", reverse=True):
	"""
	Sort papers or clusters by citation metrics

	Args:
	items: List of (item, path) tuples
	sort_by: "citation_count", "influential_citation_count", "avg_citations", "avg_influential_citations"
	reverse: True for descending (highest first), False for ascending

	Returns:
	Sorted list of (item, path) tuples
	"""
	def get_sort_key(item_tuple):
	item, path = item_tuple

	# For papers - get citation data from semantic_scholar
	if "paper_id" in item:
	semantic_scholar = item.get('semantic_scholar', {})
	if sort_by == "citation_count":
	return semantic_scholar.get('citationCount', 0)
	elif sort_by == "influential_citation_count":
	return semantic_scholar.get('influentialCitationCount', 0)
	else:
	# For papers, citation_count and avg_citations are the same
	return semantic_scholar.get('citationCount', 0)

	# For clusters - calculate citation metrics
	else:
	metrics = calculate_citation_metrics(item)
	if sort_by == "citation_count":
	return metrics['total_citations']
	elif sort_by == "influential_citation_count":
	return metrics['total_influential_citations']
	elif sort_by == "avg_citations":
	return metrics['avg_citations']
	elif sort_by == "avg_influential_citations":
	return metrics['avg_influential_citations']
	else:
	return metrics['total_citations']

	try:
	return sorted(items, key=get_sort_key, reverse=reverse)
	except Exception as e:
	# If sorting fails, return original list
	print(f"Sorting failed: {e}")
	return items

	def main():
	st.set_page_config(
	layout="wide",
	page_title="Paper Clusters Explorer",
	initial_sidebar_state="expanded",
	menu_items=None
	)
	# 设置浅色主题
	st.markdown("""
	<script>
	var elements = window.parent.document.querySelectorAll('.stApp');
	elements[0].classList.add('light');
	elements[0].classList.remove('dark');
	</script>
	""", unsafe_allow_html=True)
	st.markdown(CUSTOM_CSS, unsafe_allow_html=True)

	hierarchy_files = get_hierarchy_files()
	if not hierarchy_files:
	st.error("No hierarchy files found in /hierarchies directory")
	return

	# Manage file selection via query params
	current_url = st.query_params.get('hierarchy', None)
	current_file = unquote(current_url) + '.json' if current_url else None

	hierarchy_options = {format_hierarchy_option(f): f for f in hierarchy_files}
	selected_option = st.selectbox(
	'Select Hierarchy',
	options=list(hierarchy_options.keys()),
	index=list(hierarchy_options.values()).index(current_file) if current_file else 0
	)
	selected_file = hierarchy_options[selected_option]

	# Save selected file in query params
	if selected_file != current_file:
	st.query_params['hierarchy'] = quote(selected_file.replace('.json', ''))

	data = load_hierarchy_data(selected_file)
	info = parse_filename(selected_file)

	# Hierarchy metadata and navigation state
	with st.expander("📋 Hierarchy Metadata", expanded=False):
	# Create a grid layout for metadata
	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown(f"""
	<div class='metric-card'>
	<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Date</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['date']}</p>
	</div>

	<div class='metric-card' style='margin-top: 10px;'>
	<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Clustering Method</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['clustermethod']}</p>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.markdown(f"""
	<div class='metric-card'>
	<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Embedder / Summarizer</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['embedder']} / {info['summarizer']}</p>
	</div>

	<div class='metric-card' style='margin-top: 10px;'>
	<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Contribution Type</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['contribution_type']}</p>
	</div>
	""", unsafe_allow_html=True)

	with col3:
	st.markdown(f"""
	<div class='metric-card'>
	<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Building Method</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['building_method']}</p>
	</div>

	<div class='metric-card' style='margin-top: 10px;'>
	<h4 style='margin-top: 0; color: #2C3E50; font-size: 0.9rem;'>Cluster Levels</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB;'>{info['clusterlevel']} (Total: {info['level_count']})</p>
	</div>
	""", unsafe_allow_html=True)

	if 'path' not in st.session_state:
	path_params = st.query_params.get_all('path')
	st.session_state.path = [p for p in path_params if p]

	current_clusters = find_clusters_in_path(data, st.session_state.path)
	current_level = len(st.session_state.path)
	total_levels = info['level_count']
	level_name = f'Level {current_level + 1}' if current_level < total_levels else 'Papers'

	is_paper_level = current_level >= total_levels or (current_clusters and "paper_id" in current_clusters[0][0])

	if not is_paper_level and current_clusters:
	with st.expander("📊 Cluster Statistics", expanded=False):
	stats = get_cluster_statistics(current_clusters)

	# Create a 3x2 grid for six small metric cards
	row1_col1, row1_col2, row1_col3 = st.columns(3)
	row2_col1, row2_col2, row2_col3 = st.columns(3)

	# Row 1 - First 3 metrics
	with row1_col1:
	st.markdown(f"""
	<div class='metric-card' style='padding: 0.8rem;'>
	<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Total Clusters</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Total Clusters']['value']}</p>
	</div>
	""", unsafe_allow_html=True)

	with row1_col2:
	st.markdown(f"""
	<div class='metric-card' style='padding: 0.8rem;'>
	<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Total Papers</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Total Papers']['value']}</p>
	</div>
	""", unsafe_allow_html=True)

	with row1_col3:
	st.markdown(f"""
	<div class='metric-card' style='padding: 0.8rem;'>
	<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Avg Papers/Cluster</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Average Papers per Cluster']['value']}</p>
	</div>
	""", unsafe_allow_html=True)

	# Row 2 - Next 3 metrics
	with row2_col1:
	st.markdown(f"""
	<div class='metric-card' style='padding: 0.8rem; margin-bottom: 15px;'>
	<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Median Papers</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Median Papers']['value']}</p>
	</div>
	""", unsafe_allow_html=True)

	with row2_col2:
	st.markdown(f"""
	<div class='metric-card' style='padding: 0.8rem; margin-bottom: 15px;'>
	<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Max Papers in Cluster</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Max Papers in Cluster']['value']}</p>
	</div>
	""", unsafe_allow_html=True)

	with row2_col3:
	st.markdown(f"""
	<div class='metric-card' style='padding: 0.8rem; margin-bottom: 15px;'>
	<h4 style='margin-top: 0; margin-bottom: 5px; color: #2C3E50; font-size: 0.85rem;'>Min Papers in Cluster</h4>
	<p style='font-size: 0.9rem; font-weight: 600; color: #3498DB; margin: 0;'>{stats['Min Papers in Cluster']['value']}</p>
	</div>
	""", unsafe_allow_html=True)

	# Add simple sorting options if we have items to display
	if current_clusters:
	# Simplified sort options - only Total Citations and Total Influential Citations
	sort_options = {
	"Total Citations": "citation_count",
	"Total Influential Citations": "influential_citation_count"
	}

	selected_sort = st.selectbox(
	"📊 Sort by:",
	options=list(sort_options.keys()),
	index=0,
	key="sort_selector"
	)

	# Apply sorting - always use descending order (highest first)
	sort_key = sort_options[selected_sort]
	current_clusters = sort_items_by_citations(current_clusters, sort_key, reverse=True)

	# Back navigation button
	if st.session_state.path:
	if st.button('← Back', key='back_button'):
	st.session_state.path.pop()
	st.rerun()

	# Current path display
	if st.session_state.path:
	# 获取路径上每个聚类的标题
	path_info = []
	current = data["clusters"]

	# 构建路径中每个聚类的标题和层级信息
	for i, cid in enumerate(st.session_state.path):
	level_num = i + 1 # 从1开始的层级编号
	for c in current:
	if c["cluster_id"] == cid:
	path_info.append((level_num, c["title"], c["cluster_id"]))
	current = c["children"]
	break

	# 在Streamlit中创建路径导航
	with st.container():
	st.markdown("<h3 style='margin-top: 0.5rem; margin-bottom: 0.8rem;'>🗂️ Current Path</h3>", unsafe_allow_html=True)

	# 🔝 添加 Root 入口
	col1, col2 = st.columns([0.3, 0.7])
	with col1:
	st.markdown(f"<div><strong>Root:</strong></div>", unsafe_allow_html=True)
	with col2:
	if st.button("All Papers", key="root_button"):
	st.session_state.path = []
	st.rerun()

	# 使用缩进显示路径层次结构
	for i, (level_num, title, cluster_id) in enumerate(path_info):
	col1, col2 = st.columns([0.3, 0.7])

	with col1:
	st.markdown(f"<div><strong>Level {level_num}:</strong></div>", unsafe_allow_html=True)

	with col2:
	# 创建用于返回到该级别的按钮
	if st.button(f"{title}", key=f"lvl_{i}_{cluster_id}"):
	# 当按钮被点击时，将路径截断到该级别
	st.session_state.path = st.session_state.path[:i+1]
	st.rerun()

	# 内容展示标题
	st.markdown(f"""
	<h3 style='margin: 1rem 0 0.5rem 0; color: #2C3E50;'>
	{'📑 Papers' if is_paper_level else '📂 ' + level_name}
	</h3>
	""", unsafe_allow_html=True)

	for item, full_path in current_clusters:
	if is_paper_level:
	display_paper(item)
	else:
	display_cluster(item, full_path)

	if __name__ == '__main__':
	main()