Spaces:

juju6326
/

Steam_Market_Analysis

Sleeping

App Files Files Community

Steam_Market_Analysis / utils.py

juju6326

Update utils.py

78f78b3 verified 25 days ago

raw

history blame contribute delete

19.4 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import plotly.express as px
	import matplotlib.pyplot as plt
	from sklearn.manifold import MDS
	from matplotlib import font_manager
	from wordcloud import WordCloud
	import os
	import random

	from config import chinese_name_map, FIXED_GENRE_COLORS, genres_map
	# ==========================================================================
	def categorize_business_model(row):
	name = str(row['name']).lower()

	categories = str(row.get('categories', '')).lower()
	genres = str(row.get('genres', '')).lower()

	# --- A. 修正：這些在資料集中標記為 0 但實則為付費大作 ---
	premium_fix = [
	'football manager', 'thunder tier one', 'atelier ryza 2',
	'little nightmares', 'atelier ryza 3', 'battlefield 2042',
	'Nobody'
	]

	models = []

	if any(k in name for k in premium_fix) or 'premium_keywords' in name:
	models.append('資料集標記異常/可能為限時免費')

	# --- B. 大型 IP 入口與版本更新 (IP Portal/Version Update) ---
	# 邏輯：遊戲主體免費下載（啟動器），但內容需訂閱或購買 DLC（如 COD, Halo）。
	ip_keywords = [
	'command & conquer', 'total war', 'warhammer', 'ea sports',
	'efootball', 'samurai shodown', 'battlefield', 'call of duty',
	'final fantasy', 'game of thrones', 'halo'
	]
	if any(k in name for k in ip_keywords):
	models.append('大型 IP 入口/版本更新 (IP Portal)')

	# --- C. 章節/試玩吸引術 (Episodic/Prologue/Pass) ---
	# 邏輯：免費提供第一章或好友通行證，誘導購買完整版。
	prologue_keywords = [
	'prologue', 'episode', 'demo', 'nexus', 'chapter',
	"friend's pass", 'trial', 'poppy'
	]
	if any(k in name for k in prologue_keywords):
	models.append('章節/試玩 (Episodic/Prologue)')

	# --- D. 線上營運與內購制 (Service-based F2P / IAP) ---
	# 邏輯：有 [In-App Purchases] 標籤，或已知 F2P 轉型作品。
	# 修正：補強 Hired Ops, XERA, VAIL, BROKE PROTOCOL 等內購制競技遊戲。
	f2p_iap_names = [
	'naraka', 'fragpunk', 'stormgate', 'fear surrounds',
	'zenith', 'hired ops', 'xera', 'vail', 'broke protocol'
	]
	if ('in-app purchases' in categories or
	'massively multiplayer' in genres or
	any(k in name for k in f2p_iap_names)):
	models.append('線上營運/內購制 (Service-based F2P)')

	# --- 特別檢查：真正免費的線上遊戲 ---
	f2p_verified = ['world war 3', 'quake champions', 'lost ark', 'stumble guys']
	if any(k in name for k in f2p_verified):
	models.append('線上營運/內購制 (Service-based F2P)')

	special = ['knightfall']
	if any(k in name for k in special):
	models.append('2024年開放永久免費')

	unique_models = list(set(models))

	if not unique_models:
	return '真正免費 (Experiment/Legacy)'

	# 使用 "\|" 作為唯一分隔符，前後不加空格，方便後續程式解析
	return "\|".join(unique_models)
	# 將 RGB 轉 HEX
	def rgb_to_hex(rgb):
	return '#%02x%02x%02x' % tuple(int(x*255) for x in rgb)

	def get_genre_color(genre_key):
	if genre_key in FIXED_GENRE_COLORS:
	return rgb_to_hex(FIXED_GENRE_COLORS[genre_key])
	else:
	return '#BDBDBD' # fallback 灰色

	def monthly_stats(df):
	stats = df.groupby('release_month')['recommendations'].agg(['count','mean','median','std']).reset_index()
	stats = stats.round(2)
	stats['mean_max'] = stats['mean'].max()
	stats['mean_min'] = stats['mean'].min()
	return stats

	def year_stats(df):
	stats = df.groupby('release_year').agg(
	count=('price', 'size'),
	avg_price=('price', 'mean'),
	median_price=('price', 'median'),
	avg_recs = ('recommendations', 'mean')
	).reset_index()
	stats['growth_rate'] = stats['count'].pct_change() * 100
	return stats

	def top_entities(df, entity_col):
	entity_stats = df.groupby(entity_col).agg(
	game_count=('recommendations', 'size'),
	total_rec=('recommendations', 'sum'),
	avg_rec=('recommendations', 'mean')
	).sort_values('game_count', ascending=False).head(10)

	top_games_list = []
	for entity in entity_stats.index:
	# 找出該廠商的所有遊戲，按推薦數排序取前三
	top_3 = df[df[entity_col] == entity].nlargest(3, 'recommendations')['name'].tolist()
	top_3_cn = [chinese_name_map.get(name, name) for name in top_3]
	top_games_list.append("<br>".join(top_3_cn))
	entity_stats['知名遊戲'] = top_games_list
	return entity_stats

	# 跟top_entities比多了最低遊戲數目3款的篩選
	def top_entities_advanced(df, entity_col):
	# 1. 計算所有廠商的基礎統計
	entity_stats = df.groupby(entity_col).agg(
	game_count=('appid', 'nunique'), # 確保用 appid 算數量
	total_rec=('recommendations', 'sum'),
	avg_rec=('recommendations', 'mean'),
	avg_price=('price', 'mean'),
	avg_discount=('discount(%)', 'mean')
	)
	# 2. 【關鍵篩選】僅保留在市場上發行 3 款(含)以上遊戲的廠商
	entity_stats = entity_stats[entity_stats['game_count'] >= 3]
	entity_stats = entity_stats.sort_values('total_rec', ascending=False).head(10)

	top_games_list = []
	for entity in entity_stats.index:
	# 這裡過濾原始 df 找出該廠商的所有遊戲
	subset = df[df[entity_col] == entity]
	top_3 = subset.nlargest(3, 'recommendations')['name'].tolist()
	# 轉換中文名 (若 chinese_name_map 未定義則用原名)
	top_3_cn = [chinese_name_map.get(name, name) for name in top_3]
	top_games_list.append(" \| ".join(top_3_cn))

	entity_stats['知名遊戲'] = top_games_list
	return entity_stats

	def plot_top_entities(stats_df, title, global_avg):
	stats_df = stats_df.reset_index()
	entity_name = stats_df.columns[0]

	# --- 動態建立 hover_data，避免 KeyError ---
	# 基本必有的標籤
	hover_cols = {
	'game_count': True,
	'avg_rec': ':,.0f'
	}
	# 檢查並加入可選標籤 (如果該 df 裡有這些欄位才加)
	optional_fields = {
	'知名遊戲': True,
	'avg_price': ':.2f',
	'avg_discount': ':.1%'
	}
	for field, fmt in optional_fields.items():
	if field in stats_df.columns:
	hover_cols[field] = fmt
	# ---------------------------------------
	fig = px.scatter(stats_df,
	x="game_count",
	y="avg_rec",
	size="total_rec" if "total_rec" in stats_df.columns else None,
	color=entity_name,
	hover_name=entity_name,
	hover_data=hover_cols, # 使用動態產生的字典
	title=title,
	labels={"game_count": "發行遊戲量", "avg_rec": "平均推薦數"},
	template="plotly_white"
	)
	# 加上全體平均線
	fig.add_hline(y=global_avg,
	line_dash="dash",
	line_color="red",
	annotation_text="市場平均推薦數",
	annotation_position="bottom right")
	fig.update_layout(showlegend=False)

	return fig

	def plot_entity_price_strategy(df_paid, stats_df, entity_type="developer"):
	top_entities = stats_df.index.tolist()
	df_top = df_paid[df_paid[entity_type].isin(top_entities)].copy()

	# 畫箱型圖
	title = "卓越開發商定價策略分佈" if entity_type == "developer" else "卓越發行商定價策略分佈"
	fig = px.box(
	df_top,
	x=entity_type,
	y="price",
	color=entity_type,
	points="all", # 顯示所有點，可以看到具體哪款遊戲定價多少
	title=title,
	labels={"price": "價格 (USD)", entity_type: "廠商名稱"},
	template="plotly_white"
	)
	fig.update_layout(showlegend=False, xaxis={'categoryorder': 'median descending'})
	return fig

	def discount_performance_full(df_active_paid):
	# 1. 準備數據副本，避免改動原始資料
	df = df_active_paid.copy()
	# 2. 定義標籤與區間
	bins_discount = [0, 0.3, 0.5, 0.8, 1.0]
	labels_discount = ['0–30%', '30–50%', '50–80%', '80–100%']
	# 3. 分離「無折扣組」與「有折扣組」
	df_no_dis = df[df['discount(%)'] == 0]
	df_dis = df[df['discount(%)'] > 0].copy()
	# 4. 對有折扣組進行切分
	df_dis['discount_group'] = pd.cut(
	df_dis['discount(%)'],
	bins=bins_discount,
	labels=labels_discount,
	include_lowest=False
	)
	# 5. 計算有折扣組的統計 (Groupby)
	stats_dis = df_dis.groupby('discount_group', observed=True).agg(
	遊戲數量=('appid', 'count'),
	平均折後價=('price', 'mean'),
	平均原價=('original_price', 'mean'),
	遊玩增長中位數=('median_playtime_change', 'median'),
	遊玩增長平均值=('median_playtime_change', 'mean'),
	平均推薦數=('recommendations', 'mean')
	).reset_index()
	# 6. 計算無折扣組的基準列 (手動建立 DataFrame)
	baseline_row = pd.DataFrame([{
	'discount_group': '無折扣 (基準線)',
	'遊戲數量': len(df_no_dis),
	'平均折後價': df_no_dis['price'].mean(),
	'平均原價': df_no_dis['price'].mean(),
	'遊玩增長中位數': df_no_dis['median_playtime_change'].median(),
	'遊玩增長平均值': df_no_dis['median_playtime_change'].mean(),
	'平均推薦數': df_no_dis['recommendations'].mean()
	}])
	# 7. 合併結果
	final_report = pd.concat([baseline_row, stats_dis], ignore_index=True)

	return final_report

	def plot_streamlit_wordcloud(
	df_wc,
	mapping,
	color_dict=FIXED_GENRE_COLORS,
	font_path=None,
	figsize=(12, 6),
	bg_color='white',
	fade_categories_alpha=0.5
	):
	cols = [
	c for c in df_wc.columns
	if (c.startswith("genres_") or c.startswith("categories_"))
	and mapping and c in mapping
	]

	freq = {}
	for c in cols:
	cnt = int(df_wc[c].sum())
	if cnt > 0:
	freq[mapping[c]] = cnt

	# if not freq:
	# st.warning(f"⚠️ 「{dataset_name}」數據不足，無法生成文字雲。")
	# return

	# --- 關鍵修正：整合 rgb_to_hex ---
	def color_func(word, **kwargs):
	# 預設隨機灰色 (Fallback)
	# 用 Hex 格式比較統一
	g = random.randint(80, 180)
	default_color = '#%02x%02x%02x' % (g, g, g)

	# 反查 key (因為 word 是中文，我们要找 genres_Action)
	target_key = None
	for k, v in mapping.items():
	if v == word:
	target_key = k
	break

	if target_key and target_key in color_dict:
	# 取得 (0~1) 的 tuple
	r, g, b = FIXED_GENRE_COLORS[target_key]

	# 處理 categories_ 的淡化 (注意：這裡是對 0.0~1.0 進行運算)
	if target_key.startswith("categories_"):
	# 公式：原色 + (1.0 - 原色) * 淡化係數 (混入白色)
	r = r + (1.0 - r) * fade_categories_alpha
	g = g + (1.0 - g) * fade_categories_alpha
	b = b + (1.0 - b) * fade_categories_alpha

	# 直接呼叫你的 helper function 轉成 Hex
	return rgb_to_hex((r, g, b))

	return default_color
	#
	# # 字體路徑檢查
	# if font_path and not os.path.exists(font_path):
	# st.warning(f"⚠️ 找不到字體檔案：{font_path}，文字雲可能會顯示亂碼方塊。")

	# 生成文字雲
	wc = WordCloud(
	width=900,
	height=450,
	background_color=bg_color,
	font_path=font_path,
	collocations=False,
	random_state=2077,
	).generate_from_frequencies(freq)

	# 繪圖
	fig, ax = plt.subplots(figsize=figsize)
	ax.imshow(wc.recolor(color_func=color_func))
	ax.axis("off")
	st.pyplot(fig)

	def fade_color(color, alpha=0.5):
	"""將 RGB tuple 加上 alpha 通道"""
	r, g, b = color[:3]
	return (r, g, b, alpha)


	@st.cache_data(ttl=3600, show_spinner=False)
	def compute_mds_coordinates(
	df,
	top_n_tags,
	rec_threshold,
	noise_threshold=0.05,
	random_state=42
	):
	"""
	負責最繁重的 MDS 運算。
	當參數改變時才會重新執行，否則直接讀快取。
	"""
	# 1. 篩選資料 (Recommendations 門檻)
	if 'recommendations' in df.columns:
	df_filtered = df[df['recommendations'] > rec_threshold]
	else:
	df_filtered = df.copy()

	# 2. 找出 Top N 標籤
	tag_cols = [c for c in df.columns if c.startswith('genres_') or c.startswith('categories_')]

	# 如果篩選後沒資料，提早回傳
	if df_filtered.empty:
	return None, None, None, None

	tag_counts = df_filtered[tag_cols].sum().sort_values(ascending=False).head(top_n_tags)
	target_tags = tag_counts.index.tolist()

	# 3. 計算相關矩陣與距離矩陣
	corr_matrix = df_filtered[target_tags].corr().fillna(0)
	corr_matrix[corr_matrix.abs() < noise_threshold] = 0
	distance_matrix = 1 - np.abs(corr_matrix.values)

	# 4. MDS 降維運算
	mds = MDS(
	n_components=2,
	dissimilarity="precomputed",
	random_state=random_state,
	n_init=4,
	max_iter=300,
	normalized_stress="auto"
	)
	pos = mds.fit_transform(distance_matrix)

	return pos, target_tags, tag_counts, df_filtered.shape[0]


	# MDS繪圖主函式
	# -------------------------------------------------------
	def plot_mds_streamlit(
	df,
	mapping=genres_map,
	color_dict=FIXED_GENRE_COLORS,
	font_path="fonts/NotoSansTC-Regular.ttf",
	fade_alpha=0.8
	):
	st.subheader("🔗 遊戲標籤關聯圖(Genre對Categor) (MDS Map)")

	all_tag_cols = [c for c in df.columns if c.startswith('genres_') or c.startswith('categories_')]
	# --- A. 建立 Slider (放在 col 中比較美觀) ---
	col1, col2 = st.columns(2)

	with col1:
	max_tags = len(all_tag_cols)
	top_n = st.slider(
	"📍 顯示 Top N 標籤數量",
	min_value=10,
	max_value=max_tags,
	value=20,
	step=1
	)
	with col2:
	max_rec = int(df['recommendations'].max()) if 'recommendations' in df.columns else 10000
	rec_th = st.slider(
	"👍 推薦數門檻 (過濾冷門遊戲)",
	min_value=0,
	max_value=max_rec,
	value=1000,
	step=500
	)

	# --- B. 執行運算 (有快取保護) ---
	with st.spinner("正在計算標籤距離..."):
	pos, target_tags, tag_counts, sample_size = compute_mds_coordinates(
	df, top_n, rec_th
	)

	if pos is None:
	st.warning("⚠️ 篩選條件過於嚴格，沒有足夠的遊戲數據可供分析。")
	return

	# --- C. 準備繪圖 ---
	# 設定字體 (這是關鍵，否則 Matplotlib 中文會亂碼)
	if font_path and os.path.exists(font_path):
	prop = font_manager.FontProperties(fname=font_path)
	else:
	prop = None # fallback

	# 顏色邏輯函式 (內部使用)
	def get_color(tag):
	if tag in color_dict:
	c = color_dict[tag]
	# 如果是 categories_ 就淡化
	if tag.startswith('categories_'):
	return fade_color(c, alpha=fade_alpha)
	return c
	return (0.5, 0.5, 0.5, 0.7) # 灰色

	final_colors = [get_color(t) for t in target_tags]

	# 建立畫布
	fig, ax = plt.subplots(figsize=(14, 10))
	ax.margins(0.1) # 0.1 代表留白 10%，如果不夠可以調成 0.2

	# 氣泡大小公式
	sizes = (tag_counts.values / tag_counts.values.max()) * 10000

	# 畫散佈圖
	ax.scatter(
	pos[:, 0], pos[:, 1],
	s=sizes,
	c=final_colors,
	alpha=0.8,
	edgecolors='white',
	linewidth=1.5
	)

	# 標註文字
	texts = []
	for i, tag in enumerate(target_tags):
	# 使用 mapping 轉中文
	label_text = mapping.get(tag, tag.replace('genres_', '').replace('categories_', ''))

	# 字體大小動態調整
	font_size = max(9, min(16, int(tag_counts.values[i] / tag_counts.values.max() * 20)))

	is_genre = tag in color_dict and not tag.startswith('categories_')

	ax.text(
	pos[i, 0], pos[i, 1],
	label_text,
	ha='center', va='center',
	fontsize=font_size,
	fontproperties=prop, # <--- 套用中文字體
	fontweight='bold' if is_genre else 'normal',
	color='black' if is_genre else '#333333'
	)

	ax.set_title(
	f"MDS Tag Map (樣本數: {sample_size} 款遊戲)",
	fontsize=18,
	fontproperties=prop,
	pad=20
	)
	ax.grid(True, linestyle='--', alpha=0.2)
	ax.axis('off') # 隱藏座標軸讓畫面更乾淨

	# --- D. 輸出到 Streamlit ---
	st.pyplot(fig)

	def plot_tag_heatmap(
	data,
	font_path=None,
	top_n=20,
	figsize=(14, 12)
	):
	st.subheader("🔗 遊戲標籤關係係數圖 (Corr Heatmap)")
	all_tag_cols = [c for c in data.columns if c.startswith('genres_') or c.startswith('categories_')]

	# 2. 篩選出現頻率最高的 Top N 標籤
	tag_counts = data[all_tag_cols].sum().sort_values(ascending=False).head(top_n)
	top_tags = tag_counts.index.tolist()

	if len(top_tags) < 2:
	st.warning("⚠️ 數據過濾後標籤過少，無法繪製熱力圖。")
	return

	# 3. 計算相關係數矩陣
	corr_matrix = data[top_tags].corr()

	# ✨ 修改處 1：建立遮罩 (Mask)
	# np.triu (Triangle Upper) 會把矩陣的上半部變成 True (要遮掉的部分)
	# k=0 代表連對角線也遮掉；如果你想保留對角線(自己對自己=1)，改成 k=1
	mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

	# 4. 繪圖
	fig, ax = plt.subplots(figsize=figsize)

	# 使用 Seaborn 畫圖
	sns.heatmap(
	corr_matrix,
	mask=mask, # ✨ 修改處 2：傳入遮罩
	annot=True, # 顯示數值
	fmt=".2f",
	cmap='coolwarm',
	vmin=-1, vmax=1,
	ax=ax,
	cbar_kws={'shrink': 0.8},
	square=True # 建議加上這個，讓格子保持正方形，比較好看
	)

	prop = None
	# 5. 設定中文字體
	if font_path:
	prop = font_manager.FontProperties(fname=font_path, size=12)

	# 設定 X 軸標籤字體
	ax.set_xticklabels(
	[t.replace('genres_', '').replace('categories_', '') for t in top_tags],
	fontproperties=prop,
	rotation=45,
	ha='right'
	)

	# 設定 Y 軸標籤字體
	ax.set_yticklabels(
	[t.replace('genres_', '').replace('categories_', '') for t in top_tags],
	fontproperties=prop,
	rotation=0
	)

	ax.set_title(f"Top {top_n} Tags Correlation (Lower Triangle)", fontproperties=prop, fontsize=18, pad=20)

	# 6. 顯示
	st.pyplot(fig)