Spaces:
Sleeping
Sleeping
File size: 19,397 Bytes
5b96174 78f78b3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 | import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from matplotlib import font_manager
from wordcloud import WordCloud
import os
import random
from config import chinese_name_map, FIXED_GENRE_COLORS, genres_map
# ==========================================================================
def categorize_business_model(row):
name = str(row['name']).lower()
categories = str(row.get('categories', '')).lower()
genres = str(row.get('genres', '')).lower()
# --- A. 修正:這些在資料集中標記為 0 但實則為付費大作 ---
premium_fix = [
'football manager', 'thunder tier one', 'atelier ryza 2',
'little nightmares', 'atelier ryza 3', 'battlefield 2042',
'Nobody'
]
models = []
if any(k in name for k in premium_fix) or 'premium_keywords' in name:
models.append('資料集標記異常/可能為限時免費')
# --- B. 大型 IP 入口與版本更新 (IP Portal/Version Update) ---
# 邏輯:遊戲主體免費下載(啟動器),但內容需訂閱或購買 DLC(如 COD, Halo)。
ip_keywords = [
'command & conquer', 'total war', 'warhammer', 'ea sports',
'efootball', 'samurai shodown', 'battlefield', 'call of duty',
'final fantasy', 'game of thrones', 'halo'
]
if any(k in name for k in ip_keywords):
models.append('大型 IP 入口/版本更新 (IP Portal)')
# --- C. 章節/試玩吸引術 (Episodic/Prologue/Pass) ---
# 邏輯:免費提供第一章或好友通行證,誘導購買完整版。
prologue_keywords = [
'prologue', 'episode', 'demo', 'nexus', 'chapter',
"friend's pass", 'trial', 'poppy'
]
if any(k in name for k in prologue_keywords):
models.append('章節/試玩 (Episodic/Prologue)')
# --- D. 線上營運與內購制 (Service-based F2P / IAP) ---
# 邏輯:有 [In-App Purchases] 標籤,或已知 F2P 轉型作品。
# 修正:補強 Hired Ops, XERA, VAIL, BROKE PROTOCOL 等內購制競技遊戲。
f2p_iap_names = [
'naraka', 'fragpunk', 'stormgate', 'fear surrounds',
'zenith', 'hired ops', 'xera', 'vail', 'broke protocol'
]
if ('in-app purchases' in categories or
'massively multiplayer' in genres or
any(k in name for k in f2p_iap_names)):
models.append('線上營運/內購制 (Service-based F2P)')
# --- 特別檢查:真正免費的線上遊戲 ---
f2p_verified = ['world war 3', 'quake champions', 'lost ark', 'stumble guys']
if any(k in name for k in f2p_verified):
models.append('線上營運/內購制 (Service-based F2P)')
special = ['knightfall']
if any(k in name for k in special):
models.append('2024年開放永久免費')
unique_models = list(set(models))
if not unique_models:
return '真正免費 (Experiment/Legacy)'
# 使用 "|" 作為唯一分隔符,前後不加空格,方便後續程式解析
return "|".join(unique_models)
# 將 RGB 轉 HEX
def rgb_to_hex(rgb):
return '#%02x%02x%02x' % tuple(int(x*255) for x in rgb)
def get_genre_color(genre_key):
if genre_key in FIXED_GENRE_COLORS:
return rgb_to_hex(FIXED_GENRE_COLORS[genre_key])
else:
return '#BDBDBD' # fallback 灰色
def monthly_stats(df):
stats = df.groupby('release_month')['recommendations'].agg(['count','mean','median','std']).reset_index()
stats = stats.round(2)
stats['mean_max'] = stats['mean'].max()
stats['mean_min'] = stats['mean'].min()
return stats
def year_stats(df):
stats = df.groupby('release_year').agg(
count=('price', 'size'),
avg_price=('price', 'mean'),
median_price=('price', 'median'),
avg_recs = ('recommendations', 'mean')
).reset_index()
stats['growth_rate'] = stats['count'].pct_change() * 100
return stats
def top_entities(df, entity_col):
entity_stats = df.groupby(entity_col).agg(
game_count=('recommendations', 'size'),
total_rec=('recommendations', 'sum'),
avg_rec=('recommendations', 'mean')
).sort_values('game_count', ascending=False).head(10)
top_games_list = []
for entity in entity_stats.index:
# 找出該廠商的所有遊戲,按推薦數排序取前三
top_3 = df[df[entity_col] == entity].nlargest(3, 'recommendations')['name'].tolist()
top_3_cn = [chinese_name_map.get(name, name) for name in top_3]
top_games_list.append("<br>".join(top_3_cn))
entity_stats['知名遊戲'] = top_games_list
return entity_stats
# 跟top_entities比 多了最低遊戲數目3款的篩選
def top_entities_advanced(df, entity_col):
# 1. 計算所有廠商的基礎統計
entity_stats = df.groupby(entity_col).agg(
game_count=('appid', 'nunique'), # 確保用 appid 算數量
total_rec=('recommendations', 'sum'),
avg_rec=('recommendations', 'mean'),
avg_price=('price', 'mean'),
avg_discount=('discount(%)', 'mean')
)
# 2. 【關鍵篩選】僅保留在市場上發行 3 款(含)以上遊戲的廠商
entity_stats = entity_stats[entity_stats['game_count'] >= 3]
entity_stats = entity_stats.sort_values('total_rec', ascending=False).head(10)
top_games_list = []
for entity in entity_stats.index:
# 這裡過濾原始 df 找出該廠商的所有遊戲
subset = df[df[entity_col] == entity]
top_3 = subset.nlargest(3, 'recommendations')['name'].tolist()
# 轉換中文名 (若 chinese_name_map 未定義則用原名)
top_3_cn = [chinese_name_map.get(name, name) for name in top_3]
top_games_list.append(" | ".join(top_3_cn))
entity_stats['知名遊戲'] = top_games_list
return entity_stats
def plot_top_entities(stats_df, title, global_avg):
stats_df = stats_df.reset_index()
entity_name = stats_df.columns[0]
# --- 動態建立 hover_data,避免 KeyError ---
# 基本必有的標籤
hover_cols = {
'game_count': True,
'avg_rec': ':,.0f'
}
# 檢查並加入可選標籤 (如果該 df 裡有這些欄位才加)
optional_fields = {
'知名遊戲': True,
'avg_price': ':.2f',
'avg_discount': ':.1%'
}
for field, fmt in optional_fields.items():
if field in stats_df.columns:
hover_cols[field] = fmt
# ---------------------------------------
fig = px.scatter(stats_df,
x="game_count",
y="avg_rec",
size="total_rec" if "total_rec" in stats_df.columns else None,
color=entity_name,
hover_name=entity_name,
hover_data=hover_cols, # 使用動態產生的字典
title=title,
labels={"game_count": "發行遊戲量", "avg_rec": "平均推薦數"},
template="plotly_white"
)
# 加上全體平均線
fig.add_hline(y=global_avg,
line_dash="dash",
line_color="red",
annotation_text="市場平均推薦數",
annotation_position="bottom right")
fig.update_layout(showlegend=False)
return fig
def plot_entity_price_strategy(df_paid, stats_df, entity_type="developer"):
top_entities = stats_df.index.tolist()
df_top = df_paid[df_paid[entity_type].isin(top_entities)].copy()
# 畫箱型圖
title = "卓越開發商定價策略分佈" if entity_type == "developer" else "卓越發行商定價策略分佈"
fig = px.box(
df_top,
x=entity_type,
y="price",
color=entity_type,
points="all", # 顯示所有點,可以看到具體哪款遊戲定價多少
title=title,
labels={"price": "價格 (USD)", entity_type: "廠商名稱"},
template="plotly_white"
)
fig.update_layout(showlegend=False, xaxis={'categoryorder': 'median descending'})
return fig
def discount_performance_full(df_active_paid):
# 1. 準備數據副本,避免改動原始資料
df = df_active_paid.copy()
# 2. 定義標籤與區間
bins_discount = [0, 0.3, 0.5, 0.8, 1.0]
labels_discount = ['0–30%', '30–50%', '50–80%', '80–100%']
# 3. 分離「無折扣組」與「有折扣組」
df_no_dis = df[df['discount(%)'] == 0]
df_dis = df[df['discount(%)'] > 0].copy()
# 4. 對有折扣組進行切分
df_dis['discount_group'] = pd.cut(
df_dis['discount(%)'],
bins=bins_discount,
labels=labels_discount,
include_lowest=False
)
# 5. 計算有折扣組的統計 (Groupby)
stats_dis = df_dis.groupby('discount_group', observed=True).agg(
遊戲數量=('appid', 'count'),
平均折後價=('price', 'mean'),
平均原價=('original_price', 'mean'),
遊玩增長中位數=('median_playtime_change', 'median'),
遊玩增長平均值=('median_playtime_change', 'mean'),
平均推薦數=('recommendations', 'mean')
).reset_index()
# 6. 計算無折扣組的基準列 (手動建立 DataFrame)
baseline_row = pd.DataFrame([{
'discount_group': '無折扣 (基準線)',
'遊戲數量': len(df_no_dis),
'平均折後價': df_no_dis['price'].mean(),
'平均原價': df_no_dis['price'].mean(),
'遊玩增長中位數': df_no_dis['median_playtime_change'].median(),
'遊玩增長平均值': df_no_dis['median_playtime_change'].mean(),
'平均推薦數': df_no_dis['recommendations'].mean()
}])
# 7. 合併結果
final_report = pd.concat([baseline_row, stats_dis], ignore_index=True)
return final_report
def plot_streamlit_wordcloud(
df_wc,
mapping,
color_dict=FIXED_GENRE_COLORS,
font_path=None,
figsize=(12, 6),
bg_color='white',
fade_categories_alpha=0.5
):
cols = [
c for c in df_wc.columns
if (c.startswith("genres_") or c.startswith("categories_"))
and mapping and c in mapping
]
freq = {}
for c in cols:
cnt = int(df_wc[c].sum())
if cnt > 0:
freq[mapping[c]] = cnt
# if not freq:
# st.warning(f"⚠️ 「{dataset_name}」數據不足,無法生成文字雲。")
# return
# --- 關鍵修正:整合 rgb_to_hex ---
def color_func(word, **kwargs):
# 預設隨機灰色 (Fallback)
# 用 Hex 格式比較統一
g = random.randint(80, 180)
default_color = '#%02x%02x%02x' % (g, g, g)
# 反查 key (因為 word 是中文,我们要找 genres_Action)
target_key = None
for k, v in mapping.items():
if v == word:
target_key = k
break
if target_key and target_key in color_dict:
# 取得 (0~1) 的 tuple
r, g, b = FIXED_GENRE_COLORS[target_key]
# 處理 categories_ 的淡化 (注意:這裡是對 0.0~1.0 進行運算)
if target_key.startswith("categories_"):
# 公式:原色 + (1.0 - 原色) * 淡化係數 (混入白色)
r = r + (1.0 - r) * fade_categories_alpha
g = g + (1.0 - g) * fade_categories_alpha
b = b + (1.0 - b) * fade_categories_alpha
# 直接呼叫你的 helper function 轉成 Hex
return rgb_to_hex((r, g, b))
return default_color
#
# # 字體路徑檢查
# if font_path and not os.path.exists(font_path):
# st.warning(f"⚠️ 找不到字體檔案:{font_path},文字雲可能會顯示亂碼方塊。")
# 生成文字雲
wc = WordCloud(
width=900,
height=450,
background_color=bg_color,
font_path=font_path,
collocations=False,
random_state=2077,
).generate_from_frequencies(freq)
# 繪圖
fig, ax = plt.subplots(figsize=figsize)
ax.imshow(wc.recolor(color_func=color_func))
ax.axis("off")
st.pyplot(fig)
def fade_color(color, alpha=0.5):
"""將 RGB tuple 加上 alpha 通道"""
r, g, b = color[:3]
return (r, g, b, alpha)
@st.cache_data(ttl=3600, show_spinner=False)
def compute_mds_coordinates(
df,
top_n_tags,
rec_threshold,
noise_threshold=0.05,
random_state=42
):
"""
負責最繁重的 MDS 運算。
當參數改變時才會重新執行,否則直接讀快取。
"""
# 1. 篩選資料 (Recommendations 門檻)
if 'recommendations' in df.columns:
df_filtered = df[df['recommendations'] > rec_threshold]
else:
df_filtered = df.copy()
# 2. 找出 Top N 標籤
tag_cols = [c for c in df.columns if c.startswith('genres_') or c.startswith('categories_')]
# 如果篩選後沒資料,提早回傳
if df_filtered.empty:
return None, None, None, None
tag_counts = df_filtered[tag_cols].sum().sort_values(ascending=False).head(top_n_tags)
target_tags = tag_counts.index.tolist()
# 3. 計算相關矩陣與距離矩陣
corr_matrix = df_filtered[target_tags].corr().fillna(0)
corr_matrix[corr_matrix.abs() < noise_threshold] = 0
distance_matrix = 1 - np.abs(corr_matrix.values)
# 4. MDS 降維運算
mds = MDS(
n_components=2,
dissimilarity="precomputed",
random_state=random_state,
n_init=4,
max_iter=300,
normalized_stress="auto"
)
pos = mds.fit_transform(distance_matrix)
return pos, target_tags, tag_counts, df_filtered.shape[0]
# MDS繪圖主函式
# -------------------------------------------------------
def plot_mds_streamlit(
df,
mapping=genres_map,
color_dict=FIXED_GENRE_COLORS,
font_path="fonts/NotoSansTC-Regular.ttf",
fade_alpha=0.8
):
st.subheader("🔗 遊戲標籤關聯圖(Genre對Categor) (MDS Map)")
all_tag_cols = [c for c in df.columns if c.startswith('genres_') or c.startswith('categories_')]
# --- A. 建立 Slider (放在 col 中比較美觀) ---
col1, col2 = st.columns(2)
with col1:
max_tags = len(all_tag_cols)
top_n = st.slider(
"📍 顯示 Top N 標籤數量",
min_value=10,
max_value=max_tags,
value=20,
step=1
)
with col2:
max_rec = int(df['recommendations'].max()) if 'recommendations' in df.columns else 10000
rec_th = st.slider(
"👍 推薦數門檻 (過濾冷門遊戲)",
min_value=0,
max_value=max_rec,
value=1000,
step=500
)
# --- B. 執行運算 (有快取保護) ---
with st.spinner("正在計算標籤距離..."):
pos, target_tags, tag_counts, sample_size = compute_mds_coordinates(
df, top_n, rec_th
)
if pos is None:
st.warning("⚠️ 篩選條件過於嚴格,沒有足夠的遊戲數據可供分析。")
return
# --- C. 準備繪圖 ---
# 設定字體 (這是關鍵,否則 Matplotlib 中文會亂碼)
if font_path and os.path.exists(font_path):
prop = font_manager.FontProperties(fname=font_path)
else:
prop = None # fallback
# 顏色邏輯函式 (內部使用)
def get_color(tag):
if tag in color_dict:
c = color_dict[tag]
# 如果是 categories_ 就淡化
if tag.startswith('categories_'):
return fade_color(c, alpha=fade_alpha)
return c
return (0.5, 0.5, 0.5, 0.7) # 灰色
final_colors = [get_color(t) for t in target_tags]
# 建立畫布
fig, ax = plt.subplots(figsize=(14, 10))
ax.margins(0.1) # 0.1 代表留白 10%,如果不夠可以調成 0.2
# 氣泡大小公式
sizes = (tag_counts.values / tag_counts.values.max()) * 10000
# 畫散佈圖
ax.scatter(
pos[:, 0], pos[:, 1],
s=sizes,
c=final_colors,
alpha=0.8,
edgecolors='white',
linewidth=1.5
)
# 標註文字
texts = []
for i, tag in enumerate(target_tags):
# 使用 mapping 轉中文
label_text = mapping.get(tag, tag.replace('genres_', '').replace('categories_', ''))
# 字體大小動態調整
font_size = max(9, min(16, int(tag_counts.values[i] / tag_counts.values.max() * 20)))
is_genre = tag in color_dict and not tag.startswith('categories_')
ax.text(
pos[i, 0], pos[i, 1],
label_text,
ha='center', va='center',
fontsize=font_size,
fontproperties=prop, # <--- 套用中文字體
fontweight='bold' if is_genre else 'normal',
color='black' if is_genre else '#333333'
)
ax.set_title(
f"MDS Tag Map (樣本數: {sample_size} 款遊戲)",
fontsize=18,
fontproperties=prop,
pad=20
)
ax.grid(True, linestyle='--', alpha=0.2)
ax.axis('off') # 隱藏座標軸讓畫面更乾淨
# --- D. 輸出到 Streamlit ---
st.pyplot(fig)
def plot_tag_heatmap(
data,
font_path=None,
top_n=20,
figsize=(14, 12)
):
st.subheader("🔗 遊戲標籤關係係數圖 (Corr Heatmap)")
all_tag_cols = [c for c in data.columns if c.startswith('genres_') or c.startswith('categories_')]
# 2. 篩選出現頻率最高的 Top N 標籤
tag_counts = data[all_tag_cols].sum().sort_values(ascending=False).head(top_n)
top_tags = tag_counts.index.tolist()
if len(top_tags) < 2:
st.warning("⚠️ 數據過濾後標籤過少,無法繪製熱力圖。")
return
# 3. 計算相關係數矩陣
corr_matrix = data[top_tags].corr()
# ✨ 修改處 1:建立遮罩 (Mask)
# np.triu (Triangle Upper) 會把矩陣的上半部變成 True (要遮掉的部分)
# k=0 代表連對角線也遮掉;如果你想保留對角線(自己對自己=1),改成 k=1
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# 4. 繪圖
fig, ax = plt.subplots(figsize=figsize)
# 使用 Seaborn 畫圖
sns.heatmap(
corr_matrix,
mask=mask, # ✨ 修改處 2:傳入遮罩
annot=True, # 顯示數值
fmt=".2f",
cmap='coolwarm',
vmin=-1, vmax=1,
ax=ax,
cbar_kws={'shrink': 0.8},
square=True # 建議加上這個,讓格子保持正方形,比較好看
)
prop = None
# 5. 設定中文字體
if font_path:
prop = font_manager.FontProperties(fname=font_path, size=12)
# 設定 X 軸標籤字體
ax.set_xticklabels(
[t.replace('genres_', '').replace('categories_', '') for t in top_tags],
fontproperties=prop,
rotation=45,
ha='right'
)
# 設定 Y 軸標籤字體
ax.set_yticklabels(
[t.replace('genres_', '').replace('categories_', '') for t in top_tags],
fontproperties=prop,
rotation=0
)
ax.set_title(f"Top {top_n} Tags Correlation (Lower Triangle)", fontproperties=prop, fontsize=18, pad=20)
# 6. 顯示
st.pyplot(fig) |