Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import plotly.express as px | |
| import matplotlib.pyplot as plt | |
| from sklearn.manifold import MDS | |
| from matplotlib import font_manager | |
| from wordcloud import WordCloud | |
| import os | |
| import random | |
| from config import chinese_name_map, FIXED_GENRE_COLORS, genres_map | |
| # ========================================================================== | |
| def categorize_business_model(row): | |
| name = str(row['name']).lower() | |
| categories = str(row.get('categories', '')).lower() | |
| genres = str(row.get('genres', '')).lower() | |
| # --- A. 修正:這些在資料集中標記為 0 但實則為付費大作 --- | |
| premium_fix = [ | |
| 'football manager', 'thunder tier one', 'atelier ryza 2', | |
| 'little nightmares', 'atelier ryza 3', 'battlefield 2042', | |
| 'Nobody' | |
| ] | |
| models = [] | |
| if any(k in name for k in premium_fix) or 'premium_keywords' in name: | |
| models.append('資料集標記異常/可能為限時免費') | |
| # --- B. 大型 IP 入口與版本更新 (IP Portal/Version Update) --- | |
| # 邏輯:遊戲主體免費下載(啟動器),但內容需訂閱或購買 DLC(如 COD, Halo)。 | |
| ip_keywords = [ | |
| 'command & conquer', 'total war', 'warhammer', 'ea sports', | |
| 'efootball', 'samurai shodown', 'battlefield', 'call of duty', | |
| 'final fantasy', 'game of thrones', 'halo' | |
| ] | |
| if any(k in name for k in ip_keywords): | |
| models.append('大型 IP 入口/版本更新 (IP Portal)') | |
| # --- C. 章節/試玩吸引術 (Episodic/Prologue/Pass) --- | |
| # 邏輯:免費提供第一章或好友通行證,誘導購買完整版。 | |
| prologue_keywords = [ | |
| 'prologue', 'episode', 'demo', 'nexus', 'chapter', | |
| "friend's pass", 'trial', 'poppy' | |
| ] | |
| if any(k in name for k in prologue_keywords): | |
| models.append('章節/試玩 (Episodic/Prologue)') | |
| # --- D. 線上營運與內購制 (Service-based F2P / IAP) --- | |
| # 邏輯:有 [In-App Purchases] 標籤,或已知 F2P 轉型作品。 | |
| # 修正:補強 Hired Ops, XERA, VAIL, BROKE PROTOCOL 等內購制競技遊戲。 | |
| f2p_iap_names = [ | |
| 'naraka', 'fragpunk', 'stormgate', 'fear surrounds', | |
| 'zenith', 'hired ops', 'xera', 'vail', 'broke protocol' | |
| ] | |
| if ('in-app purchases' in categories or | |
| 'massively multiplayer' in genres or | |
| any(k in name for k in f2p_iap_names)): | |
| models.append('線上營運/內購制 (Service-based F2P)') | |
| # --- 特別檢查:真正免費的線上遊戲 --- | |
| f2p_verified = ['world war 3', 'quake champions', 'lost ark', 'stumble guys'] | |
| if any(k in name for k in f2p_verified): | |
| models.append('線上營運/內購制 (Service-based F2P)') | |
| special = ['knightfall'] | |
| if any(k in name for k in special): | |
| models.append('2024年開放永久免費') | |
| unique_models = list(set(models)) | |
| if not unique_models: | |
| return '真正免費 (Experiment/Legacy)' | |
| # 使用 "|" 作為唯一分隔符,前後不加空格,方便後續程式解析 | |
| return "|".join(unique_models) | |
| # 將 RGB 轉 HEX | |
| def rgb_to_hex(rgb): | |
| return '#%02x%02x%02x' % tuple(int(x*255) for x in rgb) | |
| def get_genre_color(genre_key): | |
| if genre_key in FIXED_GENRE_COLORS: | |
| return rgb_to_hex(FIXED_GENRE_COLORS[genre_key]) | |
| else: | |
| return '#BDBDBD' # fallback 灰色 | |
| def monthly_stats(df): | |
| stats = df.groupby('release_month')['recommendations'].agg(['count','mean','median','std']).reset_index() | |
| stats = stats.round(2) | |
| stats['mean_max'] = stats['mean'].max() | |
| stats['mean_min'] = stats['mean'].min() | |
| return stats | |
| def year_stats(df): | |
| stats = df.groupby('release_year').agg( | |
| count=('price', 'size'), | |
| avg_price=('price', 'mean'), | |
| median_price=('price', 'median'), | |
| avg_recs = ('recommendations', 'mean') | |
| ).reset_index() | |
| stats['growth_rate'] = stats['count'].pct_change() * 100 | |
| return stats | |
| def top_entities(df, entity_col): | |
| entity_stats = df.groupby(entity_col).agg( | |
| game_count=('recommendations', 'size'), | |
| total_rec=('recommendations', 'sum'), | |
| avg_rec=('recommendations', 'mean') | |
| ).sort_values('game_count', ascending=False).head(10) | |
| top_games_list = [] | |
| for entity in entity_stats.index: | |
| # 找出該廠商的所有遊戲,按推薦數排序取前三 | |
| top_3 = df[df[entity_col] == entity].nlargest(3, 'recommendations')['name'].tolist() | |
| top_3_cn = [chinese_name_map.get(name, name) for name in top_3] | |
| top_games_list.append("<br>".join(top_3_cn)) | |
| entity_stats['知名遊戲'] = top_games_list | |
| return entity_stats | |
| # 跟top_entities比 多了最低遊戲數目3款的篩選 | |
| def top_entities_advanced(df, entity_col): | |
| # 1. 計算所有廠商的基礎統計 | |
| entity_stats = df.groupby(entity_col).agg( | |
| game_count=('appid', 'nunique'), # 確保用 appid 算數量 | |
| total_rec=('recommendations', 'sum'), | |
| avg_rec=('recommendations', 'mean'), | |
| avg_price=('price', 'mean'), | |
| avg_discount=('discount(%)', 'mean') | |
| ) | |
| # 2. 【關鍵篩選】僅保留在市場上發行 3 款(含)以上遊戲的廠商 | |
| entity_stats = entity_stats[entity_stats['game_count'] >= 3] | |
| entity_stats = entity_stats.sort_values('total_rec', ascending=False).head(10) | |
| top_games_list = [] | |
| for entity in entity_stats.index: | |
| # 這裡過濾原始 df 找出該廠商的所有遊戲 | |
| subset = df[df[entity_col] == entity] | |
| top_3 = subset.nlargest(3, 'recommendations')['name'].tolist() | |
| # 轉換中文名 (若 chinese_name_map 未定義則用原名) | |
| top_3_cn = [chinese_name_map.get(name, name) for name in top_3] | |
| top_games_list.append(" | ".join(top_3_cn)) | |
| entity_stats['知名遊戲'] = top_games_list | |
| return entity_stats | |
| def plot_top_entities(stats_df, title, global_avg): | |
| stats_df = stats_df.reset_index() | |
| entity_name = stats_df.columns[0] | |
| # --- 動態建立 hover_data,避免 KeyError --- | |
| # 基本必有的標籤 | |
| hover_cols = { | |
| 'game_count': True, | |
| 'avg_rec': ':,.0f' | |
| } | |
| # 檢查並加入可選標籤 (如果該 df 裡有這些欄位才加) | |
| optional_fields = { | |
| '知名遊戲': True, | |
| 'avg_price': ':.2f', | |
| 'avg_discount': ':.1%' | |
| } | |
| for field, fmt in optional_fields.items(): | |
| if field in stats_df.columns: | |
| hover_cols[field] = fmt | |
| # --------------------------------------- | |
| fig = px.scatter(stats_df, | |
| x="game_count", | |
| y="avg_rec", | |
| size="total_rec" if "total_rec" in stats_df.columns else None, | |
| color=entity_name, | |
| hover_name=entity_name, | |
| hover_data=hover_cols, # 使用動態產生的字典 | |
| title=title, | |
| labels={"game_count": "發行遊戲量", "avg_rec": "平均推薦數"}, | |
| template="plotly_white" | |
| ) | |
| # 加上全體平均線 | |
| fig.add_hline(y=global_avg, | |
| line_dash="dash", | |
| line_color="red", | |
| annotation_text="市場平均推薦數", | |
| annotation_position="bottom right") | |
| fig.update_layout(showlegend=False) | |
| return fig | |
| def plot_entity_price_strategy(df_paid, stats_df, entity_type="developer"): | |
| top_entities = stats_df.index.tolist() | |
| df_top = df_paid[df_paid[entity_type].isin(top_entities)].copy() | |
| # 畫箱型圖 | |
| title = "卓越開發商定價策略分佈" if entity_type == "developer" else "卓越發行商定價策略分佈" | |
| fig = px.box( | |
| df_top, | |
| x=entity_type, | |
| y="price", | |
| color=entity_type, | |
| points="all", # 顯示所有點,可以看到具體哪款遊戲定價多少 | |
| title=title, | |
| labels={"price": "價格 (USD)", entity_type: "廠商名稱"}, | |
| template="plotly_white" | |
| ) | |
| fig.update_layout(showlegend=False, xaxis={'categoryorder': 'median descending'}) | |
| return fig | |
| def discount_performance_full(df_active_paid): | |
| # 1. 準備數據副本,避免改動原始資料 | |
| df = df_active_paid.copy() | |
| # 2. 定義標籤與區間 | |
| bins_discount = [0, 0.3, 0.5, 0.8, 1.0] | |
| labels_discount = ['0–30%', '30–50%', '50–80%', '80–100%'] | |
| # 3. 分離「無折扣組」與「有折扣組」 | |
| df_no_dis = df[df['discount(%)'] == 0] | |
| df_dis = df[df['discount(%)'] > 0].copy() | |
| # 4. 對有折扣組進行切分 | |
| df_dis['discount_group'] = pd.cut( | |
| df_dis['discount(%)'], | |
| bins=bins_discount, | |
| labels=labels_discount, | |
| include_lowest=False | |
| ) | |
| # 5. 計算有折扣組的統計 (Groupby) | |
| stats_dis = df_dis.groupby('discount_group', observed=True).agg( | |
| 遊戲數量=('appid', 'count'), | |
| 平均折後價=('price', 'mean'), | |
| 平均原價=('original_price', 'mean'), | |
| 遊玩增長中位數=('median_playtime_change', 'median'), | |
| 遊玩增長平均值=('median_playtime_change', 'mean'), | |
| 平均推薦數=('recommendations', 'mean') | |
| ).reset_index() | |
| # 6. 計算無折扣組的基準列 (手動建立 DataFrame) | |
| baseline_row = pd.DataFrame([{ | |
| 'discount_group': '無折扣 (基準線)', | |
| '遊戲數量': len(df_no_dis), | |
| '平均折後價': df_no_dis['price'].mean(), | |
| '平均原價': df_no_dis['price'].mean(), | |
| '遊玩增長中位數': df_no_dis['median_playtime_change'].median(), | |
| '遊玩增長平均值': df_no_dis['median_playtime_change'].mean(), | |
| '平均推薦數': df_no_dis['recommendations'].mean() | |
| }]) | |
| # 7. 合併結果 | |
| final_report = pd.concat([baseline_row, stats_dis], ignore_index=True) | |
| return final_report | |
| def plot_streamlit_wordcloud( | |
| df_wc, | |
| mapping, | |
| color_dict=FIXED_GENRE_COLORS, | |
| font_path=None, | |
| figsize=(12, 6), | |
| bg_color='white', | |
| fade_categories_alpha=0.5 | |
| ): | |
| cols = [ | |
| c for c in df_wc.columns | |
| if (c.startswith("genres_") or c.startswith("categories_")) | |
| and mapping and c in mapping | |
| ] | |
| freq = {} | |
| for c in cols: | |
| cnt = int(df_wc[c].sum()) | |
| if cnt > 0: | |
| freq[mapping[c]] = cnt | |
| # if not freq: | |
| # st.warning(f"⚠️ 「{dataset_name}」數據不足,無法生成文字雲。") | |
| # return | |
| # --- 關鍵修正:整合 rgb_to_hex --- | |
| def color_func(word, **kwargs): | |
| # 預設隨機灰色 (Fallback) | |
| # 用 Hex 格式比較統一 | |
| g = random.randint(80, 180) | |
| default_color = '#%02x%02x%02x' % (g, g, g) | |
| # 反查 key (因為 word 是中文,我们要找 genres_Action) | |
| target_key = None | |
| for k, v in mapping.items(): | |
| if v == word: | |
| target_key = k | |
| break | |
| if target_key and target_key in color_dict: | |
| # 取得 (0~1) 的 tuple | |
| r, g, b = FIXED_GENRE_COLORS[target_key] | |
| # 處理 categories_ 的淡化 (注意:這裡是對 0.0~1.0 進行運算) | |
| if target_key.startswith("categories_"): | |
| # 公式:原色 + (1.0 - 原色) * 淡化係數 (混入白色) | |
| r = r + (1.0 - r) * fade_categories_alpha | |
| g = g + (1.0 - g) * fade_categories_alpha | |
| b = b + (1.0 - b) * fade_categories_alpha | |
| # 直接呼叫你的 helper function 轉成 Hex | |
| return rgb_to_hex((r, g, b)) | |
| return default_color | |
| # | |
| # # 字體路徑檢查 | |
| # if font_path and not os.path.exists(font_path): | |
| # st.warning(f"⚠️ 找不到字體檔案:{font_path},文字雲可能會顯示亂碼方塊。") | |
| # 生成文字雲 | |
| wc = WordCloud( | |
| width=900, | |
| height=450, | |
| background_color=bg_color, | |
| font_path=font_path, | |
| collocations=False, | |
| random_state=2077, | |
| ).generate_from_frequencies(freq) | |
| # 繪圖 | |
| fig, ax = plt.subplots(figsize=figsize) | |
| ax.imshow(wc.recolor(color_func=color_func)) | |
| ax.axis("off") | |
| st.pyplot(fig) | |
| def fade_color(color, alpha=0.5): | |
| """將 RGB tuple 加上 alpha 通道""" | |
| r, g, b = color[:3] | |
| return (r, g, b, alpha) | |
| def compute_mds_coordinates( | |
| df, | |
| top_n_tags, | |
| rec_threshold, | |
| noise_threshold=0.05, | |
| random_state=42 | |
| ): | |
| """ | |
| 負責最繁重的 MDS 運算。 | |
| 當參數改變時才會重新執行,否則直接讀快取。 | |
| """ | |
| # 1. 篩選資料 (Recommendations 門檻) | |
| if 'recommendations' in df.columns: | |
| df_filtered = df[df['recommendations'] > rec_threshold] | |
| else: | |
| df_filtered = df.copy() | |
| # 2. 找出 Top N 標籤 | |
| tag_cols = [c for c in df.columns if c.startswith('genres_') or c.startswith('categories_')] | |
| # 如果篩選後沒資料,提早回傳 | |
| if df_filtered.empty: | |
| return None, None, None, None | |
| tag_counts = df_filtered[tag_cols].sum().sort_values(ascending=False).head(top_n_tags) | |
| target_tags = tag_counts.index.tolist() | |
| # 3. 計算相關矩陣與距離矩陣 | |
| corr_matrix = df_filtered[target_tags].corr().fillna(0) | |
| corr_matrix[corr_matrix.abs() < noise_threshold] = 0 | |
| distance_matrix = 1 - np.abs(corr_matrix.values) | |
| # 4. MDS 降維運算 | |
| mds = MDS( | |
| n_components=2, | |
| dissimilarity="precomputed", | |
| random_state=random_state, | |
| n_init=4, | |
| max_iter=300, | |
| normalized_stress="auto" | |
| ) | |
| pos = mds.fit_transform(distance_matrix) | |
| return pos, target_tags, tag_counts, df_filtered.shape[0] | |
| # MDS繪圖主函式 | |
| # ------------------------------------------------------- | |
| def plot_mds_streamlit( | |
| df, | |
| mapping=genres_map, | |
| color_dict=FIXED_GENRE_COLORS, | |
| font_path="fonts/NotoSansTC-Regular.ttf", | |
| fade_alpha=0.8 | |
| ): | |
| st.subheader("🔗 遊戲標籤關聯圖(Genre對Categor) (MDS Map)") | |
| all_tag_cols = [c for c in df.columns if c.startswith('genres_') or c.startswith('categories_')] | |
| # --- A. 建立 Slider (放在 col 中比較美觀) --- | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| max_tags = len(all_tag_cols) | |
| top_n = st.slider( | |
| "📍 顯示 Top N 標籤數量", | |
| min_value=10, | |
| max_value=max_tags, | |
| value=20, | |
| step=1 | |
| ) | |
| with col2: | |
| max_rec = int(df['recommendations'].max()) if 'recommendations' in df.columns else 10000 | |
| rec_th = st.slider( | |
| "👍 推薦數門檻 (過濾冷門遊戲)", | |
| min_value=0, | |
| max_value=max_rec, | |
| value=1000, | |
| step=500 | |
| ) | |
| # --- B. 執行運算 (有快取保護) --- | |
| with st.spinner("正在計算標籤距離..."): | |
| pos, target_tags, tag_counts, sample_size = compute_mds_coordinates( | |
| df, top_n, rec_th | |
| ) | |
| if pos is None: | |
| st.warning("⚠️ 篩選條件過於嚴格,沒有足夠的遊戲數據可供分析。") | |
| return | |
| # --- C. 準備繪圖 --- | |
| # 設定字體 (這是關鍵,否則 Matplotlib 中文會亂碼) | |
| if font_path and os.path.exists(font_path): | |
| prop = font_manager.FontProperties(fname=font_path) | |
| else: | |
| prop = None # fallback | |
| # 顏色邏輯函式 (內部使用) | |
| def get_color(tag): | |
| if tag in color_dict: | |
| c = color_dict[tag] | |
| # 如果是 categories_ 就淡化 | |
| if tag.startswith('categories_'): | |
| return fade_color(c, alpha=fade_alpha) | |
| return c | |
| return (0.5, 0.5, 0.5, 0.7) # 灰色 | |
| final_colors = [get_color(t) for t in target_tags] | |
| # 建立畫布 | |
| fig, ax = plt.subplots(figsize=(14, 10)) | |
| ax.margins(0.1) # 0.1 代表留白 10%,如果不夠可以調成 0.2 | |
| # 氣泡大小公式 | |
| sizes = (tag_counts.values / tag_counts.values.max()) * 10000 | |
| # 畫散佈圖 | |
| ax.scatter( | |
| pos[:, 0], pos[:, 1], | |
| s=sizes, | |
| c=final_colors, | |
| alpha=0.8, | |
| edgecolors='white', | |
| linewidth=1.5 | |
| ) | |
| # 標註文字 | |
| texts = [] | |
| for i, tag in enumerate(target_tags): | |
| # 使用 mapping 轉中文 | |
| label_text = mapping.get(tag, tag.replace('genres_', '').replace('categories_', '')) | |
| # 字體大小動態調整 | |
| font_size = max(9, min(16, int(tag_counts.values[i] / tag_counts.values.max() * 20))) | |
| is_genre = tag in color_dict and not tag.startswith('categories_') | |
| ax.text( | |
| pos[i, 0], pos[i, 1], | |
| label_text, | |
| ha='center', va='center', | |
| fontsize=font_size, | |
| fontproperties=prop, # <--- 套用中文字體 | |
| fontweight='bold' if is_genre else 'normal', | |
| color='black' if is_genre else '#333333' | |
| ) | |
| ax.set_title( | |
| f"MDS Tag Map (樣本數: {sample_size} 款遊戲)", | |
| fontsize=18, | |
| fontproperties=prop, | |
| pad=20 | |
| ) | |
| ax.grid(True, linestyle='--', alpha=0.2) | |
| ax.axis('off') # 隱藏座標軸讓畫面更乾淨 | |
| # --- D. 輸出到 Streamlit --- | |
| st.pyplot(fig) | |
| def plot_tag_heatmap( | |
| data, | |
| font_path=None, | |
| top_n=20, | |
| figsize=(14, 12) | |
| ): | |
| st.subheader("🔗 遊戲標籤關係係數圖 (Corr Heatmap)") | |
| all_tag_cols = [c for c in data.columns if c.startswith('genres_') or c.startswith('categories_')] | |
| # 2. 篩選出現頻率最高的 Top N 標籤 | |
| tag_counts = data[all_tag_cols].sum().sort_values(ascending=False).head(top_n) | |
| top_tags = tag_counts.index.tolist() | |
| if len(top_tags) < 2: | |
| st.warning("⚠️ 數據過濾後標籤過少,無法繪製熱力圖。") | |
| return | |
| # 3. 計算相關係數矩陣 | |
| corr_matrix = data[top_tags].corr() | |
| # ✨ 修改處 1:建立遮罩 (Mask) | |
| # np.triu (Triangle Upper) 會把矩陣的上半部變成 True (要遮掉的部分) | |
| # k=0 代表連對角線也遮掉;如果你想保留對角線(自己對自己=1),改成 k=1 | |
| mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) | |
| # 4. 繪圖 | |
| fig, ax = plt.subplots(figsize=figsize) | |
| # 使用 Seaborn 畫圖 | |
| sns.heatmap( | |
| corr_matrix, | |
| mask=mask, # ✨ 修改處 2:傳入遮罩 | |
| annot=True, # 顯示數值 | |
| fmt=".2f", | |
| cmap='coolwarm', | |
| vmin=-1, vmax=1, | |
| ax=ax, | |
| cbar_kws={'shrink': 0.8}, | |
| square=True # 建議加上這個,讓格子保持正方形,比較好看 | |
| ) | |
| prop = None | |
| # 5. 設定中文字體 | |
| if font_path: | |
| prop = font_manager.FontProperties(fname=font_path, size=12) | |
| # 設定 X 軸標籤字體 | |
| ax.set_xticklabels( | |
| [t.replace('genres_', '').replace('categories_', '') for t in top_tags], | |
| fontproperties=prop, | |
| rotation=45, | |
| ha='right' | |
| ) | |
| # 設定 Y 軸標籤字體 | |
| ax.set_yticklabels( | |
| [t.replace('genres_', '').replace('categories_', '') for t in top_tags], | |
| fontproperties=prop, | |
| rotation=0 | |
| ) | |
| ax.set_title(f"Top {top_n} Tags Correlation (Lower Triangle)", fontproperties=prop, fontsize=18, pad=20) | |
| # 6. 顯示 | |
| st.pyplot(fig) |