from __future__ import annotations import json from pathlib import Path from typing import Any import joblib import pandas as pd import plotly.express as px import streamlit as st APP_DIR = Path(__file__).resolve().parent DATA_DIR = APP_DIR / "data" FEATURES_PATH = DATA_DIR / "silver" / "unified" / "vietnam_influencer_features.csv" TRUST_SCORES_PATH = DATA_DIR / "gold" / "features" / "trust_scores.csv" EVENTS_PATH = DATA_DIR / "serving" / "kol_events.jsonl" MANUAL_LABELS_PATH = DATA_DIR / "input" / "manual_trust_labels.csv" DATASET_STATS_PATH = DATA_DIR / "dataset_stats.json" MODEL_PATH = APP_DIR / "ml" / "models" / "trust_score" / "kol_trust_model.joblib" NUMERIC_FEATURES = [ "follower_count", "content_count", "view_count", "like_count", "comment_count", "share_count", "engagement_rate", "likes_per_view", "comments_per_view", "shares_per_view", "sentiment_score", "positive_comment_count", "neutral_comment_count", "negative_comment_count", "upload_frequency", "follower_growth_rate", "activity_score", "is_suspicious", ] CATEGORICAL_FEATURES = ["platform"] st.set_page_config(page_title="KOLTrust Demo", page_icon=":bar_chart:", layout="wide") def trust_label(score: float) -> str: if score >= 70: return "high_trust" if score >= 45: return "medium_trust" return "low_trust" def risk_profile(label: str) -> str: if label == "high_trust": return "trusted" if label == "medium_trust": return "watch" return "risky" def to_float(value: Any, default: float = 0.0) -> float: try: if value is None or value == "": return default return float(value) except (TypeError, ValueError): return default @st.cache_data(show_spinner=False) def read_csv(path: Path) -> pd.DataFrame: if not path.exists(): return pd.DataFrame() return pd.read_csv(path, encoding="utf-8-sig") @st.cache_data(show_spinner=False) def read_json(path: Path) -> dict[str, Any]: if not path.exists(): return {} return json.loads(path.read_text(encoding="utf-8")) @st.cache_data(show_spinner=False) def read_jsonl(path: Path) -> pd.DataFrame: if not path.exists(): return pd.DataFrame() rows = [] for line in path.read_text(encoding="utf-8").splitlines(): if line.strip(): rows.append(json.loads(line)) return pd.DataFrame(rows) @st.cache_resource(show_spinner=False) def load_trust_model() -> dict[str, Any] | None: if not MODEL_PATH.exists(): return None try: payload = joblib.load(MODEL_PATH) return payload if isinstance(payload, dict) else {"model": payload} except Exception: return None def normalize_features(df: pd.DataFrame) -> pd.DataFrame: if df.empty: return df data = df.copy() for column in NUMERIC_FEATURES: if column not in data.columns: data[column] = 0.0 data[column] = pd.to_numeric(data[column], errors="coerce").fillna(0.0) for column in CATEGORICAL_FEATURES: if column not in data.columns: data[column] = "unknown" data[column] = data[column].fillna("unknown").astype(str) for column in ("trust_score", "human_trust_score"): if column in data.columns: data[column] = pd.to_numeric(data[column], errors="coerce") return data def attach_manual_labels(features: pd.DataFrame, manual: pd.DataFrame) -> pd.DataFrame: if features.empty or manual.empty: return features required = {"platform", "creator_id", "content_id"} if not required.issubset(features.columns) or not required.issubset(manual.columns): return features label_columns = [ "platform", "creator_id", "content_id", "human_trust_score", "human_trust_label", "human_is_suspicious", "label_reason", "annotator", "labeled_at", ] labels = manual[[column for column in label_columns if column in manual.columns]].copy() for column in required: labels[column] = labels[column].astype(str) features[column] = features[column].astype(str) return features.merge(labels, on=["platform", "creator_id", "content_id"], how="left") def predict_scores(features: pd.DataFrame) -> pd.DataFrame: data = normalize_features(features) if data.empty: return data payload = load_trust_model() model = payload.get("model") if payload else None if model is None: data["model_trust_score"] = data.get("trust_score", pd.Series(dtype=float)) data["score_source"] = "baseline_rule" return data numeric_features = payload.get("numeric_features") or NUMERIC_FEATURES categorical_features = payload.get("categorical_features") or CATEGORICAL_FEATURES model_input = data[numeric_features + categorical_features].copy() try: data["model_trust_score"] = model.predict(model_input).clip(0, 100) data["score_source"] = "trained_model" except Exception: data["model_trust_score"] = data.get("trust_score", pd.Series(dtype=float)) data["score_source"] = "baseline_rule" return data def final_score(row: pd.Series) -> tuple[float, str, str]: if pd.notna(row.get("human_trust_score")): score = to_float(row.get("human_trust_score")) label = str(row.get("human_trust_label") or trust_label(score)) return score, label, "human_label" if pd.notna(row.get("model_trust_score")): score = to_float(row.get("model_trust_score")) return score, trust_label(score), str(row.get("score_source") or "trained_model") score = to_float(row.get("trust_score")) return score, trust_label(score), "baseline_rule" def build_content_table(features: pd.DataFrame) -> pd.DataFrame: rows = [] for _, row in features.iterrows(): score, label, source = final_score(row) rows.append( { "platform": row.get("platform"), "creator_id": row.get("creator_id"), "creator_name": row.get("creator_name"), "content_id": row.get("content_id"), "content_title": row.get("content_title"), "publish_time": row.get("publish_time"), "view_count": int(to_float(row.get("view_count"))), "like_count": int(to_float(row.get("like_count"))), "comment_count": int(to_float(row.get("comment_count"))), "share_count": int(to_float(row.get("share_count"))), "engagement_rate": to_float(row.get("engagement_rate")), "sentiment_score": to_float(row.get("sentiment_score")), "activity_score": to_float(row.get("activity_score")), "is_suspicious": bool(to_float(row.get("human_is_suspicious"), to_float(row.get("is_suspicious")))), "trust_score": round(score, 2), "trust_label": label, "risk_profile": risk_profile(label), "score_source": source, "label_reason": row.get("label_reason"), "annotator": row.get("annotator"), "labeled_at": row.get("labeled_at"), } ) return pd.DataFrame(rows) features = read_csv(FEATURES_PATH) manual_labels = read_csv(MANUAL_LABELS_PATH) trust_scores = read_csv(TRUST_SCORES_PATH) events = read_jsonl(EVENTS_PATH) stats = read_json(DATASET_STATS_PATH) features = attach_manual_labels(features, manual_labels) features = predict_scores(features) content = build_content_table(features) st.title("KOLTrust Demo") st.caption("Offline demo for KOL trust evaluation from exported public social metrics.") st.info("The KOL data in this demo is for simulation purposes only and not real.") if content.empty: st.error("Demo data was not found. Check the data folder in this Space.") st.stop() with st.sidebar: st.header("Controls") platforms = sorted(content["platform"].dropna().unique().tolist()) selected_platforms = st.multiselect("Platform", platforms, default=platforms) label_options = ["high_trust", "medium_trust", "low_trust"] selected_labels = st.multiselect("Trust label", label_options, default=label_options) source_options = sorted(content["score_source"].dropna().unique().tolist()) selected_sources = st.multiselect("Score source", source_options, default=source_options) min_views = st.number_input("Min views", min_value=0, value=0, step=1000) filtered = content[ content["platform"].isin(selected_platforms) & content["trust_label"].isin(selected_labels) & content["score_source"].isin(selected_sources) & (content["view_count"] >= min_views) ].copy() metric_cols = st.columns(5) metric_cols[0].metric("Content rows", f"{len(filtered):,}") metric_cols[1].metric("Creators", f"{filtered['creator_id'].nunique():,}") metric_cols[2].metric("Manual labels", f"{int((content['score_source'] == 'human_label').sum()):,}") metric_cols[3].metric("Avg trust", f"{filtered['trust_score'].mean():.1f}" if not filtered.empty else "0") metric_cols[4].metric("Suspicious", f"{int(filtered['is_suspicious'].sum()):,}") tab_overview, tab_evaluate, tab_data = st.tabs(["Overview", "Evaluate KOL", "Data"]) with tab_overview: chart_cols = st.columns([1.1, 0.9]) with chart_cols[0]: top_creators = ( filtered.groupby(["creator_id", "creator_name", "platform"], dropna=False) .agg(trust_score=("trust_score", "mean"), views=("view_count", "sum"), content=("content_id", "count")) .reset_index() .sort_values("trust_score", ascending=False) .head(20) ) fig = px.bar( top_creators.sort_values("trust_score"), x="trust_score", y="creator_name", color="platform", orientation="h", range_x=[0, 100], labels={"trust_score": "Avg trust score", "creator_name": ""}, ) fig.update_layout(height=500, margin=dict(l=10, r=10, t=20, b=10)) st.plotly_chart(fig, use_container_width=True) with chart_cols[1]: label_counts = filtered["trust_label"].value_counts().rename_axis("trust_label").reset_index(name="rows") fig = px.pie(label_counts, names="trust_label", values="rows", hole=0.45) fig.update_layout(height=500, margin=dict(l=10, r=10, t=20, b=10)) st.plotly_chart(fig, use_container_width=True) st.subheader("Top creator leaderboard") st.dataframe(top_creators, use_container_width=True, hide_index=True) with tab_evaluate: creators = ( filtered.groupby(["creator_id", "creator_name", "platform"], dropna=False) .agg(trust_score=("trust_score", "mean"), views=("view_count", "sum"), content=("content_id", "count")) .reset_index() .sort_values(["trust_score", "views"], ascending=[False, False]) ) if creators.empty: st.info("No creators match the selected filters.") else: labels = { f"{row.creator_name} ({row.platform}, {row.creator_id})": row.creator_id for row in creators.itertuples(index=False) } selected_label = st.selectbox("Select KOL", list(labels.keys())) creator_id = labels[selected_label] creator_rows = filtered[filtered["creator_id"].astype(str) == str(creator_id)].sort_values("publish_time", ascending=False) latest = creator_rows.iloc[0] avg_score = creator_rows["trust_score"].mean() kol_label = trust_label(avg_score) cols = st.columns(5) cols[0].metric("Creator", str(latest["creator_name"])) cols[1].metric("Avg trust", f"{avg_score:.1f}") cols[2].metric("Label", kol_label) cols[3].metric("Contents", f"{len(creator_rows):,}") cols[4].metric("Total views", f"{int(creator_rows['view_count'].sum()):,}") signal_rows = pd.DataFrame( [ {"signal": "Engagement rate", "value": f"{creator_rows['engagement_rate'].mean() * 100:.2f}%"}, {"signal": "Sentiment score", "value": f"{creator_rows['sentiment_score'].mean():.1f}/100"}, {"signal": "Activity score", "value": f"{creator_rows['activity_score'].mean():.1f}/100"}, {"signal": "Suspicious content", "value": int(creator_rows["is_suspicious"].sum())}, {"signal": "Score source", "value": ", ".join(sorted(creator_rows["score_source"].unique()))}, ] ) st.subheader("Assessment signals") st.dataframe(signal_rows, use_container_width=True, hide_index=True) fig = px.scatter( creator_rows, x="view_count", y="trust_score", size="like_count", color="trust_label", hover_data=["content_title", "score_source", "engagement_rate", "sentiment_score"], range_y=[0, 100], ) fig.update_layout(height=380, margin=dict(l=10, r=10, t=20, b=10)) st.plotly_chart(fig, use_container_width=True) st.subheader("Content evidence") evidence_columns = [ "content_title", "publish_time", "view_count", "like_count", "comment_count", "engagement_rate", "sentiment_score", "trust_score", "trust_label", "score_source", "label_reason", ] st.dataframe(creator_rows[evidence_columns], use_container_width=True, hide_index=True) with tab_data: st.subheader("Filtered content") show_columns = [ "platform", "creator_name", "content_title", "view_count", "engagement_rate", "sentiment_score", "activity_score", "is_suspicious", "trust_score", "trust_label", "score_source", ] st.dataframe(filtered[show_columns].sort_values("trust_score", ascending=False), use_container_width=True, hide_index=True) with st.expander("Dataset stats"): st.json(stats) with st.expander("Manual labels"): st.dataframe(manual_labels, use_container_width=True, hide_index=True) with st.expander("Raw serving events sample"): st.dataframe(events.head(200), use_container_width=True, hide_index=True) with st.expander("Creator trust scores"): st.dataframe(trust_scores, use_container_width=True, hide_index=True)