Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| import joblib | |
| import pandas as pd | |
| import plotly.express as px | |
| import streamlit as st | |
| APP_DIR = Path(__file__).resolve().parent | |
| DATA_DIR = APP_DIR / "data" | |
| FEATURES_PATH = DATA_DIR / "silver" / "unified" / "vietnam_influencer_features.csv" | |
| TRUST_SCORES_PATH = DATA_DIR / "gold" / "features" / "trust_scores.csv" | |
| EVENTS_PATH = DATA_DIR / "serving" / "kol_events.jsonl" | |
| MANUAL_LABELS_PATH = DATA_DIR / "input" / "manual_trust_labels.csv" | |
| DATASET_STATS_PATH = DATA_DIR / "dataset_stats.json" | |
| MODEL_PATH = APP_DIR / "ml" / "models" / "trust_score" / "kol_trust_model.joblib" | |
| NUMERIC_FEATURES = [ | |
| "follower_count", | |
| "content_count", | |
| "view_count", | |
| "like_count", | |
| "comment_count", | |
| "share_count", | |
| "engagement_rate", | |
| "likes_per_view", | |
| "comments_per_view", | |
| "shares_per_view", | |
| "sentiment_score", | |
| "positive_comment_count", | |
| "neutral_comment_count", | |
| "negative_comment_count", | |
| "upload_frequency", | |
| "follower_growth_rate", | |
| "activity_score", | |
| "is_suspicious", | |
| ] | |
| CATEGORICAL_FEATURES = ["platform"] | |
| st.set_page_config(page_title="KOLTrust Demo", page_icon=":bar_chart:", layout="wide") | |
| def trust_label(score: float) -> str: | |
| if score >= 70: | |
| return "high_trust" | |
| if score >= 45: | |
| return "medium_trust" | |
| return "low_trust" | |
| def risk_profile(label: str) -> str: | |
| if label == "high_trust": | |
| return "trusted" | |
| if label == "medium_trust": | |
| return "watch" | |
| return "risky" | |
| def to_float(value: Any, default: float = 0.0) -> float: | |
| try: | |
| if value is None or value == "": | |
| return default | |
| return float(value) | |
| except (TypeError, ValueError): | |
| return default | |
| def read_csv(path: Path) -> pd.DataFrame: | |
| if not path.exists(): | |
| return pd.DataFrame() | |
| return pd.read_csv(path, encoding="utf-8-sig") | |
| def read_json(path: Path) -> dict[str, Any]: | |
| if not path.exists(): | |
| return {} | |
| return json.loads(path.read_text(encoding="utf-8")) | |
| def read_jsonl(path: Path) -> pd.DataFrame: | |
| if not path.exists(): | |
| return pd.DataFrame() | |
| rows = [] | |
| for line in path.read_text(encoding="utf-8").splitlines(): | |
| if line.strip(): | |
| rows.append(json.loads(line)) | |
| return pd.DataFrame(rows) | |
| def load_trust_model() -> dict[str, Any] | None: | |
| if not MODEL_PATH.exists(): | |
| return None | |
| try: | |
| payload = joblib.load(MODEL_PATH) | |
| return payload if isinstance(payload, dict) else {"model": payload} | |
| except Exception: | |
| return None | |
| def normalize_features(df: pd.DataFrame) -> pd.DataFrame: | |
| if df.empty: | |
| return df | |
| data = df.copy() | |
| for column in NUMERIC_FEATURES: | |
| if column not in data.columns: | |
| data[column] = 0.0 | |
| data[column] = pd.to_numeric(data[column], errors="coerce").fillna(0.0) | |
| for column in CATEGORICAL_FEATURES: | |
| if column not in data.columns: | |
| data[column] = "unknown" | |
| data[column] = data[column].fillna("unknown").astype(str) | |
| for column in ("trust_score", "human_trust_score"): | |
| if column in data.columns: | |
| data[column] = pd.to_numeric(data[column], errors="coerce") | |
| return data | |
| def attach_manual_labels(features: pd.DataFrame, manual: pd.DataFrame) -> pd.DataFrame: | |
| if features.empty or manual.empty: | |
| return features | |
| required = {"platform", "creator_id", "content_id"} | |
| if not required.issubset(features.columns) or not required.issubset(manual.columns): | |
| return features | |
| label_columns = [ | |
| "platform", | |
| "creator_id", | |
| "content_id", | |
| "human_trust_score", | |
| "human_trust_label", | |
| "human_is_suspicious", | |
| "label_reason", | |
| "annotator", | |
| "labeled_at", | |
| ] | |
| labels = manual[[column for column in label_columns if column in manual.columns]].copy() | |
| for column in required: | |
| labels[column] = labels[column].astype(str) | |
| features[column] = features[column].astype(str) | |
| return features.merge(labels, on=["platform", "creator_id", "content_id"], how="left") | |
| def predict_scores(features: pd.DataFrame) -> pd.DataFrame: | |
| data = normalize_features(features) | |
| if data.empty: | |
| return data | |
| payload = load_trust_model() | |
| model = payload.get("model") if payload else None | |
| if model is None: | |
| data["model_trust_score"] = data.get("trust_score", pd.Series(dtype=float)) | |
| data["score_source"] = "baseline_rule" | |
| return data | |
| numeric_features = payload.get("numeric_features") or NUMERIC_FEATURES | |
| categorical_features = payload.get("categorical_features") or CATEGORICAL_FEATURES | |
| model_input = data[numeric_features + categorical_features].copy() | |
| try: | |
| data["model_trust_score"] = model.predict(model_input).clip(0, 100) | |
| data["score_source"] = "trained_model" | |
| except Exception: | |
| data["model_trust_score"] = data.get("trust_score", pd.Series(dtype=float)) | |
| data["score_source"] = "baseline_rule" | |
| return data | |
| def final_score(row: pd.Series) -> tuple[float, str, str]: | |
| if pd.notna(row.get("human_trust_score")): | |
| score = to_float(row.get("human_trust_score")) | |
| label = str(row.get("human_trust_label") or trust_label(score)) | |
| return score, label, "human_label" | |
| if pd.notna(row.get("model_trust_score")): | |
| score = to_float(row.get("model_trust_score")) | |
| return score, trust_label(score), str(row.get("score_source") or "trained_model") | |
| score = to_float(row.get("trust_score")) | |
| return score, trust_label(score), "baseline_rule" | |
| def build_content_table(features: pd.DataFrame) -> pd.DataFrame: | |
| rows = [] | |
| for _, row in features.iterrows(): | |
| score, label, source = final_score(row) | |
| rows.append( | |
| { | |
| "platform": row.get("platform"), | |
| "creator_id": row.get("creator_id"), | |
| "creator_name": row.get("creator_name"), | |
| "content_id": row.get("content_id"), | |
| "content_title": row.get("content_title"), | |
| "publish_time": row.get("publish_time"), | |
| "view_count": int(to_float(row.get("view_count"))), | |
| "like_count": int(to_float(row.get("like_count"))), | |
| "comment_count": int(to_float(row.get("comment_count"))), | |
| "share_count": int(to_float(row.get("share_count"))), | |
| "engagement_rate": to_float(row.get("engagement_rate")), | |
| "sentiment_score": to_float(row.get("sentiment_score")), | |
| "activity_score": to_float(row.get("activity_score")), | |
| "is_suspicious": bool(to_float(row.get("human_is_suspicious"), to_float(row.get("is_suspicious")))), | |
| "trust_score": round(score, 2), | |
| "trust_label": label, | |
| "risk_profile": risk_profile(label), | |
| "score_source": source, | |
| "label_reason": row.get("label_reason"), | |
| "annotator": row.get("annotator"), | |
| "labeled_at": row.get("labeled_at"), | |
| } | |
| ) | |
| return pd.DataFrame(rows) | |
| features = read_csv(FEATURES_PATH) | |
| manual_labels = read_csv(MANUAL_LABELS_PATH) | |
| trust_scores = read_csv(TRUST_SCORES_PATH) | |
| events = read_jsonl(EVENTS_PATH) | |
| stats = read_json(DATASET_STATS_PATH) | |
| features = attach_manual_labels(features, manual_labels) | |
| features = predict_scores(features) | |
| content = build_content_table(features) | |
| st.title("KOLTrust Demo") | |
| st.caption("Offline demo for KOL trust evaluation from exported public social metrics.") | |
| st.info("The KOL data in this demo is for simulation purposes only and not real.") | |
| if content.empty: | |
| st.error("Demo data was not found. Check the data folder in this Space.") | |
| st.stop() | |
| with st.sidebar: | |
| st.header("Controls") | |
| platforms = sorted(content["platform"].dropna().unique().tolist()) | |
| selected_platforms = st.multiselect("Platform", platforms, default=platforms) | |
| label_options = ["high_trust", "medium_trust", "low_trust"] | |
| selected_labels = st.multiselect("Trust label", label_options, default=label_options) | |
| source_options = sorted(content["score_source"].dropna().unique().tolist()) | |
| selected_sources = st.multiselect("Score source", source_options, default=source_options) | |
| min_views = st.number_input("Min views", min_value=0, value=0, step=1000) | |
| filtered = content[ | |
| content["platform"].isin(selected_platforms) | |
| & content["trust_label"].isin(selected_labels) | |
| & content["score_source"].isin(selected_sources) | |
| & (content["view_count"] >= min_views) | |
| ].copy() | |
| metric_cols = st.columns(5) | |
| metric_cols[0].metric("Content rows", f"{len(filtered):,}") | |
| metric_cols[1].metric("Creators", f"{filtered['creator_id'].nunique():,}") | |
| metric_cols[2].metric("Manual labels", f"{int((content['score_source'] == 'human_label').sum()):,}") | |
| metric_cols[3].metric("Avg trust", f"{filtered['trust_score'].mean():.1f}" if not filtered.empty else "0") | |
| metric_cols[4].metric("Suspicious", f"{int(filtered['is_suspicious'].sum()):,}") | |
| tab_overview, tab_evaluate, tab_data = st.tabs(["Overview", "Evaluate KOL", "Data"]) | |
| with tab_overview: | |
| chart_cols = st.columns([1.1, 0.9]) | |
| with chart_cols[0]: | |
| top_creators = ( | |
| filtered.groupby(["creator_id", "creator_name", "platform"], dropna=False) | |
| .agg(trust_score=("trust_score", "mean"), views=("view_count", "sum"), content=("content_id", "count")) | |
| .reset_index() | |
| .sort_values("trust_score", ascending=False) | |
| .head(20) | |
| ) | |
| fig = px.bar( | |
| top_creators.sort_values("trust_score"), | |
| x="trust_score", | |
| y="creator_name", | |
| color="platform", | |
| orientation="h", | |
| range_x=[0, 100], | |
| labels={"trust_score": "Avg trust score", "creator_name": ""}, | |
| ) | |
| fig.update_layout(height=500, margin=dict(l=10, r=10, t=20, b=10)) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with chart_cols[1]: | |
| label_counts = filtered["trust_label"].value_counts().rename_axis("trust_label").reset_index(name="rows") | |
| fig = px.pie(label_counts, names="trust_label", values="rows", hole=0.45) | |
| fig.update_layout(height=500, margin=dict(l=10, r=10, t=20, b=10)) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.subheader("Top creator leaderboard") | |
| st.dataframe(top_creators, use_container_width=True, hide_index=True) | |
| with tab_evaluate: | |
| creators = ( | |
| filtered.groupby(["creator_id", "creator_name", "platform"], dropna=False) | |
| .agg(trust_score=("trust_score", "mean"), views=("view_count", "sum"), content=("content_id", "count")) | |
| .reset_index() | |
| .sort_values(["trust_score", "views"], ascending=[False, False]) | |
| ) | |
| if creators.empty: | |
| st.info("No creators match the selected filters.") | |
| else: | |
| labels = { | |
| f"{row.creator_name} ({row.platform}, {row.creator_id})": row.creator_id | |
| for row in creators.itertuples(index=False) | |
| } | |
| selected_label = st.selectbox("Select KOL", list(labels.keys())) | |
| creator_id = labels[selected_label] | |
| creator_rows = filtered[filtered["creator_id"].astype(str) == str(creator_id)].sort_values("publish_time", ascending=False) | |
| latest = creator_rows.iloc[0] | |
| avg_score = creator_rows["trust_score"].mean() | |
| kol_label = trust_label(avg_score) | |
| cols = st.columns(5) | |
| cols[0].metric("Creator", str(latest["creator_name"])) | |
| cols[1].metric("Avg trust", f"{avg_score:.1f}") | |
| cols[2].metric("Label", kol_label) | |
| cols[3].metric("Contents", f"{len(creator_rows):,}") | |
| cols[4].metric("Total views", f"{int(creator_rows['view_count'].sum()):,}") | |
| signal_rows = pd.DataFrame( | |
| [ | |
| {"signal": "Engagement rate", "value": f"{creator_rows['engagement_rate'].mean() * 100:.2f}%"}, | |
| {"signal": "Sentiment score", "value": f"{creator_rows['sentiment_score'].mean():.1f}/100"}, | |
| {"signal": "Activity score", "value": f"{creator_rows['activity_score'].mean():.1f}/100"}, | |
| {"signal": "Suspicious content", "value": int(creator_rows["is_suspicious"].sum())}, | |
| {"signal": "Score source", "value": ", ".join(sorted(creator_rows["score_source"].unique()))}, | |
| ] | |
| ) | |
| st.subheader("Assessment signals") | |
| st.dataframe(signal_rows, use_container_width=True, hide_index=True) | |
| fig = px.scatter( | |
| creator_rows, | |
| x="view_count", | |
| y="trust_score", | |
| size="like_count", | |
| color="trust_label", | |
| hover_data=["content_title", "score_source", "engagement_rate", "sentiment_score"], | |
| range_y=[0, 100], | |
| ) | |
| fig.update_layout(height=380, margin=dict(l=10, r=10, t=20, b=10)) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.subheader("Content evidence") | |
| evidence_columns = [ | |
| "content_title", | |
| "publish_time", | |
| "view_count", | |
| "like_count", | |
| "comment_count", | |
| "engagement_rate", | |
| "sentiment_score", | |
| "trust_score", | |
| "trust_label", | |
| "score_source", | |
| "label_reason", | |
| ] | |
| st.dataframe(creator_rows[evidence_columns], use_container_width=True, hide_index=True) | |
| with tab_data: | |
| st.subheader("Filtered content") | |
| show_columns = [ | |
| "platform", | |
| "creator_name", | |
| "content_title", | |
| "view_count", | |
| "engagement_rate", | |
| "sentiment_score", | |
| "activity_score", | |
| "is_suspicious", | |
| "trust_score", | |
| "trust_label", | |
| "score_source", | |
| ] | |
| st.dataframe(filtered[show_columns].sort_values("trust_score", ascending=False), use_container_width=True, hide_index=True) | |
| with st.expander("Dataset stats"): | |
| st.json(stats) | |
| with st.expander("Manual labels"): | |
| st.dataframe(manual_labels, use_container_width=True, hide_index=True) | |
| with st.expander("Raw serving events sample"): | |
| st.dataframe(events.head(200), use_container_width=True, hide_index=True) | |
| with st.expander("Creator trust scores"): | |
| st.dataframe(trust_scores, use_container_width=True, hide_index=True) | |