liamxdev's picture
Upload folder using huggingface_hub
e59446c verified
Raw
History Blame Contribute Delete
14.6 kB
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import joblib
import pandas as pd
import plotly.express as px
import streamlit as st
APP_DIR = Path(__file__).resolve().parent
DATA_DIR = APP_DIR / "data"
FEATURES_PATH = DATA_DIR / "silver" / "unified" / "vietnam_influencer_features.csv"
TRUST_SCORES_PATH = DATA_DIR / "gold" / "features" / "trust_scores.csv"
EVENTS_PATH = DATA_DIR / "serving" / "kol_events.jsonl"
MANUAL_LABELS_PATH = DATA_DIR / "input" / "manual_trust_labels.csv"
DATASET_STATS_PATH = DATA_DIR / "dataset_stats.json"
MODEL_PATH = APP_DIR / "ml" / "models" / "trust_score" / "kol_trust_model.joblib"
NUMERIC_FEATURES = [
"follower_count",
"content_count",
"view_count",
"like_count",
"comment_count",
"share_count",
"engagement_rate",
"likes_per_view",
"comments_per_view",
"shares_per_view",
"sentiment_score",
"positive_comment_count",
"neutral_comment_count",
"negative_comment_count",
"upload_frequency",
"follower_growth_rate",
"activity_score",
"is_suspicious",
]
CATEGORICAL_FEATURES = ["platform"]
st.set_page_config(page_title="KOLTrust Demo", page_icon=":bar_chart:", layout="wide")
def trust_label(score: float) -> str:
if score >= 70:
return "high_trust"
if score >= 45:
return "medium_trust"
return "low_trust"
def risk_profile(label: str) -> str:
if label == "high_trust":
return "trusted"
if label == "medium_trust":
return "watch"
return "risky"
def to_float(value: Any, default: float = 0.0) -> float:
try:
if value is None or value == "":
return default
return float(value)
except (TypeError, ValueError):
return default
@st.cache_data(show_spinner=False)
def read_csv(path: Path) -> pd.DataFrame:
if not path.exists():
return pd.DataFrame()
return pd.read_csv(path, encoding="utf-8-sig")
@st.cache_data(show_spinner=False)
def read_json(path: Path) -> dict[str, Any]:
if not path.exists():
return {}
return json.loads(path.read_text(encoding="utf-8"))
@st.cache_data(show_spinner=False)
def read_jsonl(path: Path) -> pd.DataFrame:
if not path.exists():
return pd.DataFrame()
rows = []
for line in path.read_text(encoding="utf-8").splitlines():
if line.strip():
rows.append(json.loads(line))
return pd.DataFrame(rows)
@st.cache_resource(show_spinner=False)
def load_trust_model() -> dict[str, Any] | None:
if not MODEL_PATH.exists():
return None
try:
payload = joblib.load(MODEL_PATH)
return payload if isinstance(payload, dict) else {"model": payload}
except Exception:
return None
def normalize_features(df: pd.DataFrame) -> pd.DataFrame:
if df.empty:
return df
data = df.copy()
for column in NUMERIC_FEATURES:
if column not in data.columns:
data[column] = 0.0
data[column] = pd.to_numeric(data[column], errors="coerce").fillna(0.0)
for column in CATEGORICAL_FEATURES:
if column not in data.columns:
data[column] = "unknown"
data[column] = data[column].fillna("unknown").astype(str)
for column in ("trust_score", "human_trust_score"):
if column in data.columns:
data[column] = pd.to_numeric(data[column], errors="coerce")
return data
def attach_manual_labels(features: pd.DataFrame, manual: pd.DataFrame) -> pd.DataFrame:
if features.empty or manual.empty:
return features
required = {"platform", "creator_id", "content_id"}
if not required.issubset(features.columns) or not required.issubset(manual.columns):
return features
label_columns = [
"platform",
"creator_id",
"content_id",
"human_trust_score",
"human_trust_label",
"human_is_suspicious",
"label_reason",
"annotator",
"labeled_at",
]
labels = manual[[column for column in label_columns if column in manual.columns]].copy()
for column in required:
labels[column] = labels[column].astype(str)
features[column] = features[column].astype(str)
return features.merge(labels, on=["platform", "creator_id", "content_id"], how="left")
def predict_scores(features: pd.DataFrame) -> pd.DataFrame:
data = normalize_features(features)
if data.empty:
return data
payload = load_trust_model()
model = payload.get("model") if payload else None
if model is None:
data["model_trust_score"] = data.get("trust_score", pd.Series(dtype=float))
data["score_source"] = "baseline_rule"
return data
numeric_features = payload.get("numeric_features") or NUMERIC_FEATURES
categorical_features = payload.get("categorical_features") or CATEGORICAL_FEATURES
model_input = data[numeric_features + categorical_features].copy()
try:
data["model_trust_score"] = model.predict(model_input).clip(0, 100)
data["score_source"] = "trained_model"
except Exception:
data["model_trust_score"] = data.get("trust_score", pd.Series(dtype=float))
data["score_source"] = "baseline_rule"
return data
def final_score(row: pd.Series) -> tuple[float, str, str]:
if pd.notna(row.get("human_trust_score")):
score = to_float(row.get("human_trust_score"))
label = str(row.get("human_trust_label") or trust_label(score))
return score, label, "human_label"
if pd.notna(row.get("model_trust_score")):
score = to_float(row.get("model_trust_score"))
return score, trust_label(score), str(row.get("score_source") or "trained_model")
score = to_float(row.get("trust_score"))
return score, trust_label(score), "baseline_rule"
def build_content_table(features: pd.DataFrame) -> pd.DataFrame:
rows = []
for _, row in features.iterrows():
score, label, source = final_score(row)
rows.append(
{
"platform": row.get("platform"),
"creator_id": row.get("creator_id"),
"creator_name": row.get("creator_name"),
"content_id": row.get("content_id"),
"content_title": row.get("content_title"),
"publish_time": row.get("publish_time"),
"view_count": int(to_float(row.get("view_count"))),
"like_count": int(to_float(row.get("like_count"))),
"comment_count": int(to_float(row.get("comment_count"))),
"share_count": int(to_float(row.get("share_count"))),
"engagement_rate": to_float(row.get("engagement_rate")),
"sentiment_score": to_float(row.get("sentiment_score")),
"activity_score": to_float(row.get("activity_score")),
"is_suspicious": bool(to_float(row.get("human_is_suspicious"), to_float(row.get("is_suspicious")))),
"trust_score": round(score, 2),
"trust_label": label,
"risk_profile": risk_profile(label),
"score_source": source,
"label_reason": row.get("label_reason"),
"annotator": row.get("annotator"),
"labeled_at": row.get("labeled_at"),
}
)
return pd.DataFrame(rows)
features = read_csv(FEATURES_PATH)
manual_labels = read_csv(MANUAL_LABELS_PATH)
trust_scores = read_csv(TRUST_SCORES_PATH)
events = read_jsonl(EVENTS_PATH)
stats = read_json(DATASET_STATS_PATH)
features = attach_manual_labels(features, manual_labels)
features = predict_scores(features)
content = build_content_table(features)
st.title("KOLTrust Demo")
st.caption("Offline demo for KOL trust evaluation from exported public social metrics.")
st.info("The KOL data in this demo is for simulation purposes only and not real.")
if content.empty:
st.error("Demo data was not found. Check the data folder in this Space.")
st.stop()
with st.sidebar:
st.header("Controls")
platforms = sorted(content["platform"].dropna().unique().tolist())
selected_platforms = st.multiselect("Platform", platforms, default=platforms)
label_options = ["high_trust", "medium_trust", "low_trust"]
selected_labels = st.multiselect("Trust label", label_options, default=label_options)
source_options = sorted(content["score_source"].dropna().unique().tolist())
selected_sources = st.multiselect("Score source", source_options, default=source_options)
min_views = st.number_input("Min views", min_value=0, value=0, step=1000)
filtered = content[
content["platform"].isin(selected_platforms)
& content["trust_label"].isin(selected_labels)
& content["score_source"].isin(selected_sources)
& (content["view_count"] >= min_views)
].copy()
metric_cols = st.columns(5)
metric_cols[0].metric("Content rows", f"{len(filtered):,}")
metric_cols[1].metric("Creators", f"{filtered['creator_id'].nunique():,}")
metric_cols[2].metric("Manual labels", f"{int((content['score_source'] == 'human_label').sum()):,}")
metric_cols[3].metric("Avg trust", f"{filtered['trust_score'].mean():.1f}" if not filtered.empty else "0")
metric_cols[4].metric("Suspicious", f"{int(filtered['is_suspicious'].sum()):,}")
tab_overview, tab_evaluate, tab_data = st.tabs(["Overview", "Evaluate KOL", "Data"])
with tab_overview:
chart_cols = st.columns([1.1, 0.9])
with chart_cols[0]:
top_creators = (
filtered.groupby(["creator_id", "creator_name", "platform"], dropna=False)
.agg(trust_score=("trust_score", "mean"), views=("view_count", "sum"), content=("content_id", "count"))
.reset_index()
.sort_values("trust_score", ascending=False)
.head(20)
)
fig = px.bar(
top_creators.sort_values("trust_score"),
x="trust_score",
y="creator_name",
color="platform",
orientation="h",
range_x=[0, 100],
labels={"trust_score": "Avg trust score", "creator_name": ""},
)
fig.update_layout(height=500, margin=dict(l=10, r=10, t=20, b=10))
st.plotly_chart(fig, use_container_width=True)
with chart_cols[1]:
label_counts = filtered["trust_label"].value_counts().rename_axis("trust_label").reset_index(name="rows")
fig = px.pie(label_counts, names="trust_label", values="rows", hole=0.45)
fig.update_layout(height=500, margin=dict(l=10, r=10, t=20, b=10))
st.plotly_chart(fig, use_container_width=True)
st.subheader("Top creator leaderboard")
st.dataframe(top_creators, use_container_width=True, hide_index=True)
with tab_evaluate:
creators = (
filtered.groupby(["creator_id", "creator_name", "platform"], dropna=False)
.agg(trust_score=("trust_score", "mean"), views=("view_count", "sum"), content=("content_id", "count"))
.reset_index()
.sort_values(["trust_score", "views"], ascending=[False, False])
)
if creators.empty:
st.info("No creators match the selected filters.")
else:
labels = {
f"{row.creator_name} ({row.platform}, {row.creator_id})": row.creator_id
for row in creators.itertuples(index=False)
}
selected_label = st.selectbox("Select KOL", list(labels.keys()))
creator_id = labels[selected_label]
creator_rows = filtered[filtered["creator_id"].astype(str) == str(creator_id)].sort_values("publish_time", ascending=False)
latest = creator_rows.iloc[0]
avg_score = creator_rows["trust_score"].mean()
kol_label = trust_label(avg_score)
cols = st.columns(5)
cols[0].metric("Creator", str(latest["creator_name"]))
cols[1].metric("Avg trust", f"{avg_score:.1f}")
cols[2].metric("Label", kol_label)
cols[3].metric("Contents", f"{len(creator_rows):,}")
cols[4].metric("Total views", f"{int(creator_rows['view_count'].sum()):,}")
signal_rows = pd.DataFrame(
[
{"signal": "Engagement rate", "value": f"{creator_rows['engagement_rate'].mean() * 100:.2f}%"},
{"signal": "Sentiment score", "value": f"{creator_rows['sentiment_score'].mean():.1f}/100"},
{"signal": "Activity score", "value": f"{creator_rows['activity_score'].mean():.1f}/100"},
{"signal": "Suspicious content", "value": int(creator_rows["is_suspicious"].sum())},
{"signal": "Score source", "value": ", ".join(sorted(creator_rows["score_source"].unique()))},
]
)
st.subheader("Assessment signals")
st.dataframe(signal_rows, use_container_width=True, hide_index=True)
fig = px.scatter(
creator_rows,
x="view_count",
y="trust_score",
size="like_count",
color="trust_label",
hover_data=["content_title", "score_source", "engagement_rate", "sentiment_score"],
range_y=[0, 100],
)
fig.update_layout(height=380, margin=dict(l=10, r=10, t=20, b=10))
st.plotly_chart(fig, use_container_width=True)
st.subheader("Content evidence")
evidence_columns = [
"content_title",
"publish_time",
"view_count",
"like_count",
"comment_count",
"engagement_rate",
"sentiment_score",
"trust_score",
"trust_label",
"score_source",
"label_reason",
]
st.dataframe(creator_rows[evidence_columns], use_container_width=True, hide_index=True)
with tab_data:
st.subheader("Filtered content")
show_columns = [
"platform",
"creator_name",
"content_title",
"view_count",
"engagement_rate",
"sentiment_score",
"activity_score",
"is_suspicious",
"trust_score",
"trust_label",
"score_source",
]
st.dataframe(filtered[show_columns].sort_values("trust_score", ascending=False), use_container_width=True, hide_index=True)
with st.expander("Dataset stats"):
st.json(stats)
with st.expander("Manual labels"):
st.dataframe(manual_labels, use_container_width=True, hide_index=True)
with st.expander("Raw serving events sample"):
st.dataframe(events.head(200), use_container_width=True, hide_index=True)
with st.expander("Creator trust scores"):
st.dataframe(trust_scores, use_container_width=True, hide_index=True)