KOALADX / src /stats_tab.py
cyacya123's picture
Upload 2 files
1ea1e6b verified
# stats_tab.py
# -*- coding: utf-8 -*-
import pandas as pd
import streamlit as st
import numpy as np
try:
import altair as alt
except Exception:
alt = None
def render_stats_tab(df_all_messages: pd.DataFrame, ss):
st.subheader("Usage & Conversation Stats")
df_all = (df_all_messages.copy() if df_all_messages is not None else pd.DataFrame())
if df_all.empty:
st.info("No messages available for stats. Import from Cloud Pull or CSV first.")
return
# Robust UTC→JST handling
ts_utc = pd.to_datetime(df_all["ts"], errors="coerce", utc=True)
ts_jst = ts_utc.dt.tz_convert("Asia/Tokyo")
df_all["ts_jst"] = ts_jst
df_all["day"] = ts_jst.dt.strftime("%Y-%m-%d")
df_all["hour"] = ts_jst.dt.hour
df_all["dow"] = ts_jst.dt.dayofweek # 0=Mon..6=Sun
df_all["dow_name"] = df_all["dow"].map({0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"})
# Sender label (nickname > display_name > id)
idx_map = ss.get("user_index", {}) if ss is not None else {}
def _label(u: str) -> str:
rec = (idx_map.get(u, {}) or {})
nickname = str(rec.get("nickname", "")).strip()
display = str(rec.get("display_name", "")).strip()
base = nickname or display or u
suffix = u[-6:] if isinstance(u, str) and len(u) >= 6 else u
return f"{base} ({suffix})"
df_all["sender"] = df_all["user_id"].astype(str).map(_label)
# Controls
st.markdown("**Time Range & Metric**")
colr1, colr2, colr3 = st.columns([1.2, 1, 1.2])
with colr1:
range_choice = st.selectbox("Range", ["Past day", "Past week", "Past month", "Past year", "All"], index=1)
with colr2:
metric_type = st.radio("Metric", ["Message time", "First-seen (follow) time"], index=0)
with colr3:
gran_override = st.selectbox(
"Granularity",
["Auto", "Hourly", "Daily", "Weekly"],
index=0,
help="Auto picks Hourly for ≤2 days, else Daily.",
)
now_jst = pd.Timestamp.now(tz="Asia/Tokyo")
if range_choice == "Past day":
start_jst = now_jst - pd.Timedelta(days=1)
elif range_choice == "Past week":
start_jst = now_jst - pd.Timedelta(weeks=1)
elif range_choice == "Past month":
start_jst = now_jst - pd.Timedelta(days=30)
elif range_choice == "Past year":
start_jst = now_jst - pd.Timedelta(days=365)
else:
start_jst = df_all["ts_jst"].min() or (now_jst - pd.Timedelta(days=365))
end_jst = now_jst
dff = df_all[(df_all["ts_jst"] >= start_jst) & (df_all["ts_jst"] <= end_jst)].copy()
if dff.empty:
st.info("No messages in the selected window.")
return
st.markdown("### Overview")
# Frequency
if gran_override == "Hourly":
freq = "H"
elif gran_override == "Daily":
freq = "D"
elif gran_override == "Weekly":
freq = "W"
else:
freq = "H" if (end_jst - start_jst) <= pd.Timedelta(days=2) else "D"
# Main time series
if metric_type == "Message time":
series = dff.set_index("ts_jst").resample(freq).size()
title_main = "Messages over time"
else:
first_seen = df_all.groupby("user_id")["ts_jst"].min().dropna()
fs_win = first_seen[(first_seen >= start_jst) & (first_seen <= end_jst)]
series = fs_win.to_frame("ts_jst").set_index("ts_jst").resample(freq).size()
title_main = "New users over time (first seen)"
series_df = series.rename_axis("time").reset_index(name="count")
if not series_df.empty:
if alt:
st.altair_chart(
alt.Chart(series_df).mark_line(point=True).encode(
x=alt.X("time:T", title="Time (JST)"),
y=alt.Y("count:Q", title="Count"),
).properties(height=240, title=title_main),
use_container_width=True,
)
else:
st.line_chart(series_df.set_index("time")["count"], height=240)
# Hour-of-day
by_hour = dff.groupby("hour").size().reset_index(name="count")
if alt:
st.altair_chart(
alt.Chart(by_hour).mark_bar().encode(
x=alt.X("hour:O", title="Hour (JST)"),
y=alt.Y("count:Q", title="Messages"),
).properties(height=180, title="Messages by hour"),
use_container_width=True,
)
else:
st.bar_chart(by_hour.set_index("hour")["count"], height=180)
# Weekday
order_dow = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
by_dow = dff.groupby("dow_name").size().reindex(order_dow).fillna(0).reset_index()
by_dow.columns = ["weekday", "count"]
if alt:
st.altair_chart(
alt.Chart(by_dow).mark_bar().encode(
x=alt.X("weekday:N", sort=order_dow, title="Weekday"),
y=alt.Y("count:Q", title="Messages"),
).properties(height=180, title="Messages by weekday"),
use_container_width=True,
)
else:
st.bar_chart(by_dow.set_index("weekday")["count"], height=180)
# Role breakdown
with st.expander("Role breakdown"):
role_counts = dff.groupby("role").size().reset_index(name="count").sort_values("count", ascending=False)
if alt:
st.altair_chart(
alt.Chart(role_counts).mark_bar().encode(
x=alt.X("role:N", title="Role"),
y=alt.Y("count:Q", title="Messages"),
).properties(height=160, title="Messages by role"),
use_container_width=True,
)
else:
st.bar_chart(role_counts.set_index("role")["count"], height=160)
st.markdown("---")
# Top 10 senders per-day table
st.markdown("### Top 10 Senders (with per-day counts)")
pivot = (
dff.assign(day=dff["ts_jst"].dt.strftime("%Y-%m-%d"))
.pivot_table(index="sender", columns="day", values="text", aggfunc="count", fill_value=0)
)
top10 = pd.DataFrame()
if pivot.empty:
st.info("No senders in this window.")
else:
pivot["__Total"] = pivot.sum(axis=1)
top10 = pivot.sort_values("__Total", ascending=False).head(10)
cols = ["__Total"] + [c for c in top10.columns if c != "__Total"]
st.dataframe(top10[cols], use_container_width=True, height=260)
st.markdown("---")
# Per-user breakdown
st.markdown("### Per-user Breakdown")
users_list = sorted(dff["sender"].unique())
if not users_list:
st.info("No users to analyze in this window.")
return
pick_sender = st.selectbox("Select a sender", options=users_list, index=0, key="stats_pick_sender")
uid_sel = dff.loc[dff["sender"] == pick_sender, "user_id"].iloc[0]
dfu = dff[dff["user_id"] == uid_sel].copy()
total_msgs = dfu.shape[0]
active_days = dfu["day"].nunique()
lengths = dfu["text"].astype(str).map(len)
words = dfu["text"].astype(str).map(lambda s: len(s.split()))
median_gap = 0.0
if total_msgs > 1:
gaps = dfu.sort_values("ts_jst")["ts_jst"].diff().dropna().dt.total_seconds() / 60.0
if not gaps.empty:
median_gap = float(gaps.median())
c1, c2, c3, c4, c5 = st.columns(5)
c1.metric("Messages", f"{total_msgs}")
c2.metric("Active days", f"{active_days}")
c3.metric("Avg length (chars)", f"{float(lengths.mean()):.1f}" if total_msgs else "0.0")
c4.metric("Avg words", f"{float(words.mean()):.1f}" if total_msgs else "0.0")
c5.metric("Median gap (min)", f"{median_gap:.1f}")
# Timeline
freq_u = "H" if (end_jst - start_jst) <= pd.Timedelta(days=2) else "D"
ser_u = dfu.set_index("ts_jst").resample(freq_u).size()
ser_u_df = ser_u.rename_axis("ts_jst").reset_index(name="count")
if not ser_u_df.empty:
if alt:
st.altair_chart(
alt.Chart(ser_u_df).mark_line(point=True).encode(
x=alt.X("ts_jst:T", title="Time (JST)"),
y=alt.Y("count:Q", title="Messages"),
).properties(height=220, title=f"Messages over time — {pick_sender}"),
use_container_width=True,
)
else:
st.line_chart(ser_u_df.set_index("ts_jst")["count"], height=220)
# Length histogram
if not dfu.empty:
if alt:
hist = pd.DataFrame({"length": lengths})
st.altair_chart(
alt.Chart(hist).mark_bar().encode(
x=alt.X("length:Q", bin=alt.Bin(maxbins=30), title="Message length (chars)"),
y=alt.Y("count():Q", title="Messages"),
).properties(height=180, title="Message length distribution"),
use_container_width=True,
)
else:
st.bar_chart(lengths.value_counts().sort_index(), height=180)
# Heatmap (weekday × hour)
if alt and not dfu.empty:
dfu_heat = dfu.groupby(["dow_name", "hour"]).size().reset_index(name="count")
st.altair_chart(
alt.Chart(dfu_heat).mark_rect().encode(
x=alt.X("hour:O", title="Hour (JST)"),
y=alt.Y("dow_name:O", sort=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], title="Weekday"),
color=alt.Color("count:Q", title="Msgs", scale=alt.Scale(scheme="bluegreen")),
).properties(height=180, title="Activity heatmap"),
use_container_width=True,
)
st.markdown("---")
# Extra insights
st.markdown("### Extra Insights")
peak_hour = int(dff["hour"].mode().iloc[0]) if not dff["hour"].isna().all() else 0
peak_dow = dff["dow_name"].mode().iloc[0] if not dff["dow_name"].isna().all() else "N/A"
peak_hour_u = int(dfu["hour"].mode().iloc[0]) if not dfu["hour"].isna().all() else 0
peak_dow_u = dfu["dow_name"].mode().iloc[0] if not dfu["dow_name"].isna().all() else "N/A"
e1, e2, e3, e4 = st.columns(4)
e1.metric("Global peak hour", f"{peak_hour}:00")
e2.metric("Global peak weekday", peak_dow)
e3.metric("User peak hour", f"{peak_hour_u}:00")
e4.metric("User peak weekday", peak_dow_u)
# Rolling 7-day sum
ser_daily = dff.set_index("ts_jst").resample("D").size()
ser_daily_df = ser_daily.rename_axis("ts_jst").reset_index(name="count")
if not ser_daily_df.empty:
ser_daily_df["rolling_7d"] = ser_daily_df["count"].rolling(7, min_periods=1).sum()
if alt:
bars = alt.Chart(ser_daily_df).mark_bar().encode(
x=alt.X("ts_jst:T", title="Date (JST)"),
y=alt.Y("count:Q", title="Daily messages"),
tooltip=["ts_jst:T", "count:Q", "rolling_7d:Q"],
).properties(height=200, title="Daily messages & rolling 7-day sum")
line = alt.Chart(ser_daily_df).mark_line(strokeDash=[4, 2]).encode(
x="ts_jst:T",
y=alt.Y("rolling_7d:Q", title="Rolling 7-day sum"),
)
st.altair_chart(bars + line, use_container_width=True)
else:
st.line_chart(ser_daily_df.set_index("ts_jst")[["count", "rolling_7d"]], height=200)
# Exports
st.markdown("#### Export")
if isinstance(top10, pd.DataFrame) and not top10.empty:
csv_sum = top10.reset_index().rename(columns={"sender": "User"})
st.download_button(
"⬇️ Download Top10 table (CSV)",
data=csv_sum.to_csv(index=False),
file_name="top10_senders.csv",
mime="text/csv",
)
st.download_button(
"⬇️ Download filtered messages (CSV)",
data=dff.to_csv(index=False),
file_name="messages_filtered.csv",
mime="text/csv",
)