# stats_tab.py # -*- coding: utf-8 -*- import pandas as pd import streamlit as st import numpy as np try: import altair as alt except Exception: alt = None def render_stats_tab(df_all_messages: pd.DataFrame, ss): st.subheader("Usage & Conversation Stats") df_all = (df_all_messages.copy() if df_all_messages is not None else pd.DataFrame()) if df_all.empty: st.info("No messages available for stats. Import from Cloud Pull or CSV first.") return # Robust UTC→JST handling ts_utc = pd.to_datetime(df_all["ts"], errors="coerce", utc=True) ts_jst = ts_utc.dt.tz_convert("Asia/Tokyo") df_all["ts_jst"] = ts_jst df_all["day"] = ts_jst.dt.strftime("%Y-%m-%d") df_all["hour"] = ts_jst.dt.hour df_all["dow"] = ts_jst.dt.dayofweek # 0=Mon..6=Sun df_all["dow_name"] = df_all["dow"].map({0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) # Sender label (nickname > display_name > id) idx_map = ss.get("user_index", {}) if ss is not None else {} def _label(u: str) -> str: rec = (idx_map.get(u, {}) or {}) nickname = str(rec.get("nickname", "")).strip() display = str(rec.get("display_name", "")).strip() base = nickname or display or u suffix = u[-6:] if isinstance(u, str) and len(u) >= 6 else u return f"{base} ({suffix})" df_all["sender"] = df_all["user_id"].astype(str).map(_label) # Controls st.markdown("**Time Range & Metric**") colr1, colr2, colr3 = st.columns([1.2, 1, 1.2]) with colr1: range_choice = st.selectbox("Range", ["Past day", "Past week", "Past month", "Past year", "All"], index=1) with colr2: metric_type = st.radio("Metric", ["Message time", "First-seen (follow) time"], index=0) with colr3: gran_override = st.selectbox( "Granularity", ["Auto", "Hourly", "Daily", "Weekly"], index=0, help="Auto picks Hourly for ≤2 days, else Daily.", ) now_jst = pd.Timestamp.now(tz="Asia/Tokyo") if range_choice == "Past day": start_jst = now_jst - pd.Timedelta(days=1) elif range_choice == "Past week": start_jst = now_jst - pd.Timedelta(weeks=1) elif range_choice == "Past month": start_jst = now_jst - pd.Timedelta(days=30) elif range_choice == "Past year": start_jst = now_jst - pd.Timedelta(days=365) else: start_jst = df_all["ts_jst"].min() or (now_jst - pd.Timedelta(days=365)) end_jst = now_jst dff = df_all[(df_all["ts_jst"] >= start_jst) & (df_all["ts_jst"] <= end_jst)].copy() if dff.empty: st.info("No messages in the selected window.") return st.markdown("### Overview") # Frequency if gran_override == "Hourly": freq = "H" elif gran_override == "Daily": freq = "D" elif gran_override == "Weekly": freq = "W" else: freq = "H" if (end_jst - start_jst) <= pd.Timedelta(days=2) else "D" # Main time series if metric_type == "Message time": series = dff.set_index("ts_jst").resample(freq).size() title_main = "Messages over time" else: first_seen = df_all.groupby("user_id")["ts_jst"].min().dropna() fs_win = first_seen[(first_seen >= start_jst) & (first_seen <= end_jst)] series = fs_win.to_frame("ts_jst").set_index("ts_jst").resample(freq).size() title_main = "New users over time (first seen)" series_df = series.rename_axis("time").reset_index(name="count") if not series_df.empty: if alt: st.altair_chart( alt.Chart(series_df).mark_line(point=True).encode( x=alt.X("time:T", title="Time (JST)"), y=alt.Y("count:Q", title="Count"), ).properties(height=240, title=title_main), use_container_width=True, ) else: st.line_chart(series_df.set_index("time")["count"], height=240) # Hour-of-day by_hour = dff.groupby("hour").size().reset_index(name="count") if alt: st.altair_chart( alt.Chart(by_hour).mark_bar().encode( x=alt.X("hour:O", title="Hour (JST)"), y=alt.Y("count:Q", title="Messages"), ).properties(height=180, title="Messages by hour"), use_container_width=True, ) else: st.bar_chart(by_hour.set_index("hour")["count"], height=180) # Weekday order_dow = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] by_dow = dff.groupby("dow_name").size().reindex(order_dow).fillna(0).reset_index() by_dow.columns = ["weekday", "count"] if alt: st.altair_chart( alt.Chart(by_dow).mark_bar().encode( x=alt.X("weekday:N", sort=order_dow, title="Weekday"), y=alt.Y("count:Q", title="Messages"), ).properties(height=180, title="Messages by weekday"), use_container_width=True, ) else: st.bar_chart(by_dow.set_index("weekday")["count"], height=180) # Role breakdown with st.expander("Role breakdown"): role_counts = dff.groupby("role").size().reset_index(name="count").sort_values("count", ascending=False) if alt: st.altair_chart( alt.Chart(role_counts).mark_bar().encode( x=alt.X("role:N", title="Role"), y=alt.Y("count:Q", title="Messages"), ).properties(height=160, title="Messages by role"), use_container_width=True, ) else: st.bar_chart(role_counts.set_index("role")["count"], height=160) st.markdown("---") # Top 10 senders per-day table st.markdown("### Top 10 Senders (with per-day counts)") pivot = ( dff.assign(day=dff["ts_jst"].dt.strftime("%Y-%m-%d")) .pivot_table(index="sender", columns="day", values="text", aggfunc="count", fill_value=0) ) top10 = pd.DataFrame() if pivot.empty: st.info("No senders in this window.") else: pivot["__Total"] = pivot.sum(axis=1) top10 = pivot.sort_values("__Total", ascending=False).head(10) cols = ["__Total"] + [c for c in top10.columns if c != "__Total"] st.dataframe(top10[cols], use_container_width=True, height=260) st.markdown("---") # Per-user breakdown st.markdown("### Per-user Breakdown") users_list = sorted(dff["sender"].unique()) if not users_list: st.info("No users to analyze in this window.") return pick_sender = st.selectbox("Select a sender", options=users_list, index=0, key="stats_pick_sender") uid_sel = dff.loc[dff["sender"] == pick_sender, "user_id"].iloc[0] dfu = dff[dff["user_id"] == uid_sel].copy() total_msgs = dfu.shape[0] active_days = dfu["day"].nunique() lengths = dfu["text"].astype(str).map(len) words = dfu["text"].astype(str).map(lambda s: len(s.split())) median_gap = 0.0 if total_msgs > 1: gaps = dfu.sort_values("ts_jst")["ts_jst"].diff().dropna().dt.total_seconds() / 60.0 if not gaps.empty: median_gap = float(gaps.median()) c1, c2, c3, c4, c5 = st.columns(5) c1.metric("Messages", f"{total_msgs}") c2.metric("Active days", f"{active_days}") c3.metric("Avg length (chars)", f"{float(lengths.mean()):.1f}" if total_msgs else "0.0") c4.metric("Avg words", f"{float(words.mean()):.1f}" if total_msgs else "0.0") c5.metric("Median gap (min)", f"{median_gap:.1f}") # Timeline freq_u = "H" if (end_jst - start_jst) <= pd.Timedelta(days=2) else "D" ser_u = dfu.set_index("ts_jst").resample(freq_u).size() ser_u_df = ser_u.rename_axis("ts_jst").reset_index(name="count") if not ser_u_df.empty: if alt: st.altair_chart( alt.Chart(ser_u_df).mark_line(point=True).encode( x=alt.X("ts_jst:T", title="Time (JST)"), y=alt.Y("count:Q", title="Messages"), ).properties(height=220, title=f"Messages over time — {pick_sender}"), use_container_width=True, ) else: st.line_chart(ser_u_df.set_index("ts_jst")["count"], height=220) # Length histogram if not dfu.empty: if alt: hist = pd.DataFrame({"length": lengths}) st.altair_chart( alt.Chart(hist).mark_bar().encode( x=alt.X("length:Q", bin=alt.Bin(maxbins=30), title="Message length (chars)"), y=alt.Y("count():Q", title="Messages"), ).properties(height=180, title="Message length distribution"), use_container_width=True, ) else: st.bar_chart(lengths.value_counts().sort_index(), height=180) # Heatmap (weekday × hour) if alt and not dfu.empty: dfu_heat = dfu.groupby(["dow_name", "hour"]).size().reset_index(name="count") st.altair_chart( alt.Chart(dfu_heat).mark_rect().encode( x=alt.X("hour:O", title="Hour (JST)"), y=alt.Y("dow_name:O", sort=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], title="Weekday"), color=alt.Color("count:Q", title="Msgs", scale=alt.Scale(scheme="bluegreen")), ).properties(height=180, title="Activity heatmap"), use_container_width=True, ) st.markdown("---") # Extra insights st.markdown("### Extra Insights") peak_hour = int(dff["hour"].mode().iloc[0]) if not dff["hour"].isna().all() else 0 peak_dow = dff["dow_name"].mode().iloc[0] if not dff["dow_name"].isna().all() else "N/A" peak_hour_u = int(dfu["hour"].mode().iloc[0]) if not dfu["hour"].isna().all() else 0 peak_dow_u = dfu["dow_name"].mode().iloc[0] if not dfu["dow_name"].isna().all() else "N/A" e1, e2, e3, e4 = st.columns(4) e1.metric("Global peak hour", f"{peak_hour}:00") e2.metric("Global peak weekday", peak_dow) e3.metric("User peak hour", f"{peak_hour_u}:00") e4.metric("User peak weekday", peak_dow_u) # Rolling 7-day sum ser_daily = dff.set_index("ts_jst").resample("D").size() ser_daily_df = ser_daily.rename_axis("ts_jst").reset_index(name="count") if not ser_daily_df.empty: ser_daily_df["rolling_7d"] = ser_daily_df["count"].rolling(7, min_periods=1).sum() if alt: bars = alt.Chart(ser_daily_df).mark_bar().encode( x=alt.X("ts_jst:T", title="Date (JST)"), y=alt.Y("count:Q", title="Daily messages"), tooltip=["ts_jst:T", "count:Q", "rolling_7d:Q"], ).properties(height=200, title="Daily messages & rolling 7-day sum") line = alt.Chart(ser_daily_df).mark_line(strokeDash=[4, 2]).encode( x="ts_jst:T", y=alt.Y("rolling_7d:Q", title="Rolling 7-day sum"), ) st.altair_chart(bars + line, use_container_width=True) else: st.line_chart(ser_daily_df.set_index("ts_jst")[["count", "rolling_7d"]], height=200) # Exports st.markdown("#### Export") if isinstance(top10, pd.DataFrame) and not top10.empty: csv_sum = top10.reset_index().rename(columns={"sender": "User"}) st.download_button( "⬇️ Download Top10 table (CSV)", data=csv_sum.to_csv(index=False), file_name="top10_senders.csv", mime="text/csv", ) st.download_button( "⬇️ Download filtered messages (CSV)", data=dff.to_csv(index=False), file_name="messages_filtered.csv", mime="text/csv", )