Spaces:

BPEL
/

otree-csv-viewer

Sleeping

File size: 24,719 Bytes

import io
import json
import os
import time
from dotenv import load_dotenv

load_dotenv()
import numpy as np
import pandas as pd
import requests
from flask import Flask, render_template, request, Response, session, redirect, url_for
from werkzeug.middleware.proxy_fix import ProxyFix


app = Flask(__name__)
app.secret_key = os.environ.get("FLASK_SECRET", "otree-csv-viewer-default-key")
app.config["SESSION_COOKIE_SAMESITE"] = "None"
app.config["SESSION_COOKIE_SECURE"] = True
app.config["SESSION_COOKIE_HTTPONLY"] = True
ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD", "bpel123")
app.wsgi_app = ProxyFix(app.wsgi_app, x_proto=1, x_host=1)


@app.before_request
def require_login():
    if request.endpoint in ("login", "static") or request.path == "/favicon.ico":
        return
    if not session.get("authenticated"):
        if request.path.startswith("/api/"):
            return Response("Unauthorized", status=401)
        return redirect(url_for("login"))


def to_json(obj):
    """jsonify replacement that handles numpy types."""
    def default(o):
        if isinstance(o, (np.integer,)):
            return int(o)
        if isinstance(o, (np.floating,)):
            if np.isnan(o):
                return None
            return float(o)
        if isinstance(o, np.ndarray):
            return o.tolist()
        raise TypeError(f"{type(o)} not serializable")
    data = json.dumps(obj, default=default)
    return Response(data, mimetype="application/json")

CSV_PATH = os.path.join(os.path.dirname(__file__), "all_apps_wide-2026-03-31.csv")
OTREE_URL = os.environ.get("OTREE_CSV_URL", "")

# Identifier columns to always include alongside app_collect_results
ID_COLS = [
    "participant.id_in_session",
    "participant.code",
    "participant.label",
    "participant._current_app_name",
    "participant._current_page_name",
    "participant.treatment",
    "participant.payoff",
    "session.code",
]

COLLECT_PREFIX = "app_collect_results."

# In-memory cache so we don't re-read disk on every request
_cache = {"df": None, "mtime": 0}


def _get_df():
    """Return the dataframe, re-reading from disk only if the file changed."""
    try:
        mtime = os.path.getmtime(CSV_PATH)
    except OSError:
        mtime = 0
    if _cache["df"] is None or mtime != _cache["mtime"]:
        _cache["df"] = pd.read_csv(CSV_PATH)
        _cache["mtime"] = mtime
    return _cache["df"]


def load_csv(collect_only=True):
    df = _get_df().copy()
    if collect_only:
        collect_cols = [c for c in df.columns if c.startswith(COLLECT_PREFIX)]
        keep = [c for c in ID_COLS if c in df.columns] + collect_cols
        df = df[keep]
    return df


@app.route("/login", methods=["GET", "POST"])
def login():
    error = None
    if request.method == "POST":
        if request.form.get("password") == ADMIN_PASSWORD:
            session["authenticated"] = True
            return redirect(url_for("index"))
        error = "Incorrect password"
    return render_template("login.html", error=error)


@app.route("/logout")
def logout():
    session.clear()
    return redirect(url_for("login"))


@app.route("/favicon.ico")
def favicon():
    return redirect(url_for("static", filename="favicon.svg"), code=302)


@app.route("/")
def index():
    return render_template("index.html")


@app.route("/api/data")
def api_data():
    collect_only = request.args.get("collect_only", "1") == "1"
    df = load_csv(collect_only=collect_only)

    display_cols = []
    for c in df.columns:
        if c.startswith(COLLECT_PREFIX):
            parts = c.split(".")
            display_cols.append(parts[-1])
        elif c.startswith("participant."):
            display_cols.append(c.replace("participant.", "p."))
        elif c.startswith("session."):
            display_cols.append(c.replace("session.", "s."))
        else:
            display_cols.append(c)

    return to_json({
        "columns": display_cols,
        "raw_columns": list(df.columns),
        "rows": df.fillna("").values.tolist(),
        "total": len(df),
    })


@app.route("/api/fetch-otree", methods=["POST"])
def fetch_otree():
    """Pull fresh CSV from oTree and overwrite the local file."""
    try:
        resp = requests.get(OTREE_URL, timeout=30)
        resp.raise_for_status()
        # Validate it's actually CSV
        df = pd.read_csv(io.StringIO(resp.text))
        # Save to disk
        df.to_csv(CSV_PATH, index=False)
        # Bust cache
        _cache["df"] = df
        _cache["mtime"] = os.path.getmtime(CSV_PATH)
        return to_json({"ok": True, "rows": len(df), "cols": len(df.columns)})
    except Exception as e:
        return to_json({"ok": False, "error": str(e)}), 502


@app.route("/api/upload", methods=["POST"])
def upload_csv():
    """Accept an uploaded CSV file to replace the current dataset."""
    f = request.files.get("file")
    if not f:
        return to_json({"ok": False, "error": "No file uploaded"}), 400
    try:
        df = pd.read_csv(f.stream)
        df.to_csv(CSV_PATH, index=False)
        _cache["df"] = df
        _cache["mtime"] = os.path.getmtime(CSV_PATH)
        return to_json({"ok": True, "rows": len(df), "cols": len(df.columns)})
    except Exception as e:
        return to_json({"ok": False, "error": str(e)}), 400


@app.route("/api/payments")
def api_payments():
    """Return just PC_id, total_bonus, participant_session_id for the payments page."""
    df = _get_df()
    col_map = {
        "app_collect_results.1.player.PC_id_manual_input": "PC_id",
        "app_collect_results.1.player.total_bonus": "total_bonus",
        "app_collect_results.1.player.participant_session_id": "participant_session_id",
    }
    keep = [c for c in col_map if c in df.columns]
    sub = df[keep].rename(columns=col_map).fillna("")

    # Distinct session ids sorted alphabetically
    session_ids = sorted(set(str(v) for v in sub["participant_session_id"] if v != ""))

    return to_json({
        "columns": [col_map[c] for c in keep],
        "rows": sub.values.tolist(),
        "session_ids": session_ids,
    })


@app.route("/api/stats")
def api_stats():
    """Session-level stats dashboard data."""
    df = _get_df().copy()

    sid_col = "app_collect_results.1.player.participant_session_id"
    app_col = "participant._current_app_name"
    page_col = "participant._current_page_name"
    bot_col = "participant._is_bot"
    orphan_col = "app_collect_results.1.player.was_orphan"
    dropout_col = "participant.midgame_dropout"
    timeout_col = "participant.timed_out_from_coord_games"
    duration_col = "participant.completion_duration_time"
    treatment_col = "participant.treatment"

    # Only rows with a non-empty participant_session_id = "completed"
    df["_sid"] = df[sid_col].fillna("")
    completed = df[df["_sid"] != ""]
    incomplete = df[df["_sid"] == ""]

    # App sequence (derived from column order)
    app_order = []
    for c in df.columns:
        parts = c.split(".")
        if len(parts) > 1 and parts[0] not in ("participant", "session") and parts[0] not in app_order:
            app_order.append(parts[0])

    # ---- Global summary ----
    total = len(df)
    n_completed = len(completed)
    n_incomplete = len(incomplete)

    durations = pd.to_numeric(completed[duration_col], errors="coerce").dropna()

    global_summary = {
        "total_rows": total,
        "completed": n_completed,
        "incomplete": n_incomplete,
        "completion_rate": round(n_completed / total * 100, 1) if total else 0,
        "orphans": int((df[orphan_col] == 1).sum()) if orphan_col in df.columns else 0,
        "dropouts": int((pd.to_numeric(df[dropout_col], errors="coerce") == 1).sum()) if dropout_col in df.columns else 0,
        "timed_out": int((pd.to_numeric(df[timeout_col], errors="coerce") == 1).sum()) if timeout_col in df.columns else 0,
        "duration_median": round(durations.median(), 1) if len(durations) else None,
        "duration_mean": round(durations.mean(), 1) if len(durations) else None,
        "duration_min": round(durations.min(), 1) if len(durations) else None,
        "duration_max": round(durations.max(), 1) if len(durations) else None,
    }

    # ---- Per-session breakdown ----
    session_ids = sorted(completed["_sid"].unique())
    per_session = []
    for sid in session_ids:
        s = completed[completed["_sid"] == sid]
        s_dur = pd.to_numeric(s[duration_col], errors="coerce").dropna()
        per_session.append({
            "session_id": sid,
            "n": len(s),
            "orphans": int((s[orphan_col] == 1).sum()) if orphan_col in s.columns else 0,
            "duration_median": round(s_dur.median(), 1) if len(s_dur) else None,
            "duration_mean": round(s_dur.mean(), 1) if len(s_dur) else None,
            "treatments": dict(s[treatment_col].fillna("(none)").value_counts()),
        })

    # ---- Current app/page funnel (where are people RIGHT NOW) ----
    app_page = df[[app_col, page_col, "_sid"]].fillna("(empty)")
    funnel = []
    for a in app_order:
        sub = app_page[app_page[app_col] == a]
        if len(sub) == 0:
            continue
        pages = dict(sub[page_col].value_counts())
        funnel.append({"app": a, "count": len(sub), "pages": pages})
    # Also count those whose current_app is empty
    empty_app = app_page[app_page[app_col] == "(empty)"]
    if len(empty_app):
        funnel.append({"app": "(no app)", "count": len(empty_app), "pages": {}})

    # ---- Completed players: which app they finished at (current_app) ----
    completed_funnel = []
    for a in app_order:
        n = int((completed[app_col] == a).sum())
        if n:
            completed_funnel.append({"app": a, "count": n})

    return to_json({
        "global": global_summary,
        "per_session": per_session,
        "funnel": funnel,
        "completed_funnel": completed_funnel,
        "app_order": app_order,
    })


@app.route("/api/signal")
def api_signal():
    """Signal game analysis for completed participants."""
    df = _get_df()

    sid_col = "app_collect_results.1.player.participant_session_id"
    treat_col = "signal_game.1.group.treatment"
    buys_col = "signal_game.1.player.buys_signal"
    color_col = "participant.signal_color_choice"
    skip_col = "participant.skip_signal_game"
    intend_col = "signal_game.1.player.intends_to_buy"
    intend_count_col = "signal_game.1.group.intend_count"
    group_id_col = "signal_game.1.group.id_in_subsession"
    success_col = "signal_game.1.group.group_success"
    beliefs_col = "signal_game.1.player.beliefs_truthful"
    reason_col = "signal_game.1.player.purchase_reason"
    guess_col = "signal_game.1.player.guess_correct"
    psid_col = "app_collect_results.1.player.participant_session_id"

    # Filter: completed participants (non-empty session letter)
    completed = df[df[sid_col].notna()].copy()
    total_completed = len(completed)

    # Played signal game = did not skip (skip == 0 or NaN with color present)
    played = completed[completed[color_col].notna()].copy()
    skipped = completed[~completed.index.isin(played.index)]
    total_played = len(played)
    total_skipped = len(skipped)

    # Treatment labels
    played["_treat"] = played[treat_col].map({0.0: "Control (0)", 1.0: "Treatment (1)"}).fillna("Unknown")

    # ---- Treatment distribution ----
    treat_dist = dict(played["_treat"].value_counts())

    # ---- Buy rate by treatment ----
    buy_by_treat = {}
    for treat_name, group in played.groupby("_treat"):
        n = len(group)
        bought = int((group[buys_col] == 1).sum())
        buy_by_treat[treat_name] = {
            "n": n,
            "bought": bought,
            "pct": round(bought / n * 100, 1) if n else 0,
        }

    # ---- Color distribution by treatment ----
    color_by_treat = {}
    for treat_name, group in played.groupby("_treat"):
        colors = dict(group[color_col].value_counts())
        color_by_treat[treat_name] = colors

    # ---- Overall color distribution ----
    overall_colors = dict(played[color_col].value_counts())

    # ---- Overall buy rate ----
    total_bought = int((played[buys_col] == 1).sum())
    overall_buy_pct = round(total_bought / total_played * 100, 1) if total_played else 0

    # ---- Intend to buy (among those who have the field) ----
    intend_df = played[played[intend_col].notna()]
    intend_yes = int((intend_df[intend_col] == 1).sum())
    intend_no = int((intend_df[intend_col] == 0).sum())



    # ---- Group success by treatment ----
    success_by_treat = {}
    for treat_name, group in played.groupby("_treat"):
        n = len(group)
        succ = int((group[success_col] == 1).sum())
        success_by_treat[treat_name] = {
            "n": n,
            "success": succ,
            "pct": round(succ / n * 100, 1) if n else 0,
        }

    # ---- Success rate by group intend_count (treatment only) ----
    treat_played = played[played["_treat"] == "Treatment (1)"]
    treat_groups = treat_played.drop_duplicates(subset=[group_id_col])
    treat_groups = treat_groups[treat_groups[intend_count_col].notna()]
    success_by_intend = {}
    for ic, g in treat_groups.groupby(intend_count_col):
        n = len(g)
        succ = int((g[success_col] == 1).sum())
        success_by_intend[str(int(ic))] = {
            "n_groups": n,
            "success": succ,
            "pct": round(succ / n * 100, 1) if n else 0,
        }

    # ---- Buy rate by others' intent count (treatment only) ----
    tp = treat_played[treat_played[intend_count_col].notna() & treat_played[intend_col].notna()].copy()
    tp["_others_intend"] = tp[intend_count_col] - tp[intend_col]
    buy_by_others_intend = {}
    for oi, g in tp.groupby("_others_intend"):
        n = len(g)
        bought = int((g[buys_col] == 1).sum())
        buy_by_others_intend[str(int(oi))] = {
            "n": n,
            "bought": bought,
            "pct": round(bought / n * 100, 1) if n else 0,
        }


    # ---- Buy rate by group intend_count (treatment only, per player) ----
    buy_by_group_intend = {}
    for ic, g in tp.groupby(intend_count_col):
        n = len(g)
        bought = int((g[buys_col] == 1).sum())
        buy_by_group_intend[str(int(ic))] = {
            "n": n,
            "bought": bought,
            "pct": round(bought / n * 100, 1) if n else 0,
        }

    # ---- Beliefs distribution (among those who answered) ----
    beliefs_df = played[played[beliefs_col].notna()]
    beliefs_dist = {str(int(k)): int(v) for k, v in beliefs_df[beliefs_col].value_counts().items()}

    # ---- By session letter ----
    per_session = []
    for sid in sorted(played[psid_col].unique()):
        s = played[played[psid_col] == sid]
        n = len(s)
        bought = int((s[buys_col] == 1).sum())
        treats = dict(s["_treat"].value_counts())
        colors = dict(s[color_col].value_counts())
        per_session.append({
            "session_id": sid,
            "n": n,
            "bought": bought,
            "buy_pct": round(bought / n * 100, 1) if n else 0,
            "treatments": treats,
            "colors": colors,
        })

    return to_json({
        "total_completed": total_completed,
        "total_played": total_played,
        "total_skipped": total_skipped,
        "treat_dist": treat_dist,
        "buy_by_treat": buy_by_treat,
        "color_by_treat": color_by_treat,
        "overall_colors": overall_colors,
        "overall_buy_pct": overall_buy_pct,
        "total_bought": total_bought,
        "intend_yes": intend_yes,
        "intend_no": intend_no,
        "success_by_treat": success_by_treat,
        "success_by_intend": success_by_intend,
        "buy_by_others_intend": buy_by_others_intend,
        "buy_by_group_intend": buy_by_group_intend,
        "beliefs_dist": beliefs_dist,
        "per_session": per_session,
    })


@app.route("/api/coop-games")
def api_coop_games():
    """Combined Prisoner's Dilemma + Stag Hunt dashboard data."""
    df = _get_df()

    sid_col = "app_collect_results.1.player.participant_session_id"

    def analyze_game(prefix):
        treat_col = f"{prefix}.1.player.treatment_cond"
        coop_col = f"{prefix}.1.player.cooperate"
        bot_col = f"{prefix}.1.player.is_bot"
        payoff_col = f"{prefix}.1.player.payoff"
        msg_col = f"{prefix}.1.player.messages"
        comp_col = f"{prefix}.1.player.comprehension_passed"
        gpwait_col = f"{prefix}.1.player.time_spent_gpwait"
        instr_col = f"{prefix}.1.player.time_spent_game_instr_page"
        group_col = f"{prefix}.1.player.persistent_group_id"

        # Filter: non-NaN treatment_cond
        played = df[df[treat_col].notna()].copy()
        n = len(played)

        # Treatment distribution
        treat_dist = dict(played[treat_col].value_counts())

        # Overall cooperation
        n_coop = int((played[coop_col] == 1).sum())
        n_defect = int((played[coop_col] == 0).sum())
        coop_rate = round(n_coop / n * 100, 1) if n else 0

        # Cooperation by treatment
        coop_by_treat = {}
        for t, g in played.groupby(treat_col):
            gn = len(g)
            gc = int((g[coop_col] == 1).sum())
            coop_by_treat[t] = {"n": gn, "coop": gc, "pct": round(gc / gn * 100, 1) if gn else 0}

        # Human vs bot cooperation by treatment (for bot conditions)
        human_bot_coop = {}
        for t, g in played.groupby(treat_col):
            if bot_col in g.columns and g[bot_col].sum() > 0:
                humans = g[g[bot_col] == 0]
                bots = g[g[bot_col] == 1]
                hc = int((humans[coop_col] == 1).sum())
                bc = int((bots[coop_col] == 1).sum())
                human_bot_coop[t] = {
                    "human_n": len(humans), "human_coop": hc,
                    "human_pct": round(hc / len(humans) * 100, 1) if len(humans) else 0,
                    "bot_n": len(bots), "bot_coop": bc,
                    "bot_pct": round(bc / len(bots) * 100, 1) if len(bots) else 0,
                }

        # Bots vs humans overall
        n_bots = int((played[bot_col] == 1).sum()) if bot_col in played.columns else 0
        n_humans = n - n_bots

        # Payoff distribution
        payoff_dist = {}
        for t, g in played.groupby(treat_col):
            vals = pd.to_numeric(g[payoff_col], errors="coerce").dropna()
            payoff_dist[t] = {
                "mean": round(vals.mean(), 2) if len(vals) else None,
                "values": dict(vals.value_counts().sort_index()),
            }

        # Messages: how many non-empty per condition
        msg_by_treat = {}
        if msg_col in played.columns:
            for t, g in played.groupby(treat_col):
                msgs = g[msg_col].fillna("")
                non_empty = int((msgs.str.len() > 0).sum())
                msg_by_treat[t] = {"n": len(g), "with_msg": non_empty}

        # Comprehension
        comp_passed = int((played[comp_col] == 1).sum()) if comp_col in played.columns else n
        comp_failed = n - comp_passed

        # Per-session
        per_session = []
        for sid in sorted(played[sid_col].dropna().unique()):
            s = played[played[sid_col] == sid]
            sn = len(s)
            sc = int((s[coop_col] == 1).sum())
            treats = {}
            for t, g in s.groupby(treat_col):
                gn = len(g)
                gc = int((g[coop_col] == 1).sum())
                treats[t] = {"n": gn, "coop": gc, "pct": round(gc / gn * 100, 1) if gn else 0}
            per_session.append({
                "session_id": sid,
                "n": sn,
                "coop": sc,
                "coop_pct": round(sc / sn * 100, 1) if sn else 0,
                "treatments": treats,
            })

        return {
            "n": n,
            "n_humans": n_humans,
            "n_bots": n_bots,
            "n_coop": n_coop,
            "n_defect": n_defect,
            "coop_rate": coop_rate,
            "treat_dist": treat_dist,
            "coop_by_treat": coop_by_treat,
            "human_bot_coop": human_bot_coop,
            "payoff_dist": payoff_dist,
            "msg_by_treat": msg_by_treat,
            "comp_passed": comp_passed,
            "comp_failed": comp_failed,
            "per_session": per_session,
        }

    prisoner = analyze_game("app_prisoner")
    stag = analyze_game("app_stag")

    # Condition labels for the frontend
    cond_labels = {
        "condition_1": "Human + Chat",
        "condition_2": "Human + No Chat",
        "condition_3": "Bot + No Chat",
        "condition_4": "Bot + Chat",
    }

    return to_json({
        "prisoner": prisoner,
        "stag": stag,
        "cond_labels": cond_labels,
    })


@app.route("/api/session-diagnostic")
def api_session_diagnostic():
    """Detect mistyped session letters using time-bucket analysis."""
    df = _get_df()

    letter_col = "app_consent_consolidated.1.player.session_letter"
    sid_col = "app_collect_results.1.player.participant_session_id"
    time_col = "participant.time_started_utc"
    code_col = "participant.code"
    color_col = "participant.signal_color_choice"
    group_col = "signal_game.1.group.id_in_subsession"
    treat_col = "signal_game.1.group.treatment"

    has_letter = df[df[letter_col].notna()].copy()
    has_letter["_time"] = pd.to_datetime(has_letter[time_col], utc=True).dt.tz_convert("America/New_York")
    has_letter["_bucket"] = has_letter["_time"].dt.floor("10min")

    canonical = {}
    for letter in sorted(has_letter[letter_col].unique()):
        s = has_letter[has_letter[letter_col] == letter]
        mode_bucket = s["_bucket"].mode().iloc[0]
        canonical[letter] = mode_bucket + pd.Timedelta(minutes=5)

    sessions = []
    for letter in sorted(has_letter[letter_col].unique()):
        s = has_letter[has_letter[letter_col] == letter]
        completed = s[s[sid_col].notna()]
        played_signal = s[s[color_col].notna()]
        n_total = len(s)
        n_completed = len(completed)
        n_signal = len(played_signal)
        sessions.append({
            "letter": letter,
            "canonical_bucket": canonical[letter].strftime("%Y-%m-%d %H:%M"),
            "n_total": n_total,
            "n_completed": n_completed,
            "n_signal_played": n_signal,
            "mod7": n_signal % 7,
            "time_min": s["_time"].min().strftime("%Y-%m-%d %H:%M:%S"),
            "time_max": s["_time"].max().strftime("%Y-%m-%d %H:%M:%S"),
        })

    flagged = []
    letters = sorted(canonical.keys())
    for _, row in has_letter.iterrows():
        typed = row[letter_col]
        t = row["_time"]
        if pd.isna(t):
            continue
        dists = {l: abs((t - canonical[l]).total_seconds()) for l in letters}
        nearest = min(dists, key=dists.get)
        if nearest != typed:
            flagged.append({
                "code": row[code_col],
                "typed_letter": typed,
                "suggested_letter": nearest,
                "timestamp": t.strftime("%Y-%m-%d %H:%M:%S"),
                "dist_own_min": round(dists[typed] / 60, 1),
                "dist_suggested_min": round(dists[nearest] / 60, 1),
            })

    played = has_letter[has_letter[group_col].notna()].copy()
    split_groups = []
    if len(played):
        for gid, g in played.groupby(group_col):
            letter_counts = g[letter_col].value_counts().to_dict()
            if len(letter_counts) > 1:
                treat_val = g[treat_col].iloc[0]
                split_groups.append({
                    "group_id": int(gid),
                    "treatment": int(treat_val) if pd.notna(treat_val) else None,
                    "size": len(g),
                    "letters": {str(k): int(v) for k, v in letter_counts.items()},
                })

    bucket_detail = {}
    for letter in sorted(has_letter[letter_col].unique()):
        s = has_letter[has_letter[letter_col] == letter]
        counts = s["_bucket"].value_counts().sort_index()
        bucket_detail[letter] = [
            {"bucket": b.strftime("%Y-%m-%d %H:%M"), "count": int(c)}
            for b, c in counts.items()
        ]

    return to_json({
        "sessions": sessions,
        "flagged": flagged,
        "split_groups": split_groups,
        "bucket_detail": bucket_detail,
        "canonical": {l: v.strftime("%Y-%m-%d %H:%M") for l, v in canonical.items()},
    })


@app.route("/api/columns")
def api_columns():
    df = _get_df()
    groups = {}
    for c in df.columns:
        prefix = c.split(".")[0]
        groups.setdefault(prefix, []).append(c)
    return to_json(groups)


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)