Spaces:

apjanco
/

penelope

Running

App Files Files Community

penelope / app.py

apjanco

Upload folder using huggingface_hub

76944c6 verified 3 months ago

raw

history blame contribute delete

20.6 kB

	#!/usr/bin/env python3
	"""Penelope — Streamlit app for comparing SOC analysis results across models.

	Launch: streamlit run app.py

	Deployed on Hugging Face Spaces as a self-contained dashboard.
	The results/ directory contains pre-computed per-model JSON files.
	"""

	from __future__ import annotations

	import json
	import re
	from pathlib import Path

	import pandas as pd
	import streamlit as st

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	RESULTS_DIR = Path(__file__).parent / "results"
	MIN_OVERLAP_RATIO = 0.35 # minimum token overlap to consider passages "matched"

	SOC_TYPE_LABELS: dict[str, str] = {
	"direct_interior_monologue": "Direct Interior Monologue",
	"indirect_interior_monologue": "Indirect Interior Monologue",
	"omniscient_description": "Omniscient Description",
	"soliloquy": "Soliloquy",
	"free_association": "Free Association",
	"space_montage": "Space-Montage",
	"orthographic_marker": "Orthographic Marker",
	"imagery": "Imagery",
	"simulation_state_of_mind": "Simulation of State of Mind",
	"reverie_fantasy": "Reverie / Fantasy",
	"hybrid": "Hybrid",
	}

	# Stems to skip when scanning results/ for per-model JSON files
	_SKIP_STEMS = {"results", "consensus_conservative", "consensus_moderate", "consensus_liberal"}

	# ---------------------------------------------------------------------------
	# Data loading
	# ---------------------------------------------------------------------------


	@st.cache_data
	def load_results(results_dir: str \| None = None) -> pd.DataFrame:
	"""Load all per-model JSON files from results/ into one DataFrame."""
	rdir = Path(results_dir) if results_dir else RESULTS_DIR
	frames: list[pd.DataFrame] = []
	for f in sorted(rdir.glob("*.json")):
	if f.stem in _SKIP_STEMS or f.stem.startswith("consensus"):
	continue
	data = json.loads(f.read_text(encoding="utf-8"))
	if data:
	df = pd.DataFrame(data)
	frames.append(df)
	if not frames:
	return pd.DataFrame()
	combined = pd.concat(frames, ignore_index=True)
	# Normalise whitespace in passages for better matching
	combined["passage_norm"] = combined["passage"].apply(_normalise_text)
	combined["passage_tokens"] = combined["passage_norm"].apply(lambda t: set(t.split()))
	return combined


	def _normalise_text(text: str) -> str:
	"""Lowercase, collapse whitespace, strip punctuation for matching."""
	text = text.lower()
	text = re.sub(r"[^\w\s]", "", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	# ---------------------------------------------------------------------------
	# Passage matching — group passages across models that refer to the same text
	# ---------------------------------------------------------------------------


	def _token_overlap(a: set[str], b: set[str]) -> float:
	"""Jaccard-like overlap ratio between two token sets."""
	if not a or not b:
	return 0.0
	intersection = len(a & b)
	smaller = min(len(a), len(b))
	return intersection / smaller if smaller else 0.0


	@st.cache_data
	def build_passage_groups(_df_json: str) -> list[dict]:
	"""Cluster passages across models that overlap significantly.

	Uses a greedy approach: for each passage, find or create a group where
	token overlap with at least one existing member exceeds MIN_OVERLAP_RATIO.

	Returns a list of group dicts:
	{
	"group_id": int,
	"representative": str, # longest passage text
	"models": list[str],
	"rows": list[int], # DataFrame indices
	"chunk_id": str,
	"source_file": str,
	"n_models": int,
	"agreement": str, # "full" / "partial" / "single"
	}
	"""
	from io import StringIO
	df = pd.read_json(StringIO(_df_json), dtype={"chunk_index": int})
	if df.empty:
	return []

	df["passage_norm"] = df["passage"].apply(_normalise_text)
	df["passage_tokens"] = df["passage_norm"].apply(lambda t: set(t.split()))

	groups: list[dict] = []
	assigned: set[int] = set()

	# Process by chunk for efficiency (passages from different chunks can't match)
	for chunk_id, chunk_df in df.groupby("chunk_id"):
	idxs = chunk_df.index.tolist()
	for i in idxs:
	if i in assigned:
	continue
	tokens_i = df.at[i, "passage_tokens"]
	# Try to find a matching group
	matched_group = None
	for g in groups:
	if g["chunk_id"] != chunk_id:
	continue
	for member_idx in g["rows"]:
	tokens_m = df.at[member_idx, "passage_tokens"]
	if _token_overlap(tokens_i, tokens_m) >= MIN_OVERLAP_RATIO:
	matched_group = g
	break
	if matched_group:
	break

	if matched_group:
	matched_group["rows"].append(i)
	model = df.at[i, "model_label"]
	if model not in matched_group["models"]:
	matched_group["models"].append(model)
	else:
	groups.append({
	"group_id": len(groups),
	"rows": [i],
	"models": [df.at[i, "model_label"]],
	"chunk_id": chunk_id,
	"source_file": df.at[i, "source_file"],
	})
	assigned.add(i)

	# Enrich groups
	for g in groups:
	g["n_models"] = len(set(g["models"]))
	passages = [df.at[idx, "passage"] for idx in g["rows"]]
	g["representative"] = max(passages, key=len)
	types_in_group = set(df.at[idx, "soc_type"] for idx in g["rows"])
	if g["n_models"] == 1:
	g["agreement"] = "single"
	elif len(types_in_group) == 1:
	g["agreement"] = "full"
	else:
	g["agreement"] = "partial"

	# Sort: multi-model groups first, then by chunk
	groups.sort(key=lambda g: (-g["n_models"], g["chunk_id"], g["group_id"]))
	# Re-number
	for i, g in enumerate(groups):
	g["group_id"] = i

	return groups


	# ---------------------------------------------------------------------------
	# Streamlit UI
	# ---------------------------------------------------------------------------

	def main() -> None:
	st.set_page_config(
	page_title="Penelope — SOC Model Comparison",
	page_icon="🧶",
	layout="wide",
	)
	st.title("🧶 Penelope — SOC Model Comparison")
	st.caption(
	"Compare how different LLMs detect stream of consciousness in literary texts. "
	"[GitHub](https://github.com/apjanco/penelope)"
	)

	# Load data
	df = load_results()
	if df.empty:
	st.error(f"No result JSON files found in `{RESULTS_DIR}/`.")
	st.stop()

	all_models = sorted(df["model_label"].unique())
	all_files = sorted(df["source_file"].unique())

	# --- Sidebar filters ---
	st.sidebar.header("Filters")
	sel_files = st.sidebar.multiselect(
	"Source files", all_files, default=all_files
	)
	sel_models = st.sidebar.multiselect(
	"Models", all_models, default=all_models
	)
	min_models = st.sidebar.slider(
	"Min models marking passage", 1, len(all_models), 2,
	help="Show only passage groups identified by at least N models",
	)

	mask = df["source_file"].isin(sel_files) & df["model_label"].isin(sel_models)
	filtered = df[mask].copy()

	if filtered.empty:
	st.warning("No data matches the current filters.")
	st.stop()

	# Tabs
	tab_overview, tab_compare, tab_detail, tab_data = st.tabs([
	"📊 Overview", "🔍 Passage Comparison", "📖 Detail View", "📋 Raw Data"
	])

	# ── Tab 1: Overview ────────────────────────────────────────────────
	with tab_overview:
	_render_overview(filtered, all_models)

	# ── Tab 2: Passage Comparison ──────────────────────────────────────
	with tab_compare:
	_render_comparison(df, filtered, all_models, sel_models, min_models)

	# ── Tab 3: Detail View ─────────────────────────────────────────────
	with tab_detail:
	_render_detail(filtered, all_models)

	# ── Tab 4: Raw Data ────────────────────────────────────────────────
	with tab_data:
	_render_raw_data(filtered)


	# ---------------------------------------------------------------------------
	# Tab: Overview
	# ---------------------------------------------------------------------------

	def _render_overview(df: pd.DataFrame, all_models: list[str]) -> None:
	st.header("Overview")

	# KPI cards
	cols = st.columns(4)
	cols[0].metric("Total instances", len(df))
	cols[1].metric("Models", df["model_label"].nunique())
	cols[2].metric("Source files", df["source_file"].nunique())
	cols[3].metric("Chunks covered", df["chunk_id"].nunique())

	st.subheader("Instances per model")
	model_counts = df.groupby("model_label").size().reset_index(name="count")
	st.bar_chart(model_counts.set_index("model_label")["count"])

	# SOC type distribution
	st.subheader("SOC type distribution by model")
	type_model = (
	df.groupby(["model_label", "soc_type"])
	.size()
	.reset_index(name="count")
	)
	pivot = type_model.pivot(index="soc_type", columns="model_label", values="count").fillna(0)
	st.bar_chart(pivot)

	# Confidence breakdown
	st.subheader("Confidence breakdown")
	conf_model = (
	df.groupby(["model_label", "confidence"])
	.size()
	.reset_index(name="count")
	)
	conf_pivot = conf_model.pivot(index="confidence", columns="model_label", values="count").fillna(0)
	# Reorder
	for order_val in ["high", "medium", "low"]:
	if order_val not in conf_pivot.index:
	conf_pivot.loc[order_val] = 0
	conf_pivot = conf_pivot.loc[
	[v for v in ["high", "medium", "low"] if v in conf_pivot.index]
	]
	st.bar_chart(conf_pivot)

	# Coverage heatmap: which chunks does each model annotate?
	st.subheader("Chunk coverage by model")
	coverage = (
	df.groupby(["chunk_id", "model_label"])
	.size()
	.reset_index(name="instances")
	)
	cov_pivot = coverage.pivot(index="chunk_id", columns="model_label", values="instances").fillna(0)
	cov_pivot = cov_pivot.sort_index()
	st.dataframe(
	cov_pivot.style.background_gradient(cmap="YlOrRd", axis=None),
	use_container_width=True,
	height=min(len(cov_pivot) * 35 + 50, 600),
	)


	# ---------------------------------------------------------------------------
	# Tab: Passage Comparison
	# ---------------------------------------------------------------------------

	def _render_comparison(
	full_df: pd.DataFrame,
	filtered: pd.DataFrame,
	all_models: list[str],
	sel_models: list[str],
	min_models: int,
	) -> None:
	st.header("Passage Comparison")
	st.caption(
	"Passages from different models are grouped when they share significant "
	"token overlap (≥35% of the shorter passage). This catches near-identical "
	"quotes as well as passages where models quoted slightly different spans."
	)

	# Build groups from the full dataset (so matching works across all models)
	groups = build_passage_groups(full_df.drop(columns=["passage_tokens"]).to_json())

	# Filter groups
	visible_groups = [
	g for g in groups
	if g["n_models"] >= min_models
	and any(m in sel_models for m in g["models"])
	and g["source_file"] in filtered["source_file"].values
	]

	if not visible_groups:
	st.info("No passage groups match the current filters. Try lowering the minimum models slider.")
	return

	# Summary metrics
	c1, c2, c3, c4 = st.columns(4)
	multi = [g for g in visible_groups if g["n_models"] > 1]
	full_agree = [g for g in multi if g["agreement"] == "full"]
	partial = [g for g in multi if g["agreement"] == "partial"]
	c1.metric("Passage groups", len(visible_groups))
	c2.metric("Multi-model groups", len(multi))
	c3.metric("Full type agreement", len(full_agree))
	c4.metric("Partial agreement", len(partial))

	# Agreement filter
	agree_filter = st.radio(
	"Show", ["All", "Full agreement", "Partial agreement", "Single model"],
	horizontal=True,
	)
	if agree_filter == "Full agreement":
	visible_groups = [g for g in visible_groups if g["agreement"] == "full"]
	elif agree_filter == "Partial agreement":
	visible_groups = [g for g in visible_groups if g["agreement"] == "partial"]
	elif agree_filter == "Single model":
	visible_groups = [g for g in visible_groups if g["agreement"] == "single"]

	st.divider()

	# Render each group
	for g in visible_groups:
	_render_group(g, full_df, sel_models)


	def _render_group(group: dict, df: pd.DataFrame, sel_models: list[str]) -> None:
	"""Render one passage group as an expandable card."""
	n = group["n_models"]
	agreement = group["agreement"]

	# Badge colours
	if agreement == "full":
	badge = "🟢 Full agreement"
	elif agreement == "partial":
	badge = "🟡 Partial agreement"
	else:
	badge = "⚪ Single model"

	# Types in this group
	types_in_group = set()
	for idx in group["rows"]:
	if df.at[idx, "model_label"] in sel_models:
	types_in_group.add(df.at[idx, "soc_type"])

	preview = group["representative"][:120] + ("…" if len(group["representative"]) > 120 else "")
	header = f"{badge} \| {n} model(s) \| {', '.join(types_in_group)} \| `{group['chunk_id']}`"

	with st.expander(f"{preview}\n\n{header}", expanded=False):
	# Collect each model's annotation(s) for this group
	relevant_rows = [
	idx for idx in group["rows"]
	if df.at[idx, "model_label"] in sel_models
	]
	model_groups: dict[str, list[int]] = {}
	for idx in relevant_rows:
	model = df.at[idx, "model_label"]
	model_groups.setdefault(model, []).append(idx)

	# Build a comparison table: rows = fields, columns = models
	models_ordered = sorted(model_groups.keys())

	# Some models may have multiple matches in this group; show the
	# first one in the main table and note extras below.
	primary_idxs = {m: idxs[0] for m, idxs in model_groups.items()}
	extra_idxs = {m: idxs[1:] for m, idxs in model_groups.items() if len(idxs) > 1}

	fields = [
	("SOC Type", "soc_type", lambda v: SOC_TYPE_LABELS.get(v, v)),
	("Confidence", "confidence", None),
	("Narrator Position", "narrator_position", None),
	("Character POV", "character_pov", None),
	("Secondary Devices", "secondary_devices", None),
	("Affective Register", "affective_register", None),
	("Passage", "passage", None),
	("Explanation", "explanation", None),
	("Evidence", "evidence", None),
	("Notes", "notes", None),
	]

	# Build markdown table
	header_row = "\| Field \| " + " \| ".join(f"{m}" for m in models_ordered) + " \|"
	sep_row = "\|---\|" + "\|".join("---" for _ in models_ordered) + "\|"
	table_rows = [header_row, sep_row]

	for label, key, fmt in fields:
	cells: list[str] = []
	for m in models_ordered:
	row = df.iloc[primary_idxs[m]]
	val = row.get(key, "")
	if pd.isna(val) or val == "":
	val = "—"
	else:
	val = str(val)
	if fmt:
	val = fmt(val)
	# Escape pipes and collapse newlines for markdown table cells
	val = val.replace("\|", "\\\|").replace("\n", " ")
	# Truncate very long cells to keep table readable
	if len(val) > 300:
	val = val[:297] + "…"
	cells.append(val)
	table_rows.append(f"\| {label} \| " + " \| ".join(cells) + " \|")

	st.markdown("\n".join(table_rows))

	# If any model had multiple matches, show them below
	if extra_idxs:
	st.markdown("---")
	st.caption("Additional matches within this group:")
	for m, idxs in sorted(extra_idxs.items()):
	for idx in idxs:
	row = df.iloc[idx]
	soc_label = SOC_TYPE_LABELS.get(row["soc_type"], row["soc_type"])
	st.caption(
	f"{m} — {soc_label} ({row['confidence']}) — "
	f"{str(row['passage'])[:100]}…"
	)


	# ---------------------------------------------------------------------------
	# Tab: Detail View
	# ---------------------------------------------------------------------------

	def _render_detail(df: pd.DataFrame, all_models: list[str]) -> None:
	st.header("Detail View")
	st.caption("Browse individual passages. Select a chunk to see all annotations.")

	chunks = sorted(df["chunk_id"].unique())
	sel_chunk = st.selectbox("Chunk", chunks)

	chunk_df = df[df["chunk_id"] == sel_chunk].copy()
	if chunk_df.empty:
	st.info("No annotations for this chunk.")
	return

	st.subheader(f"Chunk: {sel_chunk}")
	if not chunk_df.empty:
	st.caption(f"Source: {chunk_df.iloc[0]['source_file']} \| Label: {chunk_df.iloc[0].get('chunk_label', '')}")

	# Group by model
	for model in sorted(chunk_df["model_label"].unique()):
	model_df = chunk_df[chunk_df["model_label"] == model]
	st.markdown(f"### {model} ({len(model_df)} instances)")

	for _, row in model_df.iterrows():
	soc_label = SOC_TYPE_LABELS.get(row["soc_type"], row["soc_type"])
	with st.expander(
	f"{soc_label} — {row['confidence']} confidence — "
	f"{row['passage'][:80]}…"
	):
	st.markdown(f"Passage:\n> {row['passage']}")
	st.markdown(f"SOC Type: {soc_label}")
	st.markdown(f"Confidence: {row['confidence']}")
	st.markdown(f"Narrator position: {row.get('narrator_position', 'n/a')}")
	st.markdown(f"Character POV: {row.get('character_pov', 'n/a')}")
	if row.get("secondary_devices"):
	st.markdown(f"Secondary devices: {row['secondary_devices']}")
	if row.get("affective_register") and row["affective_register"] != "n/a":
	st.markdown(f"Affective register: {row['affective_register']}")
	st.markdown(f"Explanation: {row['explanation']}")
	if row.get("evidence"):
	st.markdown(f"Evidence: {row['evidence']}")
	if row.get("notes"):
	st.markdown(f"Notes: {row['notes']}")


	# ---------------------------------------------------------------------------
	# Tab: Raw Data
	# ---------------------------------------------------------------------------

	def _render_raw_data(df: pd.DataFrame) -> None:
	st.header("Raw Data")

	display_cols = [
	"model_label", "source_file", "chunk_id", "chunk_label",
	"passage", "soc_type", "secondary_devices", "confidence",
	"narrator_position", "character_pov", "explanation",
	"evidence", "notes",
	]
	available = [c for c in display_cols if c in df.columns]

	st.dataframe(
	df[available],
	use_container_width=True,
	height=600,
	)

	# Download
	csv_data = df[available].to_csv(index=False)
	st.download_button(
	"⬇ Download filtered data as CSV",
	csv_data,
	file_name="penelope_filtered.csv",
	mime="text/csv",
	)


	if __name__ == "__main__":
	main()