Spaces:

caisdev
/

esfiles

Sleeping

esfiles / frontend /src /components /AnomalyPanel.tsx

Besjon Cifliku

feat: implement anomaly detection to filter suspicious word relations

9c3ade2 about 1 month ago

13 kB

	import { useState, useEffect } from "react";
	import { api } from "../api";
	import type {
	BackgroundStatus, AnomalySweepResponse, AnomalyRelationResponse, IncongruenceResponse,
	} from "../types";
	import { useApiCall } from "../hooks/useApiCall";
	import ScoreBar from "./ScoreBar";
	import StatusMessage from "./StatusMessage";

	export default function AnomalyPanel() {
	const [bg, setBg] = useState<BackgroundStatus \| null>(null);
	const [bgLoading, setBgLoading] = useState(false);
	const [bgError, setBgError] = useState("");

	// Stage A — corpus sweep
	const [showAdvanced, setShowAdvanced] = useState(false);
	const [minCount, setMinCount] = useState(5);
	const [neighbours, setNeighbours] = useState(25);
	const [topN, setTopN] = useState(30);
	const sweep = useApiCall<AnomalySweepResponse>();

	// Stage B — per-word relations
	const [selectedWord, setSelectedWord] = useState<string \| null>(null);
	const relations = useApiCall<AnomalyRelationResponse>();

	// Stage C — contextual incongruence (zoom in)
	const [keyword, setKeyword] = useState("");
	const [canonical, setCanonical] = useState("");
	const incong = useApiCall<IncongruenceResponse>();

	useEffect(() => {
	api.backgroundStatus().then(setBg).catch(() => {});
	}, []);

	async function loadBackground() {
	setBgLoading(true); setBgError("");
	try {
	setBg(await api.backgroundLoad());
	} catch {
	setBgError("Background model failed to load (network/disk). Anomaly detection needs it.");
	} finally {
	setBgLoading(false);
	}
	}

	async function runSweep() {
	setSelectedWord(null);
	relations.clear();
	const res = await sweep.run(() =>
	api.analyzeAnomalies({ min_count: minCount, neighbours, top_n: topN }));
	if (res && !bg?.ready) api.backgroundStatus().then(setBg).catch(() => {});
	}

	async function drillInto(word: string) {
	setSelectedWord(word);
	await relations.run(() => api.analyzeAnomalyRelations({ word, top_k: 15 }));
	}

	async function zoomIn(word: string, gloss?: string) {
	setKeyword(word);
	if (gloss !== undefined) setCanonical(gloss);
	await incong.run(() =>
	api.analyzeIncongruence({ keyword: word, canonical_meaning: gloss \|\| undefined, top_k: 10 }));
	document.getElementById("zoom-section")?.scrollIntoView({ behavior: "smooth" });
	}

	const bgReady = bg?.ready ?? false;

	return (
	<div>
	{/* Background model status */}
	<div className="panel">
	<h2>Anomalous Relations</h2>
	<p className="panel-desc">
	Find <strong>code-word candidates</strong>: common English words that behave uncommonly
	in this corpus. We contrast each word's neighbours in the corpus Word2Vec against a
	pretrained general-English model (GloVe). A relation is flagged when it is{" "}
	<em>strong here but weak/absent in normal English</em> — not merely "low similarity".
	</p>
	{bg && (
	<div className="flex-row" style={{ alignItems: "center", gap: 8 }}>
	<span
	className="badge"
	style={{
	background: `rgba(${bgReady ? "74, 222, 128" : "255, 170, 0"}, 0.15)`,
	color: bgReady ? "var(--ok)" : "var(--accent)",
	}}
	>
	{bg.model_name}: {bgReady ? `ready (${bg.vocab_size.toLocaleString()} words)` : "not loaded"}
	</span>
	{!bgReady && (
	<button className="btn" onClick={loadBackground} disabled={bgLoading}>
	{bgLoading ? <><span className="spinner" /> Downloading…</> : "Load background model"}
	</button>
	)}
	</div>
	)}
	{bgError && <div className="mt-2"><StatusMessage type="err" message={bgError} /></div>}
	</div>

	{/* Stage A — corpus sweep */}
	<div className="panel">
	<h3 style={{ marginTop: 0 }}>1 · Scan corpus for anomalous words</h3>
	<p className="panel-desc">
	Ranks words by neighbour-set divergence (z-scored across the vocabulary). Higher z = the
	word's corpus associations look more unlike general English.
	</p>

	<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
	{showAdvanced ? "▾" : "▸"} Advanced Settings
	</button>
	{showAdvanced && (
	<div className="advanced-section">
	<div className="form-row">
	<div className="form-group" style={{ maxWidth: 130 }}>
	<label>Min corpus freq</label>
	<input type="number" value={minCount} onChange={e => setMinCount(+e.target.value)} min={1} max={1000} />
	</div>
	<div className="form-group" style={{ maxWidth: 130 }}>
	<label>Neighbours (k)</label>
	<input type="number" value={neighbours} onChange={e => setNeighbours(+e.target.value)} min={5} max={100} />
	</div>
	<div className="form-group" style={{ maxWidth: 130 }}>
	<label>Top N results</label>
	<input type="number" value={topN} onChange={e => setTopN(+e.target.value)} min={1} max={200} />
	</div>
	</div>
	</div>
	)}

	<button className="btn btn-primary" onClick={runSweep} disabled={sweep.loading} style={{ marginTop: 8 }}>
	{sweep.loading ? <><span className="spinner" /> Scanning…</> : "Scan corpus"}
	</button>

	{sweep.error && <div className="mt-2"><StatusMessage type="err" message={sweep.error} /></div>}
	{sweep.data?.note && <div className="mt-2"><StatusMessage type="err" message={sweep.data.note} /></div>}

	{sweep.data && sweep.data.results.length > 0 && (
	<div className="mt-2">
	<div className="section-label">
	{sweep.data.results.length} flagged · shared vocab {sweep.data.vocab_size.toLocaleString()} ·
	mean shift {sweep.data.shift_mean}
	</div>
	<table className="data-table">
	<thead>
	<tr>
	<th>Word</th><th>Freq</th><th>z</th>
	<th>Surprising neighbours (here, not normal)</th><th></th>
	</tr>
	</thead>
	<tbody>
	{sweep.data.results.map((r) => (
	<tr
	key={r.word}
	onClick={() => drillInto(r.word)}
	style={{ cursor: "pointer", background: selectedWord === r.word ? "rgba(108,140,255,0.08)" : undefined }}
	>
	<td style={{ fontWeight: 600 }}>{r.word}</td>
	<td>{r.corpus_frequency}</td>
	<td>
	<span className="badge" style={{
	background: `rgba(${r.z_score >= 2 ? "255,107,107" : "108,140,255"},0.15)`,
	color: r.z_score >= 2 ? "var(--err)" : "var(--accent)",
	}}>{r.z_score.toFixed(2)}</span>
	</td>
	<td style={{ fontSize: "0.85rem" }}>{r.surprising_neighbors.join(", ") \|\| "—"}</td>
	<td style={{ color: "var(--accent)", fontSize: "0.8rem" }}>inspect →</td>
	</tr>
	))}
	</tbody>
	</table>
	</div>
	)}
	</div>

	{/* Stage B — per-word relations drilldown */}
	{selectedWord && (
	<div className="panel">
	<h3 style={{ marginTop: 0 }}>2 · Relations for "{selectedWord}"</h3>
	{relations.loading && <StatusMessage type="loading" message="Computing relations…" />}
	{relations.error && <StatusMessage type="err" message={relations.error} />}
	{relations.data && !relations.data.found && (
	<StatusMessage type="err" message={`"${selectedWord}" — ${relations.data.reason}.`} />
	)}
	{relations.data?.found && (
	<>
	<p className="panel-desc">
	Surprise = (how strongly tied here) − (how strongly tied in general English), each
	standardised within its own space. High surprise = the suspicious pairing.
	</p>
	<table className="data-table">
	<thead>
	<tr><th>Neighbour</th><th>Surprise</th><th>Corpus sim</th><th>Normal-English sim</th></tr>
	</thead>
	<tbody>
	{relations.data.relations.map((rel) => (
	<tr key={rel.neighbor}>
	<td style={{ fontWeight: 600 }}>{rel.neighbor}</td>
	<td><ScoreBar score={rel.surprise} max={4} /></td>
	<td>{rel.corpus_sim.toFixed(3)}</td>
	<td>{rel.background_sim.toFixed(3)}</td>
	</tr>
	))}
	</tbody>
	</table>
	{relations.data.normal_neighbors && (
	<div className="mt-2">
	<div className="section-label">For contrast — "{selectedWord}" normally relates to:</div>
	<div style={{ fontSize: "0.85rem", color: "var(--muted)" }}>
	{relations.data.normal_neighbors.map(n => n.neighbor).join(", ")}
	</div>
	</div>
	)}
	<button className="btn btn-primary mt-2" onClick={() => zoomIn(selectedWord, "")}>
	Zoom in on occurrences →
	</button>
	</>
	)}
	</div>
	)}

	{/* Stage C — contextual incongruence */}
	<div className="panel" id="zoom-section">
	<h3 style={{ marginTop: 0 }}>3 · Zoom in — incongruent occurrences</h3>
	<p className="panel-desc">
	Uses the transformer to rank each occurrence of a keyword by how unlike its norm it is.
	Leave the meaning blank to compare against the keyword's <em>typical</em> usage in this
	corpus, or supply a dictionary meaning (e.g. "pizza, an Italian food") to flag usages that
	drift from it. Highest-incongruence chunks are the candidate coded usages.
	</p>
	<div className="form-row">
	<div className="form-group">
	<label>Keyword</label>
	<input value={keyword} onChange={e => setKeyword(e.target.value)}
	onKeyDown={e => e.key === "Enter" && keyword.trim() && zoomIn(keyword.trim(), canonical)}
	placeholder="e.g. pizza" />
	</div>
	<div className="form-group" style={{ flex: 2 }}>
	<label>Canonical meaning (optional)</label>
	<input value={canonical} onChange={e => setCanonical(e.target.value)}
	onKeyDown={e => e.key === "Enter" && keyword.trim() && zoomIn(keyword.trim(), canonical)}
	placeholder="leave blank to use corpus-typical usage" />
	</div>
	<div className="form-group form-group-sm">
	<label> </label>
	<button className="btn btn-primary" disabled={incong.loading \|\| !keyword.trim()}
	onClick={() => zoomIn(keyword.trim(), canonical)}>
	{incong.loading ? "…" : "Zoom"}
	</button>
	</div>
	</div>

	{incong.error && <StatusMessage type="err" message={incong.error} />}
	{incong.data && incong.data.total_occurrences === 0 && (
	<StatusMessage type="err" message={`No occurrences of "${incong.data.keyword}" found.`} />
	)}
	{incong.data && incong.data.occurrences.length > 0 && (
	<div className="mt-2">
	<div className="section-label">
	{incong.data.total_occurrences} occurrences · reference: {incong.data.reference} ·
	median incongruence {incong.data.median_incongruence}
	</div>
	<div className="flex-col gap-3">
	{incong.data.occurrences.map((occ, i) => (
	<div key={i} className="result-card">
	<div className="result-header">
	<span className="context-snippet-source">{occ.doc_id} · chunk {occ.chunk_index}</span>
	<span className="badge" style={{
	background: "rgba(255,107,107,0.15)", color: "var(--err)",
	}}>incongruence {occ.incongruence.toFixed(3)}</span>
	</div>
	<div className="context-snippet mt-2">{occ.snippet}</div>
	{occ.entities.length > 0 && (
	<div className="mt-2">
	<span className="section-label">Co-occurring: </span>
	{occ.entities.map((e, j) => (
	<span key={j} className="badge" style={{ marginRight: 4 }}>{e}</span>
	))}
	</div>
	)}
	</div>
	))}
	</div>
	</div>
	)}
	</div>
	</div>
	);
	}