Spaces:

ImanAndrea
/

citation_annotation_training

Sleeping

AerdnaNami

fixed launch

61544ea 3 months ago

21.8 kB

	import sys
	import json
	import html
	from pathlib import Path
	import gradio as gr

	TEST_PAPERS_DIR = Path(__file__).parent / "test_papers"
	PAPERS_DIR = TEST_PAPERS_DIR / "papers"
	GOLD_DIR = TEST_PAPERS_DIR / "annotated"

	CATEGORIES = ["Unsupported claim", "Format", "Coherence", "Lacks synthesis"]
	CATEGORY_COLORS = {
	"Unsupported claim": "#ffb3b3",
	"Format": "#ffe8a3",
	"Coherence": "#b7f0d2",
	"Lacks synthesis": "#bcd8ff",
	}
	CATEGORY_SLUGS = {k: k.lower().replace(" ", "-") for k in CATEGORY_COLORS.keys()}

	LEGEND_HTML = """
	<div style="margin:8px 0 4px 0;font-size:0.9em">
	<div style="font-weight:600;margin-bottom:4px">Legend</div>
	<div style="display:flex;flex-direction:column;gap:4px">
	<div><span style="display:inline-block;width:14px;height:14px;background:#ffb3b3;border-radius:3px;margin-right:6px;vertical-align:middle"></span>Unsupported claim</div>
	<div><span style="display:inline-block;width:14px;height:14px;background:#ffe8a3;border-radius:3px;margin-right:6px;vertical-align:middle"></span>Format</div>
	<div><span style="display:inline-block;width:14px;height:14px;background:#b7f0d2;border-radius:3px;margin-right:6px;vertical-align:middle"></span>Coherence</div>
	<div><span style="display:inline-block;width:14px;height:14px;background:#bcd8ff;border-radius:3px;vertical-align:middle"></span>Lacks synthesis</div>
	</div>
	</div>
	"""

	CATEGORY_JS = """
	(start, end) => {
	const findById = (id) => {
	const direct = document.getElementById(id);
	if (direct) return direct;
	const app = document.querySelector('gradio-app');
	if (app && app.shadowRoot) {
	const inShadow = app.shadowRoot.getElementById(id);
	if (inShadow) return inShadow;
	}
	return null;
	};
	const view = findById('paper_view');
	const root = findById('paper-content') \|\| view;
	if (!root) return [start, end, "Text viewer not found."];
	const sel = window.getSelection();
	if (!sel \|\| sel.rangeCount === 0) return [start, end, ""];
	const range = sel.getRangeAt(0);
	let node = range.commonAncestorContainer;
	if (node && node.nodeType === 3) node = node.parentNode;
	const sameRoot = (a, b) => {
	if (!a \|\| !b \|\| !a.getRootNode \|\| !b.getRootNode) return false;
	return a.getRootNode() === b.getRootNode();
	};
	function inViewer(n){
	while (n) {
	if (n.id === 'paper-content' \|\| n.id === 'paper_view') return true;
	if (n.host) { n = n.host; continue; }
	n = n.parentNode;
	}
	return false;
	}
	const selected = range.toString();
	if (!selected) return [start, end, ""];
	if (!(inViewer(node) && (root.contains(node) \|\| sameRoot(root, node)))) {
	const text = root.textContent \|\| "";
	const idx = text.indexOf(selected);
	if (idx === -1) return [start, end, ""];
	const s = idx;
	const e = idx + selected.length;
	return [String(s), String(e), `Captured selection (${s}-${e})`];
	}
	const preRange = document.createRange();
	preRange.selectNodeContents(root);
	preRange.setEnd(range.startContainer, range.startOffset);
	const s = preRange.toString().length;
	const e = s + selected.length;
	return [String(s), String(e), `Captured selection (${s}-${e})`];
	}
	"""


	def _test_files():
	files = [f"paper_{i}.txt" for i in range(6, 11)]
	existing = [f for f in files if (PAPERS_DIR / f).exists()]
	return existing


	TEST_FILES = _test_files()


	def _read_text(filename):
	path = PAPERS_DIR / filename
	if not path.exists():
	return None, f"<em>File not found: {html.escape(filename)}</em>"
	try:
	return path.read_text(encoding="utf-8", errors="replace"), None
	except Exception as e:
	return None, f"<em>Error reading file: {html.escape(str(e))}</em>"


	def _render_with_highlights(text, annotations):
	if not annotations:
	escaped = html.escape(text)
	return (
	"<style>"
	".hl{padding:0 2px;border-radius:3px;}"
	"#paper-content{white-space:pre-wrap;font-family:inherit;padding:12px;"
	"border:1px solid #ddd;border-radius:8px;min-height:420px;"
	"max-height:420px;overflow:auto;box-sizing:border-box;"
	"user-select:text;}"
	"#paper-content:focus{outline:none;}"
	"</style>"
	f"<div id='paper-content' contenteditable='false' tabindex='0'>{escaped}</div>"
	)
	slug_for = {k: k.lower().replace(" ", "-") for k in CATEGORY_COLORS.keys()}
	sorted_anns = sorted(annotations, key=lambda r: (int(r.get("start", -1)), int(r.get("end", -1))))
	pieces = []
	cursor = 0
	text_len = len(text)
	for ann in sorted_anns:
	try:
	start = int(ann.get("start"))
	end = int(ann.get("end"))
	except Exception:
	continue
	if start < cursor or start < 0 or end < 0 or end > text_len or end <= start:
	continue
	pieces.append(html.escape(text[cursor:start]))
	label = ann.get("label", "")
	cls = slug_for.get(label, "unknown")
	color = CATEGORY_COLORS.get(label, "#ddd")
	span = html.escape(text[start:end])
	pieces.append(f"<span class='hl {cls}' style='background:{color}'>{span}</span>")
	cursor = end
	pieces.append(html.escape(text[cursor:]))
	styles = (
	"<style>"
	".hl{padding:0 2px;border-radius:3px;}"
	"#paper-content{white-space:pre-wrap;font-family:inherit;padding:12px;"
	"border:1px solid #ddd;border-radius:8px;min-height:420px;"
	"max-height:420px;overflow:auto;box-sizing:border-box;"
	"user-select:text;}"
	"#paper-content:focus{outline:none;}"
	"</style>"
	)
	return f"{styles}<div id='paper-content' contenteditable='false' tabindex='0'>{''.join(pieces)}</div>"


	def _get_annotations(state, filename):
	if not state:
	return []
	return list(state.get(filename, []))


	def read_paper(filename, ann_state):
	if not filename:
	return "<em>No file selected.</em>", "[]"
	text, err = _read_text(filename)
	if err:
	return err, "[]"
	annotations = _get_annotations(ann_state, filename)
	return _render_with_highlights(text, annotations), json.dumps(annotations, ensure_ascii=False, indent=2)


	def save_annotation(filename, start, end, category, ann_state):
	if not filename:
	return "No file selected.", gr.update(), gr.update(), ann_state
	if not category:
	return "No category selected.", gr.update(), gr.update(), ann_state
	if start in (None, "") or end in (None, ""):
	return "No selection captured yet.", gr.update(), gr.update(), ann_state
	try:
	start_i = int(start)
	end_i = int(end)
	except Exception:
	return f"Invalid start/end positions. start={start!r} end={end!r}", gr.update(), gr.update(), ann_state
	if start_i < 0 or end_i < 0 or end_i < start_i:
	return f"Invalid span range. start={start_i} end={end_i}", gr.update(), gr.update(), ann_state
	text, err = _read_text(filename)
	if err:
	return err, gr.update(), gr.update(), ann_state
	if end_i > len(text):
	return "Span exceeds file length.", gr.update(), gr.update(), ann_state
	record = {
	"file": filename,
	"start": start_i,
	"end": end_i,
	"label": category,
	"text": text[start_i:end_i],
	}
	records = _get_annotations(ann_state, filename)
	if records:
	last = records[-1]
	if (
	last.get("file") == record["file"]
	and last.get("start") == record["start"]
	and last.get("end") == record["end"]
	and last.get("label") == record["label"]
	):
	return f"Annotation already saved for {filename}.", gr.update(), gr.update(), ann_state
	records.append(record)
	new_state = dict(ann_state or {})
	new_state[filename] = records
	content = _render_with_highlights(text, records)
	annotations_json = json.dumps(records, ensure_ascii=False, indent=2)
	return f"Saved annotation for {filename}.", content, annotations_json, new_state


	def remove_annotation(filename, start, end, ann_state):
	if not filename:
	return "No file selected.", gr.update(), gr.update(), ann_state
	if start in (None, "") or end in (None, ""):
	return "No selection captured yet.", gr.update(), gr.update(), ann_state
	try:
	start_i = int(start)
	end_i = int(end)
	except Exception:
	return f"Invalid start/end positions. start={start!r} end={end!r}", gr.update(), gr.update(), ann_state
	if start_i < 0 or end_i < 0 or end_i < start_i:
	return f"Invalid span range. start={start_i} end={end_i}", gr.update(), gr.update(), ann_state
	text, err = _read_text(filename)
	if err:
	return err, gr.update(), gr.update(), ann_state
	records = _get_annotations(ann_state, filename)
	before = len(records)
	kept = []
	for r in records:
	try:
	s = int(r.get("start"))
	e = int(r.get("end"))
	except Exception:
	kept.append(r)
	continue
	overlaps = not (e <= start_i or s >= end_i)
	if not overlaps:
	kept.append(r)
	removed = before - len(kept)
	new_state = dict(ann_state or {})
	new_state[filename] = kept
	content = _render_with_highlights(text, kept)
	annotations_json = json.dumps(kept, ensure_ascii=False, indent=2)
	if removed == 0:
	return "No overlapping highlights to remove.", content, annotations_json, new_state
	return f"Removed {removed} highlight(s).", content, annotations_json, new_state


	def clear_annotations(filename, ann_state):
	if not filename:
	return "No file selected.", gr.update(), gr.update(), ann_state
	text, err = _read_text(filename)
	if err:
	return err, gr.update(), gr.update(), ann_state
	new_state = dict(ann_state or {})
	new_state[filename] = []
	content = _render_with_highlights(text, [])
	return "Cleared annotations for current paper.", content, "[]", new_state


	def _gold_path_for(filename):
	stem = Path(filename).stem
	return GOLD_DIR / f"first_ten_agreed_{stem}.json"


	def _load_gold(filename):
	path = _gold_path_for(filename)
	if not path.exists():
	return []
	try:
	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)
	if isinstance(data, list):
	return data
	except Exception:
	return []
	return []


	def _overlap(a_start, a_end, b_start, b_end):
	return not (a_end <= b_start or a_start >= b_end)


	def _score_annotations(filename, ann_state):
	if not filename:
	return "No file selected."
	user_anns = _get_annotations(ann_state, filename)
	gold_anns = _load_gold(filename)
	if not gold_anns:
	return "Gold annotations not found for this paper."
	matched_user = 0
	matched_gold = 0
	for u in user_anns:
	try:
	us = int(u.get("start"))
	ue = int(u.get("end"))
	except Exception:
	continue
	ul = u.get("label")
	ok = False
	for g in gold_anns:
	try:
	gs = int(g.get("start"))
	ge = int(g.get("end"))
	except Exception:
	continue
	gl = g.get("label")
	if ul == gl and _overlap(us, ue, gs, ge):
	ok = True
	break
	if ok:
	matched_user += 1
	for g in gold_anns:
	try:
	gs = int(g.get("start"))
	ge = int(g.get("end"))
	except Exception:
	continue
	gl = g.get("label")
	ok = False
	for u in user_anns:
	try:
	us = int(u.get("start"))
	ue = int(u.get("end"))
	except Exception:
	continue
	ul = u.get("label")
	if ul == gl and _overlap(us, ue, gs, ge):
	ok = True
	break
	if ok:
	matched_gold += 1
	total_user = len(user_anns)
	total_gold = len(gold_anns)
	precision = matched_user / total_user if total_user else 0.0
	recall = matched_gold / total_gold if total_gold else 0.0
	if precision + recall:
	f1 = 2 * precision * recall / (precision + recall)
	else:
	f1 = 0.0
	return (
	f"Score for {filename}: matched {matched_user}/{total_user} user spans and "
	f"{matched_gold}/{total_gold} gold spans. "
	f"Precision={precision:.2f} Recall={recall:.2f} F1={f1:.2f}"
	)


	def _render_gold(filename):
	if not filename:
	return "<em>No file selected.</em>"
	text, err = _read_text(filename)
	if err:
	return err
	gold_anns = _load_gold(filename)
	if not gold_anns:
	return "<em>Gold annotations not found for this paper.</em>"
	return _render_with_highlights(text, gold_anns)


	def _submit_check(filename, ann_state, attempts_state):
	if not filename:
	return "No file selected.", "<em>No file selected.</em>", attempts_state
	attempts = dict(attempts_state or {})
	tries = int(attempts.get(filename, 0))
	user_anns = _get_annotations(ann_state, filename)
	gold_anns = _load_gold(filename)
	if not gold_anns:
	return "Gold annotations not found for this paper.", "<em>Gold annotations not found.</em>", attempts
	matched_gold = 0
	for g in gold_anns:
	try:
	gs = int(g.get("start"))
	ge = int(g.get("end"))
	except Exception:
	continue
	gl = g.get("label")
	ok = False
	for u in user_anns:
	try:
	us = int(u.get("start"))
	ue = int(u.get("end"))
	except Exception:
	continue
	ul = u.get("label")
	if ul == gl and _overlap(us, ue, gs, ge):
	ok = True
	break
	if ok:
	matched_gold += 1
	total_gold = len(gold_anns)
	recall = matched_gold / total_gold if total_gold else 0.0
	tries += 1
	attempts[filename] = tries
	if recall >= 0.6:
	msg = (
	f"Passed: matched {matched_gold}/{total_gold} gold spans "
	f"(Recall={recall:.2f})."
	)
	return msg, _render_gold(filename), attempts
	if tries < 3:
	remaining = 3 - tries
	msg = (
	f"Try again: matched {matched_gold}/{total_gold} gold spans "
	f"(Recall={recall:.2f}). {remaining} attempt(s) left."
	)
	return msg, "<em>Gold highlights will appear after 3 attempts or a pass.</em>", attempts
	msg = (
	f"Attempts used. Matched {matched_gold}/{total_gold} gold spans "
	f"(Recall={recall:.2f}). Showing gold highlights."
	)
	return msg, _render_gold(filename), attempts


	def _reset_attempts_for(filename, attempts_state):
	if not filename:
	return attempts_state
	attempts = dict(attempts_state or {})
	attempts[filename] = 0
	return attempts


	def _attempts_label(filename, attempts_state):
	if not filename:
	return "<div style='text-align:right;font-size:1.35em;'>Attempts: 0/3</div>"
	attempts = dict(attempts_state or {})
	tries = int(attempts.get(filename, 0))
	return f"<div style='text-align:right;font-size:1.35em;'>Attempts: {tries}/3</div>"


	def _cycle_paper(current, direction):
	papers = TEST_FILES
	if not papers:
	return gr.update()
	if current not in papers:
	return gr.update(value=papers[0])
	idx = papers.index(current)
	next_idx = (idx + direction) % len(papers)
	return gr.update(value=papers[next_idx])


	def _progress_label(filename):
	papers = TEST_FILES
	if not papers or filename not in papers:
	return "<div style='font-size:1.35em;'>Test 0 of 0</div>"
	return (
	f"<div style='font-size:1.35em;'>"
	f"Test {papers.index(filename) + 1} of {len(papers)}"
	f"</div>"
	)


	with gr.Blocks(
	title="Annotation Check",
	css="""
	#cat_btn_unsupported-claim button,
	#cat_btn_unsupported-claim .gr-button { background:#ffb3b3 !important; border-color:#ffb3b3 !important; color:#222 !important; }
	#cat_btn_format button,
	#cat_btn_format .gr-button { background:#ffe8a3 !important; border-color:#ffe8a3 !important; color:#222 !important; }
	#cat_btn_coherence button,
	#cat_btn_coherence .gr-button { background:#b7f0d2 !important; border-color:#b7f0d2 !important; color:#222 !important; }
	#cat_btn_lacks-synthesis button,
	#cat_btn_lacks-synthesis .gr-button { background:#bcd8ff !important; border-color:#bcd8ff !important; color:#222 !important; }
	#top_intro { font-size:1.50em; line-height:1.3; }
	#progress_label { font-size:1.25em; }
	#attempts_label { font-size:1.25em; }
	""",
	) as demo:
	gr.HTML(
	"<div id='top_intro'>"
	"This is a test-only annotation app.<br>"
	"Highlight spans and label them exactly like the main task. "
	"Gold spans appear after you reach at least 60% recall (or after 3 tries)."
	"</div>"
	)
	ann_state = gr.State({})
	attempts_state = gr.State({})
	with gr.Row():
	with gr.Column(scale=3):
	current_file = gr.State(TEST_FILES[0] if TEST_FILES else None)
	with gr.Row():
	progress = gr.Markdown(
	_progress_label(TEST_FILES[0] if TEST_FILES else None),
	elem_id="progress_label",
	)
	attempts_label = gr.Markdown(
	_attempts_label(TEST_FILES[0] if TEST_FILES else None, {}),
	elem_id="attempts_label",
	)
	if TEST_FILES:
	initial_text, initial_err = _read_text(TEST_FILES[0])
	initial_html = (
	_render_with_highlights(initial_text, []) if not initial_err else initial_err
	)
	else:
	initial_html = "<em>No file selected.</em>"
	content = gr.HTML(initial_html, elem_id="paper_view")
	with gr.Row():
	prev_btn = gr.Button("Previous test")
	next_btn = gr.Button("Next test")
	gold_content = gr.HTML("<em>Gold highlighted spans will appear here after you click 'Submit & check' and pass or ran out of attempts.</em>")
	with gr.Column(scale=1):
	gr.Markdown("Category")
	category_buttons = {}
	for cat in CATEGORIES:
	slug = CATEGORY_SLUGS.get(cat, cat.lower().replace(" ", "-"))
	category_buttons[cat] = gr.Button(cat, elem_id=f"cat_btn_{slug}")
	remove_btn = gr.Button("Remove highlight", variant="secondary")
	gr.HTML(LEGEND_HTML)
	clear_btn = gr.Button("Clear annotations", variant="stop")
	submit_btn = gr.Button("Submit & check")
	status = gr.Textbox(label="Status", interactive=False, elem_id="status_box")
	annotations_view = gr.Textbox(
	label="Annotations for current file (JSON)", lines=10, interactive=False, elem_id="annotations_view"
	)

	start_pos = gr.Textbox(label="Start (char)", elem_id="start_pos", visible="hidden")
	end_pos = gr.Textbox(label="End (char)", elem_id="end_pos", visible="hidden")
	selection_timer = gr.Timer(0.3)

	prev_btn.click(lambda f: _cycle_paper(f, -1), inputs=current_file, outputs=current_file)
	next_btn.click(lambda f: _cycle_paper(f, 1), inputs=current_file, outputs=current_file)
	current_file.change(read_paper, inputs=[current_file, ann_state], outputs=[content, annotations_view])
	current_file.change(lambda: gr.update(value=""), None, start_pos)
	current_file.change(lambda: gr.update(value=""), None, end_pos)
	current_file.change(lambda f: _progress_label(f), inputs=current_file, outputs=progress)
	current_file.change(
	lambda: "<em>Gold highlighted spans will appear here after you click 'Submit & check' and pass or ran out of attempts.</em>",
	None,
	gold_content,
	)
	current_file.change(
	_reset_attempts_for,
	inputs=[current_file, attempts_state],
	outputs=[attempts_state],
	)
	current_file.change(
	_attempts_label,
	inputs=[current_file, attempts_state],
	outputs=[attempts_label],
	)

	selection_timer.tick(
	fn=None,
	inputs=[start_pos, end_pos],
	outputs=[start_pos, end_pos, status],
	js=CATEGORY_JS,
	)
	for cat, btn in category_buttons.items():
	btn.click(
	lambda filename, start, end, state, c=cat: save_annotation(filename, start, end, c, state),
	inputs=[current_file, start_pos, end_pos, ann_state],
	outputs=[status, content, annotations_view, ann_state],
	)
	remove_btn.click(
	remove_annotation,
	inputs=[current_file, start_pos, end_pos, ann_state],
	outputs=[status, content, annotations_view, ann_state],
	)
	clear_btn.click(
	clear_annotations,
	inputs=[current_file, ann_state],
	outputs=[status, content, annotations_view, ann_state],
	)
	submit_btn.click(
	_submit_check,
	inputs=[current_file, ann_state, attempts_state],
	outputs=[status, gold_content, attempts_state],
	)
	submit_btn.click(
	_attempts_label,
	inputs=[current_file, attempts_state],
	outputs=[attempts_label],
	)

	if TEST_FILES:
	_, annotations_val = read_paper(TEST_FILES[0], {})
	annotations_view.value = annotations_val


	if __name__ == "__main__":
	port = 7861
	if len(sys.argv) > 1:
	try:
	port = int(sys.argv[1])
	except Exception:
	pass
	demo.launch()