BibGuard / app.py
thinkwee
v2.0
fcffa22
#!/usr/bin/env python3
"""
BibGuard Gradio web app β€” minimalist iframe layout.
The right pane embeds the self-contained ``report.html`` produced by
``src/report/html_report.py`` via ``<iframe srcdoc=...>``. This makes the
generated report the single source of truth (per-section filters, full-text
search, dark mode, inline span highlighting all live inside it) and avoids
re-rendering the same content inside Gradio with stale styles.
"""
from __future__ import annotations
import base64
import logging
import os
import tempfile
import time
from pathlib import Path
import gradio as gr
from src.parsers import BibParser, TexParser
from src.fetchers import (
ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher,
OpenAlexFetcher, DBLPFetcher,
)
from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
from src.report.generator import ReportGenerator, EntryReport
from src.config.yaml_config import (
BibGuardConfig, BibliographyConfig, SubmissionConfig, OutputConfig,
)
from src.config.workflow import get_default_workflow
from src.checkers import CHECKER_REGISTRY
from src.checkers.retraction_checker import RetractionChecker
from src.checkers.url_checker import URLChecker
from src.utils import http as http_layer
from src.utils.logging_setup import setup as setup_logging, capture_run
from src.utils.validation import validate_bib, validate_tex, format_report
from app_helper import fetch_and_compare_with_workflow
LOG_PATH = setup_logging(os.environ.get("BIBGUARD_LOG", "WARNING"))
logger = logging.getLogger("bibguard.app")
logger.info("BibGuard app starting (log file: %s)", LOG_PATH)
# Configure HTTP layer once at import time.
http_layer.configure(
contact_email=os.environ.get("BIBGUARD_CONTACT_EMAIL", ""),
cache_enabled=True,
cache_ttl_hours=24,
retry_total=5,
retry_backoff_factor=1.5,
)
# --------------------------------------------------------------------- presets
PRESETS = {
"Quick": {
"check_metadata": False, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
"url_liveness": False, "retraction": False,
"submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
"ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
"number": True, "citation_quality": True, "anonymization": True},
},
"Standard": {
"check_metadata": False, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
"url_liveness": False, "retraction": True,
"submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
"ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
"number": True, "citation_quality": True, "anonymization": True},
},
"Strict": {
"check_metadata": True, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
"url_liveness": True, "retraction": True,
"submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
"ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
"number": True, "citation_quality": True, "anonymization": True},
},
}
# ----------------------------------------------------------------------- CSS
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
* { font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; }
/* Reserve space for the vertical scrollbar so expanding the Advanced
accordion (or anything else that adds content) doesn't shift the
layout horizontally. `overflow-y: scroll` on html is the universal
fallback for browsers without scrollbar-gutter.
`overflow-x: hidden` on body kills any page-width jitter coming from
inner elements that briefly overflow during streaming updates. */
html { scrollbar-gutter: stable; overflow-y: scroll; overflow-x: hidden; }
body { overflow-x: hidden; }
.gradio-container {
max-width: 1400px !important;
margin: 0 auto !important;
padding: 0 20px !important;
box-sizing: border-box !important;
width: 100% !important;
overflow-x: hidden !important;
}
/* Header strip */
.bg-header {
padding: 14px 4px 12px !important;
border-bottom: 1px solid #e5e7eb;
margin-bottom: 14px;
}
/* ==================================================================
Top toolbar β€” single horizontal row with all primary controls.
Every primary control has the SAME explicit 56px height. The little
filename/info chip beneath sits in a fixed 18px slot. The columns
wrap that into a 78px tall toolbar that's identical across cells.
================================================================== */
.bg-toolbar {
margin-bottom: 14px;
gap: 10px !important;
align-items: flex-start !important;
}
.bg-toolbar .gr-form { gap: 0 !important; }
.bg-toolbar .gr-block { border: none !important; box-shadow: none !important; padding: 0 !important; }
/* Common: any direct primary control fills column width */
.bg-toolbar > * { width: 100% !important; }
/* ---- Upload buttons ---- */
.bg-upload-btn,
.bg-upload-btn > .wrap,
.bg-upload-btn > div {
height: 56px !important;
min-height: 56px !important;
max-height: 56px !important;
width: 100% !important;
}
.bg-upload-btn button {
height: 56px !important;
min-height: 56px !important;
max-height: 56px !important;
width: 100% !important;
padding: 0 14px !important;
font-size: 13px !important;
font-weight: 500 !important;
border-radius: 8px !important;
border: 1px dashed #cbd5e1 !important;
background: #f8fafc !important;
color: #334155 !important;
transition: border 0.15s, background 0.15s !important;
line-height: 1 !important;
}
.bg-upload-btn button:hover {
border-color: #2563eb !important;
background: #eff6ff !important;
color: #1e3a8a !important;
}
/* ---- Run / Stop button (same column, visibility-swapped) ---- */
.bg-run-btn,
.bg-run-btn > .wrap,
.bg-run-btn > div {
height: 56px !important;
min-height: 56px !important;
max-height: 56px !important;
width: 100% !important;
}
.bg-run-btn button {
height: 56px !important;
min-height: 56px !important;
max-height: 56px !important;
width: 100% !important;
font-weight: 600 !important;
border-radius: 8px !important;
font-size: 14px !important;
line-height: 1 !important;
padding: 0 16px !important;
}
.bg-stop-btn button {
background: #dc2626 !important;
color: white !important;
border: none !important;
}
.bg-stop-btn button:hover { background: #b91c1c !important; }
/* ---- Preset radio as horizontal pill chips ---- */
.bg-preset,
.bg-preset > div,
.bg-preset > .wrap {
height: 56px !important;
min-height: 56px !important;
max-height: 56px !important;
padding: 0 !important;
}
.bg-preset > label,
.bg-preset .label-wrap { display: none !important; }
.bg-preset .wrap,
.bg-preset > div > div,
.bg-preset fieldset {
display: flex !important;
flex-direction: row !important;
gap: 4px !important;
flex-wrap: nowrap !important;
width: 100% !important;
height: 56px !important;
align-items: stretch !important;
border: none !important;
padding: 0 !important;
margin: 0 !important;
}
.bg-preset label {
flex: 1 1 0 !important;
margin: 0 !important;
padding: 0 8px !important;
height: 56px !important;
min-height: 56px !important;
max-height: 56px !important;
border-radius: 8px !important;
font-size: 13px !important;
font-weight: 500 !important;
border: 1px solid #e5e7eb !important;
background: #ffffff !important;
cursor: pointer !important;
text-align: center !important;
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
line-height: 1 !important;
color: #475569 !important;
transition: background 0.15s, border 0.15s !important;
white-space: nowrap !important;
}
.bg-preset label:hover { background: #f8fafc !important; border-color: #cbd5e1 !important; }
.bg-preset input[type="radio"] { display: none !important; }
.bg-preset label.selected,
.bg-preset label:has(input:checked) {
background: #1e3a8a !important;
color: #ffffff !important;
border-color: #1e3a8a !important;
}
/* ---- Caption chip beneath each toolbar control ---- */
.bg-fname {
font-size: 11.5px;
color: #94a3b8;
padding: 4px 8px 0 8px;
line-height: 1.3;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
height: 18px;
box-sizing: content-box;
}
.bg-fname.ok { color: #166534; font-weight: 500; }
/* ==================================================================
Advanced settings β€” gr.Row with each Checkbox as its own card.
Trick: `display: contents` on Gradio's intermediate wrapper makes
it vanish from the layout tree, so the actual checkbox blocks
become direct flex children of .bg-row. Card style is applied to
each block, not the wrapper, so we get N cards per row instead of
one big box.
================================================================== */
.bg-row {
display: flex !important;
flex-direction: row !important;
gap: 10px !important;
align-items: stretch !important;
padding: 4px 0 !important;
}
/* Flatten Gradio's intermediate `.form` / `.gr-form` wrapper so its
children become direct flex items of .bg-row. */
.bg-row > .form,
.bg-row > .gr-form {
display: contents !important;
}
/* Some Gradio versions emit a plain `<div>` wrapper instead of `.form`.
We can't safely `display: contents` every direct div (the spacer is
one), but if the wrapper has only blocks inside, contents flatten it. */
.bg-row > div:not(.bg-row-spacer):not(.gr-block):not(.block) {
display: contents !important;
}
/* Each individual checkbox block = a card */
.bg-row .gr-block,
.bg-row .block {
flex: 1 1 0 !important;
min-width: 0 !important;
background: #f8fafc !important;
border: 1px solid #e5e7eb !important;
border-radius: 8px !important;
padding: 8px 12px !important;
box-shadow: none !important;
transition: background 0.15s, border 0.15s !important;
}
.bg-row .gr-block:hover,
.bg-row .block:hover {
background: #eff6ff !important;
border-color: #cbd5e1 !important;
}
.bg-row label,
.bg-row .gr-checkbox label {
font-size: 13px !important;
font-weight: 500 !important;
line-height: 1.3 !important;
color: #334155 !important;
margin: 0 !important;
padding: 0 !important;
}
.bg-row .gr-info, .bg-row [class*="info"] { display: none !important; }
/* Spacer β€” invisible flex item that just preserves alignment */
.bg-row .bg-row-spacer {
flex: 1 1 0 !important;
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
visibility: hidden !important;
}
/* ==================================================================
Status strip β€” thin one-liner above the report.
The Gradio HTML wrapper itself is pinned to its parent column's width
so no inner content can change the page geometry during streaming.
================================================================== */
#bg-status-wrap,
#bg-status-wrap > * {
width: 100% !important;
max-width: 100% !important;
min-width: 0 !important;
box-sizing: border-box !important;
overflow-x: hidden !important;
}
.bg-status {
padding: 10px 14px;
border-radius: 10px;
background: #f8fafc;
border: 1px solid #e2e8f0;
font-size: 12.5px;
line-height: 1.45;
color: #334155;
margin: 8px 0 12px 0;
max-width: 100%;
overflow: hidden; /* never let inline content widen the page */
box-sizing: border-box;
}
.bg-status-row {
display: flex;
align-items: center;
gap: 14px;
flex-wrap: nowrap; /* one row, ellipsize the middle */
min-width: 0;
width: 100%;
}
.bg-status .bg-status-stage {
font-weight: 600;
color: #1e3a8a;
display: inline-flex;
align-items: center;
gap: 8px;
flex-shrink: 0;
white-space: nowrap;
}
.bg-status .bg-status-detail {
color: #475569;
flex: 1 1 0;
min-width: 0;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.bg-status .bg-status-detail code {
background: #eef2ff;
padding: 1px 6px;
border-radius: 4px;
font-size: 11.5px;
color: #1e3a8a;
}
.bg-status .bg-status-meta {
color: #64748b;
font-size: 11.5px;
display: inline-flex;
flex-wrap: nowrap;
gap: 12px;
flex-shrink: 0;
white-space: nowrap;
}
.bg-status.done { background: #f0fdf4; border-color: #bbf7d0; }
.bg-status.done .bg-status-stage { color: #166534; }
.bg-status.error { background: #fef2f2; border-color: #fecaca; }
.bg-status.error .bg-status-stage { color: #b91c1c; }
.bg-status .spin {
display: inline-block;
width: 10px; height: 10px;
border: 2px solid #cbd5e1;
border-top-color: #2563eb;
border-radius: 50%;
animation: bg-spin 0.9s linear infinite;
}
@keyframes bg-spin { to { transform: rotate(360deg); } }
/* ==================================================================
Report area β€” full-width iframe.
================================================================== */
.bg-main { padding: 0 !important; }
.bg-report-iframe {
width: 100%;
height: 80vh;
min-height: 620px;
border: 1px solid #e5e7eb;
border-radius: 12px;
background: white;
box-shadow: 0 1px 2px rgba(0,0,0,0.04);
}
/* Empty / error placeholder (full-width, centered card) */
.bg-empty {
display: flex; align-items: center; justify-content: center;
flex-direction: column; gap: 14px;
min-height: 60vh;
color: #6b7280; text-align: center;
border: 2px dashed #e5e7eb; border-radius: 12px;
padding: 56px 24px;
background: #fafafa;
}
.bg-empty .bg-empty-icon { font-size: 56px; line-height: 1; }
.bg-empty .bg-empty-title { font-size: 17px; font-weight: 600; color: #374151; }
.bg-empty .bg-empty-hint { font-size: 14px; max-width: 580px; line-height: 1.6; }
.bg-empty .bg-empty-hint code { background: #f3f4f6; padding: 1px 6px; border-radius: 4px; font-size: 13px; }
/* Compact downloads section */
.bg-downloads { gap: 6px !important; }
.bg-downloads .gr-file { min-height: auto !important; }
.bg-downloads .bg-file-input > label > div {
height: 52px !important;
min-height: 52px !important;
max-height: 52px !important;
}
/* Footer */
.bg-footer {
text-align: center;
margin-top: 18px;
padding-top: 12px;
border-top: 1px solid #f1f5f9;
font-size: 11.5px;
color: #9ca3af;
}
.bg-footer code { background: #f3f4f6; padding: 1px 5px; border-radius: 3px; font-size: 11px; }
.bg-footer a { color: #6b7280; text-decoration: none; }
.bg-footer a:hover { text-decoration: underline; }
/* Trim accordion chrome a bit */
.gr-accordion { border-radius: 10px !important; border: 1px solid #e5e7eb !important; }
.gr-accordion > .label-wrap { padding: 8px 12px !important; font-size: 13px !important; }
@media (prefers-color-scheme: dark) {
.bg-empty { background: #161b22; border-color: #2a313c; color: #9ca3af; }
.bg-empty .bg-empty-title { color: #e6edf3; }
.bg-empty .bg-empty-hint code { background: #21262d; }
.bg-report-iframe { background: #0d1117; border-color: #2a313c; box-shadow: none; }
.bg-status { background: #0f172a; border-color: #1e293b; color: #cbd5e1; }
.bg-status .bg-status-stage { color: #93c5fd; }
.bg-status .bg-status-detail { color: #94a3b8; }
.bg-status .bg-status-detail code { background: #1e293b; color: #93c5fd; }
.bg-status .bg-status-meta { color: #64748b; }
.bg-status.done { background: #052e1a; border-color: #14532d; }
.bg-status.done .bg-status-stage { color: #86efac; }
.bg-status.error { background: #2a0e0e; border-color: #7f1d1d; }
.bg-preset label { background: #161b22 !important; border-color: #2a313c !important; color: #cbd5e1 !important; }
.bg-preset label:hover { background: #1e293b !important; }
.bg-preset .selected { background: #2563eb !important; border-color: #2563eb !important; }
.bg-footer { border-color: #1e293b; }
}
"""
EMPTY_PANEL_HTML = """
<div class="bg-empty">
<div class="bg-empty-icon">πŸ“„</div>
<div class="bg-empty-title">Your interactive report appears here</div>
<div class="bg-empty-hint">
Upload a <code>.bib</code> file and a <code>.tex</code> file in the toolbar above,
pick a preset, then press <strong>Run check</strong>. The report renders as a
self-contained HTML page with per-section filters, full-text search,
inline span highlighting, and dark-mode support.
</div>
</div>
"""
EMPTY_STATUS_HTML = (
'<div class="bg-status">'
'<div class="bg-status-row">'
'<span class="bg-status-stage">β—‹ Idle</span>'
'<span class="bg-status-detail">Upload <code>.bib</code> + <code>.tex</code> '
'and press <strong>Run check</strong> to begin.</span>'
'</div></div>'
)
def _placeholder(message: str, color: str = "#b91c1c") -> str:
"""Inline error/info card shown in place of the iframe."""
return (
f'<div class="bg-empty" style="color:{color};border-color:{color}33">'
f'<div class="bg-empty-icon">⚠️</div>'
f'<div class="bg-empty-title">{message}</div>'
f'</div>'
)
def _html_to_iframe(html: str) -> str:
"""
Embed an HTML document inside ``<iframe srcdoc>``.
We escape only ``&`` and ``"`` β€” these are the two characters that can
break the attribute value or get re-decoded as entities. ``<`` and ``>``
must stay raw, otherwise the inner document would be HTML-encoded.
"""
escaped = html.replace("&", "&amp;").replace('"', "&quot;")
return (
f'<iframe class="bg-report-iframe" srcdoc="{escaped}" '
f'sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" '
f'loading="lazy"></iframe>'
)
def _status_html(stage: str, detail: str = "", meta: list[str] | None = None,
state: str = "running") -> str:
"""Render the live-status strip shown above the report.
Layout is a single horizontal row: [stage] [detail] [meta chips].
Wraps cleanly on narrow screens.
"""
if state == "running":
stage_icon = '<span class="spin"></span>'
elif state == "done":
stage_icon = '<span>βœ“</span>'
elif state == "error":
stage_icon = '<span>⚠</span>'
else:
stage_icon = '<span>β—‹</span>'
detail_html = f'<span class="bg-status-detail">{detail}</span>' if detail else '<span class="bg-status-detail"></span>'
meta_html = ""
if meta:
meta_html = (
'<span class="bg-status-meta">'
+ " ".join(f"<span>{m}</span>" for m in meta)
+ "</span>"
)
return (
f'<div class="bg-status {state}">'
f'<div class="bg-status-row">'
f'<span class="bg-status-stage">{stage_icon}<span>{stage}</span></span>'
f'{detail_html}{meta_html}'
f'</div></div>'
)
# --------------------------------------------------------------- config glue
def create_config_from_ui(
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
caption, reference, formatting, equation, ai_artifacts,
sentence, consistency, acronym, number, citation_quality, anonymization,
) -> BibGuardConfig:
config = BibGuardConfig()
config.bibliography = BibliographyConfig(
check_metadata=check_metadata,
check_usage=check_usage,
check_duplicates=check_duplicates,
check_preprint_ratio=check_preprint_ratio,
check_relevance=False, # LLM disabled in web mode
)
config.submission = SubmissionConfig(
caption=caption, reference=reference, formatting=formatting, equation=equation,
ai_artifacts=ai_artifacts, sentence=sentence, consistency=consistency,
acronym=acronym, number=number, citation_quality=citation_quality,
anonymization=anonymization,
)
config.output = OutputConfig(quiet=True, minimal_verified=False)
return config
def apply_preset(name: str):
p = PRESETS.get(name, PRESETS["Standard"])
sub = p["submission"]
return (
p["check_metadata"], p["check_usage"], p["check_duplicates"], p["check_preprint_ratio"],
sub["caption"], sub["reference"], sub["formatting"], sub["equation"],
sub["ai_artifacts"], sub["sentence"], sub["consistency"], sub["acronym"],
sub["number"], sub["citation_quality"], sub["anonymization"],
p["url_liveness"], p["retraction"],
)
_PRESET_CAPTIONS = {
"Quick": "local checks only Β· no network Β· instant",
"Standard": "local checks + retraction lookup (CrossRef)",
"Strict": "+ URL liveness + multi-source metadata (slow)",
}
def _preset_caption_html(name: str) -> str:
text = _PRESET_CAPTIONS.get(name, "")
return f'<div class="bg-fname" style="text-align:center">{text}</div>'
# ------------------------------------------------------------------ run_check
# Streaming generator. Each yield is a 7-tuple:
# (iframe_html, status_html, html_path, md_path, json_path,
# cleaned_bib_path, log_path)
# `capture_run` attaches a per-run DEBUG file handler so any exception or
# warning anywhere in the pipeline is recorded with full traceback at
# `<out_dir>/bibguard.log`, which is then downloadable. The status panel
# surfaces warning+error counts so problems aren't invisible.
def run_check(
bib_file, tex_file,
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
caption, reference, formatting, equation, ai_artifacts,
sentence, consistency, acronym, number, citation_quality, anonymization,
url_liveness=False, retraction=True,
):
"""Run the full check pipeline as a streaming generator with per-run logging.
`bib_file` / `tex_file` are filesystem path strings (carried by gr.State),
not gr.File objects. The status panel is the single source of progress
feedback β€” no separate gr.Progress bar.
"""
started = time.time()
def _elapsed() -> str:
return f"⏱ {int(time.time() - started)}s"
# Initial state: keep current report (None means clear).
if not bib_file or not tex_file:
yield (
_placeholder("Please choose both a .bib and a .tex file in the toolbar."),
_status_html("Waiting for files",
"Pick a .bib and a .tex file from the toolbar to start.",
state="error"),
None, None, None, None, None,
)
return
# Allocate the artifact dir up-front so the per-run log lives next to
# the report files.
out_dir = Path(tempfile.mkdtemp(prefix="bibguard_"))
log_path_target = out_dir / "bibguard.log"
# Reset per-source circuit breakers so a previous run's flaky source
# doesn't carry over and skip valid lookups in this run.
http_layer.reset_breakers()
with capture_run(target_path=log_path_target) as (log_path, log_stats):
logger.info("=== run_check start: bib=%s tex=%s ===", bib_file, tex_file)
try:
yield from _run_check_impl(
bib_file, tex_file, out_dir, log_path, log_stats,
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
caption, reference, formatting, equation, ai_artifacts,
sentence, consistency, acronym, number, citation_quality, anonymization,
url_liveness, retraction, started, _elapsed,
)
except Exception as e:
logger.exception("run_check crashed (entry-level guard)")
yield (
_placeholder(f"Unhandled error: {e}"),
_status_html("Failed", f"{e} β€” see <code>bibguard.log</code> for the full traceback.",
state="error"),
None, None, None, None, str(log_path),
)
finally:
logger.info("=== run_check end: warnings=%d errors=%d ===",
log_stats.warnings, log_stats.errors)
def _run_check_impl(
bib_file, tex_file, out_dir, log_path, log_stats,
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
caption, reference, formatting, equation, ai_artifacts,
sentence, consistency, acronym, number, citation_quality, anonymization,
url_liveness, retraction, started, _elapsed,
):
"""Inner pipeline. Wrapped in `capture_run` by `run_check`.
Every yield is a 7-tuple ending with the log path so the user can
download `bibguard.log` even from intermediate updates.
"""
log_path_str = str(log_path)
bib_path = Path(bib_file)
tex_path = Path(tex_file)
logger.info("Inputs: bib=%s tex=%s out_dir=%s", bib_path, tex_path, out_dir)
def _meta_with_logs(extra: list[str]) -> list[str]:
out = list(extra)
if log_stats.warnings or log_stats.errors:
out.append(f"⚠ {log_stats.warnings}w / {log_stats.errors}e logged")
return out
yield (
gr.update(),
_status_html("Validating files",
f"Reading <code>{bib_path.name}</code> and <code>{tex_path.name}</code>",
meta=_meta_with_logs([_elapsed()])),
None, None, None, None, log_path_str,
)
# Pre-flight content validation
bib_rep = validate_bib(bib_path)
tex_rep = validate_tex(tex_path)
msg = "\n".join(filter(None, [
format_report(bib_rep, bib_path.name),
format_report(tex_rep, tex_path.name),
]))
if not bib_rep.ok or not tex_rep.ok:
logger.error("File validation failed:\n%s", msg)
block = (
f'<div class="bg-empty" style="color:#b91c1c;border-color:#b91c1c33">'
f'<div class="bg-empty-icon">⚠️</div>'
f'<div class="bg-empty-title">File validation failed</div>'
f'<pre style="white-space:pre-wrap;font-size:13px;color:#7f1d1d;'
f'background:#fef2f2;padding:12px;border-radius:6px;max-width:540px">{msg}</pre>'
f'</div>'
)
yield (
block,
_status_html("File validation failed", msg.replace("\n", "<br>"),
state="error"),
None, None, None, None, log_path_str,
)
return
elif msg:
logger.info("Validation warnings:\n%s", msg)
config = create_config_from_ui(
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
caption, reference, formatting, equation, ai_artifacts,
sentence, consistency, acronym, number, citation_quality, anonymization,
)
yield (
gr.update(),
_status_html("Parsing", "Loading bibliography and LaTeX source",
meta=_meta_with_logs([_elapsed()])),
None, None, None, None, log_path_str,
)
tex_content = tex_path.read_text(encoding='utf-8', errors='replace')
bib_parser = BibParser()
entries = bib_parser.parse_file(str(bib_path))
tex_parser = TexParser()
tex_parser.parse_file(str(tex_path))
logger.info("Parsed %d bib entries from %s", len(entries), bib_path.name)
bib_config = config.bibliography
# Init components
arxiv_fetcher = crossref_fetcher = ss_fetcher = oa_fetcher = dblp_fetcher = None
comparator = usage_checker = duplicate_detector = None
if bib_config.check_metadata:
arxiv_fetcher = ArxivFetcher()
ss_fetcher = SemanticScholarFetcher()
oa_fetcher = OpenAlexFetcher()
dblp_fetcher = DBLPFetcher()
crossref_fetcher = CrossRefFetcher()
comparator = MetadataComparator()
if bib_config.check_usage:
usage_checker = UsageChecker(tex_parser)
if bib_config.check_duplicates:
duplicate_detector = DuplicateDetector()
report_gen = ReportGenerator(
minimal_verified=False,
check_preprint_ratio=bib_config.check_preprint_ratio,
preprint_warning_threshold=bib_config.preprint_warning_threshold,
)
report_gen.set_metadata([str(bib_path)], [str(tex_path)])
# Submission quality checks
yield (
gr.update(),
_status_html("LaTeX quality checks",
f"Running {len(config.submission.get_enabled_checkers())} checkers on the LaTeX source",
meta=_meta_with_logs([f"πŸ“š {len(entries)} bib entries", _elapsed()])),
None, None, None, None, log_path_str,
)
submission_results = []
for name in config.submission.get_enabled_checkers():
if name in CHECKER_REGISTRY:
try:
checker = CHECKER_REGISTRY[name]()
results = checker.check(tex_content, {})
for r in results:
r.file_path = str(tex_path)
submission_results.extend(results)
except Exception:
logger.exception("Checker %s crashed", name)
report_gen.set_submission_results(submission_results, None)
if bib_config.check_duplicates and duplicate_detector:
try:
report_gen.set_duplicate_groups(duplicate_detector.find_duplicates(entries))
except Exception:
logger.exception("Duplicate detection crashed")
if bib_config.check_usage and usage_checker:
try:
report_gen.set_missing_citations(usage_checker.get_missing_entries(entries))
except Exception:
logger.exception("Missing-citation lookup crashed")
# Per-entry workflow
total = max(1, len(entries))
workflow_config = get_default_workflow()
verified_count = 0
flagged_count = 0
not_found_count = 0
last_yield = time.time()
def _identifier_chip(entry) -> str:
"""Tiny inline hint about which IDs we have for this entry."""
bits = []
if entry.doi: bits.append("DOI")
if entry.has_arxiv: bits.append("arXiv")
if entry.title and not bits: bits.append("title")
elif entry.title: bits.append("title")
return " + ".join(bits) if bits else "no identifiers"
def _outcome_label(cmp) -> str:
if cmp is None:
return ""
if cmp.source == "unable":
return "<span style='color:#b45309'>? no metadata</span>"
if cmp.is_match:
return f"<span style='color:#166534'>βœ“ verified by {cmp.source}</span>"
return f"<span style='color:#b45309'>⚠ flagged ({cmp.source})</span>"
for i, entry in enumerate(entries):
# ── Pre-fetch status: announce identifier set BEFORE the network roundtrip
# so the user sees what's being attempted, not just the entry name.
if bib_config.check_metadata and comparator:
now = time.time()
if now - last_yield > 0.4 or i == 0:
ids = _identifier_chip(entry)
detail = f"<code>{entry.key}</code> Β· querying via <strong>{ids}</strong>"
if entry.title:
short = entry.title[:70] + ("…" if len(entry.title) > 70 else "")
detail += f" β€” <span style='color:#64748b'>{short}</span>"
yield (
gr.update(),
_status_html(
f"Verifying entry {i + 1}/{total}",
detail,
meta=_meta_with_logs([
f"πŸ“š {total} total",
f"βœ“ {verified_count}",
f"⚠ {flagged_count}",
f"? {not_found_count}",
_elapsed(),
]),
),
None, None, None, None, log_path_str,
)
last_yield = now
usage_result = None
comparison_result = None
try:
if usage_checker:
usage_result = usage_checker.check_usage(entry)
except Exception:
logger.exception("Usage check crashed for entry=%s", entry.key)
try:
if bib_config.check_metadata and comparator:
comparison_result = fetch_and_compare_with_workflow(
entry, workflow_config, arxiv_fetcher, crossref_fetcher,
ss_fetcher, oa_fetcher, dblp_fetcher, comparator,
)
if comparison_result is None or comparison_result.source == "unable":
not_found_count += 1
elif comparison_result.is_match:
verified_count += 1
else:
flagged_count += 1
except Exception:
logger.exception("Metadata fetch crashed for entry=%s", entry.key)
report_gen.add_entry_report(EntryReport(
entry=entry, comparison=comparison_result,
usage=usage_result, evaluations=[],
))
# ── Post-fetch status: show outcome inline so the user can watch
# results stream in (verified / flagged / not found).
now = time.time()
if now - last_yield > 0.4 or i == total - 1:
outcome = _outcome_label(comparison_result)
detail_parts = [f"<code>{entry.key}</code>"]
if outcome:
detail_parts.append(outcome)
if entry.title:
short = entry.title[:70] + ("…" if len(entry.title) > 70 else "")
detail_parts.append(f"<span style='color:#64748b'>{short}</span>")
detail = " Β· ".join(detail_parts)
meta = _meta_with_logs([
f"πŸ“š {i + 1}/{total}",
f"βœ“ {verified_count}",
f"⚠ {flagged_count}",
f"? {not_found_count}",
_elapsed(),
])
yield (
gr.update(),
_status_html(f"Bibliography {i + 1}/{total}", detail, meta=meta),
None, None, None, None, log_path_str,
)
last_yield = now
if retraction:
try:
doi_count = sum(1 for e in entries if getattr(e, "doi", ""))
yield (
gr.update(),
_status_html("Retraction lookups",
f"Querying CrossRef for {doi_count} DOI(s)",
meta=_meta_with_logs([_elapsed()])),
None, None, None, None, log_path_str,
)
report_gen.set_retraction_findings(RetractionChecker().check_entries(entries))
except Exception:
logger.exception("Retraction lookup crashed")
if url_liveness:
try:
url_count = sum(1 for e in entries if getattr(e, "url", ""))
yield (
gr.update(),
_status_html("URL liveness",
f"HEAD-checking {url_count} URL(s) in parallel",
meta=_meta_with_logs([_elapsed()])),
None, None, None, None, log_path_str,
)
report_gen.set_url_findings(URLChecker().check_entries(entries))
except Exception:
logger.exception("URL liveness crashed")
# Save artifacts
yield (
gr.update(),
_status_html("Building report",
"Rendering self-contained HTML, JSON, and Markdown",
meta=_meta_with_logs([_elapsed()])),
None, None, None, None, log_path_str,
)
html_path = out_dir / "report.html"
md_path = out_dir / "bibliography_report.md"
json_path = out_dir / "report.json"
cleaned_bib_path: Path | None = None
try:
report_gen.save_html(str(html_path))
report_gen.save_bibliography_report(str(md_path))
report_gen.save_json(str(json_path))
if usage_checker:
used_keys = {er.entry.key for er in report_gen.entries if er.usage and er.usage.is_used}
if used_keys:
cleaned_bib_path = out_dir / f"{bib_path.stem}_only_used.bib"
bib_parser.filter_file(str(bib_path), str(cleaned_bib_path), used_keys)
except Exception:
logger.exception("Artifact generation failed")
# Embed report.html as iframe srcdoc
if html_path.exists():
iframe_html = _html_to_iframe(html_path.read_text(encoding='utf-8'))
else:
iframe_html = _placeholder("Report generation failed β€” see bibguard.log.")
meta = _meta_with_logs([
f"πŸ“š {len(entries)} entries",
f"βœ“ {verified_count} verified",
f"⚠ {flagged_count} flagged",
_elapsed(),
])
state = "done"
summary = "Report ready. Use the right pane to filter, search, and copy fixes."
if log_stats.errors > 0:
state = "error"
summary = (f"Done with {log_stats.errors} error(s) and {log_stats.warnings} warning(s) "
"logged β€” see <code>bibguard.log</code> for full tracebacks.")
elif log_stats.warnings > 0:
summary = (f"Report ready ({log_stats.warnings} warnings logged β€” see "
"<code>bibguard.log</code>).")
yield (
iframe_html,
_status_html("Done", summary, meta=meta, state=state),
str(html_path) if html_path.exists() else None,
str(md_path) if md_path.exists() else None,
str(json_path) if json_path.exists() else None,
str(cleaned_bib_path) if (cleaned_bib_path and cleaned_bib_path.exists()) else None,
log_path_str,
)
# --------------------------------------------------------------------- layout
def create_app() -> gr.Blocks:
# Inline app icon as a base64 data URL β€” works regardless of cwd.
icon_html = '<span style="font-size:28px">πŸ›‘οΈ</span>'
try:
icon_path = Path(__file__).parent / "assets" / "icon-192.png"
if icon_path.exists():
with open(icon_path, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
icon_html = (
f'<img src="data:image/png;base64,{b64}" '
f'style="width:32px;height:32px;border-radius:6px" alt="BibGuard">'
)
except Exception as e:
logger.debug("Icon load failed; using emoji fallback: %s", e, exc_info=True)
with gr.Blocks(
title="BibGuard β€” Bibliography & LaTeX Quality Auditor",
) as app:
gr.HTML(f"""
<div class="bg-header" style="display:flex;align-items:center;gap:10px">
{icon_html}
<strong style="font-size:18px">BibGuard</strong>
<span style="color:#6b7280;font-size:13px">β€” Bibliography & LaTeX quality auditor</span>
<span style="flex:1"></span>
<a href="https://github.com/thinkwee/BibGuard" target="_blank"
style="color:#6b7280;text-decoration:none;font-size:13px">GitHub β†—</a>
</div>
""")
# ───────────────────────── Top toolbar ─────────────────────────
# All primary controls on a single horizontal row, every primary
# widget pinned to 56px height. gr.UploadButton replaces gr.File
# because the latter's drop-zone doesn't shrink to a toolbar.
with gr.Row(elem_classes=["bg-toolbar"]):
with gr.Column(scale=2, min_width=200):
bib_btn = gr.UploadButton(
"πŸ“š Choose .bib file",
file_types=[".bib"], file_count="single",
elem_classes=["bg-upload-btn"],
)
bib_status = gr.HTML('<div class="bg-fname">no file selected</div>')
with gr.Column(scale=2, min_width=200):
tex_btn = gr.UploadButton(
"πŸ“„ Choose .tex file",
file_types=[".tex"], file_count="single",
elem_classes=["bg-upload-btn"],
)
tex_status = gr.HTML('<div class="bg-fname">no file selected</div>')
with gr.Column(scale=3, min_width=280):
preset = gr.Radio(
choices=list(PRESETS.keys()),
value="Standard",
show_label=False,
elem_classes=["bg-preset"],
)
preset_caption = gr.HTML(
_preset_caption_html("Standard"),
)
with gr.Column(scale=1, min_width=140):
run_btn = gr.Button("β–Ά Run check", variant="primary",
elem_classes=["bg-run-btn"])
stop_btn = gr.Button("β—Ό Stop", variant="stop",
elem_classes=["bg-run-btn", "bg-stop-btn"],
visible=False)
gr.HTML('<div class="bg-fname" style="text-align:center">&nbsp;</div>')
# Holds the selected file paths (strings). Updated by the UploadButton
# callbacks below so run_check sees plain paths regardless of how the
# user picked the files.
bib_path_state = gr.State(value=None)
tex_path_state = gr.State(value=None)
# Advanced fine-grained toggles. Default closed β€” most users just
# pick a preset and go. Each tab is composed of gr.Row blocks of
# exactly 4 cells so columns line up vertically. Short rows are
# padded with invisible spacer HTML.
def _spacer():
return gr.HTML('<div class="bg-row-spacer">&nbsp;</div>',
elem_classes=["bg-row-spacer"])
with gr.Accordion("βš™οΈ Advanced settings", open=False):
with gr.Tabs():
with gr.TabItem("Bibliography"):
with gr.Row(elem_classes=["bg-row"]):
check_metadata = gr.Checkbox(label="Metadata verify", value=False)
check_usage = gr.Checkbox(label="Usage", value=True)
check_duplicates = gr.Checkbox(label="Duplicates", value=True)
check_preprint_ratio = gr.Checkbox(label="Preprints", value=True)
with gr.Row(elem_classes=["bg-row"]):
retraction = gr.Checkbox(label="Retractions", value=True)
url_liveness = gr.Checkbox(label="URL liveness", value=False)
_spacer()
_spacer()
with gr.TabItem("LaTeX format"):
with gr.Row(elem_classes=["bg-row"]):
caption = gr.Checkbox(label="Captions", value=True)
reference = gr.Checkbox(label="References", value=True)
formatting = gr.Checkbox(label="Formatting", value=True)
equation = gr.Checkbox(label="Equations", value=True)
with gr.TabItem("Writing"):
with gr.Row(elem_classes=["bg-row"]):
ai_artifacts = gr.Checkbox(label="AI artifacts", value=True)
sentence = gr.Checkbox(label="Sentences", value=True)
consistency = gr.Checkbox(label="Consistency", value=True)
acronym = gr.Checkbox(label="Acronyms", value=True)
with gr.Row(elem_classes=["bg-row"]):
number = gr.Checkbox(label="Numbers", value=True)
citation_quality = gr.Checkbox(label="Citations", value=True)
anonymization = gr.Checkbox(label="Anonymization", value=True)
_spacer()
# ───────────────────────── Status strip ─────────────────────────
status_panel = gr.HTML(value=EMPTY_STATUS_HTML, elem_id="bg-status-wrap")
# ───────────────────────── Report (full width) ───────────────────
with gr.Row(elem_classes=["bg-main"]):
report_panel = gr.HTML(value=EMPTY_PANEL_HTML)
# ───────────────────────── Downloads ────────────────────────────
with gr.Accordion("πŸ“₯ Downloads", open=False):
with gr.Row(elem_classes=["bg-downloads"]):
download_html = gr.File(label="report.html (offline)",
interactive=False, elem_classes=["bg-file-input"])
download_md = gr.File(label="bibliography_report.md",
interactive=False, elem_classes=["bg-file-input"])
download_json = gr.File(label="report.json",
interactive=False, elem_classes=["bg-file-input"])
download_bib = gr.File(label="cleaned .bib",
interactive=False, elem_classes=["bg-file-input"])
download_log = gr.File(label="bibguard.log",
interactive=False, elem_classes=["bg-file-input"])
gr.HTML(
'<div class="bg-footer">'
'Set <code>$BIBGUARD_CONTACT_EMAIL</code> for the polite-pool User-Agent Β· '
f'persistent log at <code>{LOG_PATH}</code> Β· '
'set <code>BIBGUARD_DEBUG=1</code> for verbose console output.'
'</div>'
)
preset.change(
fn=apply_preset,
inputs=[preset],
outputs=[
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
caption, reference, formatting, equation,
ai_artifacts, sentence, consistency, acronym,
number, citation_quality, anonymization,
url_liveness, retraction,
],
)
preset.change(
fn=_preset_caption_html,
inputs=[preset],
outputs=[preset_caption],
)
# ---- Upload-button callbacks: store path in state + update chip ----
def _on_bib_upload(f):
if f is None:
return None, '<div class="bg-fname">no file selected</div>'
path = getattr(f, "name", str(f))
return path, f'<div class="bg-fname ok">πŸ“š {Path(path).name}</div>'
def _on_tex_upload(f):
if f is None:
return None, '<div class="bg-fname">no file selected</div>'
path = getattr(f, "name", str(f))
return path, f'<div class="bg-fname ok">πŸ“„ {Path(path).name}</div>'
bib_btn.upload(_on_bib_upload, inputs=[bib_btn], outputs=[bib_path_state, bib_status])
tex_btn.upload(_on_tex_upload, inputs=[tex_btn], outputs=[tex_path_state, tex_status])
# Run pipeline:
# 1. Toggle visibility: hide Run, show Stop.
# 2. Stream run_check yields into report + status + downloads.
# 3. After completion, swap buttons back.
# Stop button cancels the streaming task via Gradio's `cancels=`.
def _show_stop():
return gr.update(visible=False), gr.update(visible=True)
def _show_run():
return gr.update(visible=True), gr.update(visible=False)
run_event = run_btn.click(
fn=_show_stop, inputs=None, outputs=[run_btn, stop_btn],
).then(
fn=run_check,
inputs=[
bib_path_state, tex_path_state,
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
caption, reference, formatting, equation, ai_artifacts,
sentence, consistency, acronym, number, citation_quality, anonymization,
url_liveness, retraction,
],
outputs=[report_panel, status_panel,
download_html, download_md, download_json, download_bib, download_log],
).then(
fn=_show_run, inputs=None, outputs=[run_btn, stop_btn],
)
stop_btn.click(
fn=lambda: (
gr.update(visible=True),
gr.update(visible=False),
_status_html("Cancelled",
"Run interrupted by user. Partial results discarded.",
state="error"),
),
inputs=None,
outputs=[run_btn, stop_btn, status_panel],
cancels=[run_event],
)
return app
app = create_app()
if __name__ == "__main__":
_favicon = Path(__file__).parent / "assets" / "icon-192.png"
app.launch(
favicon_path=str(_favicon) if _favicon.exists() else None,
show_error=True,
css=CUSTOM_CSS,
theme=gr.themes.Soft(),
)