#!/usr/bin/env python3 """ BibGuard Gradio web app — minimalist iframe layout. The right pane embeds the self-contained ``report.html`` produced by ``src/report/html_report.py`` via ``' ) def _status_html(stage: str, detail: str = "", meta: list[str] | None = None, state: str = "running") -> str: """Render the live-status strip shown above the report. Layout is a single horizontal row: [stage] [detail] [meta chips]. Wraps cleanly on narrow screens. """ if state == "running": stage_icon = '' elif state == "done": stage_icon = '' elif state == "error": stage_icon = '' else: stage_icon = '' detail_html = f'{detail}' if detail else '' meta_html = "" if meta: meta_html = ( '' + " ".join(f"{m}" for m in meta) + "" ) return ( f'
' f'
' f'{stage_icon}{stage}' f'{detail_html}{meta_html}' f'
' ) # --------------------------------------------------------------- config glue def create_config_from_ui( check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, sentence, consistency, acronym, number, citation_quality, anonymization, ) -> BibGuardConfig: config = BibGuardConfig() config.bibliography = BibliographyConfig( check_metadata=check_metadata, check_usage=check_usage, check_duplicates=check_duplicates, check_preprint_ratio=check_preprint_ratio, check_relevance=False, # LLM disabled in web mode ) config.submission = SubmissionConfig( caption=caption, reference=reference, formatting=formatting, equation=equation, ai_artifacts=ai_artifacts, sentence=sentence, consistency=consistency, acronym=acronym, number=number, citation_quality=citation_quality, anonymization=anonymization, ) config.output = OutputConfig(quiet=True, minimal_verified=False) return config def apply_preset(name: str): p = PRESETS.get(name, PRESETS["Standard"]) sub = p["submission"] return ( p["check_metadata"], p["check_usage"], p["check_duplicates"], p["check_preprint_ratio"], sub["caption"], sub["reference"], sub["formatting"], sub["equation"], sub["ai_artifacts"], sub["sentence"], sub["consistency"], sub["acronym"], sub["number"], sub["citation_quality"], sub["anonymization"], p["url_liveness"], p["retraction"], ) _PRESET_CAPTIONS = { "Quick": "local checks only · no network · instant", "Standard": "local checks + retraction lookup (CrossRef)", "Strict": "+ URL liveness + multi-source metadata (slow)", } def _preset_caption_html(name: str) -> str: text = _PRESET_CAPTIONS.get(name, "") return f'
{text}
' # ------------------------------------------------------------------ run_check # Streaming generator. Each yield is a 7-tuple: # (iframe_html, status_html, html_path, md_path, json_path, # cleaned_bib_path, log_path) # `capture_run` attaches a per-run DEBUG file handler so any exception or # warning anywhere in the pipeline is recorded with full traceback at # `/bibguard.log`, which is then downloadable. The status panel # surfaces warning+error counts so problems aren't invisible. def run_check( bib_file, tex_file, check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, sentence, consistency, acronym, number, citation_quality, anonymization, url_liveness=False, retraction=True, ): """Run the full check pipeline as a streaming generator with per-run logging. `bib_file` / `tex_file` are filesystem path strings (carried by gr.State), not gr.File objects. The status panel is the single source of progress feedback — no separate gr.Progress bar. """ started = time.time() def _elapsed() -> str: return f"⏱ {int(time.time() - started)}s" # Initial state: keep current report (None means clear). if not bib_file or not tex_file: yield ( _placeholder("Please choose both a .bib and a .tex file in the toolbar."), _status_html("Waiting for files", "Pick a .bib and a .tex file from the toolbar to start.", state="error"), None, None, None, None, None, ) return # Allocate the artifact dir up-front so the per-run log lives next to # the report files. out_dir = Path(tempfile.mkdtemp(prefix="bibguard_")) log_path_target = out_dir / "bibguard.log" # Reset per-source circuit breakers so a previous run's flaky source # doesn't carry over and skip valid lookups in this run. http_layer.reset_breakers() with capture_run(target_path=log_path_target) as (log_path, log_stats): logger.info("=== run_check start: bib=%s tex=%s ===", bib_file, tex_file) try: yield from _run_check_impl( bib_file, tex_file, out_dir, log_path, log_stats, check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, sentence, consistency, acronym, number, citation_quality, anonymization, url_liveness, retraction, started, _elapsed, ) except Exception as e: logger.exception("run_check crashed (entry-level guard)") yield ( _placeholder(f"Unhandled error: {e}"), _status_html("Failed", f"{e} — see bibguard.log for the full traceback.", state="error"), None, None, None, None, str(log_path), ) finally: logger.info("=== run_check end: warnings=%d errors=%d ===", log_stats.warnings, log_stats.errors) def _run_check_impl( bib_file, tex_file, out_dir, log_path, log_stats, check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, sentence, consistency, acronym, number, citation_quality, anonymization, url_liveness, retraction, started, _elapsed, ): """Inner pipeline. Wrapped in `capture_run` by `run_check`. Every yield is a 7-tuple ending with the log path so the user can download `bibguard.log` even from intermediate updates. """ log_path_str = str(log_path) bib_path = Path(bib_file) tex_path = Path(tex_file) logger.info("Inputs: bib=%s tex=%s out_dir=%s", bib_path, tex_path, out_dir) def _meta_with_logs(extra: list[str]) -> list[str]: out = list(extra) if log_stats.warnings or log_stats.errors: out.append(f"⚠ {log_stats.warnings}w / {log_stats.errors}e logged") return out yield ( gr.update(), _status_html("Validating files", f"Reading {bib_path.name} and {tex_path.name}", meta=_meta_with_logs([_elapsed()])), None, None, None, None, log_path_str, ) # Pre-flight content validation bib_rep = validate_bib(bib_path) tex_rep = validate_tex(tex_path) msg = "\n".join(filter(None, [ format_report(bib_rep, bib_path.name), format_report(tex_rep, tex_path.name), ])) if not bib_rep.ok or not tex_rep.ok: logger.error("File validation failed:\n%s", msg) block = ( f'
' f'
⚠️
' f'
File validation failed
' f'
{msg}
' f'
' ) yield ( block, _status_html("File validation failed", msg.replace("\n", "
"), state="error"), None, None, None, None, log_path_str, ) return elif msg: logger.info("Validation warnings:\n%s", msg) config = create_config_from_ui( check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, sentence, consistency, acronym, number, citation_quality, anonymization, ) yield ( gr.update(), _status_html("Parsing", "Loading bibliography and LaTeX source", meta=_meta_with_logs([_elapsed()])), None, None, None, None, log_path_str, ) tex_content = tex_path.read_text(encoding='utf-8', errors='replace') bib_parser = BibParser() entries = bib_parser.parse_file(str(bib_path)) tex_parser = TexParser() tex_parser.parse_file(str(tex_path)) logger.info("Parsed %d bib entries from %s", len(entries), bib_path.name) bib_config = config.bibliography # Init components arxiv_fetcher = crossref_fetcher = ss_fetcher = oa_fetcher = dblp_fetcher = None comparator = usage_checker = duplicate_detector = None if bib_config.check_metadata: arxiv_fetcher = ArxivFetcher() ss_fetcher = SemanticScholarFetcher() oa_fetcher = OpenAlexFetcher() dblp_fetcher = DBLPFetcher() crossref_fetcher = CrossRefFetcher() comparator = MetadataComparator() if bib_config.check_usage: usage_checker = UsageChecker(tex_parser) if bib_config.check_duplicates: duplicate_detector = DuplicateDetector() report_gen = ReportGenerator( minimal_verified=False, check_preprint_ratio=bib_config.check_preprint_ratio, preprint_warning_threshold=bib_config.preprint_warning_threshold, ) report_gen.set_metadata([str(bib_path)], [str(tex_path)]) # Submission quality checks yield ( gr.update(), _status_html("LaTeX quality checks", f"Running {len(config.submission.get_enabled_checkers())} checkers on the LaTeX source", meta=_meta_with_logs([f"📚 {len(entries)} bib entries", _elapsed()])), None, None, None, None, log_path_str, ) submission_results = [] for name in config.submission.get_enabled_checkers(): if name in CHECKER_REGISTRY: try: checker = CHECKER_REGISTRY[name]() results = checker.check(tex_content, {}) for r in results: r.file_path = str(tex_path) submission_results.extend(results) except Exception: logger.exception("Checker %s crashed", name) report_gen.set_submission_results(submission_results, None) if bib_config.check_duplicates and duplicate_detector: try: report_gen.set_duplicate_groups(duplicate_detector.find_duplicates(entries)) except Exception: logger.exception("Duplicate detection crashed") if bib_config.check_usage and usage_checker: try: report_gen.set_missing_citations(usage_checker.get_missing_entries(entries)) except Exception: logger.exception("Missing-citation lookup crashed") # Per-entry workflow total = max(1, len(entries)) workflow_config = get_default_workflow() verified_count = 0 flagged_count = 0 not_found_count = 0 last_yield = time.time() def _identifier_chip(entry) -> str: """Tiny inline hint about which IDs we have for this entry.""" bits = [] if entry.doi: bits.append("DOI") if entry.has_arxiv: bits.append("arXiv") if entry.title and not bits: bits.append("title") elif entry.title: bits.append("title") return " + ".join(bits) if bits else "no identifiers" def _outcome_label(cmp) -> str: if cmp is None: return "" if cmp.source == "unable": return "? no metadata" if cmp.is_match: return f"✓ verified by {cmp.source}" return f"⚠ flagged ({cmp.source})" for i, entry in enumerate(entries): # ── Pre-fetch status: announce identifier set BEFORE the network roundtrip # so the user sees what's being attempted, not just the entry name. if bib_config.check_metadata and comparator: now = time.time() if now - last_yield > 0.4 or i == 0: ids = _identifier_chip(entry) detail = f"{entry.key} · querying via {ids}" if entry.title: short = entry.title[:70] + ("…" if len(entry.title) > 70 else "") detail += f" — {short}" yield ( gr.update(), _status_html( f"Verifying entry {i + 1}/{total}", detail, meta=_meta_with_logs([ f"📚 {total} total", f"✓ {verified_count}", f"⚠ {flagged_count}", f"? {not_found_count}", _elapsed(), ]), ), None, None, None, None, log_path_str, ) last_yield = now usage_result = None comparison_result = None try: if usage_checker: usage_result = usage_checker.check_usage(entry) except Exception: logger.exception("Usage check crashed for entry=%s", entry.key) try: if bib_config.check_metadata and comparator: comparison_result = fetch_and_compare_with_workflow( entry, workflow_config, arxiv_fetcher, crossref_fetcher, ss_fetcher, oa_fetcher, dblp_fetcher, comparator, ) if comparison_result is None or comparison_result.source == "unable": not_found_count += 1 elif comparison_result.is_match: verified_count += 1 else: flagged_count += 1 except Exception: logger.exception("Metadata fetch crashed for entry=%s", entry.key) report_gen.add_entry_report(EntryReport( entry=entry, comparison=comparison_result, usage=usage_result, evaluations=[], )) # ── Post-fetch status: show outcome inline so the user can watch # results stream in (verified / flagged / not found). now = time.time() if now - last_yield > 0.4 or i == total - 1: outcome = _outcome_label(comparison_result) detail_parts = [f"{entry.key}"] if outcome: detail_parts.append(outcome) if entry.title: short = entry.title[:70] + ("…" if len(entry.title) > 70 else "") detail_parts.append(f"{short}") detail = " · ".join(detail_parts) meta = _meta_with_logs([ f"📚 {i + 1}/{total}", f"✓ {verified_count}", f"⚠ {flagged_count}", f"? {not_found_count}", _elapsed(), ]) yield ( gr.update(), _status_html(f"Bibliography {i + 1}/{total}", detail, meta=meta), None, None, None, None, log_path_str, ) last_yield = now if retraction: try: doi_count = sum(1 for e in entries if getattr(e, "doi", "")) yield ( gr.update(), _status_html("Retraction lookups", f"Querying CrossRef for {doi_count} DOI(s)", meta=_meta_with_logs([_elapsed()])), None, None, None, None, log_path_str, ) report_gen.set_retraction_findings(RetractionChecker().check_entries(entries)) except Exception: logger.exception("Retraction lookup crashed") if url_liveness: try: url_count = sum(1 for e in entries if getattr(e, "url", "")) yield ( gr.update(), _status_html("URL liveness", f"HEAD-checking {url_count} URL(s) in parallel", meta=_meta_with_logs([_elapsed()])), None, None, None, None, log_path_str, ) report_gen.set_url_findings(URLChecker().check_entries(entries)) except Exception: logger.exception("URL liveness crashed") # Save artifacts yield ( gr.update(), _status_html("Building report", "Rendering self-contained HTML, JSON, and Markdown", meta=_meta_with_logs([_elapsed()])), None, None, None, None, log_path_str, ) html_path = out_dir / "report.html" md_path = out_dir / "bibliography_report.md" json_path = out_dir / "report.json" cleaned_bib_path: Path | None = None try: report_gen.save_html(str(html_path)) report_gen.save_bibliography_report(str(md_path)) report_gen.save_json(str(json_path)) if usage_checker: used_keys = {er.entry.key for er in report_gen.entries if er.usage and er.usage.is_used} if used_keys: cleaned_bib_path = out_dir / f"{bib_path.stem}_only_used.bib" bib_parser.filter_file(str(bib_path), str(cleaned_bib_path), used_keys) except Exception: logger.exception("Artifact generation failed") # Embed report.html as iframe srcdoc if html_path.exists(): iframe_html = _html_to_iframe(html_path.read_text(encoding='utf-8')) else: iframe_html = _placeholder("Report generation failed — see bibguard.log.") meta = _meta_with_logs([ f"📚 {len(entries)} entries", f"✓ {verified_count} verified", f"⚠ {flagged_count} flagged", _elapsed(), ]) state = "done" summary = "Report ready. Use the right pane to filter, search, and copy fixes." if log_stats.errors > 0: state = "error" summary = (f"Done with {log_stats.errors} error(s) and {log_stats.warnings} warning(s) " "logged — see bibguard.log for full tracebacks.") elif log_stats.warnings > 0: summary = (f"Report ready ({log_stats.warnings} warnings logged — see " "bibguard.log).") yield ( iframe_html, _status_html("Done", summary, meta=meta, state=state), str(html_path) if html_path.exists() else None, str(md_path) if md_path.exists() else None, str(json_path) if json_path.exists() else None, str(cleaned_bib_path) if (cleaned_bib_path and cleaned_bib_path.exists()) else None, log_path_str, ) # --------------------------------------------------------------------- layout def create_app() -> gr.Blocks: # Inline app icon as a base64 data URL — works regardless of cwd. icon_html = '🛡️' try: icon_path = Path(__file__).parent / "assets" / "icon-192.png" if icon_path.exists(): with open(icon_path, "rb") as f: b64 = base64.b64encode(f.read()).decode() icon_html = ( f'BibGuard' ) except Exception as e: logger.debug("Icon load failed; using emoji fallback: %s", e, exc_info=True) with gr.Blocks( title="BibGuard — Bibliography & LaTeX Quality Auditor", ) as app: gr.HTML(f"""
{icon_html} BibGuard — Bibliography & LaTeX quality auditor GitHub ↗
""") # ───────────────────────── Top toolbar ───────────────────────── # All primary controls on a single horizontal row, every primary # widget pinned to 56px height. gr.UploadButton replaces gr.File # because the latter's drop-zone doesn't shrink to a toolbar. with gr.Row(elem_classes=["bg-toolbar"]): with gr.Column(scale=2, min_width=200): bib_btn = gr.UploadButton( "📚 Choose .bib file", file_types=[".bib"], file_count="single", elem_classes=["bg-upload-btn"], ) bib_status = gr.HTML('
no file selected
') with gr.Column(scale=2, min_width=200): tex_btn = gr.UploadButton( "📄 Choose .tex file", file_types=[".tex"], file_count="single", elem_classes=["bg-upload-btn"], ) tex_status = gr.HTML('
no file selected
') with gr.Column(scale=3, min_width=280): preset = gr.Radio( choices=list(PRESETS.keys()), value="Standard", show_label=False, elem_classes=["bg-preset"], ) preset_caption = gr.HTML( _preset_caption_html("Standard"), ) with gr.Column(scale=1, min_width=140): run_btn = gr.Button("▶ Run check", variant="primary", elem_classes=["bg-run-btn"]) stop_btn = gr.Button("◼ Stop", variant="stop", elem_classes=["bg-run-btn", "bg-stop-btn"], visible=False) gr.HTML('
 
') # Holds the selected file paths (strings). Updated by the UploadButton # callbacks below so run_check sees plain paths regardless of how the # user picked the files. bib_path_state = gr.State(value=None) tex_path_state = gr.State(value=None) # Advanced fine-grained toggles. Default closed — most users just # pick a preset and go. Each tab is composed of gr.Row blocks of # exactly 4 cells so columns line up vertically. Short rows are # padded with invisible spacer HTML. def _spacer(): return gr.HTML('
 
', elem_classes=["bg-row-spacer"]) with gr.Accordion("⚙️ Advanced settings", open=False): with gr.Tabs(): with gr.TabItem("Bibliography"): with gr.Row(elem_classes=["bg-row"]): check_metadata = gr.Checkbox(label="Metadata verify", value=False) check_usage = gr.Checkbox(label="Usage", value=True) check_duplicates = gr.Checkbox(label="Duplicates", value=True) check_preprint_ratio = gr.Checkbox(label="Preprints", value=True) with gr.Row(elem_classes=["bg-row"]): retraction = gr.Checkbox(label="Retractions", value=True) url_liveness = gr.Checkbox(label="URL liveness", value=False) _spacer() _spacer() with gr.TabItem("LaTeX format"): with gr.Row(elem_classes=["bg-row"]): caption = gr.Checkbox(label="Captions", value=True) reference = gr.Checkbox(label="References", value=True) formatting = gr.Checkbox(label="Formatting", value=True) equation = gr.Checkbox(label="Equations", value=True) with gr.TabItem("Writing"): with gr.Row(elem_classes=["bg-row"]): ai_artifacts = gr.Checkbox(label="AI artifacts", value=True) sentence = gr.Checkbox(label="Sentences", value=True) consistency = gr.Checkbox(label="Consistency", value=True) acronym = gr.Checkbox(label="Acronyms", value=True) with gr.Row(elem_classes=["bg-row"]): number = gr.Checkbox(label="Numbers", value=True) citation_quality = gr.Checkbox(label="Citations", value=True) anonymization = gr.Checkbox(label="Anonymization", value=True) _spacer() # ───────────────────────── Status strip ───────────────────────── status_panel = gr.HTML(value=EMPTY_STATUS_HTML, elem_id="bg-status-wrap") # ───────────────────────── Report (full width) ─────────────────── with gr.Row(elem_classes=["bg-main"]): report_panel = gr.HTML(value=EMPTY_PANEL_HTML) # ───────────────────────── Downloads ──────────────────────────── with gr.Accordion("📥 Downloads", open=False): with gr.Row(elem_classes=["bg-downloads"]): download_html = gr.File(label="report.html (offline)", interactive=False, elem_classes=["bg-file-input"]) download_md = gr.File(label="bibliography_report.md", interactive=False, elem_classes=["bg-file-input"]) download_json = gr.File(label="report.json", interactive=False, elem_classes=["bg-file-input"]) download_bib = gr.File(label="cleaned .bib", interactive=False, elem_classes=["bg-file-input"]) download_log = gr.File(label="bibguard.log", interactive=False, elem_classes=["bg-file-input"]) gr.HTML( '' ) preset.change( fn=apply_preset, inputs=[preset], outputs=[ check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, sentence, consistency, acronym, number, citation_quality, anonymization, url_liveness, retraction, ], ) preset.change( fn=_preset_caption_html, inputs=[preset], outputs=[preset_caption], ) # ---- Upload-button callbacks: store path in state + update chip ---- def _on_bib_upload(f): if f is None: return None, '
no file selected
' path = getattr(f, "name", str(f)) return path, f'
📚 {Path(path).name}
' def _on_tex_upload(f): if f is None: return None, '
no file selected
' path = getattr(f, "name", str(f)) return path, f'
📄 {Path(path).name}
' bib_btn.upload(_on_bib_upload, inputs=[bib_btn], outputs=[bib_path_state, bib_status]) tex_btn.upload(_on_tex_upload, inputs=[tex_btn], outputs=[tex_path_state, tex_status]) # Run pipeline: # 1. Toggle visibility: hide Run, show Stop. # 2. Stream run_check yields into report + status + downloads. # 3. After completion, swap buttons back. # Stop button cancels the streaming task via Gradio's `cancels=`. def _show_stop(): return gr.update(visible=False), gr.update(visible=True) def _show_run(): return gr.update(visible=True), gr.update(visible=False) run_event = run_btn.click( fn=_show_stop, inputs=None, outputs=[run_btn, stop_btn], ).then( fn=run_check, inputs=[ bib_path_state, tex_path_state, check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, sentence, consistency, acronym, number, citation_quality, anonymization, url_liveness, retraction, ], outputs=[report_panel, status_panel, download_html, download_md, download_json, download_bib, download_log], ).then( fn=_show_run, inputs=None, outputs=[run_btn, stop_btn], ) stop_btn.click( fn=lambda: ( gr.update(visible=True), gr.update(visible=False), _status_html("Cancelled", "Run interrupted by user. Partial results discarded.", state="error"), ), inputs=None, outputs=[run_btn, stop_btn, status_panel], cancels=[run_event], ) return app app = create_app() if __name__ == "__main__": _favicon = Path(__file__).parent / "assets" / "icon-192.png" app.launch( favicon_path=str(_favicon) if _favicon.exists() else None, show_error=True, css=CUSTOM_CSS, theme=gr.themes.Soft(), )