Spaces:

4waiz
/

CiteGuard-RAG

Running

App Files Files Community

CiteGuard-RAG / src /citeguard /cli.py

4waiz

chore: flatten repo layout

910eb59 10 days ago

raw

history blame contribute delete

2.97 kB

	"""Command-line interface."""
	from __future__ import annotations

	import os
	import sys
	from pathlib import Path

	# Force transformers to be torch-only. Some Windows environments ship a
	# TensorFlow build pinned to NumPy 1.x; importing it under NumPy 2.x crashes
	# the whole process. We never use TF or Flax, so disable them up front.
	os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
	os.environ.setdefault("USE_TF", "0")
	os.environ.setdefault("USE_FLAX", "0")
	os.environ.setdefault("TRANSFORMERS_NO_TF", "1")

	import click

	from .config import load_config
	from .data_loaders.custom_jsonl import load_jsonl
	from .pipeline import CiteGuardPipeline


	@click.group()
	def main() -> None:
	"""CiteGuard-RAG command-line entry point."""


	@main.command()
	@click.option("--input", "input_path", required=True, type=click.Path(exists=True, dir_okay=False))
	@click.option("--config", "config_path", default=None, type=click.Path(exists=True, dir_okay=False))
	@click.option("--output", "output_dir", default="outputs", type=click.Path(file_okay=False))
	@click.option("--no-progress", is_flag=True, default=False, help="Disable progress bar.")
	def evaluate(input_path: str, config_path: str \| None, output_dir: str, no_progress: bool) -> None:
	"""Run the full pipeline on a JSONL of RAG examples."""
	cfg = load_config(config_path)
	# Re-root output dirs under the requested --output if provided.
	out_root = Path(output_dir)
	cfg.output.tables_dir = str(out_root / "tables")
	cfg.output.figures_dir = str(out_root / "figures")
	cfg.output.cards_dir = str(out_root / "evidence_cards")
	cfg.output.reports_dir = str(out_root / "reports")

	examples = load_jsonl(input_path)
	if not examples:
	click.echo(f"No examples loaded from {input_path}.", err=True)
	sys.exit(1)

	pipeline = CiteGuardPipeline(cfg)
	results = pipeline.run(examples, show_progress=not no_progress)
	paths = pipeline.write_outputs(results)

	click.echo(f"Wrote {len(results['evaluations'])} claim evaluations.")
	for k, v in paths.items():
	click.echo(f" {k}: {v}")
	agg = results["aggregate"]
	click.echo(
	f"macro-F1={agg['macro_f1']:.4f} weighted-F1={agg['weighted_f1']:.4f} "
	f"(n_with_gold={agg['n_with_gold_label']}/{agg['n_claims']})"
	)


	@main.command()
	@click.option("--input", "input_path", required=True, type=click.Path(exists=True, dir_okay=False))
	def split(input_path: str) -> None:
	"""Print claim splits for each example. Useful for debugging the splitter."""
	from .preprocess.claim_splitter import ClaimSplitter

	splitter = ClaimSplitter()
	for ex in load_jsonl(input_path):
	claims = splitter.split(ex.example_id, ex.answer)
	click.echo(f"=== {ex.example_id} ({len(claims)} claims) ===")
	for c in claims:
	click.echo(f" [{c.claim_id}] cited={c.cited_doc_ids} :: {c.text}")


	if __name__ == "__main__": # pragma: no cover
	main()