Spaces:

InsanAlex
/

iris-at-text2sparql

Running on CPU Upgrade

File size: 5,361 Bytes

d745844

"""Entry point for the Text2SPARQL repair pipeline.

Handles CLI argument parsing, loading inputs, running the pipeline,
and saving JSON traces. No business logic here.
"""

from __future__ import annotations

import argparse
import json
import logging
import sys
from pathlib import Path

from .config import load_config
from .models import QueryRequest, RunTrace
from .pipeline import Text2SPARQLPipeline
from .utils import load_json, make_run_dir, save_json

logger = logging.getLogger("text2sparql_repair")


def _setup_logging(verbose: bool = False) -> None:
    """Configure logging for the pipeline."""
    level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )


def _save_trace(trace: RunTrace, run_dir: str) -> None:
    """Save all trace artifacts as individual JSON files.

    Files saved:
    - request.json
    - context.json
    - initial_candidates.json
    - validations.json
    - committee.json
    - decisions.json
    - repairs.json
    - final_trace.json
    """
    save_json(trace.request.model_dump(), f"{run_dir}/request.json")
    save_json(trace.context.model_dump(), f"{run_dir}/context.json")
    save_json(
        [c.model_dump() for c in trace.initial_candidates],
        f"{run_dir}/initial_candidates.json",
    )
    save_json(
        [v.model_dump() for v in trace.validation_history],
        f"{run_dir}/validations.json",
    )
    save_json(
        [f.model_dump() for f in trace.committee_history],
        f"{run_dir}/committee.json",
    )
    save_json(
        [d.model_dump() for d in trace.decision_history],
        f"{run_dir}/decisions.json",
    )
    save_json(
        [r.model_dump() for r in trace.repair_history],
        f"{run_dir}/repairs.json",
    )
    save_json(trace.model_dump(), f"{run_dir}/final_trace.json")


def run_single(request_path: str, config_path: str) -> None:
    """Run the pipeline on a single request.

    Args:
        request_path: Path to a JSON file containing a QueryRequest.
        config_path: Path to the YAML config file.
    """
    config = load_config(config_path)
    pipeline = Text2SPARQLPipeline(config)

    request_data = load_json(request_path)
    request = QueryRequest(**request_data)

    logger.info("Processing single request: %s", request.request_id)

    trace = pipeline.run(request)

    run_dir = make_run_dir("runs", request.request_id)
    _save_trace(trace, run_dir)

    logger.info(
        "Done. Status: %s | Final query: %s",
        trace.final_status, trace.final_query[:120] if trace.final_query else "(empty)",
    )
    logger.info("Trace saved to: %s", run_dir)


def run_batch(input_path: str, config_path: str) -> None:
    """Run the pipeline on a batch of requests.

    Input file can be:
    - A JSON array of QueryRequest objects
    - A JSONL file (one QueryRequest per line)

    Args:
        input_path: Path to the input file.
        config_path: Path to the YAML config file.
    """
    config = load_config(config_path)
    pipeline = Text2SPARQLPipeline(config)

    # Detect format
    with open(input_path, "r") as f:
        first_char = f.read(1)
        f.seek(0)

        if first_char == "[":
            # JSON array
            requests_data = json.load(f)
        else:
            # JSONL
            requests_data = [
                json.loads(line) for line in f if line.strip()
            ]

    logger.info("Processing batch of %d requests", len(requests_data))

    for i, req_data in enumerate(requests_data):
        request = QueryRequest(**req_data)
        logger.info(
            "[%d/%d] Processing: %s",
            i + 1, len(requests_data), request.request_id,
        )

        try:
            trace = pipeline.run(request)
            run_dir = make_run_dir("runs", request.request_id)
            _save_trace(trace, run_dir)
            logger.info(
                "[%d/%d] Done. Status: %s",
                i + 1, len(requests_data), trace.final_status,
            )
        except Exception as exc:
            logger.error(
                "[%d/%d] Failed: %s", i + 1, len(requests_data), exc,
                exc_info=True,
            )

    logger.info("Batch processing complete.")


def main() -> None:
    """CLI entry point."""
    parser = argparse.ArgumentParser(
        description="Text2SPARQL Post-Generation Repair Pipeline",
        prog="text2sparql_repair",
    )
    parser.add_argument(
        "--config", "-c",
        default="configs/default.yaml",
        help="Path to YAML config file (default: configs/default.yaml)",
    )
    parser.add_argument(
        "--request", "-r",
        help="Path to a single request JSON file",
    )
    parser.add_argument(
        "--batch", "-b",
        help="Path to a batch input file (JSON array or JSONL)",
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Enable verbose (DEBUG) logging",
    )

    args = parser.parse_args()
    _setup_logging(verbose=args.verbose)

    if args.request:
        run_single(args.request, args.config)
    elif args.batch:
        run_batch(args.batch, args.config)
    else:
        parser.print_help()
        sys.exit(1)


if __name__ == "__main__":
    main()