wahl-hack / scripts /build_dip_knowledge_base.py
Ani
Add DIP knowledge base and promise tracker functionality
1179155
Raw
History Blame Contribute Delete
2.77 kB
#!/usr/bin/env python3
"""Build the local DIP knowledge base for the German Promise Tracker.
Example:
export DIP_API_KEY="..."
python scripts/build_dip_knowledge_base.py \
--resources vorgang vorgangsposition drucksache \
--wahlperiode 21 \
--date-start 2025-01-01 \
--max-pages 5
"""
from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from dip_client import RESOURCE_TYPES, build_knowledge_base, build_query_params, save_knowledge_base
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Fetch Bundestag DIP records and build a normalised CSV knowledge base.")
parser.add_argument("--api-key", default=os.environ.get("DIP_API_KEY", ""), help="DIP API key. Defaults to env var DIP_API_KEY.")
parser.add_argument("--resources", nargs="+", default=["vorgang", "vorgangsposition", "drucksache"], choices=RESOURCE_TYPES)
parser.add_argument("--wahlperiode", type=int, default=21, help="Bundestag election period, e.g. 21.")
parser.add_argument("--date-start", default="", help="DIP f.datum.start, e.g. 2025-01-01.")
parser.add_argument("--date-end", default="", help="DIP f.datum.end, e.g. 2026-12-31.")
parser.add_argument("--updated-start", default="", help="DIP f.aktualisiert.start, e.g. 2026-05-01T00:00:00+02:00.")
parser.add_argument("--updated-end", default="", help="DIP f.aktualisiert.end.")
parser.add_argument("--zuordnung", default="", choices=["", "BT", "BR", "BV", "EK"], help="DIP f.zuordnung filter.")
parser.add_argument("--max-pages", type=int, default=2, help="Cursor pages per resource.")
parser.add_argument("--output-dir", default=str(ROOT / "data"), help="Output directory.")
return parser.parse_args()
def main() -> None:
args = parse_args()
if not args.api_key:
raise SystemExit("Missing API key. Set DIP_API_KEY or pass --api-key.")
params = build_query_params(
wahlperiode=args.wahlperiode,
date_start=args.date_start or None,
date_end=args.date_end or None,
updated_start=args.updated_start or None,
updated_end=args.updated_end or None,
zuordnung=args.zuordnung or None,
)
df, raw_docs, metadata = build_knowledge_base(
api_key=args.api_key,
resources=args.resources,
params=params,
max_pages_per_resource=args.max_pages,
)
save_knowledge_base(df, raw_docs, metadata, Path(args.output_dir))
print(f"Saved {len(df)} unique DIP records to {args.output_dir}")
print(metadata)
if __name__ == "__main__":
main()