File size: 2,765 Bytes
1179155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
"""Build the local DIP knowledge base for the German Promise Tracker.

Example:
    export DIP_API_KEY="..."
    python scripts/build_dip_knowledge_base.py \
        --resources vorgang vorgangsposition drucksache \
        --wahlperiode 21 \
        --date-start 2025-01-01 \
        --max-pages 5
"""

from __future__ import annotations

import argparse
import os
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from dip_client import RESOURCE_TYPES, build_knowledge_base, build_query_params, save_knowledge_base


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Fetch Bundestag DIP records and build a normalised CSV knowledge base.")
    parser.add_argument("--api-key", default=os.environ.get("DIP_API_KEY", ""), help="DIP API key. Defaults to env var DIP_API_KEY.")
    parser.add_argument("--resources", nargs="+", default=["vorgang", "vorgangsposition", "drucksache"], choices=RESOURCE_TYPES)
    parser.add_argument("--wahlperiode", type=int, default=21, help="Bundestag election period, e.g. 21.")
    parser.add_argument("--date-start", default="", help="DIP f.datum.start, e.g. 2025-01-01.")
    parser.add_argument("--date-end", default="", help="DIP f.datum.end, e.g. 2026-12-31.")
    parser.add_argument("--updated-start", default="", help="DIP f.aktualisiert.start, e.g. 2026-05-01T00:00:00+02:00.")
    parser.add_argument("--updated-end", default="", help="DIP f.aktualisiert.end.")
    parser.add_argument("--zuordnung", default="", choices=["", "BT", "BR", "BV", "EK"], help="DIP f.zuordnung filter.")
    parser.add_argument("--max-pages", type=int, default=2, help="Cursor pages per resource.")
    parser.add_argument("--output-dir", default=str(ROOT / "data"), help="Output directory.")
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    if not args.api_key:
        raise SystemExit("Missing API key. Set DIP_API_KEY or pass --api-key.")

    params = build_query_params(
        wahlperiode=args.wahlperiode,
        date_start=args.date_start or None,
        date_end=args.date_end or None,
        updated_start=args.updated_start or None,
        updated_end=args.updated_end or None,
        zuordnung=args.zuordnung or None,
    )
    df, raw_docs, metadata = build_knowledge_base(
        api_key=args.api_key,
        resources=args.resources,
        params=params,
        max_pages_per_resource=args.max_pages,
    )
    save_knowledge_base(df, raw_docs, metadata, Path(args.output_dir))
    print(f"Saved {len(df)} unique DIP records to {args.output_dir}")
    print(metadata)


if __name__ == "__main__":
    main()