File size: 1,997 Bytes
5887b57
 
 
d491d8b
5887b57
 
d491d8b
 
 
5887b57
 
d491d8b
5887b57
 
 
 
 
 
 
 
81e8d50
5887b57
 
 
 
 
 
 
81e8d50
 
 
 
 
 
 
 
 
 
 
fe535b3
 
81e8d50
385810e
fe535b3
81e8d50
 
5887b57
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from __future__ import annotations

import argparse
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from src.chunking import dataframe_to_documents
from src.embeddings import EmbeddingModel
from src.parser import filter_df_by_fachgruppe, parse_ebm_xml_to_dataframe
from src.vector_store import EbmVectorStore


def main() -> None:
    parser = argparse.ArgumentParser(description="Build the local FAISS database from EBM XML.")
    parser.add_argument("--xml", required=True, help="Path to the official EBM XML.")
    parser.add_argument("--store", required=True, help="Output directory for the FAISS store.")
    parser.add_argument("--model", default=None, help="Optional sentence-transformers model name.")
    parser.add_argument("--fachgruppe-filter", action="store_true", help="Filter to Fachgruppe 001 only (for full EBM downloads).")
    args = parser.parse_args()

    xml_path = Path(args.xml)
    store_dir = Path(args.store)
    embedding_model = EmbeddingModel(args.model) if args.model else EmbeddingModel()

    df = parse_ebm_xml_to_dataframe(str(xml_path))
    
    # Apply Fachgruppe filter only if requested
    if args.fachgruppe_filter:
        print("Applying Fachgruppe 001 filter...")
        df = filter_df_by_fachgruppe(df)
        if df.empty:
            raise ValueError(
                "No Fachgruppe 001 documents found in the provided XML. "
                "Please check the XML file or remove the --fachgruppe-filter flag."
            )
    
    if df.empty:
        raise ValueError(
            "No documents found in the provided XML. "
            "Please provide a valid KBV EBM XML file."
        )
    
    print(f"Building FAISS store from {len(df)} documents...")
    documents = dataframe_to_documents(df)
    store, embeddings = EbmVectorStore.build(documents, embedding_model=embedding_model)
    store.save(store_dir, embeddings=embeddings)


if __name__ == "__main__":
    main()