ebm-mentor / scripts /build_database.py
bsenst's picture
feat: use downloaded extracted XML with Fachgruppe 001 filter, fallback to dummy XML
81e8d50
Raw
History Blame Contribute Delete
2 kB
from __future__ import annotations
import argparse
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from src.chunking import dataframe_to_documents
from src.embeddings import EmbeddingModel
from src.parser import filter_df_by_fachgruppe, parse_ebm_xml_to_dataframe
from src.vector_store import EbmVectorStore
def main() -> None:
parser = argparse.ArgumentParser(description="Build the local FAISS database from EBM XML.")
parser.add_argument("--xml", required=True, help="Path to the official EBM XML.")
parser.add_argument("--store", required=True, help="Output directory for the FAISS store.")
parser.add_argument("--model", default=None, help="Optional sentence-transformers model name.")
parser.add_argument("--fachgruppe-filter", action="store_true", help="Filter to Fachgruppe 001 only (for full EBM downloads).")
args = parser.parse_args()
xml_path = Path(args.xml)
store_dir = Path(args.store)
embedding_model = EmbeddingModel(args.model) if args.model else EmbeddingModel()
df = parse_ebm_xml_to_dataframe(str(xml_path))
# Apply Fachgruppe filter only if requested
if args.fachgruppe_filter:
print("Applying Fachgruppe 001 filter...")
df = filter_df_by_fachgruppe(df)
if df.empty:
raise ValueError(
"No Fachgruppe 001 documents found in the provided XML. "
"Please check the XML file or remove the --fachgruppe-filter flag."
)
if df.empty:
raise ValueError(
"No documents found in the provided XML. "
"Please provide a valid KBV EBM XML file."
)
print(f"Building FAISS store from {len(df)} documents...")
documents = dataframe_to_documents(df)
store, embeddings = EbmVectorStore.build(documents, embedding_model=embedding_model)
store.save(store_dir, embeddings=embeddings)
if __name__ == "__main__":
main()