Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(ROOT)) | |
| from src.chunking import dataframe_to_documents | |
| from src.embeddings import EmbeddingModel | |
| from src.parser import filter_df_by_fachgruppe, parse_ebm_xml_to_dataframe | |
| from src.vector_store import EbmVectorStore | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Build the local FAISS database from EBM XML.") | |
| parser.add_argument("--xml", required=True, help="Path to the official EBM XML.") | |
| parser.add_argument("--store", required=True, help="Output directory for the FAISS store.") | |
| parser.add_argument("--model", default=None, help="Optional sentence-transformers model name.") | |
| parser.add_argument("--fachgruppe-filter", action="store_true", help="Filter to Fachgruppe 001 only (for full EBM downloads).") | |
| args = parser.parse_args() | |
| xml_path = Path(args.xml) | |
| store_dir = Path(args.store) | |
| embedding_model = EmbeddingModel(args.model) if args.model else EmbeddingModel() | |
| df = parse_ebm_xml_to_dataframe(str(xml_path)) | |
| # Apply Fachgruppe filter only if requested | |
| if args.fachgruppe_filter: | |
| print("Applying Fachgruppe 001 filter...") | |
| df = filter_df_by_fachgruppe(df) | |
| if df.empty: | |
| raise ValueError( | |
| "No Fachgruppe 001 documents found in the provided XML. " | |
| "Please check the XML file or remove the --fachgruppe-filter flag." | |
| ) | |
| if df.empty: | |
| raise ValueError( | |
| "No documents found in the provided XML. " | |
| "Please provide a valid KBV EBM XML file." | |
| ) | |
| print(f"Building FAISS store from {len(df)} documents...") | |
| documents = dataframe_to_documents(df) | |
| store, embeddings = EbmVectorStore.build(documents, embedding_model=embedding_model) | |
| store.save(store_dir, embeddings=embeddings) | |
| if __name__ == "__main__": | |
| main() | |