Spaces:
Sleeping
Sleeping
File size: 1,997 Bytes
5887b57 d491d8b 5887b57 d491d8b 5887b57 d491d8b 5887b57 81e8d50 5887b57 81e8d50 fe535b3 81e8d50 385810e fe535b3 81e8d50 5887b57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | from __future__ import annotations
import argparse
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from src.chunking import dataframe_to_documents
from src.embeddings import EmbeddingModel
from src.parser import filter_df_by_fachgruppe, parse_ebm_xml_to_dataframe
from src.vector_store import EbmVectorStore
def main() -> None:
parser = argparse.ArgumentParser(description="Build the local FAISS database from EBM XML.")
parser.add_argument("--xml", required=True, help="Path to the official EBM XML.")
parser.add_argument("--store", required=True, help="Output directory for the FAISS store.")
parser.add_argument("--model", default=None, help="Optional sentence-transformers model name.")
parser.add_argument("--fachgruppe-filter", action="store_true", help="Filter to Fachgruppe 001 only (for full EBM downloads).")
args = parser.parse_args()
xml_path = Path(args.xml)
store_dir = Path(args.store)
embedding_model = EmbeddingModel(args.model) if args.model else EmbeddingModel()
df = parse_ebm_xml_to_dataframe(str(xml_path))
# Apply Fachgruppe filter only if requested
if args.fachgruppe_filter:
print("Applying Fachgruppe 001 filter...")
df = filter_df_by_fachgruppe(df)
if df.empty:
raise ValueError(
"No Fachgruppe 001 documents found in the provided XML. "
"Please check the XML file or remove the --fachgruppe-filter flag."
)
if df.empty:
raise ValueError(
"No documents found in the provided XML. "
"Please provide a valid KBV EBM XML file."
)
print(f"Building FAISS store from {len(df)} documents...")
documents = dataframe_to_documents(df)
store, embeddings = EbmVectorStore.build(documents, embedding_model=embedding_model)
store.save(store_dir, embeddings=embeddings)
if __name__ == "__main__":
main()
|