Spaces:
Running
Running
File size: 973 Bytes
83aed13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | from collections import defaultdict
from pipelines.basic_rag.vector_store import VectorStore
from ingestion.entity_extraction import extract_entities
from ingestion.build_graph import build_graph
def chunk_index(chunk_id: str) -> int:
if "_chunk_" not in chunk_id:
return 0
try:
return int(chunk_id.rsplit("_chunk_", 1)[1])
except ValueError:
return 0
def main():
store = VectorStore.load()
by_doc = defaultdict(list)
for record in store.metadata:
by_doc[record["doc_id"]].append(record)
for doc_id, chunks in by_doc.items():
chunks.sort(key=lambda r: chunk_index(r["chunk_id"]))
entities = extract_entities([chunk["text"] for chunk in chunks])
result = build_graph(
doc_id=doc_id,
title=f"Imported document {doc_id}",
chunks=chunks,
entities=entities,
)
print(doc_id, result)
if __name__ == "__main__":
main()
|