graphrag-benchmark / scripts /hydrate_benchmark_graph.py
VedantDhavan's picture
Deploy GraphRAG benchmark backend
83aed13
Raw
History Blame Contribute Delete
973 Bytes
from collections import defaultdict
from pipelines.basic_rag.vector_store import VectorStore
from ingestion.entity_extraction import extract_entities
from ingestion.build_graph import build_graph
def chunk_index(chunk_id: str) -> int:
if "_chunk_" not in chunk_id:
return 0
try:
return int(chunk_id.rsplit("_chunk_", 1)[1])
except ValueError:
return 0
def main():
store = VectorStore.load()
by_doc = defaultdict(list)
for record in store.metadata:
by_doc[record["doc_id"]].append(record)
for doc_id, chunks in by_doc.items():
chunks.sort(key=lambda r: chunk_index(r["chunk_id"]))
entities = extract_entities([chunk["text"] for chunk in chunks])
result = build_graph(
doc_id=doc_id,
title=f"Imported document {doc_id}",
chunks=chunks,
entities=entities,
)
print(doc_id, result)
if __name__ == "__main__":
main()