Spaces:

VedantDhavan
/

graphrag-benchmark

Running

graphrag-benchmark / scripts /hydrate_benchmark_graph.py

Deploy GraphRAG benchmark backend

83aed13 about 2 months ago

973 Bytes

	from collections import defaultdict

	from pipelines.basic_rag.vector_store import VectorStore
	from ingestion.entity_extraction import extract_entities
	from ingestion.build_graph import build_graph


	def chunk_index(chunk_id: str) -> int:
	if "_chunk_" not in chunk_id:
	return 0
	try:
	return int(chunk_id.rsplit("_chunk_", 1)[1])
	except ValueError:
	return 0


	def main():
	store = VectorStore.load()

	by_doc = defaultdict(list)
	for record in store.metadata:
	by_doc[record["doc_id"]].append(record)

	for doc_id, chunks in by_doc.items():
	chunks.sort(key=lambda r: chunk_index(r["chunk_id"]))
	entities = extract_entities([chunk["text"] for chunk in chunks])
	result = build_graph(
	doc_id=doc_id,
	title=f"Imported document {doc_id}",
	chunks=chunks,
	entities=entities,
	)
	print(doc_id, result)


	if __name__ == "__main__":
	main()