Spaces:
Running
Running
| from collections import defaultdict | |
| from pipelines.basic_rag.vector_store import VectorStore | |
| from ingestion.entity_extraction import extract_entities | |
| from ingestion.build_graph import build_graph | |
| def chunk_index(chunk_id: str) -> int: | |
| if "_chunk_" not in chunk_id: | |
| return 0 | |
| try: | |
| return int(chunk_id.rsplit("_chunk_", 1)[1]) | |
| except ValueError: | |
| return 0 | |
| def main(): | |
| store = VectorStore.load() | |
| by_doc = defaultdict(list) | |
| for record in store.metadata: | |
| by_doc[record["doc_id"]].append(record) | |
| for doc_id, chunks in by_doc.items(): | |
| chunks.sort(key=lambda r: chunk_index(r["chunk_id"])) | |
| entities = extract_entities([chunk["text"] for chunk in chunks]) | |
| result = build_graph( | |
| doc_id=doc_id, | |
| title=f"Imported document {doc_id}", | |
| chunks=chunks, | |
| entities=entities, | |
| ) | |
| print(doc_id, result) | |
| if __name__ == "__main__": | |
| main() | |