File size: 2,030 Bytes
fd06b5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
"""
Ingest persistent documents into vector store.
Run this to make company policies searchable.
"""
from pathlib import Path
from vector_store import get_vector_store
def ingest_persistent_docs():
"""Ingest all documents from persistent_docs/ into vector store."""
persistent_dir = Path("persistent_docs")
if not persistent_dir.exists():
print("β persistent_docs/ directory not found")
return
vector_store = get_vector_store()
# Find all supported files
supported_extensions = ['.txt', '.md']
files = []
for ext in supported_extensions:
files.extend(persistent_dir.glob(f'*{ext}'))
if not files:
print("π No text files found in persistent_docs/")
return
print(f"\nπ Found {len(files)} document(s) to ingest:")
for file_path in files:
try:
print(f"\nπ Processing: {file_path.name}")
# Read file content
content = file_path.read_text(encoding='utf-8')
# Use filename without extension as document_id
doc_id = file_path.stem
# Ingest into vector store
num_chunks = vector_store.ingest_document(
document_text=content,
document_id=doc_id,
metadata={
"file_path": str(file_path.absolute()),
"filename": file_path.name,
"storage_type": "persistent"
},
chunk_size=500,
chunk_overlap=50
)
print(f" β
Ingested '{doc_id}' - Created {num_chunks} chunks")
except Exception as e:
print(f" β Failed to ingest {file_path.name}: {e}")
print(f"\nπ Ingestion complete! Documents are now searchable.\n")
if __name__ == "__main__":
print("=" * 60)
print("PERSISTENT DOCUMENTS INGESTION")
print("=" * 60)
ingest_persistent_docs()
|