File size: 2,030 Bytes
fd06b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Ingest persistent documents into vector store.
Run this to make company policies searchable.
"""
from pathlib import Path
from vector_store import get_vector_store

def ingest_persistent_docs():
    """Ingest all documents from persistent_docs/ into vector store."""
    persistent_dir = Path("persistent_docs")
    
    if not persistent_dir.exists():
        print("❌ persistent_docs/ directory not found")
        return
    
    vector_store = get_vector_store()
    
    # Find all supported files
    supported_extensions = ['.txt', '.md']
    files = []
    for ext in supported_extensions:
        files.extend(persistent_dir.glob(f'*{ext}'))
    
    if not files:
        print("πŸ“‚ No text files found in persistent_docs/")
        return
    
    print(f"\nπŸ“š Found {len(files)} document(s) to ingest:")
    
    for file_path in files:
        try:
            print(f"\nπŸ“„ Processing: {file_path.name}")
            
            # Read file content
            content = file_path.read_text(encoding='utf-8')
            
            # Use filename without extension as document_id
            doc_id = file_path.stem
            
            # Ingest into vector store
            num_chunks = vector_store.ingest_document(
                document_text=content,
                document_id=doc_id,
                metadata={
                    "file_path": str(file_path.absolute()),
                    "filename": file_path.name,
                    "storage_type": "persistent"
                },
                chunk_size=500,
                chunk_overlap=50
            )
            
            print(f"   βœ… Ingested '{doc_id}' - Created {num_chunks} chunks")
            
        except Exception as e:
            print(f"   ❌ Failed to ingest {file_path.name}: {e}")
    
    print(f"\nπŸŽ‰ Ingestion complete! Documents are now searchable.\n")

if __name__ == "__main__":
    print("=" * 60)
    print("PERSISTENT DOCUMENTS INGESTION")
    print("=" * 60)
    ingest_persistent_docs()