File size: 8,036 Bytes
8bf4d58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""Script to add documents to the vector store from files or text."""

import sys
import os
from pathlib import Path
from typing import List, Dict, Optional

# Add parent directory to path
try:
    parent_dir = Path(__file__).parent.parent
    sys.path.insert(0, str(parent_dir))
except (NameError, AttributeError):
    # Handle case where __file__ is not available (e.g., when executed directly)
    import os
    parent_dir = Path(os.getcwd())
    if str(parent_dir) not in sys.path:
        sys.path.insert(0, str(parent_dir))

# Lazy import - only import when functions are actually called
# This prevents import errors when the module is scanned but not used
_vector_store = None
_vector_store_error = None

def _get_vector_store():
    """Lazy import of vector store."""
    global _vector_store, _vector_store_error
    if _vector_store_error is not None:
        raise _vector_store_error
    if _vector_store is None:
        try:
            from src.retrieval.vector_store import get_vector_store
            _vector_store = get_vector_store()
        except ImportError as e:
            _vector_store_error = ImportError(
                f"Failed to import vector store. Make sure all dependencies (including chromadb) are installed. "
                f"Run: pip install -r requirements.txt\n"
                f"Original error: {e}"
            )
            raise _vector_store_error
    return _vector_store


def add_text_documents(texts: List[str], metadatas: Optional[List[Dict]] = None):
    """
    Add text documents to the vector store.
    
    Args:
        texts: List of document texts
        metadatas: Optional list of metadata dictionaries
    """
    vector_store = _get_vector_store()
    
    if metadatas is None:
        metadatas = [{}] * len(texts)
    
    ids = vector_store.add_documents(texts, metadatas)
    print(f"✅ Added {len(ids)} documents to vector store")
    return ids


def add_file_documents(file_paths: List[str], chunk_size: int = 1000):
    """
    Add documents from text files to the vector store.
    
    Args:
        file_paths: List of file paths to read
        chunk_size: Size of text chunks (characters) for splitting large documents
    """
    all_documents = []
    all_metadatas = []
    
    for file_path in file_paths:
        file_path = Path(file_path)
        if not file_path.exists():
            print(f"⚠️  Warning: File not found: {file_path}")
            continue
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Split large documents into chunks
            if len(content) > chunk_size:
                chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
                for i, chunk in enumerate(chunks):
                    all_documents.append(chunk)
                    all_metadatas.append({
                        "source": str(file_path.name),
                        "chunk": i + 1,
                        "type": "file"
                    })
            else:
                all_documents.append(content)
                all_metadatas.append({
                    "source": str(file_path.name),
                    "type": "file"
                })
            
            print(f"✅ Loaded: {file_path.name}")
        except Exception as e:
            print(f"❌ Error reading {file_path}: {e}")
    
    if all_documents:
        ids = add_text_documents(all_documents, all_metadatas)
        return ids
    else:
        print("⚠️  No documents to add")
        return []


def add_from_directory(directory: str, extensions: List[str] = None):
    """
    Add all text files from a directory.
    
    Args:
        directory: Directory path
        extensions: List of file extensions to include (default: ['.txt', '.md', '.py'])
    """
    if extensions is None:
        extensions = ['.txt', '.md', '.py', '.json']
    
    directory = Path(directory)
    if not directory.exists():
        print(f"❌ Directory not found: {directory}")
        return []
    
    file_paths = []
    for ext in extensions:
        file_paths.extend(directory.glob(f"**/*{ext}"))
    
    if not file_paths:
        print(f"⚠️  No files found with extensions {extensions} in {directory}")
        return []
    
    print(f"📁 Found {len(file_paths)} files in {directory}")
    return add_file_documents([str(f) for f in file_paths])


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Add documents to the vector store")
    parser.add_argument("--text", nargs="+", help="Add text documents directly")
    parser.add_argument("--file", nargs="+", help="Add documents from files")
    parser.add_argument("--directory", help="Add all documents from a directory")
    parser.add_argument("--sample-docs", action="store_true", help="Add sample documents")
    
    args = parser.parse_args()
    
    if args.sample_docs:
        # Add sample documents
        sample_docs = [
            {
                "text": """
                Oracle Exadata is a database machine that combines hardware and software 
                to provide high-performance database solutions. When migrating Exadata 
                workloads to the cloud, it's important to consider compatibility, 
                performance, and feature parity.
                """,
                "metadata": {"source": "exadata_migration_guide", "type": "documentation"},
            },
            {
                "text": """
                Cloud migration strategies for Oracle Exadata include:
                1. Lift and shift - moving workloads with minimal changes
                2. Replatforming - adapting to cloud-native services
                3. Refactoring - redesigning for cloud architecture
                
                Each approach has different trade-offs in terms of effort, cost, and feature availability.
                """,
                "metadata": {"source": "migration_strategies", "type": "guide"},
            },
            {
                "text": """
                Oracle Cloud Infrastructure (OCI) provides Exadata Cloud Service which 
                maintains full feature compatibility with on-premises Exadata. This 
                service offers the same architecture and capabilities, making it ideal 
                for migrations requiring minimal changes.
                """,
                "metadata": {"source": "oci_exadata", "type": "cloud_service"},
            },
            {
                "text": """
                Oracle AI Database services on AWS provide customers with a simplified path 
                to migrate Oracle Exadata workloads. These services run on AWS infrastructure 
                and offer managed database solutions that maintain Oracle compatibility while 
                leveraging AWS cloud capabilities. The services include automated migration tools, 
                performance optimization, and seamless integration with AWS services.
                """,
                "metadata": {"source": "oracle_aws_services", "type": "cloud_service"},
            },
        ]
        
        documents = [doc["text"] for doc in sample_docs]
        metadatas = [doc["metadata"] for doc in sample_docs]
        add_text_documents(documents, metadatas)
    
    elif args.text:
        add_text_documents(args.text)
    
    elif args.file:
        add_file_documents(args.file)
    
    elif args.directory:
        add_from_directory(args.directory)
    
    else:
        print("Please specify --text, --file, --directory, or --sample-docs")
        print("\nExamples:")
        print("  python scripts/add_documents.py --sample-docs")
        print("  python scripts/add_documents.py --file doc1.txt doc2.txt")
        print("  python scripts/add_documents.py --directory data/sample_documents")
        print("  python scripts/add_documents.py --text 'Your document text here'")