Spaces:
Sleeping
Sleeping
File size: 8,036 Bytes
8bf4d58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
"""Script to add documents to the vector store from files or text."""
import sys
import os
from pathlib import Path
from typing import List, Dict, Optional
# Add parent directory to path
try:
parent_dir = Path(__file__).parent.parent
sys.path.insert(0, str(parent_dir))
except (NameError, AttributeError):
# Handle case where __file__ is not available (e.g., when executed directly)
import os
parent_dir = Path(os.getcwd())
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
# Lazy import - only import when functions are actually called
# This prevents import errors when the module is scanned but not used
_vector_store = None
_vector_store_error = None
def _get_vector_store():
"""Lazy import of vector store."""
global _vector_store, _vector_store_error
if _vector_store_error is not None:
raise _vector_store_error
if _vector_store is None:
try:
from src.retrieval.vector_store import get_vector_store
_vector_store = get_vector_store()
except ImportError as e:
_vector_store_error = ImportError(
f"Failed to import vector store. Make sure all dependencies (including chromadb) are installed. "
f"Run: pip install -r requirements.txt\n"
f"Original error: {e}"
)
raise _vector_store_error
return _vector_store
def add_text_documents(texts: List[str], metadatas: Optional[List[Dict]] = None):
"""
Add text documents to the vector store.
Args:
texts: List of document texts
metadatas: Optional list of metadata dictionaries
"""
vector_store = _get_vector_store()
if metadatas is None:
metadatas = [{}] * len(texts)
ids = vector_store.add_documents(texts, metadatas)
print(f"✅ Added {len(ids)} documents to vector store")
return ids
def add_file_documents(file_paths: List[str], chunk_size: int = 1000):
"""
Add documents from text files to the vector store.
Args:
file_paths: List of file paths to read
chunk_size: Size of text chunks (characters) for splitting large documents
"""
all_documents = []
all_metadatas = []
for file_path in file_paths:
file_path = Path(file_path)
if not file_path.exists():
print(f"⚠️ Warning: File not found: {file_path}")
continue
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Split large documents into chunks
if len(content) > chunk_size:
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
for i, chunk in enumerate(chunks):
all_documents.append(chunk)
all_metadatas.append({
"source": str(file_path.name),
"chunk": i + 1,
"type": "file"
})
else:
all_documents.append(content)
all_metadatas.append({
"source": str(file_path.name),
"type": "file"
})
print(f"✅ Loaded: {file_path.name}")
except Exception as e:
print(f"❌ Error reading {file_path}: {e}")
if all_documents:
ids = add_text_documents(all_documents, all_metadatas)
return ids
else:
print("⚠️ No documents to add")
return []
def add_from_directory(directory: str, extensions: List[str] = None):
"""
Add all text files from a directory.
Args:
directory: Directory path
extensions: List of file extensions to include (default: ['.txt', '.md', '.py'])
"""
if extensions is None:
extensions = ['.txt', '.md', '.py', '.json']
directory = Path(directory)
if not directory.exists():
print(f"❌ Directory not found: {directory}")
return []
file_paths = []
for ext in extensions:
file_paths.extend(directory.glob(f"**/*{ext}"))
if not file_paths:
print(f"⚠️ No files found with extensions {extensions} in {directory}")
return []
print(f"📁 Found {len(file_paths)} files in {directory}")
return add_file_documents([str(f) for f in file_paths])
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Add documents to the vector store")
parser.add_argument("--text", nargs="+", help="Add text documents directly")
parser.add_argument("--file", nargs="+", help="Add documents from files")
parser.add_argument("--directory", help="Add all documents from a directory")
parser.add_argument("--sample-docs", action="store_true", help="Add sample documents")
args = parser.parse_args()
if args.sample_docs:
# Add sample documents
sample_docs = [
{
"text": """
Oracle Exadata is a database machine that combines hardware and software
to provide high-performance database solutions. When migrating Exadata
workloads to the cloud, it's important to consider compatibility,
performance, and feature parity.
""",
"metadata": {"source": "exadata_migration_guide", "type": "documentation"},
},
{
"text": """
Cloud migration strategies for Oracle Exadata include:
1. Lift and shift - moving workloads with minimal changes
2. Replatforming - adapting to cloud-native services
3. Refactoring - redesigning for cloud architecture
Each approach has different trade-offs in terms of effort, cost, and feature availability.
""",
"metadata": {"source": "migration_strategies", "type": "guide"},
},
{
"text": """
Oracle Cloud Infrastructure (OCI) provides Exadata Cloud Service which
maintains full feature compatibility with on-premises Exadata. This
service offers the same architecture and capabilities, making it ideal
for migrations requiring minimal changes.
""",
"metadata": {"source": "oci_exadata", "type": "cloud_service"},
},
{
"text": """
Oracle AI Database services on AWS provide customers with a simplified path
to migrate Oracle Exadata workloads. These services run on AWS infrastructure
and offer managed database solutions that maintain Oracle compatibility while
leveraging AWS cloud capabilities. The services include automated migration tools,
performance optimization, and seamless integration with AWS services.
""",
"metadata": {"source": "oracle_aws_services", "type": "cloud_service"},
},
]
documents = [doc["text"] for doc in sample_docs]
metadatas = [doc["metadata"] for doc in sample_docs]
add_text_documents(documents, metadatas)
elif args.text:
add_text_documents(args.text)
elif args.file:
add_file_documents(args.file)
elif args.directory:
add_from_directory(args.directory)
else:
print("Please specify --text, --file, --directory, or --sample-docs")
print("\nExamples:")
print(" python scripts/add_documents.py --sample-docs")
print(" python scripts/add_documents.py --file doc1.txt doc2.txt")
print(" python scripts/add_documents.py --directory data/sample_documents")
print(" python scripts/add_documents.py --text 'Your document text here'")
|