File size: 6,581 Bytes
04ab625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
"""
Initialize the RAG system by creating embeddings and FAISS index.
"""
import sys
from pathlib import Path

# Add project root to Python path
sys.path.insert(0, str(Path(__file__).parent.parent))

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from config import DATA_DIR, MODELS_DIR, CHUNK_SIZE, CHUNK_OVERLAP, EMBEDDING_MODEL
import sqlite3
import hashlib
from typing import List, Tuple
import os

def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    """Simple text chunking implementation."""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        if i + chunk_size >= len(words):
            break
    
    return chunks

def initialize_rag():
    """Initialize the RAG system with sample data."""
    print("Initializing RAG system...")
    
    # Load embedding model
    print(f"Loading embedding model: {EMBEDDING_MODEL}")
    embedder = SentenceTransformer(EMBEDDING_MODEL)
    
    # Collect all documents
    documents = []
    doc_ids = []
    chunk_metadata = []
    
    # First, check if we have documents
    md_files = list(DATA_DIR.glob("*.md"))
    txt_files = list(DATA_DIR.glob("*.txt"))
    
    if not md_files and not txt_files:
        print("No documents found. Running download_sample_data.py first...")
        # Try to create sample data
        from scripts.download_sample_data import download_sample_data
        download_sample_data()
        
        # Refresh file list
        md_files = list(DATA_DIR.glob("*.md"))
        txt_files = list(DATA_DIR.glob("*.txt"))
    
    print(f"Found {len(md_files)} .md files and {len(txt_files)} .txt files")
    
    for file_path in md_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            chunks = chunk_text(content)
            documents.extend(chunks)
            doc_ids.extend([file_path.name] * len(chunks))
            for j, chunk in enumerate(chunks):
                chunk_metadata.append({
                    'doc_id': file_path.name,
                    'chunk_index': j,
                    'file_type': 'markdown'
                })
    
    for file_path in txt_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            chunks = chunk_text(content)
            documents.extend(chunks)
            doc_ids.extend([file_path.name] * len(chunks))
            for j, chunk in enumerate(chunks):
                chunk_metadata.append({
                    'doc_id': file_path.name,
                    'chunk_index': j,
                    'file_type': 'text'
                })
    
    print(f"Found {len(documents)} chunks from {len(set(doc_ids))} documents")
    
    if not documents:
        print("ERROR: No documents found. Please add documents to the data/ directory first.")
        return
    
    # Create embeddings
    print("Creating embeddings...")
    embeddings = embedder.encode(documents, show_progress_bar=True, batch_size=32)
    
    # Create FAISS index
    print("Creating FAISS index...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 distance
    index.add(embeddings.astype(np.float32))
    
    # Save FAISS index
    faiss_index_path = DATA_DIR / "faiss_index.bin"
    faiss.write_index(index, str(faiss_index_path))
    print(f"Saved FAISS index to {faiss_index_path}")
    
    # Create document store (SQLite)
    print("Creating document store...")
    conn = sqlite3.connect(DATA_DIR / "docstore.db")
    cursor = conn.cursor()
    
    # Create tables
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS chunks (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            chunk_text TEXT NOT NULL,
            doc_id TEXT NOT NULL,
            chunk_hash TEXT UNIQUE NOT NULL,
            embedding_hash TEXT,
            chunk_index INTEGER,
            file_type TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS embedding_cache (
            text_hash TEXT PRIMARY KEY,
            embedding BLOB NOT NULL,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            access_count INTEGER DEFAULT 0
        )
    """)
    
    # Insert chunks
    inserted_count = 0
    for i, (chunk, doc_id, metadata) in enumerate(zip(documents, doc_ids, chunk_metadata)):
        chunk_hash = hashlib.md5(chunk.encode()).hexdigest()
        try:
            cursor.execute(
                """INSERT INTO chunks 
                   (chunk_text, doc_id, chunk_hash, chunk_index, file_type) 
                   VALUES (?, ?, ?, ?, ?)""",
                (chunk, doc_id, chunk_hash, metadata['chunk_index'], metadata['file_type'])
            )
            inserted_count += 1
        except sqlite3.IntegrityError:
            # Skip duplicates
            pass
    
    conn.commit()
    
    # Create indexes for performance
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_hash ON chunks(chunk_hash)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_doc_id ON chunks(doc_id)")
    conn.commit()
    
    conn.close()
    print(f"Saved {inserted_count} chunks to document store")
    
    # Also create embedding_cache.db if it doesn't exist
    cache_path = DATA_DIR / "embedding_cache.db"
    if not cache_path.exists():
        conn = sqlite3.connect(cache_path)
        cursor = conn.cursor()
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS embedding_cache (
                text_hash TEXT PRIMARY KEY,
                embedding BLOB NOT NULL,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                access_count INTEGER DEFAULT 0
            )
        """)
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_created_at ON embedding_cache(created_at)")
        conn.commit()
        conn.close()
        print(f"Created embedding cache at {cache_path}")
    
    print("\nRAG system initialized successfully!")
    print(f"FAISS index: {faiss_index_path}")
    print(f"Document store: {DATA_DIR / 'docstore.db'}")
    print(f"Embedding cache: {DATA_DIR / 'embedding_cache.db'}")
    print(f"Total chunks: {len(documents)}")
    print(f"Embedding dimension: {dimension}")
    print("\nYou can now start the API server with: python -m app.main")

if __name__ == "__main__":
    initialize_rag()