File size: 5,090 Bytes
4339a4c
ea88b9e
65387c4
ea88b9e
65387c4
 
 
 
ea88b9e
 
4339a4c
 
 
65387c4
 
 
 
4339a4c
 
65387c4
 
 
 
4339a4c
 
 
 
 
 
 
 
 
65387c4
 
 
 
 
 
 
4339a4c
65387c4
 
 
 
 
 
 
 
4339a4c
65387c4
4339a4c
65387c4
 
 
 
 
 
 
 
 
 
4339a4c
65387c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea88b9e
 
 
65387c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea88b9e
65387c4
 
ea88b9e
 
 
65387c4
 
 
 
 
 
ea88b9e
65387c4
ea88b9e
65387c4
 
 
 
 
 
 
 
 
ea88b9e
65387c4
 
 
 
 
 
 
 
 
 
 
ea88b9e
 
 
65387c4
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

# utils/vector_store.py
import faiss
import numpy as np
from typing import List, Dict, Optional
import pickle
import os
from pathlib import Path

class VectorStore:
    def __init__(self):
        # Use absolute path for HF Spaces
        self.persist_directory = "/data/faiss"
        self.index = None
        self.documents = []
        self.metadata = []
        
        # Ensure directories exist
        self._create_data_directories()
        
        # Try to load existing index and data
        self._load_or_create_index()

    def _create_data_directories(self):
        """Create necessary data directories"""
        # Create main data directory
        Path("/data").mkdir(parents=True, exist_ok=True)
        # Create FAISS specific directory
        Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
        # Create uploads directory
        Path("/data/uploads").mkdir(parents=True, exist_ok=True)

    def _load_or_create_index(self):
        """Load existing index or create new one"""
        index_path = os.path.join(self.persist_directory, "faiss.index")
        data_path = os.path.join(self.persist_directory, "documents.pkl")
        
        try:
            if os.path.exists(index_path) and os.path.exists(data_path):
                print(f"Loading existing index from {index_path}")
                # Load existing index
                self.index = faiss.read_index(index_path)
                
                # Load documents and metadata
                with open(data_path, 'rb') as f:
                    data = pickle.load(f)
                    self.documents = data['documents']
                    self.metadata = data['metadata']
                print(f"Loaded {len(self.documents)} documents from existing index")
            else:
                print("No existing index found, creating new one")
                # Create new index
                self.index = None  # Will be created when first vectors are added
                self.documents = []
                self.metadata = []
        except Exception as e:
            print(f"Error loading index: {e}")
            self.index = None
            self.documents = []
            self.metadata = []

    
    def _save_index(self):
        """Save index and data to disk"""
        if self.index is not None:
            index_path = os.path.join(self.persist_directory, "faiss.index")
            data_path = os.path.join(self.persist_directory, "documents.pkl")
            
            try:
                # Save FAISS index
                faiss.write_index(self.index, index_path)
                
                # Save documents and metadata
                with open(data_path, 'wb') as f:
                    pickle.dump({
                        'documents': self.documents,
                        'metadata': self.metadata
                    }, f)
            except Exception as e:
                print(f"Error saving index: {e}")

    def add_documents(self, chunks: List[Dict], metadata: Optional[Dict] = None):
        """Add document chunks to vector store"""
        if not chunks:
            return

        # Extract vectors and documents
        vectors = np.array([chunk["embeddings"] for chunk in chunks])
        
        # Create index if it doesn't exist
        if self.index is None:
            dimension = vectors.shape[1]
            self.index = faiss.IndexFlatL2(dimension)

        # Add vectors to index
        self.index.add(vectors.astype(np.float32))
        
        # Store documents and metadata
        for chunk in chunks:
            chunk_metadata = {
                "chunk_id": len(self.documents),
                "text_length": len(chunk["text"])
            }
            if metadata:
                chunk_metadata.update(metadata)
            
            self.documents.append(chunk["text"])
            self.metadata.append(chunk_metadata)
        
        # Save updated index
        self._save_index()

    def search(self, query_vector: np.ndarray, n_results: int = 5) -> List[Dict]:
        """Search for similar documents"""
        if self.index is None or self.index.ntotal == 0:
            return []

        # Reshape query vector if needed
        if len(query_vector.shape) == 1:
            query_vector = query_vector.reshape(1, -1)

        # Perform search
        distances, indices = self.index.search(query_vector.astype(np.float32), n_results)
        
        # Format results
        results = []
        for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
            if idx < len(self.documents):  # Check if index is valid
                results.append({
                    "text": self.documents[idx],
                    "metadata": self.metadata[idx],
                    "distance": float(dist)
                })
        
        return results

    def get_all_documents(self) -> List[Dict]:
        """Get all stored documents"""
        return [
            {"text": doc, "metadata": meta}
            for doc, meta in zip(self.documents, self.metadata)
        ]