File size: 14,277 Bytes
7e3b585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Tuple
import os
import pickle

class SpaceKnowledgeBase:
    def __init__(self, data_dir: str = "data"):
        self.data_dir = data_dir
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.documents = []
        self.embeddings = None
        self.index_path = "knowledge_base_index.faiss"
        self.docs_path = "knowledge_base_docs.pkl"
        
        # Load or create knowledge base
        self._load_or_create_index()
    
    def _load_json_files(self) -> List[Dict[str, Any]]:
        """Load all JSON data files and extract documents"""
        documents = []
        
        # File mappings for different data types
        data_files = {
            'space_terminology.json': self._process_terminology,
            'space_agencies.json': self._process_agencies,
            'planets.json': self._process_planets,
            'rockets.json': self._process_rockets,
            'astronauts.json': self._process_astronauts,
            'telescopes.json': self._process_telescopes,
            'space_museams.json': self._process_museums,
            'notable_peoples.json': self._process_notable_people
        }
        
        for filename, processor in data_files.items():
            file_path = os.path.join(self.data_dir, filename)
            if os.path.exists(file_path):
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    documents.extend(processor(data))
                    print(f"Loaded {filename}: {len(processor(data))} documents")
                except Exception as e:
                    print(f"Error loading {filename}: {e}")
        
        return documents
    
    def _process_terminology(self, data: Dict) -> List[Dict[str, Any]]:
        """Process space terminology data"""
        docs = []
        for term in data.get('space_terms', []):
            doc = {
                'id': f"term_{term.get('id', '')}",
                'type': 'terminology',
                'title': term.get('term', ''),
                'content': f"{term.get('term', '')}. {term.get('short_description', '')} {term.get('detailed_description', '')}",
                'category': term.get('category', ''),
                'metadata': {
                    'category': term.get('category', ''),
                    'difficulty': term.get('difficulty_level', ''),
                    'related_terms': term.get('related_terms', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _process_agencies(self, data: Dict) -> List[Dict[str, Any]]:
        """Process space agencies data"""
        docs = []
        for agency in data.get('space_agencies', []):
            doc = {
                'id': f"agency_{agency.get('id', '')}",
                'type': 'agency',
                'title': agency.get('name', ''),
                'content': f"{agency.get('full_name', '')}. {agency.get('description', '')} Founded: {agency.get('founded', '')}. Country: {agency.get('country', '')}",
                'category': agency.get('type', ''),
                'metadata': {
                    'country': agency.get('country', ''),
                    'founded': agency.get('founded', ''),
                    'type': agency.get('type', ''),
                    'headquarters': agency.get('headquarters', ''),
                    'budget': agency.get('annual_budget', '')
                }
            }
            docs.append(doc)
        return docs
    
    def _process_planets(self, data: Dict) -> List[Dict[str, Any]]:
        """Process planets data"""
        docs = []
        for planet in data.get('planets', []):
            doc = {
                'id': f"planet_{planet.get('id', '')}",
                'type': 'planet',
                'title': planet.get('name', ''),
                'content': f"{planet.get('name', '')}. {planet.get('description', '')} Distance from Sun: {planet.get('distance_from_sun', '')}. Type: {planet.get('type', '')}",
                'category': planet.get('type', ''),
                'metadata': {
                    'type': planet.get('type', ''),
                    'distance_from_sun': planet.get('distance_from_sun', ''),
                    'diameter': planet.get('diameter', ''),
                    'moons': planet.get('moons', ''),
                    'key_features': planet.get('key_features', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _process_rockets(self, data: Dict) -> List[Dict[str, Any]]:
        """Process rockets data"""
        docs = []
        for rocket in data.get('rockets', []):
            doc = {
                'id': f"rocket_{rocket.get('id', '')}",
                'type': 'rocket',
                'title': rocket.get('name', ''),
                'content': f"{rocket.get('name', '')}. {rocket.get('description', '')} First flight: {rocket.get('first_flight_year', '')}. Purpose: {rocket.get('purpose', '')}",
                'category': rocket.get('type', ''),
                'metadata': {
                    'country_of_origin': rocket.get('country_of_origin', ''),
                    'operator': rocket.get('operator', ''),
                    'first_flight_year': rocket.get('first_flight_year', ''),
                    'payload_capacity': rocket.get('capacity_payload_kg', ''),
                    'active': rocket.get('active', False)
                }
            }
            docs.append(doc)
        return docs
    
    def _process_astronauts(self, data: Dict) -> List[Dict[str, Any]]:
        """Process astronauts data"""
        docs = []
        for astronaut in data.get('astronauts', []):
            doc = {
                'id': f"astronaut_{astronaut.get('id', '')}",
                'type': 'astronaut',
                'title': astronaut.get('name', ''),
                'content': f"{astronaut.get('name', '')}. {astronaut.get('description', '')} Agency: {astronaut.get('agency', '')}. Country: {astronaut.get('country', '')}",
                'category': astronaut.get('type', ''),
                'metadata': {
                    'country': astronaut.get('country', ''),
                    'agency': astronaut.get('agency', ''),
                    'birth_year': astronaut.get('birth_year', ''),
                    'missions_count': astronaut.get('missions_count', ''),
                    'achievements': astronaut.get('achievements', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _process_telescopes(self, data: Dict) -> List[Dict[str, Any]]:
        """Process telescopes data"""
        docs = []
        for telescope in data.get('telescopes', []):
            doc = {
                'id': f"telescope_{telescope.get('id', '')}",
                'type': 'telescope',
                'title': telescope.get('name', ''),
                'content': f"{telescope.get('name', '')}. {telescope.get('description', '')} Type: {telescope.get('type', '')}. Status: {telescope.get('status', '')}",
                'category': telescope.get('type', ''),
                'metadata': {
                    'type': telescope.get('type', ''),
                    'country': telescope.get('country', ''),
                    'agency': telescope.get('agency', ''),
                    'year': telescope.get('year', ''),
                    'status': telescope.get('status', '')
                }
            }
            docs.append(doc)
        return docs
    
    def _process_museums(self, data: Dict) -> List[Dict[str, Any]]:
        """Process space museums data"""
        docs = []
        for museum in data.get('space_museums', []):
            doc = {
                'id': f"museum_{museum.get('name', '').replace(' ', '_').lower()}",
                'type': 'museum',
                'title': museum.get('name', ''),
                'content': f"{museum.get('name', '')}. {museum.get('famous_for', '')} Located in {museum.get('city_or_region', '')}, {museum.get('country', '')}. {museum.get('additional_info', '')}",
                'category': 'space_museum',
                'metadata': {
                    'country': museum.get('country', ''),
                    'city_or_region': museum.get('city_or_region', ''),
                    'famous_for': museum.get('famous_for', ''),
                    'established_year': museum.get('established_year', ''),
                    'annual_visitors': museum.get('annual_visitors', ''),
                    'additional_info': museum.get('additional_info', '')
                }
            }
            docs.append(doc)
        return docs
    
    def _process_notable_people(self, data: Dict) -> List[Dict[str, Any]]:
        """Process notable people data"""
        docs = []
        for person in data.get('notable_space_contributors', []):
            doc = {
                'id': f"person_{person.get('name', '').replace(' ', '_').lower()}",
                'type': 'notable_person',
                'title': person.get('name', ''),
                'content': f"{person.get('name', '')}. {person.get('contribution', '')} Known for: {person.get('known_for', '')}. Country: {person.get('country', '')}",
                'category': 'space_pioneer',
                'metadata': {
                    'country': person.get('country', ''),
                    'contribution': person.get('contribution', ''),
                    'known_for': person.get('known_for', ''),
                    'birth_date': person.get('birth_date', ''),
                    'death_date': person.get('death_date', ''),
                    'awards': person.get('awards', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _create_embeddings(self, documents: List[Dict[str, Any]]) -> np.ndarray:
        """Create embeddings for documents"""
        texts = [f"{doc['title']} {doc['content']}" for doc in documents]
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings
    
    def _create_faiss_index(self, embeddings: np.ndarray) -> faiss.IndexFlatIP:
        """Create FAISS index for cosine similarity search"""
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        
        # Create index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
        index.add(embeddings)
        
        return index
    
    def _load_or_create_index(self):
        """Load existing index or create new one"""
        if os.path.exists(self.index_path) and os.path.exists(self.docs_path):
            try:
                # Load existing index and documents
                self.index = faiss.read_index(self.index_path)
                with open(self.docs_path, 'rb') as f:
                    self.documents = pickle.load(f)
                print(f"Loaded existing knowledge base with {len(self.documents)} documents")
                return
            except Exception as e:
                print(f"Error loading existing index: {e}")
        
        # Create new index
        print("Creating new knowledge base...")
        self.documents = self._load_json_files()
        
        if not self.documents:
            print("No documents found!")
            return
        
        self.embeddings = self._create_embeddings(self.documents)
        self.index = self._create_faiss_index(self.embeddings)
        
        # Save index and documents
        faiss.write_index(self.index, self.index_path)
        with open(self.docs_path, 'wb') as f:
            pickle.dump(self.documents, f)
        
        print(f"Created knowledge base with {len(self.documents)} documents")
    
    def search(self, query: str, top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
        """Search for relevant documents using vector similarity"""
        if not self.index or not self.documents:
            return []
        
        # Create query embedding
        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        # Search
        scores, indices = self.index.search(query_embedding, top_k)
        
        # Return results with documents and scores
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx >= 0 and idx < len(self.documents):
                results.append((self.documents[idx], float(score)))
        
        return results
    
    def get_context_for_query(self, query: str, max_context_length: int = 2000) -> str:
        """Get relevant context for a query to use with LLM"""
        results = self.search(query, top_k=5)
        
        context_parts = []
        current_length = 0
        
        for doc, score in results:
            doc_text = f"**{doc['type'].title()}: {doc['title']}**\n{doc['content']}\n"
            
            if current_length + len(doc_text) > max_context_length:
                break
            
            context_parts.append(doc_text)
            current_length += len(doc_text)
        
        return "\n".join(context_parts)
    
    def force_regenerate(self):
        """Force regeneration of the knowledge base"""
        print("🔄 Force regenerating knowledge base...")
        
        # Remove existing files
        if os.path.exists(self.index_path):
            os.remove(self.index_path)
        if os.path.exists(self.docs_path):
            os.remove(self.docs_path)
        
        # Recreate
        self._load_or_create_index()
        print(f"✅ Knowledge base regenerated with {len(self.documents)} documents")