import json import numpy as np import faiss from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from typing import List, Dict, Any, Tuple import os import pickle class SpaceKnowledgeBase: def __init__(self, data_dir: str = "data"): self.data_dir = data_dir self.model = SentenceTransformer('all-MiniLM-L6-v2') self.index = None self.documents = [] self.embeddings = None self.index_path = "knowledge_base_index.faiss" self.docs_path = "knowledge_base_docs.pkl" # Load or create knowledge base self._load_or_create_index() def _load_json_files(self) -> List[Dict[str, Any]]: """Load all JSON data files and extract documents""" documents = [] # File mappings for different data types data_files = { 'space_terminology.json': self._process_terminology, 'space_agencies.json': self._process_agencies, 'planets.json': self._process_planets, 'rockets.json': self._process_rockets, 'astronauts.json': self._process_astronauts, 'telescopes.json': self._process_telescopes, 'space_museams.json': self._process_museums, 'notable_peoples.json': self._process_notable_people } for filename, processor in data_files.items(): file_path = os.path.join(self.data_dir, filename) if os.path.exists(file_path): try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) documents.extend(processor(data)) print(f"Loaded {filename}: {len(processor(data))} documents") except Exception as e: print(f"Error loading {filename}: {e}") return documents def _process_terminology(self, data: Dict) -> List[Dict[str, Any]]: """Process space terminology data""" docs = [] for term in data.get('space_terms', []): doc = { 'id': f"term_{term.get('id', '')}", 'type': 'terminology', 'title': term.get('term', ''), 'content': f"{term.get('term', '')}. {term.get('short_description', '')} {term.get('detailed_description', '')}", 'category': term.get('category', ''), 'metadata': { 'category': term.get('category', ''), 'difficulty': term.get('difficulty_level', ''), 'related_terms': term.get('related_terms', []) } } docs.append(doc) return docs def _process_agencies(self, data: Dict) -> List[Dict[str, Any]]: """Process space agencies data""" docs = [] for agency in data.get('space_agencies', []): doc = { 'id': f"agency_{agency.get('id', '')}", 'type': 'agency', 'title': agency.get('name', ''), 'content': f"{agency.get('full_name', '')}. {agency.get('description', '')} Founded: {agency.get('founded', '')}. Country: {agency.get('country', '')}", 'category': agency.get('type', ''), 'metadata': { 'country': agency.get('country', ''), 'founded': agency.get('founded', ''), 'type': agency.get('type', ''), 'headquarters': agency.get('headquarters', ''), 'budget': agency.get('annual_budget', '') } } docs.append(doc) return docs def _process_planets(self, data: Dict) -> List[Dict[str, Any]]: """Process planets data""" docs = [] for planet in data.get('planets', []): doc = { 'id': f"planet_{planet.get('id', '')}", 'type': 'planet', 'title': planet.get('name', ''), 'content': f"{planet.get('name', '')}. {planet.get('description', '')} Distance from Sun: {planet.get('distance_from_sun', '')}. Type: {planet.get('type', '')}", 'category': planet.get('type', ''), 'metadata': { 'type': planet.get('type', ''), 'distance_from_sun': planet.get('distance_from_sun', ''), 'diameter': planet.get('diameter', ''), 'moons': planet.get('moons', ''), 'key_features': planet.get('key_features', []) } } docs.append(doc) return docs def _process_rockets(self, data: Dict) -> List[Dict[str, Any]]: """Process rockets data""" docs = [] for rocket in data.get('rockets', []): doc = { 'id': f"rocket_{rocket.get('id', '')}", 'type': 'rocket', 'title': rocket.get('name', ''), 'content': f"{rocket.get('name', '')}. {rocket.get('description', '')} First flight: {rocket.get('first_flight_year', '')}. Purpose: {rocket.get('purpose', '')}", 'category': rocket.get('type', ''), 'metadata': { 'country_of_origin': rocket.get('country_of_origin', ''), 'operator': rocket.get('operator', ''), 'first_flight_year': rocket.get('first_flight_year', ''), 'payload_capacity': rocket.get('capacity_payload_kg', ''), 'active': rocket.get('active', False) } } docs.append(doc) return docs def _process_astronauts(self, data: Dict) -> List[Dict[str, Any]]: """Process astronauts data""" docs = [] for astronaut in data.get('astronauts', []): doc = { 'id': f"astronaut_{astronaut.get('id', '')}", 'type': 'astronaut', 'title': astronaut.get('name', ''), 'content': f"{astronaut.get('name', '')}. {astronaut.get('description', '')} Agency: {astronaut.get('agency', '')}. Country: {astronaut.get('country', '')}", 'category': astronaut.get('type', ''), 'metadata': { 'country': astronaut.get('country', ''), 'agency': astronaut.get('agency', ''), 'birth_year': astronaut.get('birth_year', ''), 'missions_count': astronaut.get('missions_count', ''), 'achievements': astronaut.get('achievements', []) } } docs.append(doc) return docs def _process_telescopes(self, data: Dict) -> List[Dict[str, Any]]: """Process telescopes data""" docs = [] for telescope in data.get('telescopes', []): doc = { 'id': f"telescope_{telescope.get('id', '')}", 'type': 'telescope', 'title': telescope.get('name', ''), 'content': f"{telescope.get('name', '')}. {telescope.get('description', '')} Type: {telescope.get('type', '')}. Status: {telescope.get('status', '')}", 'category': telescope.get('type', ''), 'metadata': { 'type': telescope.get('type', ''), 'country': telescope.get('country', ''), 'agency': telescope.get('agency', ''), 'year': telescope.get('year', ''), 'status': telescope.get('status', '') } } docs.append(doc) return docs def _process_museums(self, data: Dict) -> List[Dict[str, Any]]: """Process space museums data""" docs = [] for museum in data.get('space_museums', []): doc = { 'id': f"museum_{museum.get('name', '').replace(' ', '_').lower()}", 'type': 'museum', 'title': museum.get('name', ''), 'content': f"{museum.get('name', '')}. {museum.get('famous_for', '')} Located in {museum.get('city_or_region', '')}, {museum.get('country', '')}. {museum.get('additional_info', '')}", 'category': 'space_museum', 'metadata': { 'country': museum.get('country', ''), 'city_or_region': museum.get('city_or_region', ''), 'famous_for': museum.get('famous_for', ''), 'established_year': museum.get('established_year', ''), 'annual_visitors': museum.get('annual_visitors', ''), 'additional_info': museum.get('additional_info', '') } } docs.append(doc) return docs def _process_notable_people(self, data: Dict) -> List[Dict[str, Any]]: """Process notable people data""" docs = [] for person in data.get('notable_space_contributors', []): doc = { 'id': f"person_{person.get('name', '').replace(' ', '_').lower()}", 'type': 'notable_person', 'title': person.get('name', ''), 'content': f"{person.get('name', '')}. {person.get('contribution', '')} Known for: {person.get('known_for', '')}. Country: {person.get('country', '')}", 'category': 'space_pioneer', 'metadata': { 'country': person.get('country', ''), 'contribution': person.get('contribution', ''), 'known_for': person.get('known_for', ''), 'birth_date': person.get('birth_date', ''), 'death_date': person.get('death_date', ''), 'awards': person.get('awards', []) } } docs.append(doc) return docs def _create_embeddings(self, documents: List[Dict[str, Any]]) -> np.ndarray: """Create embeddings for documents""" texts = [f"{doc['title']} {doc['content']}" for doc in documents] embeddings = self.model.encode(texts, show_progress_bar=True) return embeddings def _create_faiss_index(self, embeddings: np.ndarray) -> faiss.IndexFlatIP: """Create FAISS index for cosine similarity search""" # Normalize embeddings for cosine similarity faiss.normalize_L2(embeddings) # Create index dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity index.add(embeddings) return index def _load_or_create_index(self): """Load existing index or create new one""" if os.path.exists(self.index_path) and os.path.exists(self.docs_path): try: # Load existing index and documents self.index = faiss.read_index(self.index_path) with open(self.docs_path, 'rb') as f: self.documents = pickle.load(f) print(f"Loaded existing knowledge base with {len(self.documents)} documents") return except Exception as e: print(f"Error loading existing index: {e}") # Create new index print("Creating new knowledge base...") self.documents = self._load_json_files() if not self.documents: print("No documents found!") return self.embeddings = self._create_embeddings(self.documents) self.index = self._create_faiss_index(self.embeddings) # Save index and documents faiss.write_index(self.index, self.index_path) with open(self.docs_path, 'wb') as f: pickle.dump(self.documents, f) print(f"Created knowledge base with {len(self.documents)} documents") def search(self, query: str, top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]: """Search for relevant documents using vector similarity""" if not self.index or not self.documents: return [] # Create query embedding query_embedding = self.model.encode([query]) faiss.normalize_L2(query_embedding) # Search scores, indices = self.index.search(query_embedding, top_k) # Return results with documents and scores results = [] for i, (score, idx) in enumerate(zip(scores[0], indices[0])): if idx >= 0 and idx < len(self.documents): results.append((self.documents[idx], float(score))) return results def get_context_for_query(self, query: str, max_context_length: int = 2000) -> str: """Get relevant context for a query to use with LLM""" results = self.search(query, top_k=5) context_parts = [] current_length = 0 for doc, score in results: doc_text = f"**{doc['type'].title()}: {doc['title']}**\n{doc['content']}\n" if current_length + len(doc_text) > max_context_length: break context_parts.append(doc_text) current_length += len(doc_text) return "\n".join(context_parts) def force_regenerate(self): """Force regeneration of the knowledge base""" print("🔄 Force regenerating knowledge base...") # Remove existing files if os.path.exists(self.index_path): os.remove(self.index_path) if os.path.exists(self.docs_path): os.remove(self.docs_path) # Recreate self._load_or_create_index() print(f"✅ Knowledge base regenerated with {len(self.documents)} documents")