Spaces:

aradhyapavan
/

casmopedia

Sleeping

File size: 14,277 Bytes

7e3b585

import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Tuple
import os
import pickle

class SpaceKnowledgeBase:
    def __init__(self, data_dir: str = "data"):
        self.data_dir = data_dir
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.documents = []
        self.embeddings = None
        self.index_path = "knowledge_base_index.faiss"
        self.docs_path = "knowledge_base_docs.pkl"
        
        # Load or create knowledge base
        self._load_or_create_index()
    
    def _load_json_files(self) -> List[Dict[str, Any]]:
        """Load all JSON data files and extract documents"""
        documents = []
        
        # File mappings for different data types
        data_files = {
            'space_terminology.json': self._process_terminology,
            'space_agencies.json': self._process_agencies,
            'planets.json': self._process_planets,
            'rockets.json': self._process_rockets,
            'astronauts.json': self._process_astronauts,
            'telescopes.json': self._process_telescopes,
            'space_museams.json': self._process_museums,
            'notable_peoples.json': self._process_notable_people
        }
        
        for filename, processor in data_files.items():
            file_path = os.path.join(self.data_dir, filename)
            if os.path.exists(file_path):
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    documents.extend(processor(data))
                    print(f"Loaded {filename}: {len(processor(data))} documents")
                except Exception as e:
                    print(f"Error loading {filename}: {e}")
        
        return documents
    
    def _process_terminology(self, data: Dict) -> List[Dict[str, Any]]:
        """Process space terminology data"""
        docs = []
        for term in data.get('space_terms', []):
            doc = {
                'id': f"term_{term.get('id', '')}",
                'type': 'terminology',
                'title': term.get('term', ''),
                'content': f"{term.get('term', '')}. {term.get('short_description', '')} {term.get('detailed_description', '')}",
                'category': term.get('category', ''),
                'metadata': {
                    'category': term.get('category', ''),
                    'difficulty': term.get('difficulty_level', ''),
                    'related_terms': term.get('related_terms', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _process_agencies(self, data: Dict) -> List[Dict[str, Any]]:
        """Process space agencies data"""
        docs = []
        for agency in data.get('space_agencies', []):
            doc = {
                'id': f"agency_{agency.get('id', '')}",
                'type': 'agency',
                'title': agency.get('name', ''),
                'content': f"{agency.get('full_name', '')}. {agency.get('description', '')} Founded: {agency.get('founded', '')}. Country: {agency.get('country', '')}",
                'category': agency.get('type', ''),
                'metadata': {
                    'country': agency.get('country', ''),
                    'founded': agency.get('founded', ''),
                    'type': agency.get('type', ''),
                    'headquarters': agency.get('headquarters', ''),
                    'budget': agency.get('annual_budget', '')
                }
            }
            docs.append(doc)
        return docs
    
    def _process_planets(self, data: Dict) -> List[Dict[str, Any]]:
        """Process planets data"""
        docs = []
        for planet in data.get('planets', []):
            doc = {
                'id': f"planet_{planet.get('id', '')}",
                'type': 'planet',
                'title': planet.get('name', ''),
                'content': f"{planet.get('name', '')}. {planet.get('description', '')} Distance from Sun: {planet.get('distance_from_sun', '')}. Type: {planet.get('type', '')}",
                'category': planet.get('type', ''),
                'metadata': {
                    'type': planet.get('type', ''),
                    'distance_from_sun': planet.get('distance_from_sun', ''),
                    'diameter': planet.get('diameter', ''),
                    'moons': planet.get('moons', ''),
                    'key_features': planet.get('key_features', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _process_rockets(self, data: Dict) -> List[Dict[str, Any]]:
        """Process rockets data"""
        docs = []
        for rocket in data.get('rockets', []):
            doc = {
                'id': f"rocket_{rocket.get('id', '')}",
                'type': 'rocket',
                'title': rocket.get('name', ''),
                'content': f"{rocket.get('name', '')}. {rocket.get('description', '')} First flight: {rocket.get('first_flight_year', '')}. Purpose: {rocket.get('purpose', '')}",
                'category': rocket.get('type', ''),
                'metadata': {
                    'country_of_origin': rocket.get('country_of_origin', ''),
                    'operator': rocket.get('operator', ''),
                    'first_flight_year': rocket.get('first_flight_year', ''),
                    'payload_capacity': rocket.get('capacity_payload_kg', ''),
                    'active': rocket.get('active', False)
                }
            }
            docs.append(doc)
        return docs
    
    def _process_astronauts(self, data: Dict) -> List[Dict[str, Any]]:
        """Process astronauts data"""
        docs = []
        for astronaut in data.get('astronauts', []):
            doc = {
                'id': f"astronaut_{astronaut.get('id', '')}",
                'type': 'astronaut',
                'title': astronaut.get('name', ''),
                'content': f"{astronaut.get('name', '')}. {astronaut.get('description', '')} Agency: {astronaut.get('agency', '')}. Country: {astronaut.get('country', '')}",
                'category': astronaut.get('type', ''),
                'metadata': {
                    'country': astronaut.get('country', ''),
                    'agency': astronaut.get('agency', ''),
                    'birth_year': astronaut.get('birth_year', ''),
                    'missions_count': astronaut.get('missions_count', ''),
                    'achievements': astronaut.get('achievements', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _process_telescopes(self, data: Dict) -> List[Dict[str, Any]]:
        """Process telescopes data"""
        docs = []
        for telescope in data.get('telescopes', []):
            doc = {
                'id': f"telescope_{telescope.get('id', '')}",
                'type': 'telescope',
                'title': telescope.get('name', ''),
                'content': f"{telescope.get('name', '')}. {telescope.get('description', '')} Type: {telescope.get('type', '')}. Status: {telescope.get('status', '')}",
                'category': telescope.get('type', ''),
                'metadata': {
                    'type': telescope.get('type', ''),
                    'country': telescope.get('country', ''),
                    'agency': telescope.get('agency', ''),
                    'year': telescope.get('year', ''),
                    'status': telescope.get('status', '')
                }
            }
            docs.append(doc)
        return docs
    
    def _process_museums(self, data: Dict) -> List[Dict[str, Any]]:
        """Process space museums data"""
        docs = []
        for museum in data.get('space_museums', []):
            doc = {
                'id': f"museum_{museum.get('name', '').replace(' ', '_').lower()}",
                'type': 'museum',
                'title': museum.get('name', ''),
                'content': f"{museum.get('name', '')}. {museum.get('famous_for', '')} Located in {museum.get('city_or_region', '')}, {museum.get('country', '')}. {museum.get('additional_info', '')}",
                'category': 'space_museum',
                'metadata': {
                    'country': museum.get('country', ''),
                    'city_or_region': museum.get('city_or_region', ''),
                    'famous_for': museum.get('famous_for', ''),
                    'established_year': museum.get('established_year', ''),
                    'annual_visitors': museum.get('annual_visitors', ''),
                    'additional_info': museum.get('additional_info', '')
                }
            }
            docs.append(doc)
        return docs
    
    def _process_notable_people(self, data: Dict) -> List[Dict[str, Any]]:
        """Process notable people data"""
        docs = []
        for person in data.get('notable_space_contributors', []):
            doc = {
                'id': f"person_{person.get('name', '').replace(' ', '_').lower()}",
                'type': 'notable_person',
                'title': person.get('name', ''),
                'content': f"{person.get('name', '')}. {person.get('contribution', '')} Known for: {person.get('known_for', '')}. Country: {person.get('country', '')}",
                'category': 'space_pioneer',
                'metadata': {
                    'country': person.get('country', ''),
                    'contribution': person.get('contribution', ''),
                    'known_for': person.get('known_for', ''),
                    'birth_date': person.get('birth_date', ''),
                    'death_date': person.get('death_date', ''),
                    'awards': person.get('awards', [])
                }
            }
            docs.append(doc)
        return docs
    
    def _create_embeddings(self, documents: List[Dict[str, Any]]) -> np.ndarray:
        """Create embeddings for documents"""
        texts = [f"{doc['title']} {doc['content']}" for doc in documents]
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings
    
    def _create_faiss_index(self, embeddings: np.ndarray) -> faiss.IndexFlatIP:
        """Create FAISS index for cosine similarity search"""
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        
        # Create index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
        index.add(embeddings)
        
        return index
    
    def _load_or_create_index(self):
        """Load existing index or create new one"""
        if os.path.exists(self.index_path) and os.path.exists(self.docs_path):
            try:
                # Load existing index and documents
                self.index = faiss.read_index(self.index_path)
                with open(self.docs_path, 'rb') as f:
                    self.documents = pickle.load(f)
                print(f"Loaded existing knowledge base with {len(self.documents)} documents")
                return
            except Exception as e:
                print(f"Error loading existing index: {e}")
        
        # Create new index
        print("Creating new knowledge base...")
        self.documents = self._load_json_files()
        
        if not self.documents:
            print("No documents found!")
            return
        
        self.embeddings = self._create_embeddings(self.documents)
        self.index = self._create_faiss_index(self.embeddings)
        
        # Save index and documents
        faiss.write_index(self.index, self.index_path)
        with open(self.docs_path, 'wb') as f:
            pickle.dump(self.documents, f)
        
        print(f"Created knowledge base with {len(self.documents)} documents")
    
    def search(self, query: str, top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
        """Search for relevant documents using vector similarity"""
        if not self.index or not self.documents:
            return []
        
        # Create query embedding
        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        # Search
        scores, indices = self.index.search(query_embedding, top_k)
        
        # Return results with documents and scores
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx >= 0 and idx < len(self.documents):
                results.append((self.documents[idx], float(score)))
        
        return results
    
    def get_context_for_query(self, query: str, max_context_length: int = 2000) -> str:
        """Get relevant context for a query to use with LLM"""
        results = self.search(query, top_k=5)
        
        context_parts = []
        current_length = 0
        
        for doc, score in results:
            doc_text = f"**{doc['type'].title()}: {doc['title']}**\n{doc['content']}\n"
            
            if current_length + len(doc_text) > max_context_length:
                break
            
            context_parts.append(doc_text)
            current_length += len(doc_text)
        
        return "\n".join(context_parts)
    
    def force_regenerate(self):
        """Force regeneration of the knowledge base"""
        print("🔄 Force regenerating knowledge base...")
        
        # Remove existing files
        if os.path.exists(self.index_path):
            os.remove(self.index_path)
        if os.path.exists(self.docs_path):
            os.remove(self.docs_path)
        
        # Recreate
        self._load_or_create_index()
        print(f"✅ Knowledge base regenerated with {len(self.documents)} documents")