"""
Vector Store implementation using LlamaIndex and Chroma for semantic code search.
"""

import os
import logging
from typing import List, Dict, Optional
from pathlib import Path

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, Document
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import warnings

from .embeddings import get_embedding_model
from src.config import AIManager

# Suppress deprecation warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='llama_index.llms.gemini')
warnings.filterwarnings('ignore', category=DeprecationWarning, module='llama_index.embeddings.gemini')

logger = logging.getLogger(__name__)


class CodeSearchEngine:
    """
    Semantic code search engine using LlamaIndex + Chroma vector store.
    Enables finding similar legacy patterns across large codebases.
    """
    
    def __init__(self, persist_dir: Optional[str] = None, use_modal: bool = True):
        """
        Initialize the code search engine.
        
        Args:
            persist_dir: Optional directory to persist Chroma database
            use_modal: If True, use Modal embedding as primary (default: True)
        """
        self.persist_dir = persist_dir
        self.index: Optional[VectorStoreIndex] = None
        self.chroma_client = None
        self.chroma_collection = None
        self.use_modal = use_modal
        
        # Configure embeddings (Modal primary, Gemini fallback)
        try:
            Settings.embed_model = get_embedding_model(prefer_modal=use_modal)
        except Exception as e:
            logger.warning(f"Failed to initialize preferred embedding, using Gemini: {e}")
            Settings.embed_model = get_embedding_model(force_gemini=True)
            self.use_modal = False
        
        # Configure LLM using centralized AIManager
        self.ai_manager = AIManager()
        
        # Set up LlamaIndex LLM based on provider
        if self.ai_manager.provider_name == "gemini":
            from llama_index.llms.gemini import Gemini
            Settings.llm = Gemini(
                model=self.ai_manager.model_name,
                api_key=os.getenv("GEMINI_API_KEY"),
                temperature=0.1
            )
        elif self.ai_manager.provider_name in ["nebius", "openai"]:
            from llama_index.llms.openai import OpenAI
            if self.ai_manager.provider_name == "nebius":
                # Use gpt-3.5-turbo as placeholder to pass LlamaIndex validation
                # The actual model is passed via additional_kwargs
                Settings.llm = OpenAI(
                    model="gpt-3.5-turbo",
                    api_key=os.getenv("NEBIUS_API_KEY"),
                    api_base="https://api.tokenfactory.nebius.com/v1/",
                    temperature=0.1,
                    additional_kwargs={"model": self.ai_manager.model_name}
                )
            else:
                Settings.llm = OpenAI(
                    model=self.ai_manager.model_name,
                    api_key=os.getenv("OPENAI_API_KEY"),
                    temperature=0.1
                )
        
        embedding_type = "Modal (primary)" if self.use_modal else "Gemini (fallback)"
        logger.info(f"CodeSearchEngine initialized with {embedding_type} embeddings and {self.ai_manager.provider_name} LLM")
    
    def build_index(self, repo_path: str, file_extensions: Optional[List[str]] = None) -> VectorStoreIndex:
        """
        Build searchable index of codebase.
        
        Args:
            repo_path: Path to repository to index
            file_extensions: Optional list of file extensions to include (e.g., ['.py', '.java'])
        
        Returns:
            VectorStoreIndex for querying
        """
        logger.info(f"Building code index for: {repo_path}")
        
        # Initialize Chroma client
        if self.persist_dir:
            self.chroma_client = chromadb.PersistentClient(path=self.persist_dir)
        else:
            self.chroma_client = chromadb.EphemeralClient()
        
        # Create or get collection
        collection_name = "code_embeddings"
        try:
            self.chroma_collection = self.chroma_client.get_or_create_collection(collection_name)
        except Exception as e:
            logger.warning(f"Error with collection, creating new: {e}")
            self.chroma_collection = self.chroma_client.create_collection(collection_name)
        
        vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)
        
        # Load documents from repository
        documents = self._load_code_files(repo_path, file_extensions)
        
        if not documents:
            logger.warning(f"No code files found in {repo_path}")
            return None
        
        logger.info(f"Loaded {len(documents)} code files")
        
        # Build index (using default text splitter instead of CodeSplitter to avoid tree-sitter dependency)
        try:
            self.index = VectorStoreIndex.from_documents(
                documents,
                vector_store=vector_store,
                show_progress=True
            )
            logger.info("Code index built successfully")
        except Exception as e:
            if self.use_modal:
                logger.warning(f"Modal embedding failed during indexing: {e}")
                logger.info("Retrying with Gemini embeddings...")
                
                # Switch to Gemini
                Settings.embed_model = get_embedding_model(force_gemini=True)
                self.use_modal = False
                
                # Retry building index
                self.index = VectorStoreIndex.from_documents(
                    documents,
                    vector_store=vector_store,
                    show_progress=True
                )
                logger.info("Code index built successfully with Gemini embeddings")
            else:
                raise
        
        return self.index
    
    def _load_code_files(self, repo_path: str, file_extensions: Optional[List[str]] = None) -> List[Document]:
        """
        Load code files from repository.
        
        Args:
            repo_path: Path to repository
            file_extensions: Optional list of extensions to include
        
        Returns:
            List of Document objects
        """
        documents = []
        repo_path = Path(repo_path)
        
        # Default extensions if not specified
        if file_extensions is None:
            file_extensions = [
                # Python
                '.py', '.pyw', '.pyx',
                # Java
                '.java',
                # JavaScript/TypeScript
                '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs',
                # PHP
                '.php', '.php3', '.php4', '.php5', '.phtml',
                # Ruby
                '.rb', '.rbw',
                # Go
                '.go',
                # C/C++
                '.c', '.cpp', '.cc', '.cxx', '.c++', '.h', '.hpp', '.hh', '.hxx', '.h++',
                # C#
                '.cs',
                # Rust
                '.rs',
                # Kotlin
                '.kt', '.kts',
                # Swift
                '.swift',
                # Scala
                '.scala', '.sc',
                # R
                '.r', '.R',
                # Perl
                '.pl', '.pm', '.t', '.pod',
                # Shell
                '.sh', '.bash', '.zsh', '.fish'
            ]
        
        # Walk through directory
        for file_path in repo_path.rglob('*'):
            if file_path.is_file() and file_path.suffix in file_extensions:
                try:
                    # Skip hidden files and common non-code directories
                    if any(part.startswith('.') for part in file_path.parts):
                        continue
                    if any(part in ['node_modules', 'venv', '__pycache__', 'build', 'dist'] 
                           for part in file_path.parts):
                        continue
                    
                    # Read file content
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                    
                    # Create document with metadata
                    doc = Document(
                        text=content,
                        metadata={
                            'file_path': str(file_path.relative_to(repo_path)),
                            'file_name': file_path.name,
                            'extension': file_path.suffix,
                            'size': len(content)
                        }
                    )
                    documents.append(doc)
                    
                except Exception as e:
                    logger.warning(f"Error reading {file_path}: {e}")
        
        return documents
    
    def find_similar_patterns(self, pattern_query: str, top_k: int = 20) -> List[Dict]:
        """
        Find files with similar legacy patterns.
        
        Args:
            pattern_query: Natural language query describing the pattern
            top_k: Number of results to return
        
        Returns:
            List of dictionaries with file paths and relevance scores
        """
        if not self.index:
            raise ValueError("Index not built. Call build_index() first.")
        
        logger.info(f"Searching for pattern: {pattern_query}")
        
        # Create query engine
        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,
            response_mode="tree_summarize"
        )
        
        # Execute query
        response = query_engine.query(pattern_query)
        
        # Extract source files and scores
        results = []
        for node in response.source_nodes:
            results.append({
                'file_path': node.metadata.get('file_path', 'unknown'),
                'file_name': node.metadata.get('file_name', 'unknown'),
                'score': node.score,
                'text_snippet': node.text[:200] + '...' if len(node.text) > 200 else node.text
            })
        
        logger.info(f"Found {len(results)} matching files")
        return results
    
    def analyze_pattern_with_context(self, pattern_query: str, files: List[str]) -> str:
        """
        Deep analysis of legacy pattern with full context retrieval.
        
        Args:
            pattern_query: Description of the pattern to analyze
            files: List of file paths to analyze
        
        Returns:
            Analysis result from Gemini
        """
        if not self.index:
            raise ValueError("Index not built. Call build_index() first.")
        
        logger.info(f"Analyzing pattern with context: {pattern_query}")
        
        # Build enhanced query with file context
        enhanced_query = f"""
        Analyze the following legacy code pattern and provide:
        1. What the code currently does
        2. Why it's problematic (security, performance, maintainability)
        3. Modern equivalent (recommended library/pattern)
        4. Migration steps with risk assessment
        
        Pattern to analyze: {pattern_query}
        Files to focus on: {', '.join(files)}
        
        Provide detailed analysis in JSON format with keys:
        - analysis: Overall analysis
        - issues: List of specific issues
        - recommendation: Recommended modern approach
        - steps: Migration steps
        - risks: Risk assessment
        """
        
        # Create query engine with custom prompt
        query_engine = self.index.as_query_engine(
            similarity_top_k=10,
            response_mode="compact"
        )
        
        # Execute analysis
        response = query_engine.query(enhanced_query)
        
        return response.response
    
    def get_transformation_examples(self, pattern_type: str, top_k: int = 5) -> List[Dict]:
        """
        Retrieve examples of successful transformations for a pattern type.
        
        Args:
            pattern_type: Type of pattern (e.g., "MySQLdb to SQLAlchemy")
            top_k: Number of examples to retrieve
        
        Returns:
            List of example transformations
        """
        if not self.index:
            raise ValueError("Index not built. Call build_index() first.")
        
        query = f"Find examples of code that was successfully transformed from {pattern_type}"
        
        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,
            response_mode="compact"
        )
        
        response = query_engine.query(query)
        
        # Extract examples from source nodes
        examples = []
        for node in response.source_nodes:
            examples.append({
                'file_path': node.metadata.get('file_path', 'unknown'),
                'code_snippet': node.text,
                'score': node.score
            })
        
        return examples