"""
Core orchestration module for GetGit RAG + LLM Pipeline.

This module serves as the unified entry point for GetGit, coordinating
repository cloning, RAG-based analysis, and LLM-powered question answering.
It provides a simple API for end-to-end repository intelligence gathering.
"""

import os
import logging
from typing import Optional, List, Dict, Any
from pathlib import Path

from clone_repo import clone_repo
from repo_manager import RepositoryManager
from rag import (
    RepositoryChunker,
    SimpleEmbedding,
    SentenceTransformerEmbedding,
    Retriever,
    RAGConfig,
    generate_response,
)
from checkpoints import (
    load_checkpoints,
    evaluate_checkpoint,
    run_checkpoints,
    format_results_summary,
    CheckpointResult
)


# Configure logging
def setup_logging(level: str = "INFO") -> logging.Logger:
    """
    Configure logging for the core module.
    
    Args:
        level: Logging level (DEBUG, INFO, WARNING, ERROR)
    
    Returns:
        Configured logger instance
    """
    log_level = getattr(logging, level.upper(), logging.INFO)
    
    logging.basicConfig(
        level=log_level,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    
    logger = logging.getLogger('getgit.core')
    logger.setLevel(log_level)  # Explicitly set logger level
    return logger


# Initialize module logger
logger = setup_logging()


def initialize_repository(repo_url: str, local_path: str = "source_repo") -> str:
    """
    Clone or load the repository and prepare it for analysis.
    
    This function now includes repository persistence and validation:
    - Checks if the repository URL has changed
    - Cleans up old data if a new repository is provided
    - Stores the current repository URL for future validation
    
    Args:
        repo_url: GitHub repository URL to clone
        local_path: Local path where repository will be stored
    
    Returns:
        Path to the cloned/loaded repository
    
    Raises:
        Exception: If repository cloning or loading fails
    """
    logger.info(f"Initializing repository from {repo_url}")
    
    try:
        # Initialize repository manager
        repo_manager = RepositoryManager(
            data_dir="data",
            repo_dir=local_path,
            cache_dir=".rag_cache"
        )
        
        # Check if we need to reset (different repository URL)
        reset_performed = repo_manager.prepare_for_new_repo(repo_url)
        
        if reset_performed:
            logger.info("Repository reset performed, will clone fresh copy")
        
        # Clone or reuse existing repository
        if os.path.exists(local_path):
            logger.info(f"Repository already exists at {local_path}, using existing copy")
            logger.debug(f"Skipping clone for existing repository at {local_path}")
        else:
            logger.info(f"Cloning repository to {local_path}")
            clone_repo(repo_url, local_path)
            logger.info(f"Repository successfully cloned to {local_path}")
        
        # Verify repository exists and is accessible
        if not os.path.isdir(local_path):
            raise ValueError(f"Repository path {local_path} is not a valid directory")
        
        logger.debug(f"Repository initialized at {local_path}")
        return local_path
    
    except Exception as e:
        logger.error(f"Failed to initialize repository: {str(e)}")
        raise


def setup_rag(
    repo_path: str,
    repository_name: Optional[str] = None,
    config: Optional[RAGConfig] = None,
    use_sentence_transformer: bool = False
) -> Retriever:
    """
    Initialize chunker, embeddings, and retriever for RAG pipeline.
    
    Args:
        repo_path: Path to the repository to analyze
        repository_name: Optional name for the repository
        config: Optional RAG configuration (uses default if not provided)
        use_sentence_transformer: Whether to use SentenceTransformer embeddings
    
    Returns:
        Configured Retriever instance with indexed repository chunks
    
    Raises:
        Exception: If RAG initialization or indexing fails
    """
    logger.info(f"Setting up RAG pipeline for repository at {repo_path}")
    
    try:
        # Use default config if not provided
        if config is None:
            config = RAGConfig.default()
            logger.debug("Using default RAG configuration")
        
        # Determine repository name
        if repository_name is None:
            repository_name = os.path.basename(repo_path)
        logger.debug(f"Repository name: {repository_name}")
        
        # Step 1: Chunk the repository
        logger.info("Chunking repository content...")
        chunker = RepositoryChunker(repo_path, repository_name=repository_name)
        chunks = chunker.chunk_repository(config.chunking.file_patterns)
        logger.info(f"Created {len(chunks)} chunks from repository")
        
        if not chunks:
            logger.warning("No chunks created - repository may be empty or contain no supported file types")
            raise ValueError(
                "No chunks created from repository. Ensure the repository contains "
                f"files matching patterns: {config.chunking.file_patterns}"
            )
        
        # Step 2: Initialize embedding model
        logger.info("Initializing embedding model...")
        if use_sentence_transformer:
            try:
                embedding_model = SentenceTransformerEmbedding(config.embedding.model_name)
                logger.info(f"Using SentenceTransformer model: {config.embedding.model_name}")
            except ImportError:
                logger.warning("sentence-transformers not available, falling back to SimpleEmbedding")
                embedding_model = SimpleEmbedding(max_features=config.embedding.embedding_dim)
        else:
            embedding_model = SimpleEmbedding(max_features=config.embedding.embedding_dim)
            logger.info("Using SimpleEmbedding (TF-IDF based)")
        
        # Step 3: Create retriever and index chunks
        logger.info("Creating retriever and indexing chunks...")
        retriever = Retriever(embedding_model)
        retriever.index_chunks(chunks, batch_size=config.embedding.batch_size)
        logger.info(f"Successfully indexed {len(retriever)} chunks")
        
        logger.debug("RAG pipeline setup complete")
        return retriever
    
    except Exception as e:
        logger.error(f"Failed to setup RAG pipeline: {str(e)}")
        raise


def answer_query(
    query: str,
    retriever: Retriever,
    top_k: int = 5,
    use_llm: bool = True,
    api_key: Optional[str] = None,
    model_name: str = "gemini-2.5-flash"
) -> Dict[str, Any]:
    """
    Retrieve relevant context and generate an LLM response for the query.
    
    Args:
        query: Natural language question about the repository
        retriever: Configured Retriever instance
        top_k: Number of relevant chunks to retrieve
        use_llm: Whether to generate LLM response (requires API key)
        api_key: Optional API key for LLM (reads from env if not provided)
        model_name: Name of the LLM model to use
    
    Returns:
        Dictionary containing:
            - query: The original query
            - retrieved_chunks: List of retrieved chunk information
            - context: Combined context from retrieved chunks
            - response: Generated LLM response (if use_llm=True)
            - error: Error message if LLM generation fails
    
    Raises:
        Exception: If query processing fails
    """
    logger.info(f"Processing query: '{query}'")
    
    try:
        # Step 1: Retrieve relevant chunks
        logger.info(f"Retrieving top {top_k} relevant chunks...")
        results = retriever.retrieve(query, top_k=top_k)
        logger.info(f"Retrieved {len(results)} relevant chunks")
        
        if not results:
            logger.warning("No relevant chunks found for query")
            return {
                'query': query,
                'retrieved_chunks': [],
                'context': '',
                'response': 'No relevant information found in the repository for this query.',
                'error': None
            }
        
        # Log retrieved chunks
        for result in results:
            logger.debug(
                f"Chunk {result.rank}: {result.chunk.file_path} "
                f"(score: {result.score:.4f}, type: {result.chunk.chunk_type.value})"
            )
        
        # Step 2: Extract context
        context_chunks = [result.chunk.content for result in results]
        retrieved_info = [
            {
                'rank': result.rank,
                'file_path': result.chunk.file_path,
                'chunk_type': result.chunk.chunk_type.value,
                'score': result.score,
                'start_line': result.chunk.start_line,
                'end_line': result.chunk.end_line,
                'metadata': result.chunk.metadata
            }
            for result in results
        ]
        
        # Step 3: Generate LLM response if requested
        response_text = None
        error = None
        
        if use_llm:
            logger.info("Generating LLM response...")
            try:
                response_text = generate_response(
                    query,
                    context_chunks,
                    model_name=model_name,
                    api_key=api_key
                )
                logger.info("LLM response generated successfully")
                logger.debug(f"Response length: {len(response_text)} characters")
            except Exception as e:
                error = str(e)
                logger.error(f"Failed to generate LLM response: {error}")
                response_text = None
        else:
            logger.debug("LLM response generation skipped (use_llm=False)")
        
        return {
            'query': query,
            'retrieved_chunks': retrieved_info,
            'context': '\n\n---\n\n'.join(context_chunks),
            'response': response_text,
            'error': error
        }
    
    except Exception as e:
        logger.error(f"Failed to process query: {str(e)}")
        raise


def validate_checkpoints(
    repo_url: str,
    checkpoints_file: str = "checkpoints.txt",
    local_path: str = "source_repo",
    use_llm: bool = True,
    log_level: str = "INFO",
    config: Optional[RAGConfig] = None,
    stop_on_failure: bool = False
) -> Dict[str, Any]:
    """
    Validate repository against checkpoints defined in a text file.
    
    This function orchestrates the checkpoint validation pipeline:
    1. Repository cloning/loading
    2. RAG initialization and indexing
    3. Loading checkpoints from file
    4. Sequential checkpoint evaluation
    5. Results aggregation and reporting
    
    Args:
        repo_url: GitHub repository URL
        checkpoints_file: Path to checkpoints text file
        local_path: Local path for repository storage
        use_llm: Whether to use LLM for checkpoint evaluation
        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
        config: Optional RAG configuration
        stop_on_failure: Stop processing on first checkpoint failure
    
    Returns:
        Dictionary containing:
            - checkpoints: List of checkpoint strings
            - results: List of CheckpointResult objects
            - summary: Formatted summary string
            - passed_count: Number of passed checkpoints
            - total_count: Total number of checkpoints
            - pass_rate: Percentage of passed checkpoints
    
    Raises:
        FileNotFoundError: If checkpoints file doesn't exist
        Exception: If any step of the pipeline fails
    
    Example:
        >>> result = validate_checkpoints(
        ...     repo_url="https://github.com/user/repo.git",
        ...     checkpoints_file="checkpoints.txt",
        ...     use_llm=True
        ... )
        >>> print(result['summary'])
    """
    # Setup logging
    global logger
    logger = setup_logging(log_level)
    
    logger.info("="*70)
    logger.info("GetGit Checkpoint Validation Pipeline Starting")
    logger.info("="*70)
    logger.info(f"Repository: {repo_url}")
    logger.info(f"Checkpoints File: {checkpoints_file}")
    logger.info(f"LLM Enabled: {use_llm}")
    logger.info("="*70)
    
    try:
        # Step 1: Initialize repository
        logger.info("\n[1/4] Initializing repository...")
        repo_path = initialize_repository(repo_url, local_path)
        logger.info(f"✓ Repository ready at {repo_path}")
        
        # Step 2: Setup RAG pipeline
        logger.info("\n[2/4] Setting up RAG pipeline...")
        retriever = setup_rag(repo_path, config=config)
        logger.info(f"✓ RAG pipeline ready with {len(retriever)} indexed chunks")
        
        # Step 3: Load checkpoints
        logger.info("\n[3/4] Loading checkpoints...")
        checkpoints = load_checkpoints(checkpoints_file)
        logger.info(f"✓ Loaded {len(checkpoints)} checkpoints")
        
        # Step 4: Run checkpoints
        logger.info("\n[4/4] Running checkpoint validation...")
        results = run_checkpoints(
            checkpoints=checkpoints,
            repo_path=repo_path,
            retriever=retriever,
            use_llm=use_llm,
            stop_on_failure=stop_on_failure
        )
        logger.info("✓ Checkpoint validation completed")
        
        # Generate summary
        summary = format_results_summary(results)
        
        # Calculate statistics
        passed_count = sum(1 for r in results if r.passed)
        total_count = len(results)
        pass_rate = (passed_count / total_count * 100) if total_count > 0 else 0
        
        logger.info("\n" + "="*70)
        logger.info("GetGit Checkpoint Validation Pipeline Completed")
        logger.info(f"Results: {passed_count}/{total_count} passed ({pass_rate:.1f}%)")
        logger.info("="*70)
        
        return {
            'checkpoints': checkpoints,
            'results': results,
            'summary': summary,
            'passed_count': passed_count,
            'total_count': total_count,
            'pass_rate': pass_rate
        }
    
    except Exception as e:
        logger.error("\n" + "="*70)
        logger.error("GetGit Checkpoint Validation Pipeline Failed")
        logger.error(f"Error: {str(e)}")
        logger.error("="*70)
        raise


def main(
    repo_url: str,
    query: str,
    local_path: str = "source_repo",
    use_llm: bool = True,
    top_k: int = 5,
    log_level: str = "INFO",
    config: Optional[RAGConfig] = None
) -> Dict[str, Any]:
    """
    Orchestrates the full GetGit pipeline from repository input to answer generation.
    
    This is the main entry point that coordinates:
    1. Repository cloning/loading
    2. RAG initialization and indexing
    3. Query processing and context retrieval
    4. LLM response generation
    
    Args:
        repo_url: GitHub repository URL
        query: Natural language question about the repository
        local_path: Local path for repository storage
        use_llm: Whether to generate LLM responses
        top_k: Number of relevant chunks to retrieve
        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
        config: Optional RAG configuration
    
    Returns:
        Dictionary containing query results and response
    
    Raises:
        Exception: If any step of the pipeline fails
    
    Example:
        >>> result = main(
        ...     repo_url="https://github.com/user/repo.git",
        ...     query="How do I install this project?",
        ...     use_llm=True
        ... )
        >>> print(result['response'])
    """
    # Setup logging
    global logger
    logger = setup_logging(log_level)
    
    logger.info("="*70)
    logger.info("GetGit Core Pipeline Starting")
    logger.info("="*70)
    logger.info(f"Repository: {repo_url}")
    logger.info(f"Query: {query}")
    logger.info(f"LLM Enabled: {use_llm}")
    logger.info("="*70)
    
    try:
        # Step 1: Initialize repository
        logger.info("\n[1/3] Initializing repository...")
        repo_path = initialize_repository(repo_url, local_path)
        logger.info(f"✓ Repository ready at {repo_path}")
        
        # Step 2: Setup RAG pipeline
        logger.info("\n[2/3] Setting up RAG pipeline...")
        retriever = setup_rag(repo_path, config=config)
        logger.info(f"✓ RAG pipeline ready with {len(retriever)} indexed chunks")
        
        # Step 3: Process query
        logger.info("\n[3/3] Processing query...")
        result = answer_query(
            query=query,
            retriever=retriever,
            top_k=top_k,
            use_llm=use_llm
        )
        logger.info("✓ Query processed successfully")
        
        logger.info("\n" + "="*70)
        logger.info("GetGit Core Pipeline Completed Successfully")
        logger.info("="*70)
        
        return result
    
    except Exception as e:
        logger.error("\n" + "="*70)
        logger.error("GetGit Core Pipeline Failed")
        logger.error(f"Error: {str(e)}")
        logger.error("="*70)
        raise


if __name__ == "__main__":
    """
    Example usage of the core module.
    
    This demonstrates a simple interactive session with GetGit.
    For CLI integration, consider using argparse or similar.
    """
    import sys
    
    # Example: Simple command-line usage
    if len(sys.argv) > 1:
        # If arguments provided, use them
        repo_url = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/samarthnaikk/getgit.git"
        query = sys.argv[2] if len(sys.argv) > 2 else "What is this project about?"
    else:
        # Default example
        repo_url = "https://github.com/samarthnaikk/getgit.git"
        query = "What is this project about?"
    
    print("\nGetGit - Repository Intelligence System")
    print("="*70)
    print(f"Repository: {repo_url}")
    print(f"Query: {query}")
    print("="*70 + "\n")
    
    try:
        # Run the pipeline
        result = main(
            repo_url=repo_url,
            query=query,
            use_llm=True,
            log_level="INFO"
        )
        
        # Display results
        print("\n" + "="*70)
        print("RESULTS")
        print("="*70)
        
        print(f"\nQuery: {result['query']}")
        print(f"\nRetrieved {len(result['retrieved_chunks'])} relevant chunks:")
        for chunk_info in result['retrieved_chunks'][:3]:  # Show top 3
            print(f"  - {chunk_info['file_path']} (score: {chunk_info['score']:.4f})")
        
        if result['response']:
            print("\n" + "-"*70)
            print("ANSWER:")
            print("-"*70)
            print(result['response'])
        elif result['error']:
            print("\n" + "-"*70)
            print("ERROR:")
            print("-"*70)
            print(f"Failed to generate LLM response: {result['error']}")
            print("\nShowing retrieved context instead:")
            print("-"*70)
            # Show snippet of context
            context_preview = result['context'][:500]
            if len(result['context']) > 500:
                context_preview += "..."
            print(context_preview)
        
        print("\n" + "="*70)
        
    except Exception as e:
        print(f"\n✗ Error: {str(e)}", file=sys.stderr)
        sys.exit(1)