Spaces:
Runtime error
Runtime error
| """ | |
| Core orchestration module for GetGit RAG + LLM Pipeline. | |
| This module serves as the unified entry point for GetGit, coordinating | |
| repository cloning, RAG-based analysis, and LLM-powered question answering. | |
| It provides a simple API for end-to-end repository intelligence gathering. | |
| """ | |
| import os | |
| import logging | |
| from typing import Optional, List, Dict, Any | |
| from pathlib import Path | |
| from clone_repo import clone_repo | |
| from repo_manager import RepositoryManager | |
| from rag import ( | |
| RepositoryChunker, | |
| SimpleEmbedding, | |
| SentenceTransformerEmbedding, | |
| Retriever, | |
| RAGConfig, | |
| generate_response, | |
| ) | |
| from checkpoints import ( | |
| load_checkpoints, | |
| evaluate_checkpoint, | |
| run_checkpoints, | |
| format_results_summary, | |
| CheckpointResult | |
| ) | |
| # Configure logging | |
| def setup_logging(level: str = "INFO") -> logging.Logger: | |
| """ | |
| Configure logging for the core module. | |
| Args: | |
| level: Logging level (DEBUG, INFO, WARNING, ERROR) | |
| Returns: | |
| Configured logger instance | |
| """ | |
| log_level = getattr(logging, level.upper(), logging.INFO) | |
| logging.basicConfig( | |
| level=log_level, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S' | |
| ) | |
| logger = logging.getLogger('getgit.core') | |
| logger.setLevel(log_level) # Explicitly set logger level | |
| return logger | |
| # Initialize module logger | |
| logger = setup_logging() | |
| def initialize_repository(repo_url: str, local_path: str = "source_repo") -> str: | |
| """ | |
| Clone or load the repository and prepare it for analysis. | |
| This function now includes repository persistence and validation: | |
| - Checks if the repository URL has changed | |
| - Cleans up old data if a new repository is provided | |
| - Stores the current repository URL for future validation | |
| Args: | |
| repo_url: GitHub repository URL to clone | |
| local_path: Local path where repository will be stored | |
| Returns: | |
| Path to the cloned/loaded repository | |
| Raises: | |
| Exception: If repository cloning or loading fails | |
| """ | |
| logger.info(f"Initializing repository from {repo_url}") | |
| try: | |
| # Initialize repository manager | |
| repo_manager = RepositoryManager( | |
| data_dir="data", | |
| repo_dir=local_path, | |
| cache_dir=".rag_cache" | |
| ) | |
| # Check if we need to reset (different repository URL) | |
| reset_performed = repo_manager.prepare_for_new_repo(repo_url) | |
| if reset_performed: | |
| logger.info("Repository reset performed, will clone fresh copy") | |
| # Clone or reuse existing repository | |
| if os.path.exists(local_path): | |
| logger.info(f"Repository already exists at {local_path}, using existing copy") | |
| logger.debug(f"Skipping clone for existing repository at {local_path}") | |
| else: | |
| logger.info(f"Cloning repository to {local_path}") | |
| clone_repo(repo_url, local_path) | |
| logger.info(f"Repository successfully cloned to {local_path}") | |
| # Verify repository exists and is accessible | |
| if not os.path.isdir(local_path): | |
| raise ValueError(f"Repository path {local_path} is not a valid directory") | |
| logger.debug(f"Repository initialized at {local_path}") | |
| return local_path | |
| except Exception as e: | |
| logger.error(f"Failed to initialize repository: {str(e)}") | |
| raise | |
| def setup_rag( | |
| repo_path: str, | |
| repository_name: Optional[str] = None, | |
| config: Optional[RAGConfig] = None, | |
| use_sentence_transformer: bool = False | |
| ) -> Retriever: | |
| """ | |
| Initialize chunker, embeddings, and retriever for RAG pipeline. | |
| Args: | |
| repo_path: Path to the repository to analyze | |
| repository_name: Optional name for the repository | |
| config: Optional RAG configuration (uses default if not provided) | |
| use_sentence_transformer: Whether to use SentenceTransformer embeddings | |
| Returns: | |
| Configured Retriever instance with indexed repository chunks | |
| Raises: | |
| Exception: If RAG initialization or indexing fails | |
| """ | |
| logger.info(f"Setting up RAG pipeline for repository at {repo_path}") | |
| try: | |
| # Use default config if not provided | |
| if config is None: | |
| config = RAGConfig.default() | |
| logger.debug("Using default RAG configuration") | |
| # Determine repository name | |
| if repository_name is None: | |
| repository_name = os.path.basename(repo_path) | |
| logger.debug(f"Repository name: {repository_name}") | |
| # Step 1: Chunk the repository | |
| logger.info("Chunking repository content...") | |
| chunker = RepositoryChunker(repo_path, repository_name=repository_name) | |
| chunks = chunker.chunk_repository(config.chunking.file_patterns) | |
| logger.info(f"Created {len(chunks)} chunks from repository") | |
| if not chunks: | |
| logger.warning("No chunks created - repository may be empty or contain no supported file types") | |
| raise ValueError( | |
| "No chunks created from repository. Ensure the repository contains " | |
| f"files matching patterns: {config.chunking.file_patterns}" | |
| ) | |
| # Step 2: Initialize embedding model | |
| logger.info("Initializing embedding model...") | |
| if use_sentence_transformer: | |
| try: | |
| embedding_model = SentenceTransformerEmbedding(config.embedding.model_name) | |
| logger.info(f"Using SentenceTransformer model: {config.embedding.model_name}") | |
| except ImportError: | |
| logger.warning("sentence-transformers not available, falling back to SimpleEmbedding") | |
| embedding_model = SimpleEmbedding(max_features=config.embedding.embedding_dim) | |
| else: | |
| embedding_model = SimpleEmbedding(max_features=config.embedding.embedding_dim) | |
| logger.info("Using SimpleEmbedding (TF-IDF based)") | |
| # Step 3: Create retriever and index chunks | |
| logger.info("Creating retriever and indexing chunks...") | |
| retriever = Retriever(embedding_model) | |
| retriever.index_chunks(chunks, batch_size=config.embedding.batch_size) | |
| logger.info(f"Successfully indexed {len(retriever)} chunks") | |
| logger.debug("RAG pipeline setup complete") | |
| return retriever | |
| except Exception as e: | |
| logger.error(f"Failed to setup RAG pipeline: {str(e)}") | |
| raise | |
| def answer_query( | |
| query: str, | |
| retriever: Retriever, | |
| top_k: int = 5, | |
| use_llm: bool = True, | |
| api_key: Optional[str] = None, | |
| model_name: str = "gemini-2.5-flash" | |
| ) -> Dict[str, Any]: | |
| """ | |
| Retrieve relevant context and generate an LLM response for the query. | |
| Args: | |
| query: Natural language question about the repository | |
| retriever: Configured Retriever instance | |
| top_k: Number of relevant chunks to retrieve | |
| use_llm: Whether to generate LLM response (requires API key) | |
| api_key: Optional API key for LLM (reads from env if not provided) | |
| model_name: Name of the LLM model to use | |
| Returns: | |
| Dictionary containing: | |
| - query: The original query | |
| - retrieved_chunks: List of retrieved chunk information | |
| - context: Combined context from retrieved chunks | |
| - response: Generated LLM response (if use_llm=True) | |
| - error: Error message if LLM generation fails | |
| Raises: | |
| Exception: If query processing fails | |
| """ | |
| logger.info(f"Processing query: '{query}'") | |
| try: | |
| # Step 1: Retrieve relevant chunks | |
| logger.info(f"Retrieving top {top_k} relevant chunks...") | |
| results = retriever.retrieve(query, top_k=top_k) | |
| logger.info(f"Retrieved {len(results)} relevant chunks") | |
| if not results: | |
| logger.warning("No relevant chunks found for query") | |
| return { | |
| 'query': query, | |
| 'retrieved_chunks': [], | |
| 'context': '', | |
| 'response': 'No relevant information found in the repository for this query.', | |
| 'error': None | |
| } | |
| # Log retrieved chunks | |
| for result in results: | |
| logger.debug( | |
| f"Chunk {result.rank}: {result.chunk.file_path} " | |
| f"(score: {result.score:.4f}, type: {result.chunk.chunk_type.value})" | |
| ) | |
| # Step 2: Extract context | |
| context_chunks = [result.chunk.content for result in results] | |
| retrieved_info = [ | |
| { | |
| 'rank': result.rank, | |
| 'file_path': result.chunk.file_path, | |
| 'chunk_type': result.chunk.chunk_type.value, | |
| 'score': result.score, | |
| 'start_line': result.chunk.start_line, | |
| 'end_line': result.chunk.end_line, | |
| 'metadata': result.chunk.metadata | |
| } | |
| for result in results | |
| ] | |
| # Step 3: Generate LLM response if requested | |
| response_text = None | |
| error = None | |
| if use_llm: | |
| logger.info("Generating LLM response...") | |
| try: | |
| response_text = generate_response( | |
| query, | |
| context_chunks, | |
| model_name=model_name, | |
| api_key=api_key | |
| ) | |
| logger.info("LLM response generated successfully") | |
| logger.debug(f"Response length: {len(response_text)} characters") | |
| except Exception as e: | |
| error = str(e) | |
| logger.error(f"Failed to generate LLM response: {error}") | |
| response_text = None | |
| else: | |
| logger.debug("LLM response generation skipped (use_llm=False)") | |
| return { | |
| 'query': query, | |
| 'retrieved_chunks': retrieved_info, | |
| 'context': '\n\n---\n\n'.join(context_chunks), | |
| 'response': response_text, | |
| 'error': error | |
| } | |
| except Exception as e: | |
| logger.error(f"Failed to process query: {str(e)}") | |
| raise | |
| def validate_checkpoints( | |
| repo_url: str, | |
| checkpoints_file: str = "checkpoints.txt", | |
| local_path: str = "source_repo", | |
| use_llm: bool = True, | |
| log_level: str = "INFO", | |
| config: Optional[RAGConfig] = None, | |
| stop_on_failure: bool = False | |
| ) -> Dict[str, Any]: | |
| """ | |
| Validate repository against checkpoints defined in a text file. | |
| This function orchestrates the checkpoint validation pipeline: | |
| 1. Repository cloning/loading | |
| 2. RAG initialization and indexing | |
| 3. Loading checkpoints from file | |
| 4. Sequential checkpoint evaluation | |
| 5. Results aggregation and reporting | |
| Args: | |
| repo_url: GitHub repository URL | |
| checkpoints_file: Path to checkpoints text file | |
| local_path: Local path for repository storage | |
| use_llm: Whether to use LLM for checkpoint evaluation | |
| log_level: Logging level (DEBUG, INFO, WARNING, ERROR) | |
| config: Optional RAG configuration | |
| stop_on_failure: Stop processing on first checkpoint failure | |
| Returns: | |
| Dictionary containing: | |
| - checkpoints: List of checkpoint strings | |
| - results: List of CheckpointResult objects | |
| - summary: Formatted summary string | |
| - passed_count: Number of passed checkpoints | |
| - total_count: Total number of checkpoints | |
| - pass_rate: Percentage of passed checkpoints | |
| Raises: | |
| FileNotFoundError: If checkpoints file doesn't exist | |
| Exception: If any step of the pipeline fails | |
| Example: | |
| >>> result = validate_checkpoints( | |
| ... repo_url="https://github.com/user/repo.git", | |
| ... checkpoints_file="checkpoints.txt", | |
| ... use_llm=True | |
| ... ) | |
| >>> print(result['summary']) | |
| """ | |
| # Setup logging | |
| global logger | |
| logger = setup_logging(log_level) | |
| logger.info("="*70) | |
| logger.info("GetGit Checkpoint Validation Pipeline Starting") | |
| logger.info("="*70) | |
| logger.info(f"Repository: {repo_url}") | |
| logger.info(f"Checkpoints File: {checkpoints_file}") | |
| logger.info(f"LLM Enabled: {use_llm}") | |
| logger.info("="*70) | |
| try: | |
| # Step 1: Initialize repository | |
| logger.info("\n[1/4] Initializing repository...") | |
| repo_path = initialize_repository(repo_url, local_path) | |
| logger.info(f"✓ Repository ready at {repo_path}") | |
| # Step 2: Setup RAG pipeline | |
| logger.info("\n[2/4] Setting up RAG pipeline...") | |
| retriever = setup_rag(repo_path, config=config) | |
| logger.info(f"✓ RAG pipeline ready with {len(retriever)} indexed chunks") | |
| # Step 3: Load checkpoints | |
| logger.info("\n[3/4] Loading checkpoints...") | |
| checkpoints = load_checkpoints(checkpoints_file) | |
| logger.info(f"✓ Loaded {len(checkpoints)} checkpoints") | |
| # Step 4: Run checkpoints | |
| logger.info("\n[4/4] Running checkpoint validation...") | |
| results = run_checkpoints( | |
| checkpoints=checkpoints, | |
| repo_path=repo_path, | |
| retriever=retriever, | |
| use_llm=use_llm, | |
| stop_on_failure=stop_on_failure | |
| ) | |
| logger.info("✓ Checkpoint validation completed") | |
| # Generate summary | |
| summary = format_results_summary(results) | |
| # Calculate statistics | |
| passed_count = sum(1 for r in results if r.passed) | |
| total_count = len(results) | |
| pass_rate = (passed_count / total_count * 100) if total_count > 0 else 0 | |
| logger.info("\n" + "="*70) | |
| logger.info("GetGit Checkpoint Validation Pipeline Completed") | |
| logger.info(f"Results: {passed_count}/{total_count} passed ({pass_rate:.1f}%)") | |
| logger.info("="*70) | |
| return { | |
| 'checkpoints': checkpoints, | |
| 'results': results, | |
| 'summary': summary, | |
| 'passed_count': passed_count, | |
| 'total_count': total_count, | |
| 'pass_rate': pass_rate | |
| } | |
| except Exception as e: | |
| logger.error("\n" + "="*70) | |
| logger.error("GetGit Checkpoint Validation Pipeline Failed") | |
| logger.error(f"Error: {str(e)}") | |
| logger.error("="*70) | |
| raise | |
| def main( | |
| repo_url: str, | |
| query: str, | |
| local_path: str = "source_repo", | |
| use_llm: bool = True, | |
| top_k: int = 5, | |
| log_level: str = "INFO", | |
| config: Optional[RAGConfig] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Orchestrates the full GetGit pipeline from repository input to answer generation. | |
| This is the main entry point that coordinates: | |
| 1. Repository cloning/loading | |
| 2. RAG initialization and indexing | |
| 3. Query processing and context retrieval | |
| 4. LLM response generation | |
| Args: | |
| repo_url: GitHub repository URL | |
| query: Natural language question about the repository | |
| local_path: Local path for repository storage | |
| use_llm: Whether to generate LLM responses | |
| top_k: Number of relevant chunks to retrieve | |
| log_level: Logging level (DEBUG, INFO, WARNING, ERROR) | |
| config: Optional RAG configuration | |
| Returns: | |
| Dictionary containing query results and response | |
| Raises: | |
| Exception: If any step of the pipeline fails | |
| Example: | |
| >>> result = main( | |
| ... repo_url="https://github.com/user/repo.git", | |
| ... query="How do I install this project?", | |
| ... use_llm=True | |
| ... ) | |
| >>> print(result['response']) | |
| """ | |
| # Setup logging | |
| global logger | |
| logger = setup_logging(log_level) | |
| logger.info("="*70) | |
| logger.info("GetGit Core Pipeline Starting") | |
| logger.info("="*70) | |
| logger.info(f"Repository: {repo_url}") | |
| logger.info(f"Query: {query}") | |
| logger.info(f"LLM Enabled: {use_llm}") | |
| logger.info("="*70) | |
| try: | |
| # Step 1: Initialize repository | |
| logger.info("\n[1/3] Initializing repository...") | |
| repo_path = initialize_repository(repo_url, local_path) | |
| logger.info(f"✓ Repository ready at {repo_path}") | |
| # Step 2: Setup RAG pipeline | |
| logger.info("\n[2/3] Setting up RAG pipeline...") | |
| retriever = setup_rag(repo_path, config=config) | |
| logger.info(f"✓ RAG pipeline ready with {len(retriever)} indexed chunks") | |
| # Step 3: Process query | |
| logger.info("\n[3/3] Processing query...") | |
| result = answer_query( | |
| query=query, | |
| retriever=retriever, | |
| top_k=top_k, | |
| use_llm=use_llm | |
| ) | |
| logger.info("✓ Query processed successfully") | |
| logger.info("\n" + "="*70) | |
| logger.info("GetGit Core Pipeline Completed Successfully") | |
| logger.info("="*70) | |
| return result | |
| except Exception as e: | |
| logger.error("\n" + "="*70) | |
| logger.error("GetGit Core Pipeline Failed") | |
| logger.error(f"Error: {str(e)}") | |
| logger.error("="*70) | |
| raise | |
| if __name__ == "__main__": | |
| """ | |
| Example usage of the core module. | |
| This demonstrates a simple interactive session with GetGit. | |
| For CLI integration, consider using argparse or similar. | |
| """ | |
| import sys | |
| # Example: Simple command-line usage | |
| if len(sys.argv) > 1: | |
| # If arguments provided, use them | |
| repo_url = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/samarthnaikk/getgit.git" | |
| query = sys.argv[2] if len(sys.argv) > 2 else "What is this project about?" | |
| else: | |
| # Default example | |
| repo_url = "https://github.com/samarthnaikk/getgit.git" | |
| query = "What is this project about?" | |
| print("\nGetGit - Repository Intelligence System") | |
| print("="*70) | |
| print(f"Repository: {repo_url}") | |
| print(f"Query: {query}") | |
| print("="*70 + "\n") | |
| try: | |
| # Run the pipeline | |
| result = main( | |
| repo_url=repo_url, | |
| query=query, | |
| use_llm=True, | |
| log_level="INFO" | |
| ) | |
| # Display results | |
| print("\n" + "="*70) | |
| print("RESULTS") | |
| print("="*70) | |
| print(f"\nQuery: {result['query']}") | |
| print(f"\nRetrieved {len(result['retrieved_chunks'])} relevant chunks:") | |
| for chunk_info in result['retrieved_chunks'][:3]: # Show top 3 | |
| print(f" - {chunk_info['file_path']} (score: {chunk_info['score']:.4f})") | |
| if result['response']: | |
| print("\n" + "-"*70) | |
| print("ANSWER:") | |
| print("-"*70) | |
| print(result['response']) | |
| elif result['error']: | |
| print("\n" + "-"*70) | |
| print("ERROR:") | |
| print("-"*70) | |
| print(f"Failed to generate LLM response: {result['error']}") | |
| print("\nShowing retrieved context instead:") | |
| print("-"*70) | |
| # Show snippet of context | |
| context_preview = result['context'][:500] | |
| if len(result['context']) > 500: | |
| context_preview += "..." | |
| print(context_preview) | |
| print("\n" + "="*70) | |
| except Exception as e: | |
| print(f"\n✗ Error: {str(e)}", file=sys.stderr) | |
| sys.exit(1) | |