diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..15631712576c7d1250b4f3939be87ca8a34e3af9 --- /dev/null +++ b/app.py @@ -0,0 +1,23 @@ +"""Hugging Face Spaces entry point for CodeRAG. + +This file is used by Hugging Face Spaces to launch the Gradio demo. +It's configured to work without GPU (embeddings on CPU, LLM via Groq). +""" + +import os + +# Configure for HF Spaces environment +os.environ.setdefault("MODEL_LLM_PROVIDER", "groq") +os.environ.setdefault("MODEL_EMBEDDING_DEVICE", "cpu") + +# Use HF Spaces secrets for API key +if "GROQ_API_KEY" in os.environ and "MODEL_LLM_API_KEY" not in os.environ: + os.environ["MODEL_LLM_API_KEY"] = os.environ["GROQ_API_KEY"] + +# Import and launch the Gradio app +from coderag.ui.app import create_gradio_app + +demo = create_gradio_app() + +if __name__ == "__main__": + demo.launch() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b25839566749076c92b72fa4c7d6ba65b4254887 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +# Requirements for Hugging Face Spaces +# Minimal dependencies for the Gradio demo + +fastapi>=0.115.0 +uvicorn[standard]>=0.32.0 +pydantic>=2.10.0 +pydantic-settings>=2.6.0 +gradio>=4.44.0 +transformers>=4.46.0 +sentence-transformers>=3.3.0 +chromadb>=0.5.0 +tree-sitter>=0.23.0 +tree-sitter-python>=0.23.0 +gitpython>=3.1.0 +python-dotenv>=1.0.0 +structlog>=24.4.0 +httpx>=0.27.0 +openai>=1.50.0 + +# Note: torch is pre-installed on HF Spaces +# Note: accelerate and bitsandbytes not needed for CPU-only inference diff --git a/src/coderag/__init__.py b/src/coderag/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e55695a911a78ae94a1b7e5eacfa29ce7b5a2e1d --- /dev/null +++ b/src/coderag/__init__.py @@ -0,0 +1,3 @@ +"""CodeRAG: RAG-based Q&A system for code repositories with verifiable citations.""" + +__version__ = "0.1.0" diff --git a/src/coderag/__pycache__/__init__.cpython-313.pyc b/src/coderag/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..388c5f277fe21e908ab094192ed63740955d2c49 Binary files /dev/null and b/src/coderag/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/__pycache__/cli.cpython-313.pyc b/src/coderag/__pycache__/cli.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b028ba700b63289421cd516fe60dbd9ca1a82405 Binary files /dev/null and b/src/coderag/__pycache__/cli.cpython-313.pyc differ diff --git a/src/coderag/__pycache__/config.cpython-313.pyc b/src/coderag/__pycache__/config.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..245548ea151f413006fe9495a42446b03fa16112 Binary files /dev/null and b/src/coderag/__pycache__/config.cpython-313.pyc differ diff --git a/src/coderag/__pycache__/logging.cpython-313.pyc b/src/coderag/__pycache__/logging.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2606dd6d78a57166e0d27c9cf3dbbd2470bd764 Binary files /dev/null and b/src/coderag/__pycache__/logging.cpython-313.pyc differ diff --git a/src/coderag/__pycache__/main.cpython-313.pyc b/src/coderag/__pycache__/main.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90fd976eed4abbc8362f68570214f727e0049eee Binary files /dev/null and b/src/coderag/__pycache__/main.cpython-313.pyc differ diff --git a/src/coderag/api/__init__.py b/src/coderag/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7f28268fa660f34efcc7ca878169b674b374222c --- /dev/null +++ b/src/coderag/api/__init__.py @@ -0,0 +1,5 @@ +"""API module: REST endpoints for programmatic access.""" + +from coderag.api.routes import router + +__all__ = ["router"] diff --git a/src/coderag/api/__pycache__/__init__.cpython-313.pyc b/src/coderag/api/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de950d5559fc279e734d2e0c1cffa3634f039db2 Binary files /dev/null and b/src/coderag/api/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/api/__pycache__/routes.cpython-313.pyc b/src/coderag/api/__pycache__/routes.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab857a3b82b684056ae38dea7ca543e283e3d977 Binary files /dev/null and b/src/coderag/api/__pycache__/routes.cpython-313.pyc differ diff --git a/src/coderag/api/__pycache__/schemas.cpython-313.pyc b/src/coderag/api/__pycache__/schemas.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b8f401566d2b108bf20e822bf3fc1b2311d58f4 Binary files /dev/null and b/src/coderag/api/__pycache__/schemas.cpython-313.pyc differ diff --git a/src/coderag/api/routes.py b/src/coderag/api/routes.py new file mode 100644 index 0000000000000000000000000000000000000000..ed8a3b61573410cff41a262470390ba59c4c66c6 --- /dev/null +++ b/src/coderag/api/routes.py @@ -0,0 +1,282 @@ +"""REST API routes.""" + +import json +from datetime import datetime +from typing import Optional + +from fastapi import APIRouter, HTTPException, BackgroundTasks +from fastapi.responses import JSONResponse + +from coderag.api.schemas import ( + IndexRepositoryRequest, + IndexRepositoryResponse, + QueryRequest, + QueryResponse, + ListRepositoriesResponse, + RepositoryInfo, + CitationResponse, + RetrievedChunkResponse, + ErrorResponse, +) +from coderag.config import get_settings +from coderag.generation.generator import ResponseGenerator +from coderag.indexing.embeddings import EmbeddingGenerator +from coderag.indexing.vectorstore import VectorStore +from coderag.ingestion.chunker import CodeChunker +from coderag.ingestion.filter import FileFilter +from coderag.ingestion.loader import RepositoryLoader +from coderag.ingestion.validator import GitHubURLValidator, ValidationError +from coderag.logging import get_logger +from coderag.models.document import Document +from coderag.models.query import Query as QueryModel +from coderag.models.repository import Repository, RepositoryStatus + +logger = get_logger(__name__) +router = APIRouter() + +# Global state (in production, use a proper database) +settings = get_settings() +repos_file = settings.data_dir / "repositories.json" +repositories: dict[str, Repository] = {} + + +def load_repositories() -> None: + """Load repositories from disk.""" + global repositories + if repos_file.exists(): + try: + data = json.loads(repos_file.read_text()) + repositories = {r["id"]: Repository.from_dict(r) for r in data} + except Exception as e: + logger.error("Failed to load repositories", error=str(e)) + + +def save_repositories() -> None: + """Save repositories to disk.""" + repos_file.parent.mkdir(parents=True, exist_ok=True) + data = [r.to_dict() for r in repositories.values()] + repos_file.write_text(json.dumps(data, indent=2)) + + +# Load on startup +load_repositories() + + +async def index_repository_task( + url: str, + repo_id: str, + branch: Optional[str], + include_patterns: Optional[list[str]], + exclude_patterns: Optional[list[str]], +) -> None: + """Background task to index a repository.""" + repo = repositories[repo_id] + + try: + # Validate and clone + validator = GitHubURLValidator() + repo_info = await validator.validate_repository(url) + branch = branch or repo_info.branch or "main" + + loader = RepositoryLoader() + repo_path = loader.clone_repository(repo_info, branch) + + repo.clone_path = repo_path + repo.status = RepositoryStatus.INDEXING + save_repositories() + + # Filter files + file_filter = FileFilter( + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + ) + files = list(file_filter.filter_files(repo_path)) + + # Load documents + documents = [] + for file_path in files: + try: + doc = Document.from_file(file_path, repo_path, repo.id) + documents.append(doc) + except Exception as e: + logger.warning("Failed to load file", path=str(file_path), error=str(e)) + + # Chunk + chunker = CodeChunker() + chunks = [] + for doc in documents: + for chunk in chunker.chunk_document(doc): + chunks.append(chunk) + + # Embed and store + if chunks: + vectorstore = VectorStore() + vectorstore.delete_repo_chunks(repo.id) + + embedder = EmbeddingGenerator() + embedded_chunks = embedder.embed_chunks(chunks) + vectorstore.add_chunks(embedded_chunks) + + # Update status + repo.chunk_count = len(chunks) + repo.indexed_at = datetime.now() + repo.status = RepositoryStatus.READY + save_repositories() + + logger.info("Repository indexed", repo_id=repo_id, chunks=len(chunks)) + + except Exception as e: + logger.error("Indexing failed", repo_id=repo_id, error=str(e)) + repo.status = RepositoryStatus.ERROR + repo.error_message = str(e) + save_repositories() + + +@router.post("/repos/index", response_model=IndexRepositoryResponse, status_code=202) +async def index_repository( + request: IndexRepositoryRequest, + background_tasks: BackgroundTasks, +) -> IndexRepositoryResponse: + """Index a GitHub repository.""" + # Create repository record + repo = Repository( + url=request.url, + branch=request.branch or "main", + status=RepositoryStatus.PENDING, + ) + repositories[repo.id] = repo + save_repositories() + + # Start background indexing + background_tasks.add_task( + index_repository_task, + request.url, + repo.id, + request.branch, + request.include_patterns, + request.exclude_patterns, + ) + + return IndexRepositoryResponse( + repo_id=repo.id, + status=repo.status.value, + message="Repository indexing started", + ) + + +@router.post("/query", response_model=QueryResponse) +async def query_repository(request: QueryRequest) -> QueryResponse: + """Query a repository.""" + # Check repository exists + if request.repo_id not in repositories: + raise HTTPException(status_code=404, detail="Repository not found") + + repo = repositories[request.repo_id] + if repo.status != RepositoryStatus.READY: + raise HTTPException( + status_code=400, + detail=f"Repository not ready (status: {repo.status.value})", + ) + + try: + # Generate response + generator = ResponseGenerator() + query = QueryModel( + question=request.question, + repo_id=request.repo_id, + top_k=request.top_k, + ) + response = generator.generate(query) + + # Convert to API schema + return QueryResponse( + answer=response.answer, + citations=[ + CitationResponse( + file_path=c.file_path, + start_line=c.start_line, + end_line=c.end_line, + ) + for c in response.citations + ], + retrieved_chunks=[ + RetrievedChunkResponse( + chunk_id=c.chunk_id, + file_path=c.file_path, + start_line=c.start_line, + end_line=c.end_line, + relevance_score=c.relevance_score, + chunk_type=c.chunk_type, + name=c.name, + content=c.content, + ) + for c in response.retrieved_chunks + ], + grounded=response.grounded, + query_id=response.query_id, + ) + + except Exception as e: + logger.error("Query failed", error=str(e)) + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/repos", response_model=ListRepositoriesResponse) +async def list_repositories() -> ListRepositoriesResponse: + """List all repositories.""" + return ListRepositoriesResponse( + repositories=[ + RepositoryInfo( + id=repo.id, + url=repo.url, + branch=repo.branch, + chunk_count=repo.chunk_count, + status=repo.status.value, + indexed_at=repo.indexed_at, + error_message=repo.error_message, + ) + for repo in repositories.values() + ] + ) + + +@router.get("/repos/{repo_id}", response_model=RepositoryInfo) +async def get_repository(repo_id: str) -> RepositoryInfo: + """Get repository details.""" + if repo_id not in repositories: + raise HTTPException(status_code=404, detail="Repository not found") + + repo = repositories[repo_id] + return RepositoryInfo( + id=repo.id, + url=repo.url, + branch=repo.branch, + chunk_count=repo.chunk_count, + status=repo.status.value, + indexed_at=repo.indexed_at, + error_message=repo.error_message, + ) + + +@router.delete("/repos/{repo_id}") +async def delete_repository(repo_id: str) -> dict: + """Delete a repository.""" + if repo_id not in repositories: + raise HTTPException(status_code=404, detail="Repository not found") + + repo = repositories[repo_id] + + try: + # Delete from vector store + vectorstore = VectorStore() + vectorstore.delete_repo_chunks(repo_id) + + # Delete from records + del repositories[repo_id] + save_repositories() + + return {"message": f"Repository {repo.full_name} deleted"} + + except Exception as e: + logger.error("Delete failed", error=str(e)) + raise HTTPException(status_code=500, detail=str(e)) diff --git a/src/coderag/api/schemas.py b/src/coderag/api/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..b3834e0301f43c6b435a45ebe1084be7aaf719a6 --- /dev/null +++ b/src/coderag/api/schemas.py @@ -0,0 +1,101 @@ +"""Pydantic schemas for REST API.""" + +from datetime import datetime +from typing import Optional + +from pydantic import BaseModel, Field, HttpUrl + + +class IndexRepositoryRequest(BaseModel): + """Request to index a repository.""" + + url: str = Field(..., description="GitHub repository URL") + branch: Optional[str] = Field(None, description="Branch name (default: main)") + include_patterns: Optional[list[str]] = Field(None, description="File patterns to include") + exclude_patterns: Optional[list[str]] = Field(None, description="File patterns to exclude") + + +class IndexRepositoryResponse(BaseModel): + """Response from indexing request.""" + + repo_id: str = Field(..., description="Repository ID") + status: str = Field(..., description="Indexing status") + message: str = Field(..., description="Status message") + + +class QueryRequest(BaseModel): + """Request to query a repository.""" + + question: str = Field(..., description="Question about the repository") + repo_id: str = Field(..., description="Repository ID to query") + top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve") + + +class CitationResponse(BaseModel): + """Citation information.""" + + file_path: str + start_line: int + end_line: int + + class Config: + from_attributes = True + + +class RetrievedChunkResponse(BaseModel): + """Retrieved chunk information.""" + + chunk_id: str + file_path: str + start_line: int + end_line: int + relevance_score: float + chunk_type: str + name: Optional[str] = None + content: str + + class Config: + from_attributes = True + + +class QueryResponse(BaseModel): + """Response from a query.""" + + answer: str = Field(..., description="Generated answer") + citations: list[CitationResponse] = Field(..., description="Citations in the answer") + retrieved_chunks: list[RetrievedChunkResponse] = Field(..., description="Evidence chunks") + grounded: bool = Field(..., description="Whether response is grounded in evidence") + query_id: str = Field(..., description="Query ID") + + +class RepositoryInfo(BaseModel): + """Repository information.""" + + id: str + url: str + branch: str + chunk_count: int + status: str + indexed_at: Optional[datetime] = None + error_message: Optional[str] = None + + +class ListRepositoriesResponse(BaseModel): + """List of repositories.""" + + repositories: list[RepositoryInfo] + + +class HealthResponse(BaseModel): + """Health check response.""" + + status: str + app: str + version: str + + +class ErrorResponse(BaseModel): + """Error response.""" + + error: str + detail: Optional[str] = None diff --git a/src/coderag/cli.py b/src/coderag/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..b9506f784fc13bf93b0ac411dbd1d742ec8d88cb --- /dev/null +++ b/src/coderag/cli.py @@ -0,0 +1,513 @@ +"""Unified CLI for CodeRAG.""" + +import json +import os +import platform +import shutil +import sys +from pathlib import Path +from typing import Optional + +import click + + +# Config directory and file +CONFIG_DIR = Path.home() / ".config" / "coderag" +CONFIG_FILE = CONFIG_DIR / "config.json" + + +def get_config() -> dict: + """Load configuration from config file.""" + if CONFIG_FILE.exists(): + try: + return json.loads(CONFIG_FILE.read_text()) + except Exception: + return {} + return {} + + +def save_config(config: dict) -> None: + """Save configuration to config file.""" + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + CONFIG_FILE.write_text(json.dumps(config, indent=2)) + + +def get_claude_config_path() -> Optional[Path]: + """Get Claude Desktop config path based on OS.""" + system = platform.system() + + if system == "Darwin": # macOS + return Path.home() / "Library" / "Application Support" / "Claude" / "claude_desktop_config.json" + elif system == "Linux": + return Path.home() / ".config" / "Claude" / "claude_desktop_config.json" + elif system == "Windows": + appdata = os.environ.get("APPDATA", "") + if appdata: + return Path(appdata) / "Claude" / "claude_desktop_config.json" + return None + + +@click.group() +@click.version_option(package_name="coderag") +def cli(): + """CodeRAG - RAG-based Q&A system for code repositories. + + Use 'coderag setup' to configure, then 'coderag serve' to start. + For Claude Desktop integration, run 'coderag mcp-install'. + """ + pass + + +@cli.command() +@click.option("--provider", type=click.Choice(["groq", "openai", "anthropic", "openrouter", "together", "local"]), + default=None, help="LLM provider to use") +@click.option("--api-key", default=None, help="API key for the provider") +def setup(provider: Optional[str], api_key: Optional[str]): + """Interactive setup wizard for CodeRAG. + + Configures the LLM provider and API key. Configuration is saved to + ~/.config/coderag/config.json and can be overridden by environment variables. + """ + config = get_config() + + click.echo("\nšŸ”§ CodeRAG Setup\n") + + # Provider selection + if provider is None: + click.echo("Select your LLM provider:") + click.echo(" 1. groq (FREE, fast - recommended)") + click.echo(" 2. openai") + click.echo(" 3. anthropic") + click.echo(" 4. openrouter") + click.echo(" 5. together") + click.echo(" 6. local (requires GPU)") + + choice = click.prompt("Enter choice", type=int, default=1) + providers = {1: "groq", 2: "openai", 3: "anthropic", 4: "openrouter", 5: "together", 6: "local"} + provider = providers.get(choice, "groq") + + config["llm_provider"] = provider + + # API key (not needed for local) + if provider != "local": + if api_key is None: + api_key_urls = { + "groq": "https://console.groq.com/keys", + "openai": "https://platform.openai.com/api-keys", + "anthropic": "https://console.anthropic.com/settings/keys", + "openrouter": "https://openrouter.ai/keys", + "together": "https://api.together.xyz/settings/api-keys", + } + url = api_key_urls.get(provider, "") + if url: + click.echo(f"\nGet your API key from: {url}") + + api_key = click.prompt("Enter your API key", hide_input=True) + + config["llm_api_key"] = api_key + + # Validate API key + click.echo("\nā³ Validating API key...") + if _validate_api_key(provider, api_key): + click.echo("āœ… API key is valid!") + else: + click.echo("āš ļø Could not validate API key. It may still work.") + else: + click.echo("\nāš ļø Local mode requires a CUDA-capable GPU.") + + # Save config + save_config(config) + click.echo(f"\nāœ… Configuration saved to {CONFIG_FILE}") + + # Next steps + click.echo("\nšŸ“‹ Next steps:") + click.echo(" 1. Run 'coderag serve' to start the web interface") + click.echo(" 2. Run 'coderag mcp-install' to integrate with Claude Desktop") + click.echo(" 3. Run 'coderag index ' to index a repository") + + +def _validate_api_key(provider: str, api_key: str) -> bool: + """Validate API key by making a test request.""" + try: + from openai import OpenAI + + base_urls = { + "groq": "https://api.groq.com/openai/v1", + "openai": "https://api.openai.com/v1", + "openrouter": "https://openrouter.ai/api/v1", + "together": "https://api.together.xyz/v1", + } + + if provider not in base_urls: + return True # Can't validate, assume OK + + client = OpenAI(api_key=api_key, base_url=base_urls[provider]) + client.models.list() + return True + except Exception: + return False + + +@cli.command() +@click.option("--host", default="0.0.0.0", help="Host to bind to") +@click.option("--port", default=8000, type=int, help="Port to bind to") +@click.option("--reload", is_flag=True, help="Enable auto-reload for development") +def serve(host: str, port: int, reload: bool): + """Start the CodeRAG web server. + + Starts the FastAPI server with Gradio UI, REST API, and MCP endpoint. + """ + # Apply config from file to environment + _apply_config_to_env() + + import uvicorn + from coderag.main import create_app + from coderag.config import get_settings + + settings = get_settings() + app = create_app() + + click.echo(f"\nšŸš€ Starting CodeRAG server at http://{host}:{port}") + click.echo(" Press Ctrl+C to stop\n") + + uvicorn.run( + app, + host=host, + port=port, + reload=reload, + log_level=settings.server.log_level, + ) + + +@cli.command("mcp-run") +def mcp_run(): + """Run MCP server in stdio mode (for Claude Desktop). + + This command is used by Claude Desktop to communicate with CodeRAG. + You typically don't need to run this manually. + """ + # Apply config from file to environment + _apply_config_to_env() + + # Suppress all output except MCP protocol + import logging + logging.basicConfig(level=logging.WARNING, stream=sys.stderr) + + import structlog + structlog.configure( + wrapper_class=structlog.make_filtering_bound_logger(logging.CRITICAL), + ) + + from coderag.mcp.server import create_mcp_server + mcp = create_mcp_server() + mcp.run(transport="stdio") + + +@cli.command("mcp-install") +@click.option("--dry-run", is_flag=True, help="Preview changes without applying") +def mcp_install(dry_run: bool): + """Configure Claude Desktop to use CodeRAG MCP. + + Automatically detects your OS and updates the Claude Desktop configuration + to include the CodeRAG MCP server. + """ + config_path = get_claude_config_path() + + if config_path is None: + click.echo("āŒ Could not determine Claude Desktop config location.") + click.echo(" Please manually add the MCP configuration.") + sys.exit(1) + + click.echo(f"\nšŸ” Claude Desktop config: {config_path}") + + # Check if Claude Desktop is installed + if not config_path.parent.exists(): + click.echo("\nāŒ Claude Desktop does not appear to be installed.") + click.echo(" Install it from: https://claude.ai/download") + sys.exit(1) + + # Load existing config or create new + if config_path.exists(): + try: + config = json.loads(config_path.read_text()) + except json.JSONDecodeError: + click.echo("āš ļø Existing config is invalid JSON. Creating new config.") + config = {} + else: + config = {} + + # Ensure mcpServers key exists + if "mcpServers" not in config: + config["mcpServers"] = {} + + # Find the coderag-mcp command path + coderag_path = shutil.which("coderag") + if coderag_path is None: + # Fallback to python -m + python_path = sys.executable + mcp_command = [python_path, "-m", "coderag.mcp.cli"] + else: + mcp_command = [coderag_path, "mcp-run"] + + # Prepare MCP server config + new_mcp_config = { + "command": mcp_command[0], + "args": mcp_command[1:] if len(mcp_command) > 1 else [], + } + + # Check if already configured + existing = config["mcpServers"].get("coderag") + if existing == new_mcp_config: + click.echo("\nāœ… CodeRAG MCP is already configured correctly!") + return + + # Show diff + click.echo("\nšŸ“ Changes to be made:") + if existing: + click.echo(f" Update: mcpServers.coderag") + click.echo(f" From: {json.dumps(existing)}") + click.echo(f" To: {json.dumps(new_mcp_config)}") + else: + click.echo(f" Add: mcpServers.coderag = {json.dumps(new_mcp_config)}") + + if dry_run: + click.echo("\nšŸ” Dry run - no changes made.") + return + + # Backup existing config + if config_path.exists(): + backup_path = config_path.with_suffix(".json.backup") + shutil.copy(config_path, backup_path) + click.echo(f"\nšŸ“¦ Backup saved to: {backup_path}") + + # Apply changes + config["mcpServers"]["coderag"] = new_mcp_config + config_path.parent.mkdir(parents=True, exist_ok=True) + config_path.write_text(json.dumps(config, indent=2)) + + click.echo("\nāœ… Claude Desktop configuration updated!") + click.echo("\nāš ļø Please restart Claude Desktop to apply changes.") + + +@cli.command("index") +@click.argument("url") +@click.option("--branch", default="", help="Branch to index (default: main/master)") +def index(url: str, branch: str): + """Index a GitHub repository. + + URL: The GitHub repository URL to index. + + Example: coderag index https://github.com/owner/repo + """ + # Apply config from file to environment + _apply_config_to_env() + + import asyncio + from coderag.mcp.handlers import get_mcp_handlers + + click.echo(f"\nšŸ“¦ Indexing repository: {url}") + if branch: + click.echo(f" Branch: {branch}") + + handlers = get_mcp_handlers() + + async def run_index(): + result = await handlers.index_repository(url=url, branch=branch) + return result + + result = asyncio.run(run_index()) + + if result.get("success"): + click.echo(f"\nāœ… Repository indexed successfully!") + click.echo(f" Repo ID: {result['repo_id']}") + click.echo(f" Name: {result['name']}") + click.echo(f" Files processed: {result['files_processed']}") + click.echo(f" Chunks indexed: {result['chunks_indexed']}") + click.echo(f"\n Use 'coderag query {result['repo_id'][:8]} \"your question\"' to query") + else: + click.echo(f"\nāŒ Indexing failed: {result.get('error', 'Unknown error')}") + sys.exit(1) + + +@cli.command("query") +@click.argument("repo_id") +@click.argument("question") +@click.option("--top-k", default=5, type=int, help="Number of chunks to retrieve") +@click.option("--format", "output_format", type=click.Choice(["text", "json"]), default="text", help="Output format") +def query(repo_id: str, question: str, top_k: int, output_format: str): + """Ask a question about an indexed repository. + + REPO_ID: Repository ID (full or first 8 characters) + QUESTION: Your question about the code + + Example: coderag query abc12345 "How does authentication work?" + """ + # Apply config from file to environment + _apply_config_to_env() + + import asyncio + from coderag.mcp.handlers import get_mcp_handlers + + handlers = get_mcp_handlers() + + async def run_query(): + result = await handlers.query_code(repo_id=repo_id, question=question, top_k=top_k) + return result + + click.echo(f"\nšŸ” Querying: {question}\n") + result = asyncio.run(run_query()) + + if result.get("error"): + click.echo(f"āŒ Error: {result['error']}") + sys.exit(1) + + if output_format == "json": + click.echo(json.dumps(result, indent=2)) + else: + click.echo("šŸ“ Answer:\n") + click.echo(result.get("answer", "No answer generated.")) + + if result.get("citations"): + click.echo("\nšŸ“ Citations:") + for citation in result["citations"]: + click.echo(f" {citation}") + + if result.get("evidence"): + click.echo("\nšŸ“‚ Evidence:") + for chunk in result["evidence"][:3]: # Show top 3 + click.echo(f" - {chunk['file']}:{chunk['start_line']}-{chunk['end_line']} (relevance: {chunk['relevance']})") + + +@cli.command("repos") +@click.option("--format", "output_format", type=click.Choice(["text", "json"]), default="text", help="Output format") +def repos(output_format: str): + """List all indexed repositories.""" + # Apply config from file to environment + _apply_config_to_env() + + import asyncio + from coderag.mcp.handlers import get_mcp_handlers + + handlers = get_mcp_handlers() + + async def run_list(): + result = await handlers.list_repositories() + return result + + result = asyncio.run(run_list()) + + if output_format == "json": + click.echo(json.dumps(result, indent=2)) + else: + repos_list = result.get("repositories", []) + if not repos_list: + click.echo("\nšŸ“­ No repositories indexed yet.") + click.echo(" Run 'coderag index ' to index a repository.") + return + + click.echo(f"\nšŸ“š Indexed Repositories ({len(repos_list)}):\n") + for repo in repos_list: + status_icon = "āœ…" if repo["status"] == "ready" else "ā³" if repo["status"] == "indexing" else "āŒ" + click.echo(f" {status_icon} {repo['id'][:8]} {repo['name']} ({repo['branch']})") + click.echo(f" Chunks: {repo['chunk_count']} | Indexed: {repo.get('indexed_at', 'N/A')}") + + +@cli.command("doctor") +def doctor(): + """Diagnose common issues with CodeRAG setup. + + Checks Python version, configuration, API key validity, and system components. + """ + click.echo("\nšŸ„ CodeRAG Doctor\n") + all_ok = True + + # Check Python version + py_version = sys.version_info + if py_version >= (3, 11): + click.echo(f"āœ… Python version: {py_version.major}.{py_version.minor}.{py_version.micro}") + else: + click.echo(f"āŒ Python version: {py_version.major}.{py_version.minor}.{py_version.micro} (need 3.11+)") + all_ok = False + + # Check config file + config = get_config() + if config: + click.echo(f"āœ… Config file exists: {CONFIG_FILE}") + if config.get("llm_provider"): + click.echo(f" Provider: {config['llm_provider']}") + else: + click.echo(f"āš ļø No config file. Run 'coderag setup' to configure.") + + # Check API key + api_key = config.get("llm_api_key") or os.environ.get("MODEL_LLM_API_KEY") + provider = config.get("llm_provider") or os.environ.get("MODEL_LLM_PROVIDER", "groq") + + if provider != "local": + if api_key: + click.echo(f"āœ… API key configured (provider: {provider})") + else: + click.echo(f"āŒ No API key configured for {provider}") + all_ok = False + + # Check CUDA + try: + import torch + if torch.cuda.is_available(): + click.echo(f"āœ… CUDA available: {torch.cuda.get_device_name(0)}") + else: + click.echo("ā„¹ļø CUDA not available (CPU mode for embeddings)") + except ImportError: + click.echo("āš ļø PyTorch not installed") + all_ok = False + + # Check ChromaDB data directory + from coderag.config import get_settings + settings = get_settings() + chroma_path = settings.vectorstore.persist_directory + if chroma_path.exists(): + click.echo(f"āœ… ChromaDB directory: {chroma_path}") + else: + click.echo(f"ā„¹ļø ChromaDB directory will be created: {chroma_path}") + + # Check Claude Desktop + claude_config = get_claude_config_path() + if claude_config and claude_config.exists(): + try: + config_data = json.loads(claude_config.read_text()) + if "coderag" in config_data.get("mcpServers", {}): + click.echo("āœ… Claude Desktop MCP configured") + else: + click.echo("ā„¹ļø Claude Desktop installed but MCP not configured. Run 'coderag mcp-install'") + except Exception: + click.echo("āš ļø Claude Desktop config exists but could not be read") + else: + click.echo("ā„¹ļø Claude Desktop not detected") + + # Summary + if all_ok: + click.echo("\nāœ… All checks passed!") + else: + click.echo("\nāš ļø Some issues detected. See above for details.") + + +def _apply_config_to_env(): + """Apply configuration from config file to environment variables.""" + config = get_config() + + if config.get("llm_provider") and not os.environ.get("MODEL_LLM_PROVIDER"): + os.environ["MODEL_LLM_PROVIDER"] = config["llm_provider"] + + if config.get("llm_api_key") and not os.environ.get("MODEL_LLM_API_KEY"): + os.environ["MODEL_LLM_API_KEY"] = config["llm_api_key"] + + if config.get("embedding_device") and not os.environ.get("MODEL_EMBEDDING_DEVICE"): + os.environ["MODEL_EMBEDDING_DEVICE"] = config["embedding_device"] + + +def main(): + """Entry point for the CLI.""" + cli() + + +if __name__ == "__main__": + main() diff --git a/src/coderag/config.py b/src/coderag/config.py new file mode 100644 index 0000000000000000000000000000000000000000..cc476e2402d94898461189268fe53b7336c1675d --- /dev/null +++ b/src/coderag/config.py @@ -0,0 +1,154 @@ +"""Application configuration using pydantic-settings.""" + +from pathlib import Path +from typing import Optional + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class ModelSettings(BaseSettings): + """LLM and embedding model configuration.""" + + model_config = SettingsConfigDict(env_prefix="MODEL_") + + # LLM Provider: "local", "openai", "groq", "anthropic", "openrouter" + # Default to "groq" (free tier available, no GPU required) + llm_provider: str = "groq" + + # API settings (for remote providers) + llm_api_key: Optional[str] = None + llm_api_base: Optional[str] = None # Custom API base URL + + # Model name (local or remote) + llm_name: str = "Qwen/Qwen2.5-Coder-3B-Instruct" + llm_max_new_tokens: int = 1024 + llm_temperature: float = 0.1 + llm_top_p: float = 0.95 + + # Local model settings + llm_use_4bit: bool = True + llm_device_map: str = "auto" + + embedding_name: str = "nomic-ai/nomic-embed-text-v1.5" + embedding_dimension: int = 768 + embedding_batch_size: int = 8 # Reduced for 8GB VRAM GPUs + embedding_device: str = "auto" # "auto" detects CUDA, falls back to CPU + + +class VectorStoreSettings(BaseSettings): + """ChromaDB vector store configuration.""" + + model_config = SettingsConfigDict(env_prefix="VECTORSTORE_") + + persist_directory: Path = Path("./data/chroma_db") + collection_name: str = "coderag_chunks" + distance_metric: str = "cosine" + anonymized_telemetry: bool = False + + +class IngestionSettings(BaseSettings): + """Repository ingestion configuration.""" + + model_config = SettingsConfigDict(env_prefix="INGESTION_") + + repos_cache_dir: Path = Path("./data/repos") + max_file_size_kb: int = 500 + default_branch: str = "main" + chunk_size: int = 1500 + chunk_overlap: int = 200 + + # Large repository handling + max_files_per_repo: int = 5000 + max_total_chunks: int = 50000 + batch_size: int = 100 + stream_processing: bool = True + + # Warning thresholds + warn_files_threshold: int = 1000 + warn_chunks_threshold: int = 10000 + + include_patterns: list[str] = Field( + default_factory=lambda: ["*.py", "*.js", "*.ts", "*.java", "*.go", "*.rs", "*.c", "*.cpp", "*.h"] + ) + exclude_patterns: list[str] = Field( + default_factory=lambda: [ + "**/node_modules/**", + "**/.git/**", + "**/venv/**", + "**/__pycache__/**", + "**/dist/**", + "**/build/**", + "**/*.min.js", + "**/*.min.css", + "**/package-lock.json", + "**/yarn.lock", + "**/poetry.lock", + "**/.env", + "**/.env.*", + "**/credentials*", + "**/*secret*", + "**/*password*", + ] + ) + + +class RetrievalSettings(BaseSettings): + """Retrieval configuration.""" + + model_config = SettingsConfigDict(env_prefix="RETRIEVAL_") + + default_top_k: int = 5 + max_top_k: int = 20 + similarity_threshold: float = 0.3 + + +class ServerSettings(BaseSettings): + """Server configuration.""" + + model_config = SettingsConfigDict(env_prefix="SERVER_") + + host: str = "0.0.0.0" + port: int = 8000 + reload: bool = False + workers: int = 1 + log_level: str = "info" + + +class Settings(BaseSettings): + """Main application settings.""" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + app_name: str = "CodeRAG" + app_version: str = "0.1.0" + debug: bool = False + data_dir: Path = Path("./data") + + models: ModelSettings = Field(default_factory=ModelSettings) + vectorstore: VectorStoreSettings = Field(default_factory=VectorStoreSettings) + ingestion: IngestionSettings = Field(default_factory=IngestionSettings) + retrieval: RetrievalSettings = Field(default_factory=RetrievalSettings) + server: ServerSettings = Field(default_factory=ServerSettings) + + def ensure_directories(self) -> None: + """Create required directories if they don't exist.""" + self.data_dir.mkdir(parents=True, exist_ok=True) + self.vectorstore.persist_directory.mkdir(parents=True, exist_ok=True) + self.ingestion.repos_cache_dir.mkdir(parents=True, exist_ok=True) + + +_settings: Optional[Settings] = None + + +def get_settings() -> Settings: + """Get cached settings instance.""" + global _settings + if _settings is None: + _settings = Settings() + _settings.ensure_directories() + return _settings diff --git a/src/coderag/generation/__init__.py b/src/coderag/generation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9a1fea641a023d4d24c8a170eb0be13b4c003085 --- /dev/null +++ b/src/coderag/generation/__init__.py @@ -0,0 +1,7 @@ +"""Generation module: LLM inference and response generation with citations.""" + +from coderag.generation.generator import ResponseGenerator +from coderag.generation.prompts import SYSTEM_PROMPT, build_prompt +from coderag.generation.citations import CitationParser + +__all__ = ["ResponseGenerator", "SYSTEM_PROMPT", "build_prompt", "CitationParser"] diff --git a/src/coderag/generation/__pycache__/__init__.cpython-313.pyc b/src/coderag/generation/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c937047cc779b4ac34040b288c9aaff16be63469 Binary files /dev/null and b/src/coderag/generation/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/generation/__pycache__/citations.cpython-313.pyc b/src/coderag/generation/__pycache__/citations.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72540ed1c732105424dc27bb8a26c596c3ae146b Binary files /dev/null and b/src/coderag/generation/__pycache__/citations.cpython-313.pyc differ diff --git a/src/coderag/generation/__pycache__/generator.cpython-313.pyc b/src/coderag/generation/__pycache__/generator.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e3dbb176cc70df7c81e10b6a4d36e0efd828cb8 Binary files /dev/null and b/src/coderag/generation/__pycache__/generator.cpython-313.pyc differ diff --git a/src/coderag/generation/__pycache__/prompts.cpython-313.pyc b/src/coderag/generation/__pycache__/prompts.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f2fd1016d4cf8bf87fe13cda058901fdaed337b Binary files /dev/null and b/src/coderag/generation/__pycache__/prompts.cpython-313.pyc differ diff --git a/src/coderag/generation/citations.py b/src/coderag/generation/citations.py new file mode 100644 index 0000000000000000000000000000000000000000..609718468ed9e9b29fe6cb5d3c3f4ac15d45544e --- /dev/null +++ b/src/coderag/generation/citations.py @@ -0,0 +1,77 @@ +"""Citation parsing and formatting.""" + +import re +from typing import Optional + +from coderag.models.response import Citation + + +class CitationParser: + """Parses and validates citations from LLM responses.""" + + # Pattern to match citations like [file.py:10-20] or [path/to/file.py:10-20] + CITATION_PATTERN = re.compile(r"\[([^\]]+):(\d+)-(\d+)\]") + + def parse_citations(self, text: str) -> list[Citation]: + """Extract all citations from text. + + Args: + text: Text containing citations + + Returns: + List of parsed Citation objects + """ + citations = [] + for match in self.CITATION_PATTERN.finditer(text): + file_path = match.group(1) + start_line = int(match.group(2)) + end_line = int(match.group(3)) + + citations.append(Citation( + file_path=file_path, + start_line=start_line, + end_line=end_line, + )) + + return citations + + def validate_citation(self, citation: Citation, available_files: set[str]) -> bool: + """Check if a citation references an existing file.""" + return citation.file_path in available_files + + def validate_citations( + self, + citations: list[Citation], + available_files: set[str], + ) -> tuple[list[Citation], list[Citation]]: + """Validate multiple citations. + + Returns: + Tuple of (valid_citations, invalid_citations) + """ + valid = [] + invalid = [] + + for citation in citations: + if self.validate_citation(citation, available_files): + valid.append(citation) + else: + invalid.append(citation) + + return valid, invalid + + def format_citation(self, file_path: str, start_line: int, end_line: int) -> str: + """Format a citation string.""" + return f"[{file_path}:{start_line}-{end_line}]" + + def has_citations(self, text: str) -> bool: + """Check if text contains any citations.""" + return bool(self.CITATION_PATTERN.search(text)) + + def count_citations(self, text: str) -> int: + """Count citations in text.""" + return len(self.CITATION_PATTERN.findall(text)) + + def extract_unique_files(self, citations: list[Citation]) -> set[str]: + """Get unique file paths from citations.""" + return {c.file_path for c in citations} diff --git a/src/coderag/generation/generator.py b/src/coderag/generation/generator.py new file mode 100644 index 0000000000000000000000000000000000000000..8055e3ac1118207de748beab66f0ab844242d7a1 --- /dev/null +++ b/src/coderag/generation/generator.py @@ -0,0 +1,241 @@ +"""Response generation using local or remote LLMs.""" + +from typing import Optional + +from coderag.config import get_settings +from coderag.generation.citations import CitationParser +from coderag.generation.prompts import SYSTEM_PROMPT, build_prompt, build_no_context_response +from coderag.logging import get_logger +from coderag.models.response import Response +from coderag.models.query import Query +from coderag.retrieval.retriever import Retriever + +logger = get_logger(__name__) + + +class ResponseGenerator: + """Generates grounded responses using local or remote LLMs.""" + + def __init__( + self, + retriever: Optional[Retriever] = None, + ) -> None: + self.settings = get_settings() + self.retriever = retriever or Retriever() + self.citation_parser = CitationParser() + + self.provider = self.settings.models.llm_provider.lower() + self._client = None + self._local_model = None + self._local_tokenizer = None + + logger.info("ResponseGenerator initialized", provider=self.provider) + + def _get_api_client(self): + """Get or create API client for remote providers.""" + if self._client is not None: + return self._client + + import httpx + from openai import OpenAI + + api_key = self.settings.models.llm_api_key + if not api_key: + raise ValueError(f"API key required for provider: {self.provider}") + + # Provider-specific configurations + provider_configs = { + "openai": { + "base_url": "https://api.openai.com/v1", + "default_model": "gpt-4o-mini", + }, + "groq": { + "base_url": "https://api.groq.com/openai/v1", + "default_model": "llama-3.3-70b-versatile", + }, + "anthropic": { + "base_url": "https://api.anthropic.com/v1", + "default_model": "claude-3-5-sonnet-20241022", + }, + "openrouter": { + "base_url": "https://openrouter.ai/api/v1", + "default_model": "anthropic/claude-3.5-sonnet", + }, + "together": { + "base_url": "https://api.together.xyz/v1", + "default_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + }, + } + + config = provider_configs.get(self.provider, {}) + base_url = self.settings.models.llm_api_base or config.get("base_url") + + if not base_url: + raise ValueError(f"Unknown provider: {self.provider}") + + # Set default model if not specified and it's a known provider + if self.settings.models.llm_name.startswith("Qwen/"): + self.model_name = config.get("default_model", self.settings.models.llm_name) + else: + self.model_name = self.settings.models.llm_name + + self._client = OpenAI( + api_key=api_key, + base_url=base_url, + http_client=httpx.Client(timeout=120.0), + ) + + logger.info("API client created", provider=self.provider, model=self.model_name) + return self._client + + def _load_local_model(self): + """Load local model with transformers.""" + if self._local_model is not None: + return + + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + + if not torch.cuda.is_available(): + raise RuntimeError( + "Local LLM requires a CUDA-capable GPU. Options:\n" + " 1. Use a cloud provider (free): MODEL_LLM_PROVIDER=groq\n" + " Get API key at: https://console.groq.com/keys\n" + " 2. Install CUDA and a compatible GPU" + ) + + logger.info("Loading local LLM", model=self.settings.models.llm_name) + + if self.settings.models.llm_use_4bit: + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + else: + bnb_config = None + + self._local_tokenizer = AutoTokenizer.from_pretrained( + self.settings.models.llm_name, + trust_remote_code=True, + ) + + self._local_model = AutoModelForCausalLM.from_pretrained( + self.settings.models.llm_name, + quantization_config=bnb_config, + device_map=self.settings.models.llm_device_map, + trust_remote_code=True, + torch_dtype=torch.float16, + ) + + logger.info("Local LLM loaded successfully") + + def generate(self, query: Query) -> Response: + """Generate a response for a query.""" + # Retrieve relevant chunks + chunks, context = self.retriever.retrieve_with_context( + query.question, + query.repo_id, + query.top_k, + ) + + # Handle no results + if not chunks: + return Response( + answer=build_no_context_response(), + citations=[], + retrieved_chunks=[], + grounded=False, + query_id=query.id, + ) + + # Build prompt and generate + prompt = build_prompt(query.question, context) + + if self.provider == "local": + answer = self._generate_local(prompt) + else: + answer = self._generate_api(prompt) + + # Parse citations from answer + citations = self.citation_parser.parse_citations(answer) + + # Determine if response is grounded + grounded = len(citations) > 0 and len(chunks) > 0 + + return Response( + answer=answer, + citations=citations, + retrieved_chunks=chunks, + grounded=grounded, + query_id=query.id, + ) + + def _generate_api(self, prompt: str) -> str: + """Generate using remote API.""" + client = self._get_api_client() + + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ] + + response = client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=self.settings.models.llm_max_new_tokens, + temperature=self.settings.models.llm_temperature, + top_p=self.settings.models.llm_top_p, + ) + + return response.choices[0].message.content.strip() + + def _generate_local(self, prompt: str) -> str: + """Generate using local model.""" + import torch + + self._load_local_model() + + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ] + + text = self._local_tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + inputs = self._local_tokenizer(text, return_tensors="pt").to(self._local_model.device) + + with torch.no_grad(): + outputs = self._local_model.generate( + **inputs, + max_new_tokens=self.settings.models.llm_max_new_tokens, + temperature=self.settings.models.llm_temperature, + top_p=self.settings.models.llm_top_p, + do_sample=True, + pad_token_id=self._local_tokenizer.eos_token_id, + ) + + generated = outputs[0][inputs["input_ids"].shape[1]:] + response = self._local_tokenizer.decode(generated, skip_special_tokens=True) + + return response.strip() + + def unload(self) -> None: + """Unload models from memory.""" + if self._local_model is not None: + del self._local_model + self._local_model = None + if self._local_tokenizer is not None: + del self._local_tokenizer + self._local_tokenizer = None + + import torch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + logger.info("Models unloaded") diff --git a/src/coderag/generation/prompts.py b/src/coderag/generation/prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..5723d491cac5e0316327dccc34c52bc2dc1576c6 --- /dev/null +++ b/src/coderag/generation/prompts.py @@ -0,0 +1,72 @@ +"""System prompts for grounded code Q&A.""" + +SYSTEM_PROMPT = """You are a code assistant that answers questions about a repository. + +CRITICAL RULES - YOU MUST FOLLOW THESE: + +1. FIRST, check if the retrieved chunks are RELEVANT to the question being asked. + - If the chunks discuss completely different topics than the question, respond: + "I could not find information about this in the indexed repository." + - Do NOT try to make connections that don't exist. + +2. Only answer based on EXPLICIT information in the provided code chunks. + - Every claim MUST have a citation: [file_path:start_line-end_line] + - If you cannot cite it, do NOT say it. + +3. NEVER HALLUCINATE: + - Do NOT invent code, functions, files, or behaviors + - Do NOT answer questions about topics not in the chunks (e.g., if asked about "food inventory" but chunks are about "code embeddings", say you don't have that information) + - Do NOT make assumptions about what the code might do + +4. When to refuse: + - The question is about something not covered in the chunks + - The chunks are about a completely different topic + - You would need to guess or speculate + +CITATION FORMAT: [file_path:start_line-end_line] +Example: [src/auth.py:45-78] + +RESPONSE FORMAT: +- Start with a direct answer IF AND ONLY IF the chunks contain relevant information +- Include citations inline with every factual statement +- If showing code, quote it exactly from the chunks""" + + +def build_prompt(question: str, context: str) -> str: + """Build the full prompt with context and question. + + Args: + question: User's question + context: Retrieved code chunks formatted as context + + Returns: + Complete prompt for the LLM + """ + return f"""Based on the following code chunks from the repository, answer the question. + +## Retrieved Code Chunks + +{context} + +## Question + +{question} + +## Answer + +""" + + +def build_no_context_response() -> str: + """Response when no relevant context is found.""" + return "I could not find information about this in the indexed repository." + + +def build_clarification_prompt(question: str, ambiguities: list[str]) -> str: + """Build prompt asking for clarification.""" + ambiguity_list = "\n".join(f"- {a}" for a in ambiguities) + return f"""Your question "{question}" is ambiguous. Could you clarify: + +{ambiguity_list} + +Please provide more specific details so I can give you an accurate answer.""" diff --git a/src/coderag/indexing/__init__.py b/src/coderag/indexing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3be5f28625bdd87922f3c3a41a77e717a48e1c36 --- /dev/null +++ b/src/coderag/indexing/__init__.py @@ -0,0 +1,6 @@ +"""Indexing module: Embedding generation and vector storage.""" + +from coderag.indexing.embeddings import EmbeddingGenerator +from coderag.indexing.vectorstore import VectorStore + +__all__ = ["EmbeddingGenerator", "VectorStore"] diff --git a/src/coderag/indexing/__pycache__/__init__.cpython-313.pyc b/src/coderag/indexing/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecf73a67d3e14b957719cf0b51a4fdc2fde5c685 Binary files /dev/null and b/src/coderag/indexing/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/indexing/__pycache__/embeddings.cpython-313.pyc b/src/coderag/indexing/__pycache__/embeddings.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ee5e6eee3141fcf353f3a52ad4141380c406e8b Binary files /dev/null and b/src/coderag/indexing/__pycache__/embeddings.cpython-313.pyc differ diff --git a/src/coderag/indexing/__pycache__/vectorstore.cpython-313.pyc b/src/coderag/indexing/__pycache__/vectorstore.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87039025cf855a47e16aac74ba3ef9d921308eae Binary files /dev/null and b/src/coderag/indexing/__pycache__/vectorstore.cpython-313.pyc differ diff --git a/src/coderag/indexing/embeddings.py b/src/coderag/indexing/embeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..439e66a2bcb1b124ec5bf10f2671cda350732fa4 --- /dev/null +++ b/src/coderag/indexing/embeddings.py @@ -0,0 +1,147 @@ +"""Embedding generation using nomic-embed-text.""" + +from typing import Iterator, Optional + +import torch +from sentence_transformers import SentenceTransformer + +from coderag.config import get_settings +from coderag.logging import get_logger +from coderag.models.chunk import Chunk + +logger = get_logger(__name__) + + +class EmbeddingGenerator: + """Generates embeddings using nomic-embed-text v1.5.""" + + def __init__( + self, + model_name: Optional[str] = None, + device: Optional[str] = None, + batch_size: Optional[int] = None, + ) -> None: + settings = get_settings() + self.model_name = model_name or settings.models.embedding_name + self.device = self._resolve_device(device or settings.models.embedding_device) + self.batch_size = batch_size or settings.models.embedding_batch_size + self._model: Optional[SentenceTransformer] = None + + def _resolve_device(self, device: str) -> str: + """Resolve device, falling back to CPU if CUDA unavailable.""" + if device == "auto": + return "cuda" if torch.cuda.is_available() else "cpu" + if device == "cuda" and not torch.cuda.is_available(): + logger.warning("CUDA not available, falling back to CPU for embeddings") + return "cpu" + return device + + @property + def model(self) -> SentenceTransformer: + if self._model is None: + self._load_model() + return self._model + + def _load_model(self) -> None: + logger.info("Loading embedding model", model=self.model_name, device=self.device) + self._model = SentenceTransformer( + self.model_name, + device=self.device, + trust_remote_code=True, + ) + logger.info("Embedding model loaded") + + def generate_embedding(self, text: str, is_query: bool = False) -> list[float]: + # nomic-embed uses task prefixes + if is_query: + text = f"search_query: {text}" + else: + text = f"search_document: {text}" + + embedding = self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True) + return embedding.tolist() + + def generate_embeddings( + self, + texts: list[str], + is_query: bool = False, + show_progress: bool = True, + ) -> list[list[float]]: + # Add prefixes + if is_query: + texts = [f"search_query: {t}" for t in texts] + else: + texts = [f"search_document: {t}" for t in texts] + + embeddings = self.model.encode( + texts, + batch_size=self.batch_size, + convert_to_numpy=True, + normalize_embeddings=True, + show_progress_bar=show_progress, + ) + return embeddings.tolist() + + def embed_chunks( + self, + chunks: list[Chunk], + show_progress: bool = True, + ) -> list[Chunk]: + if not chunks: + return [] + + logger.info("Generating embeddings", num_chunks=len(chunks)) + + texts = [self._chunk_to_text(chunk) for chunk in chunks] + embeddings = self.generate_embeddings(texts, is_query=False, show_progress=show_progress) + + for chunk, embedding in zip(chunks, embeddings): + chunk.embedding = embedding + + logger.info("Embeddings generated", num_chunks=len(chunks)) + return chunks + + def embed_chunks_iter( + self, + chunks: Iterator[Chunk], + batch_size: Optional[int] = None, + ) -> Iterator[Chunk]: + batch_size = batch_size or self.batch_size + batch: list[Chunk] = [] + + for chunk in chunks: + batch.append(chunk) + if len(batch) >= batch_size: + yield from self._embed_batch(batch) + batch = [] + + if batch: + yield from self._embed_batch(batch) + + def _embed_batch(self, batch: list[Chunk]) -> Iterator[Chunk]: + texts = [self._chunk_to_text(chunk) for chunk in batch] + embeddings = self.generate_embeddings(texts, is_query=False, show_progress=False) + + for chunk, embedding in zip(batch, embeddings): + chunk.embedding = embedding + yield chunk + + def _chunk_to_text(self, chunk: Chunk) -> str: + parts = [] + if chunk.name: + parts.append(f"{chunk.chunk_type.value}: {chunk.name}") + if chunk.metadata.signature: + parts.append(f"Signature: {chunk.metadata.signature}") + if chunk.metadata.docstring: + parts.append(f"Docstring: {chunk.metadata.docstring[:200]}") + parts.append(f"File: {chunk.file_path}") + parts.append(chunk.content) + return "\n".join(parts) + + def unload(self) -> None: + if self._model is not None: + del self._model + self._model = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + logger.info("Embedding model unloaded") diff --git a/src/coderag/indexing/vectorstore.py b/src/coderag/indexing/vectorstore.py new file mode 100644 index 0000000000000000000000000000000000000000..0c699cf5b1f387e82a59c20ec89d45722eec7abe --- /dev/null +++ b/src/coderag/indexing/vectorstore.py @@ -0,0 +1,170 @@ +"""ChromaDB vector store operations.""" + +from pathlib import Path +from typing import Optional + +import chromadb +from chromadb.config import Settings + +from coderag.config import get_settings +from coderag.logging import get_logger +from coderag.models.chunk import Chunk + +logger = get_logger(__name__) + + +class VectorStore: + """ChromaDB vector store for chunk storage and retrieval.""" + + def __init__( + self, + persist_directory: Optional[Path] = None, + collection_name: Optional[str] = None, + ) -> None: + settings = get_settings() + self.persist_directory = persist_directory or settings.vectorstore.persist_directory + self.collection_name = collection_name or settings.vectorstore.collection_name + self._client: Optional[chromadb.PersistentClient] = None + self._collection: Optional[chromadb.Collection] = None + + @property + def client(self) -> chromadb.PersistentClient: + if self._client is None: + self._init_client() + return self._client + + @property + def collection(self) -> chromadb.Collection: + if self._collection is None: + self._init_collection() + return self._collection + + def _init_client(self) -> None: + logger.info("Initializing ChromaDB", path=str(self.persist_directory)) + self.persist_directory.mkdir(parents=True, exist_ok=True) + self._client = chromadb.PersistentClient( + path=str(self.persist_directory), + settings=Settings(anonymized_telemetry=False), + ) + + def _init_collection(self) -> None: + self._collection = self.client.get_or_create_collection( + name=self.collection_name, + metadata={"hnsw:space": "cosine"}, + ) + logger.info("Collection initialized", name=self.collection_name) + + def add_chunks(self, chunks: list[Chunk]) -> int: + if not chunks: + return 0 + + ids = [chunk.id for chunk in chunks] + embeddings = [chunk.embedding for chunk in chunks if chunk.embedding] + documents = [chunk.content for chunk in chunks] + metadatas = [chunk.to_dict() for chunk in chunks] + + # Remove embedding and filter None values (ChromaDB doesn't accept None) + cleaned_metadatas = [] + for m in metadatas: + m.pop("embedding", None) + m.pop("content", None) # Already stored in documents + # Filter out None values - ChromaDB only accepts str, int, float, bool + cleaned = {k: v for k, v in m.items() if v is not None} + cleaned_metadatas.append(cleaned) + + self.collection.add( + ids=ids, + embeddings=embeddings, + documents=documents, + metadatas=cleaned_metadatas, + ) + + logger.info("Chunks added to vector store", count=len(chunks)) + return len(chunks) + + def query( + self, + query_embedding: list[float], + repo_id: str, + top_k: int = 5, + similarity_threshold: float = 0.0, + ) -> list[tuple[Chunk, float]]: + results = self.collection.query( + query_embeddings=[query_embedding], + n_results=top_k, + where={"repo_id": repo_id}, + include=["documents", "metadatas", "distances"], + ) + + chunks_with_scores = [] + if results["ids"] and results["ids"][0]: + for i, chunk_id in enumerate(results["ids"][0]): + # ChromaDB returns distances, convert to similarity for cosine + distance = results["distances"][0][i] + similarity = 1 - distance + + if similarity >= similarity_threshold: + metadata = results["metadatas"][0][i] + metadata["id"] = chunk_id + metadata["content"] = results["documents"][0][i] + chunk = Chunk.from_dict(metadata) + chunks_with_scores.append((chunk, similarity)) + + return chunks_with_scores + + def delete_repo_chunks(self, repo_id: str) -> int: + # Get all chunks for this repo + results = self.collection.get(where={"repo_id": repo_id}, include=[]) + + if results["ids"]: + self.collection.delete(ids=results["ids"]) + count = len(results["ids"]) + logger.info("Deleted repo chunks", repo_id=repo_id, count=count) + return count + return 0 + + def delete_file_chunks(self, repo_id: str, file_path: str) -> int: + """Delete chunks for a specific file in a repository (for incremental updates).""" + results = self.collection.get( + where={"$and": [{"repo_id": repo_id}, {"file_path": file_path}]}, + include=[], + ) + + if results["ids"]: + self.collection.delete(ids=results["ids"]) + count = len(results["ids"]) + logger.info("Deleted file chunks", repo_id=repo_id, file_path=file_path, count=count) + return count + return 0 + + def get_indexed_files(self, repo_id: str) -> set[str]: + """Get set of file paths indexed for a repository.""" + results = self.collection.get( + where={"repo_id": repo_id}, + include=["metadatas"], + ) + + files = set() + if results["metadatas"]: + for metadata in results["metadatas"]: + if "file_path" in metadata: + files.add(metadata["file_path"]) + return files + + def get_repo_chunk_count(self, repo_id: str) -> int: + results = self.collection.get(where={"repo_id": repo_id}, include=[]) + return len(results["ids"]) if results["ids"] else 0 + + def get_all_repo_ids(self) -> list[str]: + results = self.collection.get(include=["metadatas"]) + repo_ids = set() + if results["metadatas"]: + for metadata in results["metadatas"]: + if "repo_id" in metadata: + repo_ids.add(metadata["repo_id"]) + return list(repo_ids) + + def clear(self) -> None: + self.client.delete_collection(self.collection_name) + self._collection = None + logger.info("Collection cleared", name=self.collection_name) diff --git a/src/coderag/ingestion/__init__.py b/src/coderag/ingestion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c87403eb8c53f6052da8dc8ef112425650cab369 --- /dev/null +++ b/src/coderag/ingestion/__init__.py @@ -0,0 +1,8 @@ +"""Ingestion module: Repository loading, file filtering, and semantic chunking.""" + +from coderag.ingestion.validator import GitHubURLValidator +from coderag.ingestion.loader import RepositoryLoader +from coderag.ingestion.filter import FileFilter +from coderag.ingestion.chunker import CodeChunker + +__all__ = ["GitHubURLValidator", "RepositoryLoader", "FileFilter", "CodeChunker"] diff --git a/src/coderag/ingestion/__pycache__/__init__.cpython-313.pyc b/src/coderag/ingestion/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef5f22e0603cd0f956bceb990010b8182eedd5a0 Binary files /dev/null and b/src/coderag/ingestion/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/ingestion/__pycache__/chunker.cpython-313.pyc b/src/coderag/ingestion/__pycache__/chunker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5058c0cb5d1f3edac8c30a61599956fc09896c2e Binary files /dev/null and b/src/coderag/ingestion/__pycache__/chunker.cpython-313.pyc differ diff --git a/src/coderag/ingestion/__pycache__/filter.cpython-313.pyc b/src/coderag/ingestion/__pycache__/filter.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d16244518fb400f59d1e82810bec26773dd26d5 Binary files /dev/null and b/src/coderag/ingestion/__pycache__/filter.cpython-313.pyc differ diff --git a/src/coderag/ingestion/__pycache__/loader.cpython-313.pyc b/src/coderag/ingestion/__pycache__/loader.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af6c727a4019b2cc87083eeec58be21a148ba099 Binary files /dev/null and b/src/coderag/ingestion/__pycache__/loader.cpython-313.pyc differ diff --git a/src/coderag/ingestion/__pycache__/validator.cpython-313.pyc b/src/coderag/ingestion/__pycache__/validator.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e4647c165ecf39e730e865b699c48e088063a63 Binary files /dev/null and b/src/coderag/ingestion/__pycache__/validator.cpython-313.pyc differ diff --git a/src/coderag/ingestion/chunker.py b/src/coderag/ingestion/chunker.py new file mode 100644 index 0000000000000000000000000000000000000000..1bb4f678821c639047bb9750c03d7833c54e3779 --- /dev/null +++ b/src/coderag/ingestion/chunker.py @@ -0,0 +1,184 @@ +"""Code chunking with Tree-sitter and text fallback.""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterator, Optional + +from coderag.config import get_settings +from coderag.logging import get_logger +from coderag.models.chunk import Chunk, ChunkMetadata, ChunkType +from coderag.models.document import Document + +logger = get_logger(__name__) + + +@dataclass +class ChunkerConfig: + """Chunker configuration.""" + chunk_size: int = 1500 + chunk_overlap: int = 200 + min_chunk_size: int = 50 + + +class CodeChunker: + """Chunks code files into semantic units.""" + + def __init__(self, config: Optional[ChunkerConfig] = None) -> None: + settings = get_settings() + self.config = config or ChunkerConfig( + chunk_size=settings.ingestion.chunk_size, + chunk_overlap=settings.ingestion.chunk_overlap, + ) + self._tree_sitter_available = self._check_tree_sitter() + + def _check_tree_sitter(self) -> bool: + try: + import tree_sitter_python + return True + except ImportError: + logger.warning("Tree-sitter not available, using text chunking") + return False + + def chunk_document(self, document: Document) -> Iterator[Chunk]: + if document.language == "python" and self._tree_sitter_available: + yield from self._chunk_python(document) + else: + yield from self._chunk_text(document) + + def _chunk_python(self, document: Document) -> Iterator[Chunk]: + try: + import tree_sitter_python as tspython + from tree_sitter import Language, Parser + + PY_LANGUAGE = Language(tspython.language()) + parser = Parser(PY_LANGUAGE) + tree = parser.parse(bytes(document.content, "utf-8")) + + yield from self._extract_python_chunks(tree.root_node, document) + + except Exception as e: + logger.warning("Tree-sitter parsing failed, falling back to text", error=str(e)) + yield from self._chunk_text(document) + + def _extract_python_chunks(self, node, document: Document) -> Iterator[Chunk]: + lines = document.content.split("\n") + + for child in node.children: + if child.type in ("function_definition", "async_function_definition"): + yield self._create_chunk_from_node(child, document, lines, ChunkType.FUNCTION) + elif child.type == "class_definition": + yield self._create_chunk_from_node(child, document, lines, ChunkType.CLASS) + # Also extract methods + for class_child in child.children: + if class_child.type == "block": + for block_child in class_child.children: + if block_child.type in ("function_definition", "async_function_definition"): + yield self._create_chunk_from_node( + block_child, document, lines, ChunkType.METHOD, + parent_name=self._get_node_name(child) + ) + + # If no semantic chunks found, fall back to text chunking + if not any(child.type in ("function_definition", "class_definition", "async_function_definition") + for child in node.children): + yield from self._chunk_text(document) + + def _create_chunk_from_node( + self, + node, + document: Document, + lines: list[str], + chunk_type: ChunkType, + parent_name: Optional[str] = None, + ) -> Chunk: + start_line = node.start_point[0] + 1 + end_line = node.end_point[0] + 1 + content = "\n".join(lines[start_line - 1:end_line]) + name = self._get_node_name(node) + signature = self._get_signature(node, lines) + docstring = self._get_docstring(node, lines) + + metadata = ChunkMetadata( + file_path=document.file_path, + start_line=start_line, + end_line=end_line, + chunk_type=chunk_type, + language=document.language, + name=name, + signature=signature, + docstring=docstring, + parent_name=parent_name, + ) + + return Chunk(content=content, metadata=metadata, repo_id=document.repo_id) + + def _get_node_name(self, node) -> Optional[str]: + for child in node.children: + if child.type == "identifier": + return child.text.decode("utf-8") + return None + + def _get_signature(self, node, lines: list[str]) -> Optional[str]: + if node.type in ("function_definition", "async_function_definition"): + start_line = node.start_point[0] + return lines[start_line].strip() + return None + + def _get_docstring(self, node, lines: list[str]) -> Optional[str]: + for child in node.children: + if child.type == "block": + for block_child in child.children: + if block_child.type == "expression_statement": + for expr_child in block_child.children: + if expr_child.type == "string": + return expr_child.text.decode("utf-8").strip('"""\'\'\'') + return None + + def _chunk_text(self, document: Document) -> Iterator[Chunk]: + lines = document.content.split("\n") + chunk_size = self.config.chunk_size + overlap = self.config.chunk_overlap + + current_start = 0 + while current_start < len(lines): + # Calculate chunk boundaries + char_count = 0 + end_line = current_start + + while end_line < len(lines) and char_count < chunk_size: + char_count += len(lines[end_line]) + 1 + end_line += 1 + + content = "\n".join(lines[current_start:end_line]) + + if len(content.strip()) >= self.config.min_chunk_size: + metadata = ChunkMetadata( + file_path=document.file_path, + start_line=current_start + 1, + end_line=end_line, + chunk_type=ChunkType.TEXT, + language=document.language, + ) + yield Chunk(content=content, metadata=metadata, repo_id=document.repo_id) + + # Move start with overlap + overlap_lines = 0 + overlap_chars = 0 + while overlap_lines < end_line - current_start and overlap_chars < overlap: + overlap_chars += len(lines[end_line - 1 - overlap_lines]) + 1 + overlap_lines += 1 + + current_start = end_line - overlap_lines + if current_start <= 0 or end_line >= len(lines): + break + + def chunk_files(self, documents: Iterator[Document]) -> Iterator[Chunk]: + total_chunks = 0 + for doc in documents: + doc_chunks = 0 + for chunk in self.chunk_document(doc): + doc_chunks += 1 + total_chunks += 1 + yield chunk + logger.debug("Document chunked", file=doc.file_path, chunks=doc_chunks) + logger.info("Chunking complete", total_chunks=total_chunks) diff --git a/src/coderag/ingestion/filter.py b/src/coderag/ingestion/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..4b062f69b1460f30ca54af388dc8581092c79db0 --- /dev/null +++ b/src/coderag/ingestion/filter.py @@ -0,0 +1,85 @@ +"""File filtering for repository indexing.""" + +import fnmatch +from pathlib import Path +from typing import Iterator, Optional + +from coderag.config import get_settings +from coderag.logging import get_logger + +logger = get_logger(__name__) + + +class FileFilter: + """Filters files for indexing based on patterns.""" + + def __init__( + self, + include_patterns: Optional[list[str]] = None, + exclude_patterns: Optional[list[str]] = None, + max_file_size_kb: Optional[int] = None, + ) -> None: + settings = get_settings() + self.include_patterns = include_patterns or settings.ingestion.include_patterns + self.exclude_patterns = exclude_patterns or settings.ingestion.exclude_patterns + self.max_file_size = (max_file_size_kb or settings.ingestion.max_file_size_kb) * 1024 + + def should_include(self, file_path: Path, repo_root: Path) -> bool: + relative_path = str(file_path.relative_to(repo_root)) + + # Check exclusions first + for pattern in self.exclude_patterns: + if fnmatch.fnmatch(relative_path, pattern): + return False + if fnmatch.fnmatch(file_path.name, pattern): + return False + + # Check inclusions + for pattern in self.include_patterns: + if fnmatch.fnmatch(file_path.name, pattern): + return True + if fnmatch.fnmatch(relative_path, pattern): + return True + + return False + + def check_file_size(self, file_path: Path) -> bool: + try: + return file_path.stat().st_size <= self.max_file_size + except OSError: + return False + + def is_binary(self, file_path: Path) -> bool: + try: + with open(file_path, "rb") as f: + chunk = f.read(8192) + return b"\x00" in chunk + except (OSError, IOError): + return True + + def filter_files(self, repo_root: Path) -> Iterator[Path]: + skipped_count = 0 + included_count = 0 + + for file_path in repo_root.rglob("*"): + if not file_path.is_file(): + continue + + if not self.should_include(file_path, repo_root): + skipped_count += 1 + continue + + if not self.check_file_size(file_path): + logger.debug("Skipping large file", path=str(file_path)) + skipped_count += 1 + continue + + if self.is_binary(file_path): + logger.debug("Skipping binary file", path=str(file_path)) + skipped_count += 1 + continue + + included_count += 1 + yield file_path + + logger.info("File filtering complete", included=included_count, skipped=skipped_count) diff --git a/src/coderag/ingestion/loader.py b/src/coderag/ingestion/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..6e8cd26db4b423804095ff410dc4917604140ded --- /dev/null +++ b/src/coderag/ingestion/loader.py @@ -0,0 +1,117 @@ +"""Repository loading and cloning.""" + +from pathlib import Path +from typing import Callable, Optional + +from git import Repo, GitCommandError + +from coderag.config import get_settings +from coderag.logging import get_logger +from coderag.ingestion.validator import GitHubRepoInfo + +logger = get_logger(__name__) + +ProgressCallback = Callable[[str, int], None] + + +class LoaderError(Exception): + """Repository loading error.""" + pass + + +class RepositoryLoader: + """Loads repositories from GitHub.""" + + def __init__(self, cache_dir: Optional[Path] = None) -> None: + settings = get_settings() + self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir + self.cache_dir.mkdir(parents=True, exist_ok=True) + + def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path: + return self.cache_dir / repo_info.owner / repo_info.name + + def clone_repository( + self, + repo_info: GitHubRepoInfo, + branch: Optional[str] = None, + progress_callback: Optional[ProgressCallback] = None, + ) -> Path: + repo_path = self.get_repo_path(repo_info) + + # Try branches in order: specified, repo default, main, master + branches_to_try = [] + if branch: + branches_to_try.append(branch) + if repo_info.branch and repo_info.branch not in branches_to_try: + branches_to_try.append(repo_info.branch) + if "main" not in branches_to_try: + branches_to_try.append("main") + if "master" not in branches_to_try: + branches_to_try.append("master") + + if repo_path.exists(): + logger.info("Repository exists, updating", path=str(repo_path)) + return self._update_repository(repo_path, branches_to_try[0], progress_callback) + + if progress_callback: + progress_callback("Cloning repository", 0) + + repo_path.parent.mkdir(parents=True, exist_ok=True) + + last_error = None + for try_branch in branches_to_try: + try: + logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch) + Repo.clone_from( + repo_info.clone_url, + repo_path, + branch=try_branch, + depth=1, + single_branch=True, + ) + if progress_callback: + progress_callback("Clone complete", 100) + logger.info("Repository cloned", path=str(repo_path), branch=try_branch) + return repo_path + except GitCommandError as e: + last_error = e + logger.debug("Branch not found, trying next", branch=try_branch) + # Clean up partial clone if any + import shutil + shutil.rmtree(repo_path, ignore_errors=True) + continue + + raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}") + + def _update_repository( + self, + repo_path: Path, + branch: str, + progress_callback: Optional[ProgressCallback] = None, + ) -> Path: + try: + repo = Repo(repo_path) + if progress_callback: + progress_callback("Fetching updates", 30) + repo.remotes.origin.fetch() + repo.git.checkout(branch) + repo.remotes.origin.pull() + if progress_callback: + progress_callback("Update complete", 100) + logger.info("Repository updated", path=str(repo_path)) + return repo_path + except GitCommandError as e: + logger.warning("Update failed, re-cloning", error=str(e)) + import shutil + shutil.rmtree(repo_path, ignore_errors=True) + raise LoaderError(f"Failed to update, please re-clone: {e}") + + def is_cached(self, repo_info: GitHubRepoInfo) -> bool: + return self.get_repo_path(repo_info).exists() + + def delete_cache(self, repo_info: GitHubRepoInfo) -> None: + repo_path = self.get_repo_path(repo_info) + if repo_path.exists(): + import shutil + shutil.rmtree(repo_path) + logger.info("Cache deleted", path=str(repo_path)) diff --git a/src/coderag/ingestion/validator.py b/src/coderag/ingestion/validator.py new file mode 100644 index 0000000000000000000000000000000000000000..410a9e0fc0c8c7630061ba146af21d4d44d7ad91 --- /dev/null +++ b/src/coderag/ingestion/validator.py @@ -0,0 +1,98 @@ +"""GitHub URL validation and parsing.""" + +import re +from dataclasses import dataclass +from typing import Optional + +import httpx + +from coderag.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class GitHubRepoInfo: + """Parsed GitHub repository information.""" + + owner: str + name: str + url: str + branch: Optional[str] = None + + @property + def full_name(self) -> str: + return f"{self.owner}/{self.name}" + + @property + def clone_url(self) -> str: + return f"https://github.com/{self.owner}/{self.name}.git" + + @property + def api_url(self) -> str: + return f"https://api.github.com/repos/{self.owner}/{self.name}" + + +class ValidationError(Exception): + """URL validation error.""" + pass + + +class GitHubURLValidator: + """Validates and parses GitHub repository URLs.""" + + GITHUB_PATTERNS = [ + r"^https?://github\.com/(?P[^/]+)/(?P[^/]+?)(?:\.git)?/?$", + r"^git@github\.com:(?P[^/]+)/(?P[^/]+?)(?:\.git)?$", + r"^(?P[a-zA-Z0-9](?:[a-zA-Z0-9]|-(?=[a-zA-Z0-9])){0,38})/(?P[a-zA-Z0-9._-]+)$", + ] + + def __init__(self, timeout: float = 10.0) -> None: + self.timeout = timeout + self._patterns = [re.compile(p) for p in self.GITHUB_PATTERNS] + + def parse_url(self, url: str) -> GitHubRepoInfo: + url = url.strip() + for pattern in self._patterns: + match = pattern.match(url) + if match: + owner = match.group("owner") + name = match.group("name").rstrip(".git") + if not self._is_valid_name(owner) or not self._is_valid_name(name): + raise ValidationError(f"Invalid owner or repository name: {url}") + return GitHubRepoInfo(owner=owner, name=name, url=f"https://github.com/{owner}/{name}") + raise ValidationError(f"Invalid GitHub URL: {url}. Expected: https://github.com/owner/repo") + + def _is_valid_name(self, name: str) -> bool: + if not name or len(name) > 100: + return False + return bool(re.match(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$", name)) + + async def validate_repository(self, url: str, check_accessibility: bool = True) -> GitHubRepoInfo: + repo_info = self.parse_url(url) + if check_accessibility: + await self._check_repo_accessible(repo_info) + logger.info("Repository validated", owner=repo_info.owner, name=repo_info.name) + return repo_info + + async def _check_repo_accessible(self, repo_info: GitHubRepoInfo) -> None: + async with httpx.AsyncClient(timeout=self.timeout) as client: + try: + response = await client.get(repo_info.api_url) + if response.status_code == 404: + raise ValidationError(f"Repository not found: {repo_info.full_name}") + elif response.status_code == 403: + raise ValidationError(f"Access denied: {repo_info.full_name}") + elif response.status_code != 200: + raise ValidationError(f"HTTP error {response.status_code}: {repo_info.full_name}") + data = response.json() + if data.get("private", False): + raise ValidationError(f"Private repository not supported: {repo_info.full_name}") + repo_info.branch = data.get("default_branch", "main") + except httpx.TimeoutException: + raise ValidationError(f"Timeout checking repository: {repo_info.full_name}") + except httpx.RequestError as e: + raise ValidationError(f"Network error: {str(e)}") + + def validate_url_sync(self, url: str) -> GitHubRepoInfo: + return self.parse_url(url) diff --git a/src/coderag/logging.py b/src/coderag/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..67b50c9055a667d7976b0264d767b0f738ff4408 --- /dev/null +++ b/src/coderag/logging.py @@ -0,0 +1,111 @@ +"""Structured logging configuration using structlog.""" + +import logging +import sys +from typing import Any + +import structlog +from structlog.types import Processor + + +def setup_logging( + level: str = "INFO", + json_format: bool = False, + log_file: str | None = None, +) -> None: + """Configure structured logging for the application. + + Args: + level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + json_format: If True, output logs as JSON (for production) + log_file: Optional file path for logging output + """ + # Configure standard library logging + logging.basicConfig( + format="%(message)s", + stream=sys.stdout, + level=getattr(logging, level.upper()), + ) + + # Add file handler if specified + if log_file: + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(getattr(logging, level.upper())) + logging.getLogger().addHandler(file_handler) + + # Shared processors for all outputs + shared_processors: list[Processor] = [ + structlog.contextvars.merge_contextvars, + structlog.stdlib.add_log_level, + structlog.stdlib.add_logger_name, + structlog.stdlib.PositionalArgumentsFormatter(), + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.StackInfoRenderer(), + structlog.processors.UnicodeDecoder(), + ] + + if json_format: + # Production: JSON output + processors: list[Processor] = [ + *shared_processors, + structlog.processors.format_exc_info, + structlog.processors.JSONRenderer(), + ] + else: + # Development: Colored console output + processors = [ + *shared_processors, + structlog.dev.ConsoleRenderer(colors=True), + ] + + structlog.configure( + processors=processors, + wrapper_class=structlog.stdlib.BoundLogger, + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger: + """Get a structured logger instance. + + Args: + name: Logger name (usually __name__ of the calling module) + + Returns: + Configured structlog logger + """ + return structlog.get_logger(name) + + +class LogContext: + """Context manager for adding temporary context to logs.""" + + def __init__(self, **kwargs: Any) -> None: + """Initialize with context variables.""" + self.context = kwargs + self._token: Any = None + + def __enter__(self) -> "LogContext": + """Bind context variables.""" + self._token = structlog.contextvars.bind_contextvars(**self.context) + return self + + def __exit__(self, *args: Any) -> None: + """Unbind context variables.""" + structlog.contextvars.unbind_contextvars(*self.context.keys()) + + +def log_operation( + operation: str, + **kwargs: Any, +) -> LogContext: + """Create a logging context for an operation. + + Usage: + with log_operation("indexing", repo_id="123"): + # All logs within this block will include repo_id + logger.info("Starting indexing") + """ + return LogContext(operation=operation, **kwargs) diff --git a/src/coderag/main.py b/src/coderag/main.py new file mode 100644 index 0000000000000000000000000000000000000000..7c473b86827f5a4fa883fc5bfa06447f86ad3c04 --- /dev/null +++ b/src/coderag/main.py @@ -0,0 +1,128 @@ +"""CodeRAG main application entry point.""" + +from contextlib import asynccontextmanager + +import uvicorn +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from coderag.config import get_settings +from coderag.logging import setup_logging, get_logger + +# Initialize settings and logging +settings = get_settings() +setup_logging(level=settings.server.log_level.upper()) +logger = get_logger(__name__) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Application lifespan handler.""" + logger.info( + "Starting CodeRAG", + app_name=settings.app_name, + version=settings.app_version, + debug=settings.debug, + ) + yield + logger.info("Shutting down CodeRAG") + + +def create_app() -> FastAPI: + """Create and configure the FastAPI application.""" + app = FastAPI( + title=settings.app_name, + version=settings.app_version, + description="RAG-based Q&A system for code repositories with verifiable citations", + docs_url="/docs", + redoc_url="/redoc", + lifespan=lifespan, + ) + + # CORS middleware + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + # Health check endpoint + @app.get("/health") + async def health_check() -> dict: + """Health check endpoint.""" + return { + "status": "healthy", + "app": settings.app_name, + "version": settings.app_version, + } + + # Register API routes + from coderag.api.routes import router as api_router + + app.include_router(api_router, prefix="/api/v1") + + # Mount MCP server + try: + from coderag.mcp.server import create_mcp_server + + mcp_server = create_mcp_server() + mcp_app = mcp_server.streamable_http_app() + app.mount("/mcp", mcp_app) + logger.info("MCP server mounted at /mcp") + except ImportError as e: + logger.warning("MCP server not available", error=str(e)) + except Exception as e: + logger.error("Failed to mount MCP server", error=str(e)) + + # Mount Gradio UI + try: + from coderag.ui.app import create_gradio_app + import gradio as gr + + gradio_app = create_gradio_app() + app = gr.mount_gradio_app(app, gradio_app, path="/") + logger.info("Gradio UI mounted at /") + except ImportError as e: + logger.warning("Gradio UI not available", error=str(e)) + except Exception as e: + logger.error("Failed to mount Gradio UI", error=str(e)) + + return app + + +def main() -> None: + """Run the application.""" + app = create_app() + + logger.info( + "Starting server", + host=settings.server.host, + port=settings.server.port, + ) + + uvicorn.run( + app, + host=settings.server.host, + port=settings.server.port, + reload=settings.server.reload, + workers=settings.server.workers, + log_level=settings.server.log_level, + ) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + logger.info("Application interrupted by user") + except Exception as e: + logger.error("Application crashed", error=str(e), exc_info=True) + import traceback + print("\n" + "="*80) + print("FATAL ERROR:") + print("="*80) + traceback.print_exc() + print("="*80) + input("Press Enter to close...") # Keep terminal open diff --git a/src/coderag/mcp/__init__.py b/src/coderag/mcp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..815468f231599522ba47fe0465c9d86d8961a631 --- /dev/null +++ b/src/coderag/mcp/__init__.py @@ -0,0 +1,11 @@ +"""MCP (Model Context Protocol) server for CodeRAG.""" + +from coderag.mcp.handlers import MCPHandlers, get_mcp_handlers +from coderag.mcp.server import create_mcp_server, mcp + +__all__ = [ + "MCPHandlers", + "get_mcp_handlers", + "create_mcp_server", + "mcp", +] diff --git a/src/coderag/mcp/__pycache__/__init__.cpython-313.pyc b/src/coderag/mcp/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b99625304b491f7d6c88e25565b08716b4239b4 Binary files /dev/null and b/src/coderag/mcp/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/mcp/__pycache__/cli.cpython-313.pyc b/src/coderag/mcp/__pycache__/cli.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15539cf05830afeadbbb03b6970d5e399c94a9fc Binary files /dev/null and b/src/coderag/mcp/__pycache__/cli.cpython-313.pyc differ diff --git a/src/coderag/mcp/__pycache__/handlers.cpython-313.pyc b/src/coderag/mcp/__pycache__/handlers.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38338dc5d03a723514677efa06144cbdf89788b4 Binary files /dev/null and b/src/coderag/mcp/__pycache__/handlers.cpython-313.pyc differ diff --git a/src/coderag/mcp/__pycache__/prompts.cpython-313.pyc b/src/coderag/mcp/__pycache__/prompts.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14f68d9242dc160140f922673395c24c48227ca0 Binary files /dev/null and b/src/coderag/mcp/__pycache__/prompts.cpython-313.pyc differ diff --git a/src/coderag/mcp/__pycache__/resources.cpython-313.pyc b/src/coderag/mcp/__pycache__/resources.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ef76b856c22fea67d261418a1d9e7bb1a0fae5d Binary files /dev/null and b/src/coderag/mcp/__pycache__/resources.cpython-313.pyc differ diff --git a/src/coderag/mcp/__pycache__/server.cpython-313.pyc b/src/coderag/mcp/__pycache__/server.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47bd3f0ccd8b4c71d01621967ecccf0903c6dfd4 Binary files /dev/null and b/src/coderag/mcp/__pycache__/server.cpython-313.pyc differ diff --git a/src/coderag/mcp/__pycache__/tools.cpython-313.pyc b/src/coderag/mcp/__pycache__/tools.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49dc82470dd83bfab4f91cdbb8071bfc31fd24d4 Binary files /dev/null and b/src/coderag/mcp/__pycache__/tools.cpython-313.pyc differ diff --git a/src/coderag/mcp/cli.py b/src/coderag/mcp/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..70582228b8ee1c558e9e0ea7d8f3357432d7c1cf --- /dev/null +++ b/src/coderag/mcp/cli.py @@ -0,0 +1,37 @@ +"""CLI entry point for running MCP server in stdio mode.""" + +import sys +import os + +# Suppress all stdout output except MCP protocol +os.environ["PYTHONUNBUFFERED"] = "1" + +# Redirect any stray prints to stderr +import io +_original_stdout = sys.stdout + + +def main(): + """Run the MCP server in stdio mode for Claude Desktop.""" + # Suppress logging to stdout - redirect to stderr + import logging + logging.basicConfig( + level=logging.WARNING, + stream=sys.stderr, + format="%(message)s" + ) + + # Suppress structlog output + import structlog + structlog.configure( + wrapper_class=structlog.make_filtering_bound_logger(logging.CRITICAL), + ) + + from coderag.mcp.server import create_mcp_server + + mcp = create_mcp_server() + mcp.run(transport="stdio") + + +if __name__ == "__main__": + main() diff --git a/src/coderag/mcp/handlers.py b/src/coderag/mcp/handlers.py new file mode 100644 index 0000000000000000000000000000000000000000..8d6ca14c814e0b4f1f5486aead155e7102c33d23 --- /dev/null +++ b/src/coderag/mcp/handlers.py @@ -0,0 +1,545 @@ +"""MCP handlers for CodeRAG - non-streaming versions of UI handlers.""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Any, Optional + +import torch + +from coderag.config import get_settings +from coderag.generation.generator import ResponseGenerator +from coderag.indexing.embeddings import EmbeddingGenerator +from coderag.indexing.vectorstore import VectorStore +from coderag.ingestion.chunker import CodeChunker +from coderag.ingestion.filter import FileFilter +from coderag.ingestion.loader import RepositoryLoader +from coderag.ingestion.validator import GitHubURLValidator, ValidationError +from coderag.logging import get_logger +from coderag.models.chunk import Chunk +from coderag.models.document import Document +from coderag.models.query import Query +from coderag.models.repository import Repository, RepositoryStatus + +logger = get_logger(__name__) + + +class MCPHandlers: + """Handlers for MCP tools - non-streaming versions.""" + + def __init__(self) -> None: + self.settings = get_settings() + self.validator = GitHubURLValidator() + self.loader = RepositoryLoader() + self.filter = FileFilter() + self.chunker = CodeChunker() + self.embedder = EmbeddingGenerator() + self.vectorstore = VectorStore() + self.generator: Optional[ResponseGenerator] = None + + # Repository metadata storage (shared with UIHandlers) + self.repos_file = self.settings.data_dir / "repositories.json" + self.repositories: dict[str, Repository] = self._load_repositories() + + def _load_repositories(self) -> dict[str, Repository]: + """Load repositories from JSON file.""" + if self.repos_file.exists(): + try: + data = json.loads(self.repos_file.read_text()) + return {r["id"]: Repository.from_dict(r) for r in data} + except Exception as e: + logger.error("Failed to load repositories", error=str(e)) + return {} + + def _save_repositories(self) -> None: + """Save repositories to JSON file.""" + self.repos_file.parent.mkdir(parents=True, exist_ok=True) + data = [r.to_dict() for r in self.repositories.values()] + self.repos_file.write_text(json.dumps(data, indent=2)) + + def _reload_repositories(self) -> None: + """Reload repositories from disk (for consistency with UIHandlers).""" + self.repositories = self._load_repositories() + + def _find_repository(self, repo_id: str) -> Optional[Repository]: + """Find repository by full or partial ID.""" + self._reload_repositories() + for rid, repo in self.repositories.items(): + if rid == repo_id or rid.startswith(repo_id): + return repo + return None + + def _process_batch(self, chunks: list[Chunk]) -> int: + """Process a batch: embed + store + release memory.""" + if not chunks: + return 0 + + embedded = self.embedder.embed_chunks(chunks, show_progress=False) + self.vectorstore.add_chunks(embedded) + + # Release memory + del embedded + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return len(chunks) + + def _get_current_commit(self, repo_path: Path) -> str: + """Get the SHA of the current commit.""" + from git import Repo + + git_repo = Repo(repo_path) + return git_repo.head.commit.hexsha + + def _get_changed_files( + self, + repo_path: Path, + last_commit: str, + current_commit: str, + ) -> tuple[set[str], set[str], set[str]]: + """Get files that were added, modified, or deleted between commits.""" + from git import Repo + + git_repo = Repo(repo_path) + diff = git_repo.commit(last_commit).diff(current_commit) + + added: set[str] = set() + modified: set[str] = set() + deleted: set[str] = set() + + for d in diff: + if d.new_file: + added.add(d.b_path) + elif d.deleted_file: + deleted.add(d.a_path) + elif d.renamed: + deleted.add(d.a_path) + added.add(d.b_path) + else: + modified.add(d.b_path or d.a_path) + + return added, modified, deleted + + async def index_repository( + self, + url: str, + branch: str = "", + include_patterns: Optional[list[str]] = None, + exclude_patterns: Optional[list[str]] = None, + ) -> dict[str, Any]: + """Index a GitHub repository (non-streaming version).""" + try: + logger.info("MCP: Starting indexing", url=url, branch=branch) + repo_info = self.validator.parse_url(url) + branch = branch.strip() if branch else repo_info.branch or "main" + + # Create repository record + repo = Repository( + url=repo_info.url, + branch=branch, + status=RepositoryStatus.CLONING, + ) + self.repositories[repo.id] = repo + self._save_repositories() + + # Clone repository + logger.info("MCP: Cloning repository", url=url, branch=branch) + repo_path = self.loader.clone_repository(repo_info, branch) + repo.clone_path = repo_path + repo.status = RepositoryStatus.INDEXING + self._save_repositories() + + # Setup filter with custom patterns + file_filter = FileFilter( + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + ) + + # Process files + logger.info("MCP: Filtering files", repo_path=str(repo_path)) + files = list(file_filter.filter_files(repo_path)) + file_count = len(files) + logger.info("MCP: Files to process", count=file_count) + + # Delete existing chunks for this repo (re-indexing) + self.vectorstore.delete_repo_chunks(repo.id) + + # Index all files + total_chunks = 0 + batch: list[Chunk] = [] + batch_size = self.settings.ingestion.batch_size + + for file_path in files: + try: + doc = Document.from_file(file_path, repo_path, repo.id) + for chunk in self.chunker.chunk_document(doc): + chunk.repo_id = repo.id + batch.append(chunk) + + if len(batch) >= batch_size: + total_chunks += self._process_batch(batch) + batch = [] + except Exception as e: + logger.warning("Failed to process file", path=str(file_path), error=str(e)) + + # Process final batch + if batch: + total_chunks += self._process_batch(batch) + + # Save commit for incremental updates + try: + repo.last_commit = self._get_current_commit(repo_path) + except Exception: + repo.last_commit = None + + # Update repository status + repo.chunk_count = total_chunks + repo.indexed_at = datetime.now() + repo.status = RepositoryStatus.READY + self._save_repositories() + + logger.info("MCP: Indexing complete", repo_id=repo.id, chunks=total_chunks) + + return { + "success": True, + "repo_id": repo.id, + "name": repo.full_name, + "branch": repo.branch, + "files_processed": file_count, + "chunks_indexed": total_chunks, + } + + except ValidationError as e: + logger.error("MCP: Validation error", error=str(e)) + return {"success": False, "error": f"Validation error: {str(e)}"} + except Exception as e: + logger.error("MCP: Indexing failed", error=str(e), exc_info=True) + if "repo" in locals(): + repo.status = RepositoryStatus.ERROR + repo.error_message = str(e) + self._save_repositories() + return {"success": False, "error": str(e)} + + async def query_code( + self, + repo_id: str, + question: str, + top_k: int = 5, + ) -> dict[str, Any]: + """Ask a question about a repository.""" + repo = self._find_repository(repo_id) + + if not repo: + return { + "answer": "", + "citations": [], + "evidence": [], + "grounded": False, + "error": f"Repository not found: {repo_id}", + } + + if repo.status != RepositoryStatus.READY: + return { + "answer": "", + "citations": [], + "evidence": [], + "grounded": False, + "error": f"Repository not ready: status is {repo.status.value}", + } + + try: + # Lazy load generator + if self.generator is None: + self.generator = ResponseGenerator() + + query = Query( + question=question.strip(), + repo_id=repo.id, + top_k=int(top_k), + ) + + response = self.generator.generate(query) + + return { + "answer": response.answer, + "citations": response.citations, + "evidence": [ + { + "file": chunk.file_path, + "start_line": chunk.start_line, + "end_line": chunk.end_line, + "content": chunk.content[:500], # Truncate for MCP response + "relevance": round(chunk.relevance_score or 0, 3), + } + for chunk in response.retrieved_chunks + ], + "grounded": response.grounded, + } + + except Exception as e: + logger.error("MCP: Query failed", error=str(e)) + return { + "answer": "", + "citations": [], + "evidence": [], + "grounded": False, + "error": str(e), + } + + async def list_repositories(self) -> dict[str, Any]: + """List all indexed repositories.""" + self._reload_repositories() + + repos = [] + for repo in self.repositories.values(): + repos.append({ + "id": repo.id, + "name": repo.full_name, + "branch": repo.branch, + "status": repo.status.value, + "chunk_count": repo.chunk_count, + "indexed_at": repo.indexed_at.isoformat() if repo.indexed_at else None, + }) + + return { + "repositories": repos, + "count": len(repos), + } + + async def get_repository_info(self, repo_id: str) -> dict[str, Any]: + """Get detailed repository information.""" + repo = self._find_repository(repo_id) + + if not repo: + return {"error": f"Repository not found: {repo_id}"} + + # Get indexed files from vectorstore + indexed_files: list[str] = [] + try: + files = self.vectorstore.get_indexed_files(repo.id) + indexed_files = list(files) if files else [] + except Exception: + pass + + return { + "id": repo.id, + "name": repo.name, + "full_name": repo.full_name, + "url": repo.url, + "branch": repo.branch, + "status": repo.status.value, + "chunk_count": repo.chunk_count, + "indexed_at": repo.indexed_at.isoformat() if repo.indexed_at else None, + "last_commit": repo.last_commit, + "indexed_files": indexed_files, + "error_message": repo.error_message, + } + + async def delete_repository(self, repo_id: str) -> dict[str, Any]: + """Delete an indexed repository.""" + repo = self._find_repository(repo_id) + + if not repo: + return {"success": False, "error": f"Repository not found: {repo_id}"} + + try: + # Get chunk count before deletion + chunk_count = self.vectorstore.get_repo_chunk_count(repo.id) + + # Delete from vector store + self.vectorstore.delete_repo_chunks(repo.id) + + # Delete cached repo + try: + self.loader.delete_cache( + type("RepoInfo", (), {"owner": repo.owner, "name": repo.name})() + ) + except Exception: + pass + + # Remove from records + del self.repositories[repo.id] + self._save_repositories() + + logger.info("MCP: Repository deleted", repo_id=repo.id) + + return { + "success": True, + "repo_id": repo.id, + "name": repo.full_name, + "chunks_deleted": chunk_count, + } + + except Exception as e: + logger.error("MCP: Delete failed", error=str(e)) + return {"success": False, "error": str(e)} + + async def update_repository(self, repo_id: str) -> dict[str, Any]: + """Incremental update of a repository.""" + repo = self._find_repository(repo_id) + + if not repo: + return {"success": False, "error": f"Repository not found: {repo_id}"} + + if not repo.last_commit: + return { + "success": False, + "error": "No previous indexing found. Please re-index the full repository.", + } + + if not repo.clone_path or not Path(repo.clone_path).exists(): + return {"success": False, "error": "Repository cache not found. Please re-index."} + + try: + repo_path = Path(repo.clone_path) + + # Update local repository + logger.info("MCP: Updating local repository", repo_id=repo.id) + self.loader._update_repository(repo_path, repo.branch, None) + + current_commit = self._get_current_commit(repo_path) + + if current_commit == repo.last_commit: + return { + "success": True, + "message": "Repository is already up to date", + "files_changed": 0, + "chunks_added": 0, + "chunks_deleted": 0, + } + + added, modified, deleted = self._get_changed_files( + repo_path, repo.last_commit, current_commit + ) + + logger.info( + "MCP: Changes detected", + added=len(added), + modified=len(modified), + deleted=len(deleted), + ) + + # Delete chunks for deleted/modified files + chunks_deleted = 0 + for file_path in deleted | modified: + count = self.vectorstore.delete_file_chunks(repo.id, file_path) + chunks_deleted += count if count else 0 + + # Index new/modified files + files_to_index = [] + file_filter = FileFilter() + for file_path in added | modified: + full_path = repo_path / file_path + if full_path.exists() and file_filter.should_include(full_path, repo_path): + files_to_index.append(full_path) + + new_chunks = 0 + if files_to_index: + batch_size = self.settings.ingestion.batch_size + batch: list[Chunk] = [] + + for file_path in files_to_index: + try: + doc = Document.from_file(file_path, repo_path, repo.id) + for chunk in self.chunker.chunk_document(doc): + chunk.repo_id = repo.id + batch.append(chunk) + + if len(batch) >= batch_size: + new_chunks += self._process_batch(batch) + batch = [] + except Exception as e: + logger.warning("Failed to process file", path=str(file_path), error=str(e)) + + if batch: + new_chunks += self._process_batch(batch) + + # Update metadata + repo.last_commit = current_commit + repo.indexed_at = datetime.now() + repo.chunk_count = self.vectorstore.get_repo_chunk_count(repo.id) + self._save_repositories() + + return { + "success": True, + "files_changed": len(added | modified | deleted), + "files_added": len(added), + "files_modified": len(modified), + "files_deleted": len(deleted), + "chunks_added": new_chunks, + "chunks_deleted": chunks_deleted, + "total_chunks": repo.chunk_count, + } + + except Exception as e: + logger.error("MCP: Incremental update failed", error=str(e), exc_info=True) + return {"success": False, "error": str(e)} + + async def search_code( + self, + repo_id: str, + query: str, + top_k: int = 10, + file_filter: Optional[str] = None, + chunk_type: Optional[str] = None, + ) -> dict[str, Any]: + """Semantic code search without LLM generation.""" + repo = self._find_repository(repo_id) + + if not repo: + return {"results": [], "error": f"Repository not found: {repo_id}"} + + if repo.status != RepositoryStatus.READY: + return {"results": [], "error": f"Repository not ready: status is {repo.status.value}"} + + try: + # Generate query embedding + query_embedding = self.embedder.generate_embedding(query, is_query=True) + + # Search vectorstore (query returns list of (Chunk, score) tuples) + results = self.vectorstore.query( + query_embedding=query_embedding, + repo_id=repo.id, + top_k=top_k, + ) + + # Filter by file pattern if provided + if file_filter: + import fnmatch + + results = [(chunk, score) for chunk, score in results if fnmatch.fnmatch(chunk.file_path, file_filter)] + + # Filter by chunk type if provided + if chunk_type: + results = [(chunk, score) for chunk, score in results if chunk.chunk_type == chunk_type] + + return { + "results": [ + { + "file_path": chunk.file_path, + "start_line": chunk.start_line, + "end_line": chunk.end_line, + "chunk_type": chunk.chunk_type, + "content": chunk.content, + "relevance_score": round(score, 3), + } + for chunk, score in results[:top_k] + ], + "count": len(results), + } + + except Exception as e: + logger.error("MCP: Search failed", error=str(e)) + return {"results": [], "error": str(e)} + + +# Singleton pattern +_mcp_handlers: Optional[MCPHandlers] = None + + +def get_mcp_handlers() -> MCPHandlers: + """Get the singleton MCPHandlers instance.""" + global _mcp_handlers + if _mcp_handlers is None: + _mcp_handlers = MCPHandlers() + return _mcp_handlers diff --git a/src/coderag/mcp/prompts.py b/src/coderag/mcp/prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..5273999a417a0c7061d2bfe36e3015a7aa6cefc7 --- /dev/null +++ b/src/coderag/mcp/prompts.py @@ -0,0 +1,149 @@ +"""MCP prompt templates for CodeRAG.""" + +from mcp.types import PromptMessage, TextContent + +from coderag.mcp.server import mcp + + +@mcp.prompt() +async def analyze_repository(repo_url: str) -> list[PromptMessage]: + """Guide for comprehensive repository analysis. + + Args: + repo_url: GitHub repository URL to analyze + + Returns: + List of prompt messages guiding the analysis workflow + """ + return [ + PromptMessage( + role="user", + content=TextContent( + type="text", + text=f"""Please analyze the repository at {repo_url}. Follow these steps: + +1. First, use the `index_repository` tool to index the repository: + - URL: {repo_url} + +2. Once indexed, use `get_repository_info` to understand the repository structure: + - Note the number of files and chunks indexed + - Review the list of indexed files + +3. Use `query_code` to answer these questions: + - What is the main purpose of this codebase? + - What are the key components or modules? + - What design patterns are used? + - What external dependencies does it have? + +4. Use `search_code` to find: + - Entry points (main functions, CLI handlers) + - Configuration handling + - Core business logic + +5. Provide a comprehensive summary including: + - Purpose and functionality + - Architecture overview + - Key components + - Notable patterns or practices + - Potential areas for improvement +""", + ), + ) + ] + + +@mcp.prompt() +async def find_implementation(repo_id: str, feature: str) -> list[PromptMessage]: + """Guide for finding feature implementations. + + Args: + repo_id: Repository ID to search in + feature: Feature or functionality to find + + Returns: + List of prompt messages guiding the search workflow + """ + return [ + PromptMessage( + role="user", + content=TextContent( + type="text", + text=f"""Please find the implementation of "{feature}" in repository {repo_id}. Follow these steps: + +1. Use `search_code` to find relevant code: + - Query: "{feature}" + - Try different search terms if initial results aren't helpful + +2. For each relevant result, use `query_code` to understand: + - How is this feature implemented? + - What are the key functions/classes involved? + - What is the data flow? + +3. Trace the implementation: + - Find the entry point + - Follow the call chain + - Identify helper functions and utilities + +4. Provide a detailed explanation: + - Location of the implementation (files and line numbers) + - Key components and their roles + - How data flows through the system + - Any notable patterns or design decisions +""", + ), + ) + ] + + +@mcp.prompt() +async def code_review(repo_id: str, focus_area: str = "") -> list[PromptMessage]: + """Guide for performing code reviews. + + Args: + repo_id: Repository ID to review + focus_area: Optional specific area to focus on (e.g., "security", "performance") + + Returns: + List of prompt messages guiding the review workflow + """ + focus_text = f' with focus on "{focus_area}"' if focus_area else "" + + return [ + PromptMessage( + role="user", + content=TextContent( + type="text", + text=f"""Please perform a code review of repository {repo_id}{focus_text}. Follow these steps: + +1. Use `get_repository_info` to understand the repository structure + +2. Use `search_code` to find key areas to review: + - Entry points and main functions + - Error handling patterns + - Data validation + - Security-sensitive code (if applicable) + +3. For each area, use `query_code` to analyze: + - Code quality and readability + - Error handling completeness + - Security considerations + - Performance implications + - Test coverage (if tests are indexed) + +4. Check for common issues: + - Hardcoded credentials or secrets + - SQL injection vulnerabilities + - Input validation gaps + - Resource leaks + - Race conditions + +5. Provide a structured review: + - Summary of findings + - Critical issues (if any) + - Recommendations for improvement + - Positive observations + - Priority of fixes +""", + ), + ) + ] diff --git a/src/coderag/mcp/resources.py b/src/coderag/mcp/resources.py new file mode 100644 index 0000000000000000000000000000000000000000..8797da7d7eddfa1598add2ed8b264ecd18bdbfa5 --- /dev/null +++ b/src/coderag/mcp/resources.py @@ -0,0 +1,33 @@ +"""MCP resource definitions for CodeRAG.""" + +import json + +from coderag.mcp.handlers import get_mcp_handlers +from coderag.mcp.server import mcp + + +@mcp.resource("repository://{repo_id}") +async def get_repository_resource(repo_id: str) -> str: + """Get repository metadata as JSON. + + Args: + repo_id: Repository ID (full or first 8 characters) + + Returns: + Repository metadata as JSON string + """ + handlers = get_mcp_handlers() + result = await handlers.get_repository_info(repo_id=repo_id) + return json.dumps(result, indent=2) + + +@mcp.resource("repositories://list") +async def get_repositories_list() -> str: + """Get all repositories as JSON. + + Returns: + List of all repositories as JSON string + """ + handlers = get_mcp_handlers() + result = await handlers.list_repositories() + return json.dumps(result, indent=2) diff --git a/src/coderag/mcp/server.py b/src/coderag/mcp/server.py new file mode 100644 index 0000000000000000000000000000000000000000..2b1b2e539bc2d3f08f94dd002946df1f65b3b5ab --- /dev/null +++ b/src/coderag/mcp/server.py @@ -0,0 +1,40 @@ +"""FastMCP server configuration for CodeRAG.""" + +from mcp.server.fastmcp import FastMCP + +# Create the MCP server instance +mcp = FastMCP( + name="CodeRAG", + instructions="""CodeRAG is a RAG-based Q&A system for code repositories. + +Available capabilities: +- Index GitHub repositories for code analysis +- Ask questions about indexed code with verifiable citations +- Search code semantically +- Manage indexed repositories + +Use the tools to: +1. index_repository: Index a new GitHub repository +2. query_code: Ask questions about indexed code +3. search_code: Search code without LLM generation +4. list_repositories: See all indexed repositories +5. get_repository_info: Get details about a specific repository +6. update_repository: Incrementally update a repository +7. delete_repository: Remove an indexed repository + +Use the prompts for guided workflows: +- analyze_repository: Comprehensive repository analysis +- find_implementation: Find feature implementations +- code_review: Perform code reviews +""", +) + + +def create_mcp_server() -> FastMCP: + """Create and configure the MCP server with all tools, resources, and prompts.""" + # Import tools, resources, and prompts to register them + from coderag.mcp import tools # noqa: F401 + from coderag.mcp import resources # noqa: F401 + from coderag.mcp import prompts # noqa: F401 + + return mcp diff --git a/src/coderag/mcp/tools.py b/src/coderag/mcp/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..1f080c13815ac10abaa10812b7f5cef533e39f0f --- /dev/null +++ b/src/coderag/mcp/tools.py @@ -0,0 +1,140 @@ +"""MCP tool definitions for CodeRAG.""" + +from typing import Optional + +from coderag.mcp.handlers import get_mcp_handlers +from coderag.mcp.server import mcp + + +@mcp.tool() +async def index_repository( + url: str, + branch: str = "", + include_patterns: Optional[list[str]] = None, + exclude_patterns: Optional[list[str]] = None, +) -> dict: + """Index a GitHub repository for code Q&A. + + Args: + url: GitHub repository URL (e.g., https://github.com/owner/repo) + branch: Branch to index (defaults to main/master) + include_patterns: File patterns to include (e.g., ["*.py", "*.ts"]) + exclude_patterns: File patterns to exclude (e.g., ["*_test.py"]) + + Returns: + dict with success status, repo_id, files_processed, and chunks_indexed + """ + handlers = get_mcp_handlers() + return await handlers.index_repository( + url=url, + branch=branch, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + ) + + +@mcp.tool() +async def query_code( + repo_id: str, + question: str, + top_k: int = 5, +) -> dict: + """Ask questions about indexed code with citations. + + Args: + repo_id: Repository ID (full or first 8 characters) + question: Question about the code + top_k: Number of code chunks to retrieve for context (default: 5) + + Returns: + dict with answer, citations, evidence, and grounded flag + """ + handlers = get_mcp_handlers() + return await handlers.query_code( + repo_id=repo_id, + question=question, + top_k=top_k, + ) + + +@mcp.tool() +async def list_repositories() -> dict: + """List all indexed repositories. + + Returns: + dict with repositories array and count + """ + handlers = get_mcp_handlers() + return await handlers.list_repositories() + + +@mcp.tool() +async def get_repository_info(repo_id: str) -> dict: + """Get detailed repository information. + + Args: + repo_id: Repository ID (full or first 8 characters) + + Returns: + dict with repository metadata including name, url, branch, chunk_count, status, and indexed_files + """ + handlers = get_mcp_handlers() + return await handlers.get_repository_info(repo_id=repo_id) + + +@mcp.tool() +async def delete_repository(repo_id: str) -> dict: + """Remove an indexed repository. + + Args: + repo_id: Repository ID (full or first 8 characters) + + Returns: + dict with success status and chunks_deleted + """ + handlers = get_mcp_handlers() + return await handlers.delete_repository(repo_id=repo_id) + + +@mcp.tool() +async def update_repository(repo_id: str) -> dict: + """Incremental update of a repository (only changed files). + + Args: + repo_id: Repository ID (full or first 8 characters) + + Returns: + dict with success status, files_changed, chunks_added, chunks_deleted + """ + handlers = get_mcp_handlers() + return await handlers.update_repository(repo_id=repo_id) + + +@mcp.tool() +async def search_code( + repo_id: str, + query: str, + top_k: int = 10, + file_filter: Optional[str] = None, + chunk_type: Optional[str] = None, +) -> dict: + """Semantic code search without LLM generation. + + Args: + repo_id: Repository ID (full or first 8 characters) + query: Search query + top_k: Maximum number of results (default: 10) + file_filter: File pattern filter (e.g., "*.py") + chunk_type: Filter by chunk type (e.g., "function", "class") + + Returns: + dict with results array containing file_path, start_line, end_line, content, and relevance_score + """ + handlers = get_mcp_handlers() + return await handlers.search_code( + repo_id=repo_id, + query=query, + top_k=top_k, + file_filter=file_filter, + chunk_type=chunk_type, + ) diff --git a/src/coderag/models/__init__.py b/src/coderag/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dedc59798e389470aaea4da50f47727f79a8a818 --- /dev/null +++ b/src/coderag/models/__init__.py @@ -0,0 +1,20 @@ +"""Models module: Core dataclasses and entities.""" + +from coderag.models.document import Document, DocumentMetadata +from coderag.models.chunk import Chunk, ChunkMetadata, ChunkType +from coderag.models.response import Query, Response, Citation, RetrievedChunk +from coderag.models.repository import Repository, RepositoryStatus + +__all__ = [ + "Document", + "DocumentMetadata", + "Chunk", + "ChunkMetadata", + "ChunkType", + "Query", + "Response", + "Citation", + "RetrievedChunk", + "Repository", + "RepositoryStatus", +] diff --git a/src/coderag/models/__pycache__/__init__.cpython-313.pyc b/src/coderag/models/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a89dcdd2fe0d85fda4bee369ae791679c12f8c2 Binary files /dev/null and b/src/coderag/models/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/models/__pycache__/chunk.cpython-313.pyc b/src/coderag/models/__pycache__/chunk.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f8d7135c92acf13efc9a8b47108ebe9173d9ff9 Binary files /dev/null and b/src/coderag/models/__pycache__/chunk.cpython-313.pyc differ diff --git a/src/coderag/models/__pycache__/document.cpython-313.pyc b/src/coderag/models/__pycache__/document.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4aaf69bccbd26626cb2a1cb3e9f54be24591ca7 Binary files /dev/null and b/src/coderag/models/__pycache__/document.cpython-313.pyc differ diff --git a/src/coderag/models/__pycache__/query.cpython-313.pyc b/src/coderag/models/__pycache__/query.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b21b5e4daf6d3ff9982d5eb8698be058a708d03 Binary files /dev/null and b/src/coderag/models/__pycache__/query.cpython-313.pyc differ diff --git a/src/coderag/models/__pycache__/repository.cpython-313.pyc b/src/coderag/models/__pycache__/repository.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aea9668d49ba369da6387df6e475391ca9e1149 Binary files /dev/null and b/src/coderag/models/__pycache__/repository.cpython-313.pyc differ diff --git a/src/coderag/models/__pycache__/response.cpython-313.pyc b/src/coderag/models/__pycache__/response.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..329c2a58310797b89ab0a06131e33cf0a26aa2e9 Binary files /dev/null and b/src/coderag/models/__pycache__/response.cpython-313.pyc differ diff --git a/src/coderag/models/chunk.py b/src/coderag/models/chunk.py new file mode 100644 index 0000000000000000000000000000000000000000..893703752be0cc8b949943b580161ee8625b0808 --- /dev/null +++ b/src/coderag/models/chunk.py @@ -0,0 +1,123 @@ +"""Chunk entity model for semantic code units.""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional +from uuid import uuid4 + + +class ChunkType(str, Enum): + """Type of code chunk.""" + + FUNCTION = "function" + CLASS = "class" + METHOD = "method" + MODULE = "module" + TEXT = "text" + DOCSTRING = "docstring" + COMMENT = "comment" + + +@dataclass +class ChunkMetadata: + """Metadata for a code chunk.""" + + file_path: str + start_line: int + end_line: int + chunk_type: ChunkType + language: Optional[str] = None + name: Optional[str] = None + signature: Optional[str] = None + docstring: Optional[str] = None + parent_name: Optional[str] = None + + @property + def line_range(self) -> str: + """Get line range as string.""" + return f"{self.start_line}-{self.end_line}" + + @property + def citation(self) -> str: + """Get citation format.""" + return f"[{self.file_path}:{self.start_line}-{self.end_line}]" + + +@dataclass +class Chunk: + """A semantic unit of code or documentation.""" + + content: str + metadata: ChunkMetadata + repo_id: str + id: str = field(default_factory=lambda: str(uuid4())) + embedding: Optional[list[float]] = None + + @property + def file_path(self) -> str: + """Convenience accessor for file path.""" + return self.metadata.file_path + + @property + def start_line(self) -> int: + """Convenience accessor for start line.""" + return self.metadata.start_line + + @property + def end_line(self) -> int: + """Convenience accessor for end line.""" + return self.metadata.end_line + + @property + def chunk_type(self) -> ChunkType: + """Convenience accessor for chunk type.""" + return self.metadata.chunk_type + + @property + def name(self) -> Optional[str]: + """Convenience accessor for name.""" + return self.metadata.name + + @property + def citation(self) -> str: + """Get citation format.""" + return self.metadata.citation + + def to_dict(self) -> dict: + """Convert to dictionary for storage.""" + return { + "id": self.id, + "content": self.content, + "repo_id": self.repo_id, + "file_path": self.metadata.file_path, + "start_line": self.metadata.start_line, + "end_line": self.metadata.end_line, + "chunk_type": self.metadata.chunk_type.value, + "language": self.metadata.language, + "name": self.metadata.name, + "signature": self.metadata.signature, + "docstring": self.metadata.docstring, + "parent_name": self.metadata.parent_name, + } + + @classmethod + def from_dict(cls, data: dict, embedding: Optional[list[float]] = None) -> "Chunk": + """Create from dictionary.""" + metadata = ChunkMetadata( + file_path=data["file_path"], + start_line=data["start_line"], + end_line=data["end_line"], + chunk_type=ChunkType(data["chunk_type"]), + language=data.get("language"), + name=data.get("name"), + signature=data.get("signature"), + docstring=data.get("docstring"), + parent_name=data.get("parent_name"), + ) + return cls( + id=data["id"], + content=data["content"], + metadata=metadata, + repo_id=data["repo_id"], + embedding=embedding, + ) diff --git a/src/coderag/models/document.py b/src/coderag/models/document.py new file mode 100644 index 0000000000000000000000000000000000000000..3bb7707aa20734b926d5e157ed79031e90842258 --- /dev/null +++ b/src/coderag/models/document.py @@ -0,0 +1,96 @@ +"""Document entity model for representing source files.""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass +class DocumentMetadata: + """Metadata for a source document.""" + + file_path: str + language: Optional[str] = None + size_bytes: int = 0 + line_count: int = 0 + encoding: str = "utf-8" + + @property + def extension(self) -> str: + """Get file extension.""" + return Path(self.file_path).suffix.lstrip(".") + + +@dataclass +class Document: + """Represents a source code file loaded for processing.""" + + content: str + metadata: DocumentMetadata + repo_id: str = "" + + @property + def file_path(self) -> str: + """Convenience accessor for file path.""" + return self.metadata.file_path + + @property + def language(self) -> Optional[str]: + """Convenience accessor for language.""" + return self.metadata.language + + @classmethod + def from_file(cls, file_path: Path, repo_root: Path, repo_id: str = "") -> "Document": + """Create Document from a file path.""" + content = file_path.read_text(encoding="utf-8") + relative_path = str(file_path.relative_to(repo_root)) + line_count = content.count("\n") + 1 if content else 0 + + language = _detect_language(file_path.suffix) + + metadata = DocumentMetadata( + file_path=relative_path, + language=language, + size_bytes=file_path.stat().st_size, + line_count=line_count, + ) + + return cls(content=content, metadata=metadata, repo_id=repo_id) + + +def _detect_language(extension: str) -> Optional[str]: + """Detect programming language from file extension.""" + extension_map = { + ".py": "python", + ".js": "javascript", + ".ts": "typescript", + ".jsx": "javascript", + ".tsx": "typescript", + ".java": "java", + ".go": "go", + ".rs": "rust", + ".rb": "ruby", + ".php": "php", + ".c": "c", + ".cpp": "cpp", + ".h": "c", + ".hpp": "cpp", + ".cs": "csharp", + ".swift": "swift", + ".kt": "kotlin", + ".scala": "scala", + ".md": "markdown", + ".rst": "restructuredtext", + ".yaml": "yaml", + ".yml": "yaml", + ".json": "json", + ".toml": "toml", + ".xml": "xml", + ".html": "html", + ".css": "css", + ".sql": "sql", + ".sh": "bash", + ".bash": "bash", + ".zsh": "zsh", + } + return extension_map.get(extension.lower()) diff --git a/src/coderag/models/query.py b/src/coderag/models/query.py new file mode 100644 index 0000000000000000000000000000000000000000..dad66d2de5fff264810d929593c8860f0ce85fc3 --- /dev/null +++ b/src/coderag/models/query.py @@ -0,0 +1,5 @@ +"""Query model (re-export for convenience).""" + +from coderag.models.response import Query + +__all__ = ["Query"] diff --git a/src/coderag/models/repository.py b/src/coderag/models/repository.py new file mode 100644 index 0000000000000000000000000000000000000000..09042e3f409912a7abbcc35ae4dfff13c932041c --- /dev/null +++ b/src/coderag/models/repository.py @@ -0,0 +1,80 @@ +"""Repository entity model.""" + +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from pathlib import Path +from typing import Optional +from uuid import uuid4 + + +class RepositoryStatus(str, Enum): + """Repository indexing status.""" + + PENDING = "pending" + CLONING = "cloning" + INDEXING = "indexing" + READY = "ready" + ERROR = "error" + + +@dataclass +class Repository: + """Represents an indexed GitHub repository.""" + + url: str + branch: str = "main" + id: str = field(default_factory=lambda: str(uuid4())) + clone_path: Optional[Path] = None + indexed_at: Optional[datetime] = None + chunk_count: int = 0 + status: RepositoryStatus = RepositoryStatus.PENDING + error_message: Optional[str] = None + last_commit: Optional[str] = None # SHA of last indexed commit (for incremental updates) + + @property + def name(self) -> str: + """Extract repository name from URL.""" + return self.url.rstrip("/").split("/")[-1].replace(".git", "") + + @property + def owner(self) -> str: + """Extract repository owner from URL.""" + parts = self.url.rstrip("/").split("/") + return parts[-2] if len(parts) >= 2 else "" + + @property + def full_name(self) -> str: + """Get owner/repo format.""" + return f"{self.owner}/{self.name}" + + def to_dict(self) -> dict: + """Convert to dictionary for serialization.""" + return { + "id": self.id, + "url": self.url, + "branch": self.branch, + "clone_path": str(self.clone_path) if self.clone_path else None, + "indexed_at": self.indexed_at.isoformat() if self.indexed_at else None, + "chunk_count": self.chunk_count, + "status": self.status.value, + "error_message": self.error_message, + "last_commit": self.last_commit, + } + + @classmethod + def from_dict(cls, data: dict) -> "Repository": + """Create from dictionary.""" + return cls( + id=data["id"], + url=data["url"], + branch=data.get("branch", "main"), + clone_path=Path(data["clone_path"]) if data.get("clone_path") else None, + indexed_at=datetime.fromisoformat(data["indexed_at"]) + if data.get("indexed_at") + else None, + chunk_count=data.get("chunk_count", 0), + status=RepositoryStatus(data.get("status", "pending")), + error_message=data.get("error_message"), + last_commit=data.get("last_commit"), + ) diff --git a/src/coderag/models/response.py b/src/coderag/models/response.py new file mode 100644 index 0000000000000000000000000000000000000000..8b02c69012436d8ed52c13f49d5988405611445e --- /dev/null +++ b/src/coderag/models/response.py @@ -0,0 +1,108 @@ +"""Response entity models for Q&A results.""" + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Optional +from uuid import uuid4 + + +@dataclass +class Citation: + """A reference to source code location.""" + + file_path: str + start_line: int + end_line: int + + def __str__(self) -> str: + """Format as citation string.""" + return f"[{self.file_path}:{self.start_line}-{self.end_line}]" + + @classmethod + def parse(cls, citation_str: str) -> Optional["Citation"]: + """Parse citation from string format [file:start-end].""" + try: + citation_str = citation_str.strip("[]") + if ":" not in citation_str: + return None + file_path, line_range = citation_str.rsplit(":", 1) + if "-" in line_range: + start, end = line_range.split("-") + return cls( + file_path=file_path, + start_line=int(start), + end_line=int(end), + ) + else: + line = int(line_range) + return cls(file_path=file_path, start_line=line, end_line=line) + except (ValueError, IndexError): + return None + + +@dataclass +class RetrievedChunk: + """A chunk retrieved for answering a query.""" + + chunk_id: str + content: str + file_path: str + start_line: int + end_line: int + relevance_score: float + chunk_type: str + name: Optional[str] = None + + @property + def citation(self) -> str: + """Get citation format.""" + return f"[{self.file_path}:{self.start_line}-{self.end_line}]" + + +@dataclass +class Query: + """A user's question about a repository.""" + + question: str + repo_id: str + id: str = field(default_factory=lambda: str(uuid4())) + timestamp: datetime = field(default_factory=datetime.now) + top_k: int = 5 + + +@dataclass +class Response: + """The system's answer to a query.""" + + answer: str + citations: list[Citation] + retrieved_chunks: list[RetrievedChunk] + grounded: bool + query_id: str = "" + confidence_score: float = 0.0 + + @property + def has_evidence(self) -> bool: + """Check if response has supporting evidence.""" + return len(self.retrieved_chunks) > 0 + + @property + def citation_count(self) -> int: + """Count of citations in response.""" + return len(self.citations) + + def format_evidence(self) -> str: + """Format evidence section for display.""" + if not self.retrieved_chunks: + return "No evidence retrieved." + + lines = ["## Evidence\n"] + for i, chunk in enumerate(self.retrieved_chunks, 1): + lines.append(f"### {i}. {chunk.citation} (Score: {chunk.relevance_score:.3f})") + if chunk.name: + lines.append(f"**{chunk.chunk_type}**: `{chunk.name}`\n") + lines.append("```") + lines.append(chunk.content[:500] + ("..." if len(chunk.content) > 500 else "")) + lines.append("```\n") + + return "\n".join(lines) diff --git a/src/coderag/retrieval/__init__.py b/src/coderag/retrieval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..523edbed68a9366df5f281bb2548cfa8d329ba51 --- /dev/null +++ b/src/coderag/retrieval/__init__.py @@ -0,0 +1,5 @@ +"""Retrieval module: Query processing and similarity search.""" + +from coderag.retrieval.retriever import Retriever + +__all__ = ["Retriever"] diff --git a/src/coderag/retrieval/__pycache__/__init__.cpython-313.pyc b/src/coderag/retrieval/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c98817da90c7efd5ff59f305dc135f751ea7ebe2 Binary files /dev/null and b/src/coderag/retrieval/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/retrieval/__pycache__/retriever.cpython-313.pyc b/src/coderag/retrieval/__pycache__/retriever.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39f3efe08da3efee397c26a7d1f02b88d86ad65d Binary files /dev/null and b/src/coderag/retrieval/__pycache__/retriever.cpython-313.pyc differ diff --git a/src/coderag/retrieval/retriever.py b/src/coderag/retrieval/retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..c3555cebdb1e721e321dbe2b43e76f741d149b20 --- /dev/null +++ b/src/coderag/retrieval/retriever.py @@ -0,0 +1,91 @@ +"""Retrieval module for semantic search.""" + +from typing import Optional + +from coderag.config import get_settings +from coderag.indexing.embeddings import EmbeddingGenerator +from coderag.indexing.vectorstore import VectorStore +from coderag.logging import get_logger +from coderag.models.chunk import Chunk +from coderag.models.response import RetrievedChunk + +logger = get_logger(__name__) + + +class Retriever: + """Retrieves relevant chunks for a query.""" + + def __init__( + self, + vectorstore: Optional[VectorStore] = None, + embedder: Optional[EmbeddingGenerator] = None, + ) -> None: + settings = get_settings() + self.vectorstore = vectorstore or VectorStore() + self.embedder = embedder or EmbeddingGenerator() + self.default_top_k = settings.retrieval.default_top_k + self.max_top_k = settings.retrieval.max_top_k + self.similarity_threshold = settings.retrieval.similarity_threshold + + def retrieve( + self, + query: str, + repo_id: str, + top_k: Optional[int] = None, + similarity_threshold: Optional[float] = None, + ) -> list[RetrievedChunk]: + top_k = min(top_k or self.default_top_k, self.max_top_k) + threshold = similarity_threshold if similarity_threshold is not None else self.similarity_threshold + + logger.info("Retrieving chunks", query=query[:100], repo_id=repo_id, top_k=top_k) + + # Generate query embedding + query_embedding = self.embedder.generate_embedding(query, is_query=True) + + # Search vector store + results = self.vectorstore.query( + query_embedding=query_embedding, + repo_id=repo_id, + top_k=top_k, + similarity_threshold=threshold, + ) + + # Convert to RetrievedChunk + retrieved_chunks = [] + for chunk, score in results: + retrieved_chunk = RetrievedChunk( + chunk_id=chunk.id, + content=chunk.content, + file_path=chunk.file_path, + start_line=chunk.start_line, + end_line=chunk.end_line, + relevance_score=score, + chunk_type=chunk.chunk_type.value, + name=chunk.name, + ) + retrieved_chunks.append(retrieved_chunk) + + logger.info("Chunks retrieved", count=len(retrieved_chunks)) + return retrieved_chunks + + def retrieve_with_context( + self, + query: str, + repo_id: str, + top_k: Optional[int] = None, + ) -> tuple[list[RetrievedChunk], str]: + chunks = self.retrieve(query, repo_id, top_k) + + # Build context string for LLM + context_parts = [] + for i, chunk in enumerate(chunks, 1): + context_parts.append( + f"[{i}] {chunk.citation}\n" + f"Type: {chunk.chunk_type}" + f"{f' | Name: {chunk.name}' if chunk.name else ''}\n" + f"```\n{chunk.content}\n```\n" + ) + + context = "\n".join(context_parts) if context_parts else "No relevant code found." + + return chunks, context diff --git a/src/coderag/ui/__init__.py b/src/coderag/ui/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7fce094b6208e7803cc07116083627aa0283fd84 --- /dev/null +++ b/src/coderag/ui/__init__.py @@ -0,0 +1,5 @@ +"""UI module: Gradio web interface.""" + +from coderag.ui.app import create_gradio_app + +__all__ = ["create_gradio_app"] diff --git a/src/coderag/ui/__pycache__/__init__.cpython-313.pyc b/src/coderag/ui/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35cc16576736d77ea990276a529a62c360def352 Binary files /dev/null and b/src/coderag/ui/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/coderag/ui/__pycache__/app.cpython-313.pyc b/src/coderag/ui/__pycache__/app.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cfce0e61000c5994ab40e0642816098a9504f29 Binary files /dev/null and b/src/coderag/ui/__pycache__/app.cpython-313.pyc differ diff --git a/src/coderag/ui/__pycache__/handlers.cpython-313.pyc b/src/coderag/ui/__pycache__/handlers.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c29f7f1191141bf012137891bdfe0f6c16ee363 Binary files /dev/null and b/src/coderag/ui/__pycache__/handlers.cpython-313.pyc differ diff --git a/src/coderag/ui/app.py b/src/coderag/ui/app.py new file mode 100644 index 0000000000000000000000000000000000000000..0ea2b2a9a18fd2432225666aa211d973a6f1786e --- /dev/null +++ b/src/coderag/ui/app.py @@ -0,0 +1,166 @@ +"""Gradio web interface for CodeRAG.""" + +from typing import Optional + +import gradio as gr + +from coderag.ui.handlers import UIHandlers + + +def create_gradio_app() -> gr.Blocks: + """Create the Gradio application.""" + handlers = UIHandlers() + + with gr.Blocks(title="CodeRAG - Code Q&A with Citations") as app: + gr.Markdown("# CodeRAG - Code Q&A with Citations") + gr.Markdown("Index GitHub repositories and ask questions about the code with verifiable citations.") + + with gr.Tabs(): + # Tab 1: Index Repository + with gr.TabItem("Index Repository"): + with gr.Row(): + with gr.Column(scale=2): + repo_url = gr.Textbox( + label="GitHub Repository URL", + placeholder="https://github.com/owner/repo", + info="Enter a public GitHub repository URL", + ) + + with gr.Accordion("Advanced Options", open=False): + branch = gr.Textbox( + label="Branch", + placeholder="main", + value="", + info="Leave empty for default branch", + ) + include_patterns = gr.Textbox( + label="Include Patterns", + placeholder="*.py, *.js, *.md", + value="", + info="Comma-separated glob patterns (leave empty for defaults)", + ) + exclude_patterns = gr.Textbox( + label="Exclude Patterns", + placeholder="**/tests/**, **/node_modules/**", + value="", + info="Comma-separated glob patterns (leave empty for defaults)", + ) + + index_btn = gr.Button("Index Repository", variant="primary") + + with gr.Column(scale=1): + index_status = gr.Textbox( + label="Status", + interactive=False, + lines=3, + ) + index_progress = gr.Progress() + + index_btn.click( + fn=handlers.index_repository, + inputs=[repo_url, branch, include_patterns, exclude_patterns], + outputs=[index_status], + ) + + # Tab 2: Ask Questions + with gr.TabItem("Ask Questions"): + with gr.Row(): + with gr.Column(scale=2): + repo_selector = gr.Dropdown( + label="Select Repository", + choices=[], + interactive=True, + ) + refresh_repos_btn = gr.Button("Refresh", size="sm") + + question = gr.Textbox( + label="Question", + placeholder="Where is the function X defined?", + lines=2, + ) + + with gr.Row(): + top_k = gr.Slider( + minimum=1, + maximum=20, + value=5, + step=1, + label="Number of chunks to retrieve", + ) + + ask_btn = gr.Button("Ask", variant="primary") + + with gr.Column(scale=1): + qa_status = gr.Textbox( + label="Status", + interactive=False, + lines=1, + ) + + with gr.Row(): + answer_output = gr.Markdown(label="Answer") + + with gr.Accordion("Evidence", open=True): + evidence_output = gr.Markdown(label="Retrieved Chunks") + + refresh_repos_btn.click( + fn=handlers.get_repositories, + outputs=[repo_selector], + ) + + ask_btn.click( + fn=handlers.ask_question, + inputs=[repo_selector, question, top_k], + outputs=[answer_output, evidence_output, qa_status], + ) + + # Tab 3: Manage Repositories + with gr.TabItem("Manage Repositories"): + repos_table = gr.Dataframe( + headers=["ID", "Repository", "Branch", "Chunks", "Status", "Indexed At"], + label="Indexed Repositories", + interactive=False, + ) + + with gr.Row(): + refresh_table_btn = gr.Button("Refresh", size="sm") + + gr.Markdown("### Actions") + + with gr.Row(): + with gr.Column(scale=2): + action_repo_id = gr.Textbox( + label="Repository ID", + placeholder="Enter repository ID (or first 8 characters)", + info="Copy the ID from the table above", + ) + with gr.Column(scale=1): + update_btn = gr.Button("Update (Incremental)", variant="secondary") + delete_btn = gr.Button("Delete", variant="stop") + + action_status = gr.Textbox(label="Status", interactive=False, lines=5) + + refresh_table_btn.click( + fn=handlers.get_repositories_table, + outputs=[repos_table], + ) + + update_btn.click( + fn=handlers.index_repository_incremental, + inputs=[action_repo_id], + outputs=[action_status], + ) + + delete_btn.click( + fn=handlers.delete_repository, + inputs=[action_repo_id], + outputs=[action_status, repos_table], + ) + + # Load initial data + app.load( + fn=handlers.get_repositories, + outputs=[repo_selector], + ) + + return app diff --git a/src/coderag/ui/handlers.py b/src/coderag/ui/handlers.py new file mode 100644 index 0000000000000000000000000000000000000000..baa3b3503da918d0e55258426d2f84b9a7e1504a --- /dev/null +++ b/src/coderag/ui/handlers.py @@ -0,0 +1,500 @@ +"""UI event handlers for Gradio interface.""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Iterator, Optional + +import torch + +from coderag.config import get_settings +from coderag.generation.generator import ResponseGenerator +from coderag.indexing.embeddings import EmbeddingGenerator +from coderag.indexing.vectorstore import VectorStore +from coderag.ingestion.chunker import CodeChunker +from coderag.ingestion.filter import FileFilter +from coderag.ingestion.loader import RepositoryLoader +from coderag.ingestion.validator import GitHubURLValidator, ValidationError +from coderag.logging import get_logger +from coderag.models.chunk import Chunk +from coderag.models.document import Document +from coderag.models.query import Query +from coderag.models.repository import Repository, RepositoryStatus + +logger = get_logger(__name__) + + +class UIHandlers: + """Handlers for Gradio UI events.""" + + def __init__(self) -> None: + self.settings = get_settings() + self.validator = GitHubURLValidator() + self.loader = RepositoryLoader() + self.filter = FileFilter() + self.chunker = CodeChunker() + self.embedder = EmbeddingGenerator() + self.vectorstore = VectorStore() + self.generator: Optional[ResponseGenerator] = None + + # Repository metadata storage + self.repos_file = self.settings.data_dir / "repositories.json" + self.repositories: dict[str, Repository] = self._load_repositories() + + def _load_repositories(self) -> dict[str, Repository]: + if self.repos_file.exists(): + try: + data = json.loads(self.repos_file.read_text()) + return {r["id"]: Repository.from_dict(r) for r in data} + except Exception as e: + logger.error("Failed to load repositories", error=str(e)) + return {} + + def _save_repositories(self) -> None: + self.repos_file.parent.mkdir(parents=True, exist_ok=True) + data = [r.to_dict() for r in self.repositories.values()] + self.repos_file.write_text(json.dumps(data, indent=2)) + + # ========================================================================= + # Streaming Methods (Nivel 1) + # ========================================================================= + + def _document_generator( + self, + files: list[Path], + repo_path: Path, + repo_id: str, + ) -> Iterator[Document]: + """Generate documents one by one without accumulating in memory.""" + for file_path in files: + try: + yield Document.from_file(file_path, repo_path, repo_id) + except Exception as e: + logger.warning("Failed to load file", path=str(file_path), error=str(e)) + + def _process_batch(self, chunks: list[Chunk]) -> int: + """Process a batch: embed + store + release memory.""" + if not chunks: + return 0 + + embedded = self.embedder.embed_chunks(chunks, show_progress=False) + self.vectorstore.add_chunks(embedded) + + # Release memory + del embedded + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return len(chunks) + + def _stream_index_repository( + self, + documents: Iterator[Document], + repo_id: str, + batch_size: int = 100, + progress_callback: Optional[callable] = None, + ) -> int: + """Index using streaming with batches.""" + total_chunks = 0 + batch: list[Chunk] = [] + doc_count = 0 + + for doc in documents: + doc_count += 1 + for chunk in self.chunker.chunk_document(doc): + chunk.repo_id = repo_id + batch.append(chunk) + + if len(batch) >= batch_size: + total_chunks += self._process_batch(batch) + logger.info("Batch processed", total_so_far=total_chunks, docs_processed=doc_count) + if progress_callback: + progress_callback(total_chunks, doc_count) + batch = [] + + # Process final batch + if batch: + total_chunks += self._process_batch(batch) + if progress_callback: + progress_callback(total_chunks, doc_count) + + return total_chunks + + # ========================================================================= + # Validation Methods (Nivel 2) + # ========================================================================= + + def _estimate_repo_size(self, files: list[Path]) -> dict: + """Estimate repository size before indexing.""" + total_size_kb = 0 + estimated_chunks = 0 + chunk_size = self.settings.ingestion.chunk_size + + for file_path in files: + try: + size_kb = file_path.stat().st_size / 1024 + total_size_kb += size_kb + # Rough estimate: 1 chunk per chunk_size characters + estimated_chunks += max(1, int(size_kb * 1024 / chunk_size)) + except Exception: + continue + + return { + "file_count": len(files), + "total_size_kb": total_size_kb, + "estimated_chunks": estimated_chunks, + "exceeds_file_limit": len(files) > self.settings.ingestion.max_files_per_repo, + "exceeds_chunk_limit": estimated_chunks > self.settings.ingestion.max_total_chunks, + "warn_large_repo": len(files) > self.settings.ingestion.warn_files_threshold, + } + + def _validate_repo_size(self, estimate: dict) -> tuple[bool, str]: + """Validate if the repository can be indexed.""" + if estimate["exceeds_file_limit"]: + return False, f"Repository exceeds file limit ({estimate['file_count']} > {self.settings.ingestion.max_files_per_repo})" + if estimate["exceeds_chunk_limit"]: + return False, f"Repository exceeds chunk limit (~{estimate['estimated_chunks']} > {self.settings.ingestion.max_total_chunks})" + + warning = "" + if estimate["warn_large_repo"]: + warning = f"Large repository ({estimate['file_count']} files, ~{estimate['estimated_chunks']} chunks). Processing may take several minutes." + + return True, warning + + # ========================================================================= + # Incremental Indexing Methods (Nivel 3) + # ========================================================================= + + def _get_current_commit(self, repo_path: Path) -> str: + """Get the SHA of the current commit.""" + from git import Repo + git_repo = Repo(repo_path) + return git_repo.head.commit.hexsha + + def _get_changed_files( + self, + repo_path: Path, + last_commit: str, + current_commit: str, + ) -> tuple[set[str], set[str], set[str]]: + """Get files that were added, modified, or deleted between commits.""" + from git import Repo + git_repo = Repo(repo_path) + + diff = git_repo.commit(last_commit).diff(current_commit) + + added: set[str] = set() + modified: set[str] = set() + deleted: set[str] = set() + + for d in diff: + if d.new_file: + added.add(d.b_path) + elif d.deleted_file: + deleted.add(d.a_path) + elif d.renamed: + deleted.add(d.a_path) + added.add(d.b_path) + else: + modified.add(d.b_path or d.a_path) + + return added, modified, deleted + + def index_repository_incremental(self, repo_id: str) -> str: + """Update only modified files since last indexing (incremental update).""" + # Find repository by full or partial ID + found_repo = None + for rid, repo in self.repositories.items(): + if rid == repo_id or rid.startswith(repo_id): + found_repo = repo + break + + if not found_repo: + return "Repository not found" + + repo = found_repo + + if not repo.last_commit: + return "No previous indexing found. Please re-index the full repository." + + if not repo.clone_path or not Path(repo.clone_path).exists(): + return "Repository cache not found. Please re-index." + + try: + repo_path = Path(repo.clone_path) + + # Update local repository + logger.info("Updating local repository", repo_id=repo.id) + self.loader._update_repository(repo_path, repo.branch, None) + + current_commit = self._get_current_commit(repo_path) + + if current_commit == repo.last_commit: + return "Repository is already up to date." + + added, modified, deleted = self._get_changed_files( + repo_path, repo.last_commit, current_commit + ) + + logger.info( + "Changes detected", + added=len(added), + modified=len(modified), + deleted=len(deleted), + ) + + # Delete chunks for deleted/modified files + for file_path in deleted | modified: + self.vectorstore.delete_file_chunks(repo.id, file_path) + + # Index new/modified files + files_to_index = [] + file_filter = FileFilter() + for file_path in added | modified: + full_path = repo_path / file_path + if full_path.exists() and file_filter.should_include(full_path, repo_path): + files_to_index.append(full_path) + + new_chunks = 0 + if files_to_index: + batch_size = self.settings.ingestion.batch_size + doc_generator = self._document_generator(files_to_index, repo_path, repo.id) + new_chunks = self._stream_index_repository(doc_generator, repo.id, batch_size) + + # Update metadata + repo.last_commit = current_commit + repo.indexed_at = datetime.now() + repo.chunk_count = self.vectorstore.get_repo_chunk_count(repo.id) + self._save_repositories() + + return ( + f"Incremental update complete:\n" + f"- Added/Modified: {len(added | modified)} files\n" + f"- Deleted: {len(deleted)} files\n" + f"- New chunks: {new_chunks}\n" + f"- Total chunks: {repo.chunk_count}" + ) + + except Exception as e: + logger.error("Incremental indexing failed", error=str(e), exc_info=True) + return f"Error: {str(e)}" + + def index_repository( + self, + url: str, + branch: str = "", + include_patterns: str = "", + exclude_patterns: str = "", + ) -> Iterator[str]: + """Index a GitHub repository with progress updates.""" + try: + # Validate URL (sync version, skip accessibility check for UI) + yield "Validating repository URL..." + logger.info("Starting indexing", url=url, branch=branch) + repo_info = self.validator.parse_url(url) + branch = branch.strip() or repo_info.branch or "main" + + # Create repository record + repo = Repository( + url=repo_info.url, + branch=branch, + status=RepositoryStatus.CLONING, + ) + self.repositories[repo.id] = repo + + # Clone repository + yield f"Cloning {repo_info.full_name} (branch: {branch})..." + logger.info("Cloning repository", url=url, branch=branch) + repo_path = self.loader.clone_repository(repo_info, branch) + repo.clone_path = repo_path + repo.status = RepositoryStatus.INDEXING + + # Setup filter with custom patterns + include = [p.strip() for p in include_patterns.split(",") if p.strip()] or None + exclude = [p.strip() for p in exclude_patterns.split(",") if p.strip()] or None + file_filter = FileFilter(include_patterns=include, exclude_patterns=exclude) + + # Process files + yield "Scanning files..." + logger.info("Filtering files", repo_path=str(repo_path)) + files = list(file_filter.filter_files(repo_path)) + file_count = len(files) + logger.info("Files to process", count=file_count) + + # Validate repository size (Nivel 2) + estimate = self._estimate_repo_size(files) + can_proceed, message = self._validate_repo_size(estimate) + + if not can_proceed: + repo.status = RepositoryStatus.ERROR + repo.error_message = message + self._save_repositories() + yield f"Error: {message}" + return + + if message: + logger.warning(message) + yield f"Warning: {message}" + + yield f"Found {file_count} files to index (~{estimate['estimated_chunks']} chunks)" + + # Delete existing chunks for this repo (re-indexing) + logger.info("Deleting previous chunks for repo", repo_id=repo.id) + self.vectorstore.delete_repo_chunks(repo.id) + + # Stream indexing with batches and progress updates + yield f"Indexing... (0/{file_count} files, 0 chunks)" + logger.info("Starting streaming indexing", file_count=file_count) + batch_size = self.settings.ingestion.batch_size + doc_generator = self._document_generator(files, repo_path, repo.id) + + # Process with progress updates + total_chunks = 0 + batch: list[Chunk] = [] + doc_count = 0 + + for doc in doc_generator: + doc_count += 1 + for chunk in self.chunker.chunk_document(doc): + chunk.repo_id = repo.id + batch.append(chunk) + + if len(batch) >= batch_size: + total_chunks += self._process_batch(batch) + batch = [] + # Yield progress update + yield f"Indexing... ({doc_count}/{file_count} files, {total_chunks} chunks)" + + # Process final batch + if batch: + total_chunks += self._process_batch(batch) + + logger.info("Streaming indexing complete", chunk_count=total_chunks) + + # Save current commit for incremental updates (Nivel 3) + try: + repo.last_commit = self._get_current_commit(repo_path) + except Exception: + repo.last_commit = None + + # Update repository status + repo.chunk_count = total_chunks + repo.indexed_at = datetime.now() + repo.status = RepositoryStatus.READY + self._save_repositories() + + result = f"Successfully indexed {repo_info.full_name}\n{file_count} files processed\n{total_chunks} chunks indexed" + logger.info("Indexing complete", result=result) + yield result + + except ValidationError as e: + logger.error("Validation error", error=str(e)) + yield f"Validation error: {str(e)}" + except Exception as e: + logger.error("Indexing failed", error=str(e), exc_info=True) + if "repo" in locals(): + repo.status = RepositoryStatus.ERROR + repo.error_message = str(e) + self._save_repositories() + yield f"Error: {str(e)}" + + def ask_question( + self, + repo_id: str, + question: str, + top_k: int = 5, + ) -> tuple[str, str, str]: + """Ask a question about a repository.""" + if not repo_id: + return "", "", "Please select a repository" + + if not question.strip(): + return "", "", "Please enter a question" + + try: + # Lazy load generator + if self.generator is None: + self.generator = ResponseGenerator() + + query = Query( + question=question.strip(), + repo_id=repo_id, + top_k=int(top_k), + ) + + response = self.generator.generate(query) + + # Format answer + answer_md = f"## Answer\n\n{response.answer}" + if response.citations: + answer_md += "\n\n### Citations\n" + for citation in response.citations: + answer_md += f"- `{citation}`\n" + + # Format evidence + evidence_md = response.format_evidence() + + status = "Grounded" if response.grounded else "Not grounded (no citations)" + + return answer_md, evidence_md, status + + except Exception as e: + logger.error("Question failed", error=str(e)) + return "", "", f"Error: {str(e)}" + + def get_repositories(self): + """Get list of repositories for dropdown.""" + import gradio as gr + choices = [] + for repo in self.repositories.values(): + if repo.status == RepositoryStatus.READY: + label = f"{repo.full_name} ({repo.chunk_count} chunks)" + choices.append((label, repo.id)) + return gr.update(choices=choices) + + def get_repositories_table(self) -> list[list]: + """Get repositories as table data.""" + rows = [] + for repo in self.repositories.values(): + rows.append([ + repo.id[:8], + repo.full_name, + repo.branch, + repo.chunk_count, + repo.status.value, + repo.indexed_at.strftime("%Y-%m-%d %H:%M") if repo.indexed_at else "-", + ]) + return rows + + def delete_repository(self, repo_id: str) -> tuple[str, list[list]]: + """Delete a repository.""" + repo_id = repo_id.strip() + + # Find by full or partial ID + found_repo = None + for rid, repo in self.repositories.items(): + if rid == repo_id or rid.startswith(repo_id): + found_repo = repo + break + + if not found_repo: + return "Repository not found", self.get_repositories_table() + + try: + # Delete from vector store + self.vectorstore.delete_repo_chunks(found_repo.id) + + # Delete cached repo + self.loader.delete_cache( + type("RepoInfo", (), {"owner": found_repo.owner, "name": found_repo.name})() + ) + + # Remove from records + del self.repositories[found_repo.id] + self._save_repositories() + + return f"Deleted {found_repo.full_name}", self.get_repositories_table() + + except Exception as e: + logger.error("Delete failed", error=str(e)) + return f"Error: {str(e)}", self.get_repositories_table()