Spaces:
Running
feat: Add MCP + CrewAI integration with multi-mode interface
Browse files## New Features
- Merkle tree for incremental indexing (10-100x faster re-indexing)
- Enhanced semantic chunking with AST-based metadata
- Path obfuscation for privacy
- MCP server with code_search, code_refactor, suggest_refactorings tools
- CrewAI multi-agent system (Analyst, Refactor, Reviewer, Documentation agents)
- Multi-mode Streamlit UI (Chat, Search, Refactor, Generate modes)
## Improvements
- Fixed embedding model to use gemini-embedding-001
- Better rate limiting with smaller batches and more retries
- Centralized configuration system
- Technical documentation (docs/RAG_PIPELINE.md)
## Files Added
- code_chatbot/merkle_tree.py - Merkle tree change detection
- code_chatbot/config.py - Centralized configuration
- code_chatbot/mcp_server.py - MCP refactoring tools
- code_chatbot/mcp_client.py - MCP client interface
- code_chatbot/agents/ - CrewAI agent definitions
- code_chatbot/crews/ - CrewAI workflow definitions
- components/multi_mode.py - Multi-mode UI components
- docs/RAG_PIPELINE.md - Technical documentation
- .sage-env +0 -10
- README.md +11 -0
- app.py +31 -1
- app_multimode_integration.py +98 -0
- code_chatbot/agents/__init__.py +98 -0
- code_chatbot/chunker.py +163 -14
- code_chatbot/config.py +309 -0
- code_chatbot/crews/__init__.py +217 -0
- code_chatbot/incremental_indexing.py +221 -0
- code_chatbot/indexer.py +46 -14
- code_chatbot/mcp_client.py +225 -0
- code_chatbot/mcp_server.py +366 -0
- code_chatbot/merkle_tree.py +386 -0
- code_chatbot/path_obfuscator.py +215 -0
- components/multi_mode.py +422 -0
- demo_mcp_crewai.py +187 -0
- docs/RAG_PIPELINE.md +433 -0
- integrate_multimode.py +114 -0
- requirements.txt +10 -0
- tests/test_merkle_tree_simple.py +60 -0
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
# Embeddings
|
| 2 |
-
export OPENAI_API_KEY=
|
| 3 |
-
# Vector store
|
| 4 |
-
export PINECONE_API_KEY=
|
| 5 |
-
# Reranking
|
| 6 |
-
export NVIDIA_API_KEY=
|
| 7 |
-
# Generation LLM
|
| 8 |
-
export ANTHROPIC_API_KEY=
|
| 9 |
-
# Github issues
|
| 10 |
-
export GITHUB_TOKEN=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -14,6 +14,17 @@ Think of it as a private, super-powered developer assistant that knows your code
|
|
| 14 |
- **β‘ Multiple Providers**: Support for **Google Gemini** (1M+ context), **Groq** (fast inference), and standard OpenAI-compatible APIs.
|
| 15 |
- **π Universal Ingestion**: Upload ZIP files or point to GitHub repositories.
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
## π Quick Start
|
| 18 |
|
| 19 |
1. **Clone the repository**:
|
|
|
|
| 14 |
- **β‘ Multiple Providers**: Support for **Google Gemini** (1M+ context), **Groq** (fast inference), and standard OpenAI-compatible APIs.
|
| 15 |
- **π Universal Ingestion**: Upload ZIP files or point to GitHub repositories.
|
| 16 |
|
| 17 |
+
## π Advanced Features (Cursor-Inspired)
|
| 18 |
+
|
| 19 |
+
- **π Incremental Indexing**: Merkle tree-based change detection for 10-100x faster re-indexing
|
| 20 |
+
- **π Privacy-Preserving**: Optional HMAC-based path obfuscation for sensitive codebases
|
| 21 |
+
- **π§© Semantic Chunking**: AST-based code splitting that respects function/class boundaries
|
| 22 |
+
- **π Rich Metadata**: Automatic extraction of symbols, imports, and cyclomatic complexity
|
| 23 |
+
- **π― Hybrid Search**: Combines semantic similarity with keyword matching
|
| 24 |
+
- **βοΈ Highly Configurable**: Fine-tune chunking, retrieval, and privacy settings
|
| 25 |
+
|
| 26 |
+
**[π Read the Technical Deep-Dive](docs/RAG_PIPELINE.md)** to understand how our RAG pipeline works.
|
| 27 |
+
|
| 28 |
## π Quick Start
|
| 29 |
|
| 30 |
1. **Clone the repository**:
|
|
@@ -488,7 +488,37 @@ with st.sidebar:
|
|
| 488 |
|
| 489 |
# Main Chat Interface
|
| 490 |
st.title("π·οΈ Code Crawler")
|
| 491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
if not st.session_state.processed_files:
|
| 494 |
st.info("π Please upload and index a ZIP file to start.")
|
|
|
|
| 488 |
|
| 489 |
# Main Chat Interface
|
| 490 |
st.title("π·οΈ Code Crawler")
|
| 491 |
+
|
| 492 |
+
# Multi-Mode Interface
|
| 493 |
+
if st.session_state.processed_files:
|
| 494 |
+
from components.multi_mode import (
|
| 495 |
+
render_mode_selector,
|
| 496 |
+
render_chat_mode,
|
| 497 |
+
render_search_mode,
|
| 498 |
+
render_refactor_mode,
|
| 499 |
+
render_generate_mode
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
# Mode selector at the top
|
| 503 |
+
selected_mode = render_mode_selector()
|
| 504 |
+
|
| 505 |
+
st.divider()
|
| 506 |
+
|
| 507 |
+
# Render appropriate interface based on mode
|
| 508 |
+
if selected_mode == "search":
|
| 509 |
+
render_search_mode()
|
| 510 |
+
elif selected_mode == "refactor":
|
| 511 |
+
render_refactor_mode()
|
| 512 |
+
elif selected_mode == "generate":
|
| 513 |
+
render_generate_mode(st.session_state.chat_engine)
|
| 514 |
+
else: # chat mode
|
| 515 |
+
# Show chat mode UI
|
| 516 |
+
render_chat_mode(st.session_state.chat_engine)
|
| 517 |
+
|
| 518 |
+
# Continue with standard chat interface below
|
| 519 |
+
st.caption(f"Ask questions about your uploaded project. (Using {provider}, Enhanced with AST)")
|
| 520 |
+
else:
|
| 521 |
+
st.caption(f"Configure and index your codebase to get started. (Using {provider}, Enhanced with AST)")
|
| 522 |
|
| 523 |
if not st.session_state.processed_files:
|
| 524 |
st.info("π Please upload and index a ZIP file to start.")
|
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced app.py with multi-mode interface integration.
|
| 3 |
+
|
| 4 |
+
This file adds the mode selector and conditional rendering.
|
| 5 |
+
Add this code after line 520 in app.py (after the caption).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Add this import at the top of app.py (around line 11)
|
| 9 |
+
# from components.multi_mode import render_mode_selector, render_chat_mode, render_search_mode, render_refactor_mode, render_generate_mode
|
| 10 |
+
|
| 11 |
+
# Replace lines 523-615 with this code:
|
| 12 |
+
|
| 13 |
+
if not st.session_state.processed_files:
|
| 14 |
+
st.info("π Please upload and index a ZIP file to start.")
|
| 15 |
+
else:
|
| 16 |
+
# Get selected mode (defaults to chat)
|
| 17 |
+
selected_mode = st.session_state.get("mode_selector", "π¬ Chat")
|
| 18 |
+
|
| 19 |
+
# Only render chat interface in chat mode
|
| 20 |
+
if selected_mode == "π¬ Chat":
|
| 21 |
+
# Display History
|
| 22 |
+
for msg in st.session_state.messages:
|
| 23 |
+
with st.chat_message(msg["role"]):
|
| 24 |
+
# Render Sources if available
|
| 25 |
+
if "sources" in msg and msg["sources"]:
|
| 26 |
+
unique_sources = {}
|
| 27 |
+
for s in msg["sources"]:
|
| 28 |
+
if isinstance(s, dict):
|
| 29 |
+
fp = s.get('file_path', 'Unknown')
|
| 30 |
+
else:
|
| 31 |
+
fp = str(s)
|
| 32 |
+
if fp not in unique_sources:
|
| 33 |
+
unique_sources[fp] = s
|
| 34 |
+
|
| 35 |
+
chips_html = '<div class="source-container" style="display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 10px;">'
|
| 36 |
+
for fp in unique_sources:
|
| 37 |
+
basename = os.path.basename(fp) if "/" in fp else fp
|
| 38 |
+
chips_html += f"""
|
| 39 |
+
<div class="source-chip" style="background: rgba(30, 41, 59, 0.4); border: 1px solid rgba(148, 163, 184, 0.2); border-radius: 6px; padding: 4px 10px; font-size: 0.85em; color: #cbd5e1; display: flex; align-items: center; gap: 6px;">
|
| 40 |
+
<span class="source-icon">π</span> {basename}
|
| 41 |
+
</div>
|
| 42 |
+
"""
|
| 43 |
+
chips_html += '</div>'
|
| 44 |
+
st.markdown(chips_html, unsafe_allow_html=True)
|
| 45 |
+
|
| 46 |
+
st.markdown(msg["content"], unsafe_allow_html=True)
|
| 47 |
+
|
| 48 |
+
# Handle pending prompt from suggestions
|
| 49 |
+
if "pending_prompt" in st.session_state and st.session_state.pending_prompt:
|
| 50 |
+
prompt = st.session_state.pending_prompt
|
| 51 |
+
st.session_state.pending_prompt = None
|
| 52 |
+
else:
|
| 53 |
+
prompt = st.chat_input("How does the authentication work?")
|
| 54 |
+
|
| 55 |
+
if prompt:
|
| 56 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 57 |
+
with st.chat_message("user"):
|
| 58 |
+
st.markdown(prompt)
|
| 59 |
+
|
| 60 |
+
with st.chat_message("assistant"):
|
| 61 |
+
if st.session_state.chat_engine:
|
| 62 |
+
with st.spinner("Analyzing (Graph+Vector)..."):
|
| 63 |
+
answer_payload = st.session_state.chat_engine.chat(prompt)
|
| 64 |
+
|
| 65 |
+
if isinstance(answer_payload, tuple):
|
| 66 |
+
answer, sources = answer_payload
|
| 67 |
+
else:
|
| 68 |
+
answer = answer_payload
|
| 69 |
+
sources = []
|
| 70 |
+
|
| 71 |
+
if sources:
|
| 72 |
+
unique_sources = {}
|
| 73 |
+
for s in sources:
|
| 74 |
+
fp = s.get('file_path', 'Unknown')
|
| 75 |
+
if fp not in unique_sources:
|
| 76 |
+
unique_sources[fp] = s
|
| 77 |
+
|
| 78 |
+
chips_html = '<div class="source-container">'
|
| 79 |
+
for fp in unique_sources:
|
| 80 |
+
basename = os.path.basename(fp)
|
| 81 |
+
chips_html += f"""
|
| 82 |
+
<div class="source-chip">
|
| 83 |
+
<span class="source-icon">π</span> {basename}
|
| 84 |
+
</div>
|
| 85 |
+
"""
|
| 86 |
+
chips_html += '</div>'
|
| 87 |
+
st.markdown(chips_html, unsafe_allow_html=True)
|
| 88 |
+
|
| 89 |
+
st.markdown(answer)
|
| 90 |
+
|
| 91 |
+
msg_data = {
|
| 92 |
+
"role": "assistant",
|
| 93 |
+
"content": answer,
|
| 94 |
+
"sources": sources if sources else []
|
| 95 |
+
}
|
| 96 |
+
st.session_state.messages.append(msg_data)
|
| 97 |
+
else:
|
| 98 |
+
st.error("Chat engine not initialized. Please re-index.")
|
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base agent classes and utilities for CrewAI integration.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from crewai import Agent
|
| 6 |
+
from typing import List, Optional
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def create_analyst_agent(llm=None, tools: Optional[List] = None) -> Agent:
|
| 13 |
+
"""
|
| 14 |
+
Create a Code Analyst agent.
|
| 15 |
+
|
| 16 |
+
Specializes in understanding codebase architecture and identifying patterns.
|
| 17 |
+
"""
|
| 18 |
+
return Agent(
|
| 19 |
+
role="Senior Code Analyst",
|
| 20 |
+
goal="Understand codebase architecture, identify patterns, and analyze code quality",
|
| 21 |
+
backstory="""You are an expert software architect with 15 years of experience.
|
| 22 |
+
You specialize in identifying design patterns, anti-patterns, and technical debt.
|
| 23 |
+
You have a deep understanding of software architecture principles and best practices.
|
| 24 |
+
You can quickly analyze codebases and provide insightful observations about their structure.""",
|
| 25 |
+
verbose=True,
|
| 26 |
+
allow_delegation=False,
|
| 27 |
+
llm=llm,
|
| 28 |
+
tools=tools or []
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def create_refactor_agent(llm=None, tools: Optional[List] = None) -> Agent:
|
| 33 |
+
"""
|
| 34 |
+
Create a Refactoring Specialist agent.
|
| 35 |
+
|
| 36 |
+
Specializes in proposing and executing safe code refactorings.
|
| 37 |
+
"""
|
| 38 |
+
return Agent(
|
| 39 |
+
role="Refactoring Specialist",
|
| 40 |
+
goal="Improve code quality through safe, well-reasoned refactorings",
|
| 41 |
+
backstory="""You are a master of code refactoring with deep knowledge of design patterns.
|
| 42 |
+
You have refactored thousands of codebases and know how to improve code without breaking functionality.
|
| 43 |
+
You always ensure refactorings are safe, well-tested, and improve maintainability.
|
| 44 |
+
You understand the trade-offs between different refactoring approaches.""",
|
| 45 |
+
verbose=True,
|
| 46 |
+
allow_delegation=False,
|
| 47 |
+
llm=llm,
|
| 48 |
+
tools=tools or []
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def create_reviewer_agent(llm=None, tools: Optional[List] = None) -> Agent:
|
| 53 |
+
"""
|
| 54 |
+
Create a Code Review Expert agent.
|
| 55 |
+
|
| 56 |
+
Specializes in reviewing code changes and catching potential issues.
|
| 57 |
+
"""
|
| 58 |
+
return Agent(
|
| 59 |
+
role="Code Review Expert",
|
| 60 |
+
goal="Ensure code quality, catch bugs, and identify security issues",
|
| 61 |
+
backstory="""You are a veteran code reviewer who has reviewed over 10,000 pull requests.
|
| 62 |
+
You have an eagle eye for bugs, security vulnerabilities, and maintainability issues.
|
| 63 |
+
You provide constructive feedback that helps developers improve their code.
|
| 64 |
+
You understand the importance of balancing perfectionism with pragmatism.""",
|
| 65 |
+
verbose=True,
|
| 66 |
+
allow_delegation=False,
|
| 67 |
+
llm=llm,
|
| 68 |
+
tools=tools or []
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def create_documentation_agent(llm=None, tools: Optional[List] = None) -> Agent:
|
| 73 |
+
"""
|
| 74 |
+
Create a Documentation Specialist agent.
|
| 75 |
+
|
| 76 |
+
Specializes in creating clear, comprehensive documentation.
|
| 77 |
+
"""
|
| 78 |
+
return Agent(
|
| 79 |
+
role="Documentation Specialist",
|
| 80 |
+
goal="Create clear, comprehensive, and helpful documentation",
|
| 81 |
+
backstory="""You are a technical writer with deep programming knowledge.
|
| 82 |
+
You excel at explaining complex code in simple, understandable terms.
|
| 83 |
+
You know how to write documentation that developers actually want to read.
|
| 84 |
+
You understand the importance of examples, diagrams, and clear explanations.""",
|
| 85 |
+
verbose=True,
|
| 86 |
+
allow_delegation=False,
|
| 87 |
+
llm=llm,
|
| 88 |
+
tools=tools or []
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# Export all agent creators
|
| 93 |
+
__all__ = [
|
| 94 |
+
'create_analyst_agent',
|
| 95 |
+
'create_refactor_agent',
|
| 96 |
+
'create_reviewer_agent',
|
| 97 |
+
'create_documentation_agent'
|
| 98 |
+
]
|
|
@@ -19,12 +19,18 @@ tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
| 19 |
|
| 20 |
@dataclass
|
| 21 |
class FileChunk:
|
| 22 |
-
"""Represents a chunk of code with byte positions."""
|
| 23 |
file_content: str
|
| 24 |
file_metadata: Dict
|
| 25 |
start_byte: int
|
| 26 |
end_byte: int
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
@cached_property
|
| 29 |
def filename(self):
|
| 30 |
if "file_path" not in self.file_metadata:
|
|
@@ -42,22 +48,47 @@ class FileChunk:
|
|
| 42 |
return len(tokenizer.encode(self.content, disallowed_special=()))
|
| 43 |
|
| 44 |
def to_document(self) -> Document:
|
| 45 |
-
"""Convert to LangChain Document."""
|
| 46 |
chunk_type = self.file_metadata.get("chunk_type", "code")
|
| 47 |
name = self.file_metadata.get("name", None)
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
class StructuralChunker:
|
|
@@ -167,7 +198,14 @@ class StructuralChunker:
|
|
| 167 |
name = self._get_node_name(node, file_content)
|
| 168 |
if name:
|
| 169 |
chunk_metadata["name"] = name
|
|
|
|
|
|
|
| 170 |
node_chunk.file_metadata = chunk_metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
return [node_chunk]
|
| 172 |
|
| 173 |
# If leaf node is too large, split it as text
|
|
@@ -249,3 +287,114 @@ class StructuralChunker:
|
|
| 249 |
if name_node:
|
| 250 |
return content[name_node.start_byte:name_node.end_byte]
|
| 251 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
@dataclass
|
| 21 |
class FileChunk:
|
| 22 |
+
"""Represents a chunk of code with byte positions and rich metadata."""
|
| 23 |
file_content: str
|
| 24 |
file_metadata: Dict
|
| 25 |
start_byte: int
|
| 26 |
end_byte: int
|
| 27 |
|
| 28 |
+
# Enhanced metadata fields
|
| 29 |
+
symbols_defined: Optional[List[str]] = None # Functions/classes defined in this chunk
|
| 30 |
+
imports_used: Optional[List[str]] = None # Import statements relevant to chunk
|
| 31 |
+
complexity_score: Optional[int] = None # Cyclomatic complexity
|
| 32 |
+
parent_context: Optional[str] = None # Parent class/module name
|
| 33 |
+
|
| 34 |
@cached_property
|
| 35 |
def filename(self):
|
| 36 |
if "file_path" not in self.file_metadata:
|
|
|
|
| 48 |
return len(tokenizer.encode(self.content, disallowed_special=()))
|
| 49 |
|
| 50 |
def to_document(self) -> Document:
|
| 51 |
+
"""Convert to LangChain Document with enhanced metadata."""
|
| 52 |
chunk_type = self.file_metadata.get("chunk_type", "code")
|
| 53 |
name = self.file_metadata.get("name", None)
|
| 54 |
|
| 55 |
+
# Calculate line range from byte positions
|
| 56 |
+
lines_before = self.file_content[:self.start_byte].count('\n')
|
| 57 |
+
lines_in_chunk = self.file_content[self.start_byte:self.end_byte].count('\n')
|
| 58 |
+
line_range = f"L{lines_before + 1}-L{lines_before + lines_in_chunk + 1}"
|
| 59 |
+
|
| 60 |
+
# Get language from file extension
|
| 61 |
+
ext = self.filename.split('.')[-1].lower() if '.' in self.filename else 'unknown'
|
| 62 |
+
language_map = {
|
| 63 |
+
'py': 'python', 'js': 'javascript', 'ts': 'typescript',
|
| 64 |
+
'jsx': 'javascript', 'tsx': 'typescript', 'java': 'java',
|
| 65 |
+
'cpp': 'cpp', 'c': 'c', 'go': 'go', 'rs': 'rust'
|
| 66 |
+
}
|
| 67 |
+
language = language_map.get(ext, ext)
|
| 68 |
+
|
| 69 |
+
metadata = {
|
| 70 |
+
**self.file_metadata,
|
| 71 |
+
"id": f"{self.filename}_{self.start_byte}_{self.end_byte}",
|
| 72 |
+
"start_byte": self.start_byte,
|
| 73 |
+
"end_byte": self.end_byte,
|
| 74 |
+
"length": self.end_byte - self.start_byte,
|
| 75 |
+
"line_range": line_range,
|
| 76 |
+
"language": language,
|
| 77 |
+
"chunk_type": chunk_type,
|
| 78 |
+
"name": name,
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# Add enhanced metadata if available
|
| 82 |
+
if self.symbols_defined:
|
| 83 |
+
metadata["symbols"] = self.symbols_defined
|
| 84 |
+
if self.imports_used:
|
| 85 |
+
metadata["imports"] = self.imports_used
|
| 86 |
+
if self.complexity_score is not None:
|
| 87 |
+
metadata["complexity"] = self.complexity_score
|
| 88 |
+
if self.parent_context:
|
| 89 |
+
metadata["parent_context"] = self.parent_context
|
| 90 |
+
|
| 91 |
+
return Document(page_content=self.content, metadata=metadata)
|
| 92 |
|
| 93 |
|
| 94 |
class StructuralChunker:
|
|
|
|
| 198 |
name = self._get_node_name(node, file_content)
|
| 199 |
if name:
|
| 200 |
chunk_metadata["name"] = name
|
| 201 |
+
|
| 202 |
+
# Extract enhanced metadata
|
| 203 |
node_chunk.file_metadata = chunk_metadata
|
| 204 |
+
node_chunk.symbols_defined = self._extract_symbols(node, file_content)
|
| 205 |
+
node_chunk.imports_used = self._extract_imports(node, file_content)
|
| 206 |
+
node_chunk.complexity_score = self._calculate_complexity(node, file_content)
|
| 207 |
+
node_chunk.parent_context = self._get_parent_context(node, file_content)
|
| 208 |
+
|
| 209 |
return [node_chunk]
|
| 210 |
|
| 211 |
# If leaf node is too large, split it as text
|
|
|
|
| 287 |
if name_node:
|
| 288 |
return content[name_node.start_byte:name_node.end_byte]
|
| 289 |
return None
|
| 290 |
+
|
| 291 |
+
def _extract_symbols(self, node: Node, content: str) -> List[str]:
|
| 292 |
+
"""
|
| 293 |
+
Extract function and class names defined in this node.
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
List of symbol names (e.g., ['MyClass', 'MyClass.my_method'])
|
| 297 |
+
"""
|
| 298 |
+
symbols = []
|
| 299 |
+
|
| 300 |
+
def traverse(n: Node, parent_class: Optional[str] = None):
|
| 301 |
+
# Check if this is a function or class definition
|
| 302 |
+
if n.type in ['function_definition', 'class_definition', 'method_definition']:
|
| 303 |
+
name = self._get_node_name(n, content)
|
| 304 |
+
if name:
|
| 305 |
+
if parent_class:
|
| 306 |
+
symbols.append(f"{parent_class}.{name}")
|
| 307 |
+
else:
|
| 308 |
+
symbols.append(name)
|
| 309 |
+
|
| 310 |
+
# If it's a class, traverse its children with this class as parent
|
| 311 |
+
if n.type == 'class_definition':
|
| 312 |
+
for child in n.children:
|
| 313 |
+
traverse(child, name)
|
| 314 |
+
return # Don't traverse children again
|
| 315 |
+
|
| 316 |
+
# Traverse children
|
| 317 |
+
for child in n.children:
|
| 318 |
+
traverse(child, parent_class)
|
| 319 |
+
|
| 320 |
+
traverse(node)
|
| 321 |
+
return symbols
|
| 322 |
+
|
| 323 |
+
def _extract_imports(self, node: Node, content: str) -> List[str]:
|
| 324 |
+
"""
|
| 325 |
+
Extract import statements from this node.
|
| 326 |
+
|
| 327 |
+
Returns:
|
| 328 |
+
List of import statements (e.g., ['import os', 'from typing import List'])
|
| 329 |
+
"""
|
| 330 |
+
imports = []
|
| 331 |
+
|
| 332 |
+
def traverse(n: Node):
|
| 333 |
+
# Python imports
|
| 334 |
+
if n.type in ['import_statement', 'import_from_statement']:
|
| 335 |
+
import_text = content[n.start_byte:n.end_byte].strip()
|
| 336 |
+
imports.append(import_text)
|
| 337 |
+
|
| 338 |
+
# JavaScript/TypeScript imports
|
| 339 |
+
elif n.type == 'import_statement':
|
| 340 |
+
import_text = content[n.start_byte:n.end_byte].strip()
|
| 341 |
+
imports.append(import_text)
|
| 342 |
+
|
| 343 |
+
# Traverse children
|
| 344 |
+
for child in n.children:
|
| 345 |
+
traverse(child)
|
| 346 |
+
|
| 347 |
+
traverse(node)
|
| 348 |
+
return imports
|
| 349 |
+
|
| 350 |
+
def _calculate_complexity(self, node: Node, content: str) -> int:
|
| 351 |
+
"""
|
| 352 |
+
Calculate cyclomatic complexity for a code chunk.
|
| 353 |
+
|
| 354 |
+
Cyclomatic complexity = number of decision points + 1
|
| 355 |
+
Decision points: if, elif, for, while, except, and, or, case, etc.
|
| 356 |
+
|
| 357 |
+
Returns:
|
| 358 |
+
Complexity score (integer)
|
| 359 |
+
"""
|
| 360 |
+
complexity = 1 # Base complexity
|
| 361 |
+
|
| 362 |
+
# Decision point node types
|
| 363 |
+
decision_nodes = {
|
| 364 |
+
'if_statement', 'elif_clause', 'else_clause',
|
| 365 |
+
'for_statement', 'while_statement',
|
| 366 |
+
'except_clause', 'case_clause',
|
| 367 |
+
'conditional_expression', # ternary operator
|
| 368 |
+
'boolean_operator', # and, or
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
def traverse(n: Node):
|
| 372 |
+
nonlocal complexity
|
| 373 |
+
|
| 374 |
+
if n.type in decision_nodes:
|
| 375 |
+
complexity += 1
|
| 376 |
+
|
| 377 |
+
for child in n.children:
|
| 378 |
+
traverse(child)
|
| 379 |
+
|
| 380 |
+
traverse(node)
|
| 381 |
+
return complexity
|
| 382 |
+
|
| 383 |
+
def _get_parent_context(self, node: Node, content: str) -> Optional[str]:
|
| 384 |
+
"""
|
| 385 |
+
Get the parent class or module context for this node.
|
| 386 |
+
|
| 387 |
+
Returns:
|
| 388 |
+
Parent class name or None
|
| 389 |
+
"""
|
| 390 |
+
current = node.parent
|
| 391 |
+
|
| 392 |
+
while current:
|
| 393 |
+
if current.type == 'class_definition':
|
| 394 |
+
name = self._get_node_name(current, content)
|
| 395 |
+
if name:
|
| 396 |
+
return name
|
| 397 |
+
current = current.parent
|
| 398 |
+
|
| 399 |
+
return None
|
| 400 |
+
|
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration system for RAG pipeline.
|
| 3 |
+
|
| 4 |
+
Centralizes all configuration options for chunking, indexing, retrieval,
|
| 5 |
+
and privacy features. Loads from environment variables with sensible defaults.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from typing import Optional, List
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class ChunkingConfig:
|
| 16 |
+
"""Configuration for code chunking."""
|
| 17 |
+
|
| 18 |
+
max_chunk_tokens: int = 800
|
| 19 |
+
"""Maximum tokens per chunk"""
|
| 20 |
+
|
| 21 |
+
min_chunk_tokens: int = 100
|
| 22 |
+
"""Minimum tokens per chunk (for merging small chunks)"""
|
| 23 |
+
|
| 24 |
+
preserve_imports: bool = True
|
| 25 |
+
"""Include relevant import statements with chunks"""
|
| 26 |
+
|
| 27 |
+
include_parent_context: bool = True
|
| 28 |
+
"""Include parent class/module name in chunk metadata"""
|
| 29 |
+
|
| 30 |
+
calculate_complexity: bool = True
|
| 31 |
+
"""Calculate cyclomatic complexity for chunks"""
|
| 32 |
+
|
| 33 |
+
@classmethod
|
| 34 |
+
def from_env(cls) -> 'ChunkingConfig':
|
| 35 |
+
"""Load configuration from environment variables."""
|
| 36 |
+
return cls(
|
| 37 |
+
max_chunk_tokens=int(os.getenv('CHUNK_MAX_TOKENS', '800')),
|
| 38 |
+
min_chunk_tokens=int(os.getenv('CHUNK_MIN_TOKENS', '100')),
|
| 39 |
+
preserve_imports=os.getenv('CHUNK_PRESERVE_IMPORTS', 'true').lower() == 'true',
|
| 40 |
+
include_parent_context=os.getenv('CHUNK_PARENT_CONTEXT', 'true').lower() == 'true',
|
| 41 |
+
calculate_complexity=os.getenv('CHUNK_CALCULATE_COMPLEXITY', 'true').lower() == 'true',
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class PrivacyConfig:
|
| 47 |
+
"""Configuration for privacy features."""
|
| 48 |
+
|
| 49 |
+
enable_path_obfuscation: bool = False
|
| 50 |
+
"""Enable file path obfuscation for sensitive codebases"""
|
| 51 |
+
|
| 52 |
+
obfuscation_key: Optional[str] = None
|
| 53 |
+
"""Secret key for path obfuscation (auto-generated if not provided)"""
|
| 54 |
+
|
| 55 |
+
obfuscation_mapping_file: str = "chroma_db/.path_mapping.json"
|
| 56 |
+
"""File to store path obfuscation mappings"""
|
| 57 |
+
|
| 58 |
+
@classmethod
|
| 59 |
+
def from_env(cls) -> 'PrivacyConfig':
|
| 60 |
+
"""Load configuration from environment variables."""
|
| 61 |
+
return cls(
|
| 62 |
+
enable_path_obfuscation=os.getenv('ENABLE_PATH_OBFUSCATION', 'false').lower() == 'true',
|
| 63 |
+
obfuscation_key=os.getenv('PATH_OBFUSCATION_KEY'),
|
| 64 |
+
obfuscation_mapping_file=os.getenv('PATH_MAPPING_FILE', 'chroma_db/.path_mapping.json'),
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@dataclass
|
| 69 |
+
class IndexingConfig:
|
| 70 |
+
"""Configuration for indexing operations."""
|
| 71 |
+
|
| 72 |
+
enable_incremental_indexing: bool = True
|
| 73 |
+
"""Use Merkle tree for incremental indexing"""
|
| 74 |
+
|
| 75 |
+
merkle_snapshot_dir: str = "chroma_db/merkle_snapshots"
|
| 76 |
+
"""Directory to store Merkle tree snapshots"""
|
| 77 |
+
|
| 78 |
+
batch_size: int = 100
|
| 79 |
+
"""Number of documents to process in each batch"""
|
| 80 |
+
|
| 81 |
+
ignore_patterns: List[str] = field(default_factory=lambda: [
|
| 82 |
+
'*.pyc', '__pycache__/*', '.git/*', 'node_modules/*',
|
| 83 |
+
'.venv/*', 'venv/*', '*.egg-info/*', 'dist/*', 'build/*'
|
| 84 |
+
])
|
| 85 |
+
"""File patterns to ignore during indexing"""
|
| 86 |
+
|
| 87 |
+
max_file_size_mb: int = 10
|
| 88 |
+
"""Maximum file size to index (in MB)"""
|
| 89 |
+
|
| 90 |
+
@classmethod
|
| 91 |
+
def from_env(cls) -> 'IndexingConfig':
|
| 92 |
+
"""Load configuration from environment variables."""
|
| 93 |
+
ignore_patterns_str = os.getenv('INDEXING_IGNORE_PATTERNS', '')
|
| 94 |
+
ignore_patterns = ignore_patterns_str.split(',') if ignore_patterns_str else cls().ignore_patterns
|
| 95 |
+
|
| 96 |
+
return cls(
|
| 97 |
+
enable_incremental_indexing=os.getenv('ENABLE_INCREMENTAL_INDEXING', 'true').lower() == 'true',
|
| 98 |
+
merkle_snapshot_dir=os.getenv('MERKLE_SNAPSHOT_DIR', 'chroma_db/merkle_snapshots'),
|
| 99 |
+
batch_size=int(os.getenv('INDEXING_BATCH_SIZE', '100')),
|
| 100 |
+
ignore_patterns=ignore_patterns,
|
| 101 |
+
max_file_size_mb=int(os.getenv('MAX_FILE_SIZE_MB', '10')),
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@dataclass
|
| 106 |
+
class RetrievalConfig:
|
| 107 |
+
"""Configuration for retrieval operations."""
|
| 108 |
+
|
| 109 |
+
enable_reranking: bool = True
|
| 110 |
+
"""Apply reranking to retrieval results"""
|
| 111 |
+
|
| 112 |
+
retrieval_k: int = 10
|
| 113 |
+
"""Number of documents to retrieve from vector store"""
|
| 114 |
+
|
| 115 |
+
rerank_top_k: int = 5
|
| 116 |
+
"""Number of top documents to return after reranking"""
|
| 117 |
+
|
| 118 |
+
enable_multi_query: bool = False
|
| 119 |
+
"""Use multi-query retriever for query expansion"""
|
| 120 |
+
|
| 121 |
+
enable_metadata_filtering: bool = True
|
| 122 |
+
"""Enable filtering by metadata (language, type, etc.)"""
|
| 123 |
+
|
| 124 |
+
similarity_threshold: float = 0.5
|
| 125 |
+
"""Minimum similarity score for retrieval"""
|
| 126 |
+
|
| 127 |
+
@classmethod
|
| 128 |
+
def from_env(cls) -> 'RetrievalConfig':
|
| 129 |
+
"""Load configuration from environment variables."""
|
| 130 |
+
return cls(
|
| 131 |
+
enable_reranking=os.getenv('ENABLE_RERANKING', 'true').lower() == 'true',
|
| 132 |
+
retrieval_k=int(os.getenv('RETRIEVAL_K', '10')),
|
| 133 |
+
rerank_top_k=int(os.getenv('RERANK_TOP_K', '5')),
|
| 134 |
+
enable_multi_query=os.getenv('ENABLE_MULTI_QUERY', 'false').lower() == 'true',
|
| 135 |
+
enable_metadata_filtering=os.getenv('ENABLE_METADATA_FILTERING', 'true').lower() == 'true',
|
| 136 |
+
similarity_threshold=float(os.getenv('SIMILARITY_THRESHOLD', '0.5')),
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@dataclass
|
| 141 |
+
class RAGConfig:
|
| 142 |
+
"""
|
| 143 |
+
Complete RAG pipeline configuration.
|
| 144 |
+
|
| 145 |
+
This is the main configuration class that combines all sub-configurations.
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
chunking: ChunkingConfig = field(default_factory=ChunkingConfig)
|
| 149 |
+
privacy: PrivacyConfig = field(default_factory=PrivacyConfig)
|
| 150 |
+
indexing: IndexingConfig = field(default_factory=IndexingConfig)
|
| 151 |
+
retrieval: RetrievalConfig = field(default_factory=RetrievalConfig)
|
| 152 |
+
|
| 153 |
+
# General settings
|
| 154 |
+
persist_directory: str = "chroma_db"
|
| 155 |
+
"""Directory for vector database persistence"""
|
| 156 |
+
|
| 157 |
+
embedding_provider: str = "gemini"
|
| 158 |
+
"""Embedding provider: 'gemini', 'openai', 'huggingface'"""
|
| 159 |
+
|
| 160 |
+
embedding_model: str = "models/embedding-001"
|
| 161 |
+
"""Embedding model name"""
|
| 162 |
+
|
| 163 |
+
llm_provider: str = "gemini"
|
| 164 |
+
"""LLM provider for chat: 'gemini', 'groq', 'openai'"""
|
| 165 |
+
|
| 166 |
+
llm_model: str = "gemini-2.0-flash-exp"
|
| 167 |
+
"""LLM model name"""
|
| 168 |
+
|
| 169 |
+
log_level: str = "INFO"
|
| 170 |
+
"""Logging level: DEBUG, INFO, WARNING, ERROR"""
|
| 171 |
+
|
| 172 |
+
@classmethod
|
| 173 |
+
def from_env(cls) -> 'RAGConfig':
|
| 174 |
+
"""
|
| 175 |
+
Load complete configuration from environment variables.
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
RAGConfig instance with all settings loaded
|
| 179 |
+
"""
|
| 180 |
+
return cls(
|
| 181 |
+
chunking=ChunkingConfig.from_env(),
|
| 182 |
+
privacy=PrivacyConfig.from_env(),
|
| 183 |
+
indexing=IndexingConfig.from_env(),
|
| 184 |
+
retrieval=RetrievalConfig.from_env(),
|
| 185 |
+
persist_directory=os.getenv('PERSIST_DIRECTORY', 'chroma_db'),
|
| 186 |
+
embedding_provider=os.getenv('EMBEDDING_PROVIDER', 'gemini'),
|
| 187 |
+
embedding_model=os.getenv('EMBEDDING_MODEL', 'models/embedding-001'),
|
| 188 |
+
llm_provider=os.getenv('LLM_PROVIDER', 'gemini'),
|
| 189 |
+
llm_model=os.getenv('LLM_MODEL', 'gemini-2.0-flash-exp'),
|
| 190 |
+
log_level=os.getenv('LOG_LEVEL', 'INFO'),
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
def validate(self) -> List[str]:
|
| 194 |
+
"""
|
| 195 |
+
Validate configuration settings.
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
List of validation error messages (empty if valid)
|
| 199 |
+
"""
|
| 200 |
+
errors = []
|
| 201 |
+
|
| 202 |
+
# Chunking validation
|
| 203 |
+
if self.chunking.max_chunk_tokens < self.chunking.min_chunk_tokens:
|
| 204 |
+
errors.append("max_chunk_tokens must be >= min_chunk_tokens")
|
| 205 |
+
|
| 206 |
+
if self.chunking.max_chunk_tokens > 8000:
|
| 207 |
+
errors.append("max_chunk_tokens should not exceed 8000 (model context limits)")
|
| 208 |
+
|
| 209 |
+
# Privacy validation
|
| 210 |
+
if self.privacy.enable_path_obfuscation and not self.privacy.obfuscation_key:
|
| 211 |
+
errors.append("obfuscation_key required when path obfuscation is enabled")
|
| 212 |
+
|
| 213 |
+
# Indexing validation
|
| 214 |
+
if self.indexing.batch_size < 1:
|
| 215 |
+
errors.append("batch_size must be at least 1")
|
| 216 |
+
|
| 217 |
+
if self.indexing.max_file_size_mb < 1:
|
| 218 |
+
errors.append("max_file_size_mb must be at least 1")
|
| 219 |
+
|
| 220 |
+
# Retrieval validation
|
| 221 |
+
if self.retrieval.retrieval_k < self.retrieval.rerank_top_k:
|
| 222 |
+
errors.append("retrieval_k must be >= rerank_top_k")
|
| 223 |
+
|
| 224 |
+
if not 0.0 <= self.retrieval.similarity_threshold <= 1.0:
|
| 225 |
+
errors.append("similarity_threshold must be between 0.0 and 1.0")
|
| 226 |
+
|
| 227 |
+
# Provider validation
|
| 228 |
+
valid_embedding_providers = ['gemini', 'openai', 'huggingface']
|
| 229 |
+
if self.embedding_provider not in valid_embedding_providers:
|
| 230 |
+
errors.append(f"embedding_provider must be one of: {valid_embedding_providers}")
|
| 231 |
+
|
| 232 |
+
valid_llm_providers = ['gemini', 'groq', 'openai']
|
| 233 |
+
if self.llm_provider not in valid_llm_providers:
|
| 234 |
+
errors.append(f"llm_provider must be one of: {valid_llm_providers}")
|
| 235 |
+
|
| 236 |
+
return errors
|
| 237 |
+
|
| 238 |
+
def ensure_directories(self):
|
| 239 |
+
"""Create necessary directories if they don't exist."""
|
| 240 |
+
Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
|
| 241 |
+
Path(self.indexing.merkle_snapshot_dir).mkdir(parents=True, exist_ok=True)
|
| 242 |
+
|
| 243 |
+
# Create parent directory for path mapping file
|
| 244 |
+
if self.privacy.enable_path_obfuscation:
|
| 245 |
+
Path(self.privacy.obfuscation_mapping_file).parent.mkdir(parents=True, exist_ok=True)
|
| 246 |
+
|
| 247 |
+
def summary(self) -> str:
|
| 248 |
+
"""Get a human-readable summary of the configuration."""
|
| 249 |
+
return f"""
|
| 250 |
+
RAG Configuration Summary:
|
| 251 |
+
==========================
|
| 252 |
+
Chunking:
|
| 253 |
+
- Max tokens: {self.chunking.max_chunk_tokens}
|
| 254 |
+
- Min tokens: {self.chunking.min_chunk_tokens}
|
| 255 |
+
- Preserve imports: {self.chunking.preserve_imports}
|
| 256 |
+
- Calculate complexity: {self.chunking.calculate_complexity}
|
| 257 |
+
|
| 258 |
+
Privacy:
|
| 259 |
+
- Path obfuscation: {self.privacy.enable_path_obfuscation}
|
| 260 |
+
|
| 261 |
+
Indexing:
|
| 262 |
+
- Incremental indexing: {self.indexing.enable_incremental_indexing}
|
| 263 |
+
- Batch size: {self.indexing.batch_size}
|
| 264 |
+
- Max file size: {self.indexing.max_file_size_mb} MB
|
| 265 |
+
|
| 266 |
+
Retrieval:
|
| 267 |
+
- Reranking: {self.retrieval.enable_reranking}
|
| 268 |
+
- Retrieval K: {self.retrieval.retrieval_k}
|
| 269 |
+
- Rerank top K: {self.retrieval.rerank_top_k}
|
| 270 |
+
- Multi-query: {self.retrieval.enable_multi_query}
|
| 271 |
+
|
| 272 |
+
Providers:
|
| 273 |
+
- Embeddings: {self.embedding_provider} ({self.embedding_model})
|
| 274 |
+
- LLM: {self.llm_provider} ({self.llm_model})
|
| 275 |
+
- Persist dir: {self.persist_directory}
|
| 276 |
+
""".strip()
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# Global configuration instance
|
| 280 |
+
_config: Optional[RAGConfig] = None
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def get_config() -> RAGConfig:
|
| 284 |
+
"""
|
| 285 |
+
Get the global RAG configuration instance.
|
| 286 |
+
|
| 287 |
+
Loads from environment on first call, then returns cached instance.
|
| 288 |
+
|
| 289 |
+
Returns:
|
| 290 |
+
RAGConfig instance
|
| 291 |
+
"""
|
| 292 |
+
global _config
|
| 293 |
+
|
| 294 |
+
if _config is None:
|
| 295 |
+
_config = RAGConfig.from_env()
|
| 296 |
+
_config.ensure_directories()
|
| 297 |
+
|
| 298 |
+
# Validate configuration
|
| 299 |
+
errors = _config.validate()
|
| 300 |
+
if errors:
|
| 301 |
+
raise ValueError(f"Invalid configuration:\n" + "\n".join(f" - {e}" for e in errors))
|
| 302 |
+
|
| 303 |
+
return _config
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def reset_config():
|
| 307 |
+
"""Reset the global configuration (useful for testing)."""
|
| 308 |
+
global _config
|
| 309 |
+
_config = None
|
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Crew workflows for multi-agent collaboration.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from crewai import Crew, Task, Process
|
| 6 |
+
from typing import Dict, Any, Optional
|
| 7 |
+
from code_chatbot.agents import (
|
| 8 |
+
create_analyst_agent,
|
| 9 |
+
create_refactor_agent,
|
| 10 |
+
create_reviewer_agent,
|
| 11 |
+
create_documentation_agent
|
| 12 |
+
)
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RefactoringCrew:
|
| 19 |
+
"""
|
| 20 |
+
Crew for automated refactoring tasks.
|
| 21 |
+
|
| 22 |
+
Workflow:
|
| 23 |
+
1. Analyst examines code and identifies refactoring opportunities
|
| 24 |
+
2. Refactor agent implements the top refactorings
|
| 25 |
+
3. Reviewer checks the refactored code for correctness
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, llm=None, mcp_tools: Optional[list] = None):
|
| 29 |
+
"""
|
| 30 |
+
Initialize refactoring crew.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
llm: Language model to use for agents
|
| 34 |
+
mcp_tools: MCP tools to provide to agents
|
| 35 |
+
"""
|
| 36 |
+
self.llm = llm
|
| 37 |
+
self.mcp_tools = mcp_tools or []
|
| 38 |
+
|
| 39 |
+
# Create agents
|
| 40 |
+
self.analyst = create_analyst_agent(llm=llm, tools=self.mcp_tools)
|
| 41 |
+
self.refactor = create_refactor_agent(llm=llm, tools=self.mcp_tools)
|
| 42 |
+
self.reviewer = create_reviewer_agent(llm=llm, tools=self.mcp_tools)
|
| 43 |
+
|
| 44 |
+
def create_crew(self, file_path: str) -> Crew:
|
| 45 |
+
"""
|
| 46 |
+
Create a crew for refactoring a specific file.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
file_path: Path to file to refactor
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
Configured Crew instance
|
| 53 |
+
"""
|
| 54 |
+
# Define tasks
|
| 55 |
+
analysis_task = Task(
|
| 56 |
+
description=f"""Analyze the file {file_path} and identify refactoring opportunities.
|
| 57 |
+
|
| 58 |
+
Look for:
|
| 59 |
+
- Long functions that should be split
|
| 60 |
+
- Duplicate code
|
| 61 |
+
- Complex conditionals
|
| 62 |
+
- Code smells
|
| 63 |
+
- Opportunities for better naming
|
| 64 |
+
|
| 65 |
+
Provide a prioritized list of the top 3-5 refactoring suggestions with rationale.""",
|
| 66 |
+
agent=self.analyst,
|
| 67 |
+
expected_output="A prioritized list of refactoring suggestions with detailed rationale"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
refactor_task = Task(
|
| 71 |
+
description=f"""Based on the analysis, implement the top 3 refactorings for {file_path}.
|
| 72 |
+
|
| 73 |
+
For each refactoring:
|
| 74 |
+
1. Explain what you're changing and why
|
| 75 |
+
2. Show the before and after code
|
| 76 |
+
3. Ensure the refactoring is safe and doesn't break functionality
|
| 77 |
+
|
| 78 |
+
Focus on high-impact, low-risk refactorings first.""",
|
| 79 |
+
agent=self.refactor,
|
| 80 |
+
expected_output="Detailed refactoring plan with before/after code examples",
|
| 81 |
+
context=[analysis_task]
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
review_task = Task(
|
| 85 |
+
description=f"""Review the proposed refactorings for {file_path}.
|
| 86 |
+
|
| 87 |
+
Check for:
|
| 88 |
+
- Correctness: Do the refactorings preserve functionality?
|
| 89 |
+
- Quality: Do they actually improve the code?
|
| 90 |
+
- Safety: Are there any risks or edge cases?
|
| 91 |
+
- Completeness: Is anything missing?
|
| 92 |
+
|
| 93 |
+
Provide a review report with approval or requested changes.""",
|
| 94 |
+
agent=self.reviewer,
|
| 95 |
+
expected_output="Review report with approval status and any concerns",
|
| 96 |
+
context=[refactor_task]
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Create crew
|
| 100 |
+
crew = Crew(
|
| 101 |
+
agents=[self.analyst, self.refactor, self.reviewer],
|
| 102 |
+
tasks=[analysis_task, refactor_task, review_task],
|
| 103 |
+
process=Process.sequential,
|
| 104 |
+
verbose=True
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
return crew
|
| 108 |
+
|
| 109 |
+
def run(self, file_path: str) -> Dict[str, Any]:
|
| 110 |
+
"""
|
| 111 |
+
Run the refactoring crew on a file.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
file_path: Path to file to refactor
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
Crew execution result
|
| 118 |
+
"""
|
| 119 |
+
crew = self.create_crew(file_path)
|
| 120 |
+
result = crew.kickoff()
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
'file_path': file_path,
|
| 124 |
+
'result': result,
|
| 125 |
+
'tasks_completed': len(crew.tasks)
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class CodeReviewCrew:
|
| 130 |
+
"""
|
| 131 |
+
Crew for comprehensive code review.
|
| 132 |
+
|
| 133 |
+
Workflow:
|
| 134 |
+
1. Analyst examines code structure and patterns
|
| 135 |
+
2. Reviewer performs detailed code review
|
| 136 |
+
3. Documentation agent suggests documentation improvements
|
| 137 |
+
"""
|
| 138 |
+
|
| 139 |
+
def __init__(self, llm=None, mcp_tools: Optional[list] = None):
|
| 140 |
+
"""Initialize code review crew."""
|
| 141 |
+
self.llm = llm
|
| 142 |
+
self.mcp_tools = mcp_tools or []
|
| 143 |
+
|
| 144 |
+
self.analyst = create_analyst_agent(llm=llm, tools=self.mcp_tools)
|
| 145 |
+
self.reviewer = create_reviewer_agent(llm=llm, tools=self.mcp_tools)
|
| 146 |
+
self.documentation = create_documentation_agent(llm=llm, tools=self.mcp_tools)
|
| 147 |
+
|
| 148 |
+
def create_crew(self, file_path: str) -> Crew:
|
| 149 |
+
"""Create a crew for reviewing a specific file."""
|
| 150 |
+
analysis_task = Task(
|
| 151 |
+
description=f"""Analyze the structure and design of {file_path}.
|
| 152 |
+
|
| 153 |
+
Examine:
|
| 154 |
+
- Overall architecture and design patterns
|
| 155 |
+
- Code organization and modularity
|
| 156 |
+
- Complexity and maintainability
|
| 157 |
+
- Dependencies and coupling
|
| 158 |
+
|
| 159 |
+
Provide insights about the code's design quality.""",
|
| 160 |
+
agent=self.analyst,
|
| 161 |
+
expected_output="Architectural analysis with insights about design quality"
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
review_task = Task(
|
| 165 |
+
description=f"""Perform a detailed code review of {file_path}.
|
| 166 |
+
|
| 167 |
+
Check for:
|
| 168 |
+
- Bugs and potential issues
|
| 169 |
+
- Security vulnerabilities
|
| 170 |
+
- Performance problems
|
| 171 |
+
- Code style and best practices
|
| 172 |
+
- Error handling
|
| 173 |
+
|
| 174 |
+
Provide specific, actionable feedback.""",
|
| 175 |
+
agent=self.reviewer,
|
| 176 |
+
expected_output="Detailed code review with specific issues and recommendations",
|
| 177 |
+
context=[analysis_task]
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
documentation_task = Task(
|
| 181 |
+
description=f"""Review and suggest improvements for documentation in {file_path}.
|
| 182 |
+
|
| 183 |
+
Evaluate:
|
| 184 |
+
- Docstrings and comments
|
| 185 |
+
- Function/class documentation
|
| 186 |
+
- Code clarity and readability
|
| 187 |
+
- Missing documentation
|
| 188 |
+
|
| 189 |
+
Suggest specific documentation improvements.""",
|
| 190 |
+
agent=self.documentation,
|
| 191 |
+
expected_output="Documentation review with improvement suggestions",
|
| 192 |
+
context=[analysis_task, review_task]
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
crew = Crew(
|
| 196 |
+
agents=[self.analyst, self.reviewer, self.documentation],
|
| 197 |
+
tasks=[analysis_task, review_task, documentation_task],
|
| 198 |
+
process=Process.sequential,
|
| 199 |
+
verbose=True
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
return crew
|
| 203 |
+
|
| 204 |
+
def run(self, file_path: str) -> Dict[str, Any]:
|
| 205 |
+
"""Run the code review crew on a file."""
|
| 206 |
+
crew = self.create_crew(file_path)
|
| 207 |
+
result = crew.kickoff()
|
| 208 |
+
|
| 209 |
+
return {
|
| 210 |
+
'file_path': file_path,
|
| 211 |
+
'result': result,
|
| 212 |
+
'tasks_completed': len(crew.tasks)
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# Export crews
|
| 217 |
+
__all__ = ['RefactoringCrew', 'CodeReviewCrew']
|
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Incremental indexing methods for the Indexer class.
|
| 3 |
+
|
| 4 |
+
This module extends the Indexer with methods for efficient incremental indexing
|
| 5 |
+
using Merkle trees for change detection.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def add_incremental_indexing_methods(indexer_class):
|
| 18 |
+
"""
|
| 19 |
+
Add incremental indexing methods to the Indexer class.
|
| 20 |
+
|
| 21 |
+
This is a helper module to extend the Indexer without modifying the original file too much.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def incremental_index(
|
| 25 |
+
self,
|
| 26 |
+
source_path: str,
|
| 27 |
+
collection_name: str = "codebase",
|
| 28 |
+
vector_db_type: str = "chroma"
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Perform incremental indexing using Merkle tree change detection.
|
| 32 |
+
|
| 33 |
+
Only re-indexes files that have changed since the last indexing.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
source_path: Path to the codebase directory
|
| 37 |
+
collection_name: Name of the vector store collection
|
| 38 |
+
vector_db_type: Type of vector database ('chroma', 'faiss', 'qdrant')
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
ChangeSet describing what was indexed
|
| 42 |
+
"""
|
| 43 |
+
if not self.config.indexing.enable_incremental_indexing:
|
| 44 |
+
logger.info("Incremental indexing disabled, performing full index")
|
| 45 |
+
# Fall back to full indexing
|
| 46 |
+
from code_chatbot.universal_ingestor import UniversalIngestor
|
| 47 |
+
ingestor = UniversalIngestor(source_path)
|
| 48 |
+
ingestor.download()
|
| 49 |
+
|
| 50 |
+
documents = []
|
| 51 |
+
for content, metadata in ingestor.walk():
|
| 52 |
+
documents.append(Document(page_content=content, metadata=metadata))
|
| 53 |
+
|
| 54 |
+
return self.index_documents(documents, collection_name, vector_db_type)
|
| 55 |
+
|
| 56 |
+
# Get snapshot path for this collection
|
| 57 |
+
snapshot_dir = Path(self.config.indexing.merkle_snapshot_dir)
|
| 58 |
+
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
| 59 |
+
snapshot_path = snapshot_dir / f"{collection_name}_snapshot.json"
|
| 60 |
+
|
| 61 |
+
# Load previous snapshot
|
| 62 |
+
old_tree = self.merkle_tree.load_snapshot(str(snapshot_path))
|
| 63 |
+
|
| 64 |
+
# Build current tree
|
| 65 |
+
logger.info(f"Building Merkle tree for {source_path}...")
|
| 66 |
+
new_tree = self.merkle_tree.build_tree(source_path)
|
| 67 |
+
|
| 68 |
+
# Compare trees to find changes
|
| 69 |
+
changes = self.merkle_tree.compare_trees(old_tree, new_tree)
|
| 70 |
+
|
| 71 |
+
logger.info(f"Change detection: {changes.summary()}")
|
| 72 |
+
|
| 73 |
+
if not changes.has_changes():
|
| 74 |
+
logger.info("No changes detected, skipping indexing")
|
| 75 |
+
self.merkle_tree.save_snapshot(new_tree, str(snapshot_path))
|
| 76 |
+
return changes
|
| 77 |
+
|
| 78 |
+
# Remove embeddings for deleted and modified files
|
| 79 |
+
files_to_remove = changes.deleted + changes.modified
|
| 80 |
+
if files_to_remove:
|
| 81 |
+
logger.info(f"Removing embeddings for {len(files_to_remove)} files...")
|
| 82 |
+
for file_path in files_to_remove:
|
| 83 |
+
self._remove_file_embeddings(file_path, collection_name, vector_db_type)
|
| 84 |
+
|
| 85 |
+
# Index new and modified files
|
| 86 |
+
files_to_index = changes.added + changes.modified
|
| 87 |
+
if files_to_index:
|
| 88 |
+
logger.info(f"Indexing {len(files_to_index)} files...")
|
| 89 |
+
documents = []
|
| 90 |
+
|
| 91 |
+
for relative_path in files_to_index:
|
| 92 |
+
full_path = Path(source_path) / relative_path
|
| 93 |
+
|
| 94 |
+
if not full_path.exists() or not full_path.is_file():
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
# Check file size
|
| 98 |
+
file_size_mb = full_path.stat().st_size / (1024 * 1024)
|
| 99 |
+
if file_size_mb > self.config.indexing.max_file_size_mb:
|
| 100 |
+
logger.warning(f"Skipping {relative_path}: file too large ({file_size_mb:.1f} MB)")
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
content = full_path.read_text(encoding='utf-8', errors='ignore')
|
| 105 |
+
|
| 106 |
+
# Apply path obfuscation if enabled
|
| 107 |
+
display_path = relative_path
|
| 108 |
+
if self.path_obfuscator:
|
| 109 |
+
display_path = self.path_obfuscator.obfuscate_path(relative_path)
|
| 110 |
+
|
| 111 |
+
documents.append(Document(
|
| 112 |
+
page_content=content,
|
| 113 |
+
metadata={"file_path": display_path, "_original_path": relative_path}
|
| 114 |
+
))
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Failed to read {relative_path}: {e}")
|
| 117 |
+
|
| 118 |
+
if documents:
|
| 119 |
+
self.index_documents(documents, collection_name, vector_db_type)
|
| 120 |
+
|
| 121 |
+
# Save new snapshot
|
| 122 |
+
self.merkle_tree.save_snapshot(new_tree, str(snapshot_path))
|
| 123 |
+
|
| 124 |
+
logger.info(f"Incremental indexing complete: {changes.summary()}")
|
| 125 |
+
return changes
|
| 126 |
+
|
| 127 |
+
def _remove_file_embeddings(
|
| 128 |
+
self,
|
| 129 |
+
file_path: str,
|
| 130 |
+
collection_name: str = "codebase",
|
| 131 |
+
vector_db_type: str = "chroma"
|
| 132 |
+
):
|
| 133 |
+
"""
|
| 134 |
+
Remove all embeddings for a specific file.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
file_path: Relative path to the file
|
| 138 |
+
collection_name: Name of the collection
|
| 139 |
+
vector_db_type: Type of vector database
|
| 140 |
+
"""
|
| 141 |
+
from code_chatbot.indexer import get_chroma_client
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
if vector_db_type == "chroma":
|
| 145 |
+
chroma_client = get_chroma_client(self.persist_directory)
|
| 146 |
+
collection = chroma_client.get_collection(collection_name)
|
| 147 |
+
|
| 148 |
+
# Query for documents with this file_path
|
| 149 |
+
results = collection.get(
|
| 150 |
+
where={"file_path": file_path}
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
if results and results['ids']:
|
| 154 |
+
collection.delete(ids=results['ids'])
|
| 155 |
+
logger.info(f"Removed {len(results['ids'])} chunks for {file_path}")
|
| 156 |
+
|
| 157 |
+
elif vector_db_type == "faiss":
|
| 158 |
+
logger.warning("FAISS does not support selective deletion, full re-index required")
|
| 159 |
+
|
| 160 |
+
elif vector_db_type == "qdrant":
|
| 161 |
+
from qdrant_client import QdrantClient
|
| 162 |
+
|
| 163 |
+
url = os.getenv("QDRANT_URL")
|
| 164 |
+
api_key = os.getenv("QDRANT_API_KEY")
|
| 165 |
+
|
| 166 |
+
client = QdrantClient(url=url, api_key=api_key)
|
| 167 |
+
|
| 168 |
+
client.delete(
|
| 169 |
+
collection_name=collection_name,
|
| 170 |
+
points_selector={
|
| 171 |
+
"filter": {
|
| 172 |
+
"must": [{"key": "file_path", "match": {"value": file_path}}]
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
)
|
| 176 |
+
logger.info(f"Removed chunks for {file_path} from Qdrant")
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.error(f"Failed to remove embeddings for {file_path}: {e}")
|
| 180 |
+
|
| 181 |
+
def get_indexing_stats(self, collection_name: str = "codebase") -> dict:
|
| 182 |
+
"""
|
| 183 |
+
Get statistics about the indexed codebase.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
Dictionary with stats (total_chunks, unique_files, etc.)
|
| 187 |
+
"""
|
| 188 |
+
from code_chatbot.indexer import get_chroma_client
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
chroma_client = get_chroma_client(self.persist_directory)
|
| 192 |
+
collection = chroma_client.get_collection(collection_name)
|
| 193 |
+
|
| 194 |
+
# Get all documents
|
| 195 |
+
results = collection.get()
|
| 196 |
+
|
| 197 |
+
total_chunks = len(results['ids']) if results and results['ids'] else 0
|
| 198 |
+
|
| 199 |
+
# Count unique files
|
| 200 |
+
unique_files = set()
|
| 201 |
+
if results and results['metadatas']:
|
| 202 |
+
for metadata in results['metadatas']:
|
| 203 |
+
if 'file_path' in metadata:
|
| 204 |
+
unique_files.add(metadata['file_path'])
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
'total_chunks': total_chunks,
|
| 208 |
+
'unique_files': len(unique_files),
|
| 209 |
+
'collection_name': collection_name,
|
| 210 |
+
'persist_directory': self.persist_directory
|
| 211 |
+
}
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.error(f"Failed to get indexing stats: {e}")
|
| 214 |
+
return {}
|
| 215 |
+
|
| 216 |
+
# Add methods to the class
|
| 217 |
+
indexer_class.incremental_index = incremental_index
|
| 218 |
+
indexer_class._remove_file_embeddings = _remove_file_embeddings
|
| 219 |
+
indexer_class.get_indexing_stats = get_indexing_stats
|
| 220 |
+
|
| 221 |
+
return indexer_class
|
|
@@ -1,9 +1,13 @@
|
|
| 1 |
import os
|
| 2 |
-
from typing import List
|
|
|
|
| 3 |
from langchain_core.documents import Document
|
| 4 |
from langchain_community.vectorstores import Chroma
|
| 5 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 6 |
from code_chatbot.chunker import StructuralChunker
|
|
|
|
|
|
|
|
|
|
| 7 |
import shutil
|
| 8 |
import logging
|
| 9 |
|
|
@@ -40,8 +44,23 @@ class Indexer:
|
|
| 40 |
self.persist_directory = persist_directory
|
| 41 |
self.provider = provider
|
| 42 |
|
|
|
|
|
|
|
|
|
|
| 43 |
# Initialize Structural Chunker
|
| 44 |
-
self.chunker = StructuralChunker()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# Setup Embeddings (only Gemini supported)
|
| 47 |
if embedding_function:
|
|
@@ -52,7 +71,7 @@ class Indexer:
|
|
| 52 |
if not api_key:
|
| 53 |
raise ValueError("Google API Key is required for Gemini Embeddings")
|
| 54 |
self.embedding_function = GoogleGenerativeAIEmbeddings(
|
| 55 |
-
model="models/
|
| 56 |
google_api_key=api_key
|
| 57 |
)
|
| 58 |
else:
|
|
@@ -120,8 +139,8 @@ class Indexer:
|
|
| 120 |
else:
|
| 121 |
raise ValueError(f"Unsupported Vector DB: {vector_db_type}")
|
| 122 |
|
| 123 |
-
# Batch processing
|
| 124 |
-
batch_size =
|
| 125 |
total_chunks = len(all_chunks)
|
| 126 |
|
| 127 |
logger.info(f"Indexing {total_chunks} chunks in batches of {batch_size}...")
|
|
@@ -162,15 +181,24 @@ class Indexer:
|
|
| 162 |
# Loop for Chroma (existing logic)
|
| 163 |
for i in range(0, total_chunks, batch_size):
|
| 164 |
batch = all_chunks[i:i + batch_size]
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
|
| 176 |
# PersistentClient auto-persists
|
|
@@ -235,3 +263,7 @@ class Indexer:
|
|
| 235 |
retriever = vector_store.as_retriever(search_kwargs={"k": k})
|
| 236 |
logger.info(f"Retriever created with k={k}")
|
| 237 |
return retriever
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
from pathlib import Path
|
| 4 |
from langchain_core.documents import Document
|
| 5 |
from langchain_community.vectorstores import Chroma
|
| 6 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 7 |
from code_chatbot.chunker import StructuralChunker
|
| 8 |
+
from code_chatbot.merkle_tree import MerkleTree, ChangeSet
|
| 9 |
+
from code_chatbot.path_obfuscator import PathObfuscator
|
| 10 |
+
from code_chatbot.config import get_config
|
| 11 |
import shutil
|
| 12 |
import logging
|
| 13 |
|
|
|
|
| 44 |
self.persist_directory = persist_directory
|
| 45 |
self.provider = provider
|
| 46 |
|
| 47 |
+
# Load configuration
|
| 48 |
+
self.config = get_config()
|
| 49 |
+
|
| 50 |
# Initialize Structural Chunker
|
| 51 |
+
self.chunker = StructuralChunker(max_tokens=self.config.chunking.max_chunk_tokens)
|
| 52 |
+
|
| 53 |
+
# Initialize Merkle tree for change detection
|
| 54 |
+
self.merkle_tree = MerkleTree(ignore_patterns=self.config.indexing.ignore_patterns)
|
| 55 |
+
|
| 56 |
+
# Initialize path obfuscator if enabled
|
| 57 |
+
self.path_obfuscator: Optional[PathObfuscator] = None
|
| 58 |
+
if self.config.privacy.enable_path_obfuscation:
|
| 59 |
+
self.path_obfuscator = PathObfuscator(
|
| 60 |
+
secret_key=self.config.privacy.obfuscation_key,
|
| 61 |
+
mapping_file=self.config.privacy.obfuscation_mapping_file
|
| 62 |
+
)
|
| 63 |
+
logger.info("Path obfuscation enabled")
|
| 64 |
|
| 65 |
# Setup Embeddings (only Gemini supported)
|
| 66 |
if embedding_function:
|
|
|
|
| 71 |
if not api_key:
|
| 72 |
raise ValueError("Google API Key is required for Gemini Embeddings")
|
| 73 |
self.embedding_function = GoogleGenerativeAIEmbeddings(
|
| 74 |
+
model="models/gemini-embedding-001",
|
| 75 |
google_api_key=api_key
|
| 76 |
)
|
| 77 |
else:
|
|
|
|
| 139 |
else:
|
| 140 |
raise ValueError(f"Unsupported Vector DB: {vector_db_type}")
|
| 141 |
|
| 142 |
+
# Batch processing - smaller batches to avoid rate limits
|
| 143 |
+
batch_size = 20 # Reduced for free tier rate limits
|
| 144 |
total_chunks = len(all_chunks)
|
| 145 |
|
| 146 |
logger.info(f"Indexing {total_chunks} chunks in batches of {batch_size}...")
|
|
|
|
| 181 |
# Loop for Chroma (existing logic)
|
| 182 |
for i in range(0, total_chunks, batch_size):
|
| 183 |
batch = all_chunks[i:i + batch_size]
|
| 184 |
+
# Retry logic for rate limits
|
| 185 |
+
max_retries = 5
|
| 186 |
+
for retry in range(max_retries):
|
| 187 |
+
try:
|
| 188 |
+
vectordb.add_documents(documents=batch)
|
| 189 |
+
logger.info(f"Indexed batch {i // batch_size + 1}/{(total_chunks + batch_size - 1) // batch_size}")
|
| 190 |
+
# Delay to avoid rate limits (free tier is ~15 req/min)
|
| 191 |
+
time.sleep(4) # 4 seconds between batches = ~15/min
|
| 192 |
+
break
|
| 193 |
+
except Exception as e:
|
| 194 |
+
error_str = str(e).lower()
|
| 195 |
+
if 'rate' in error_str or '429' in error_str or 'quota' in error_str or 'resource_exhausted' in error_str:
|
| 196 |
+
wait_time = 30 * (retry + 1) # 30s, 60s, 90s, 120s, 150s
|
| 197 |
+
logger.warning(f"Rate limit hit, waiting {wait_time}s... (retry {retry+1}/{max_retries})")
|
| 198 |
+
time.sleep(wait_time)
|
| 199 |
+
else:
|
| 200 |
+
logger.error(f"Error indexing batch {i}: {e}")
|
| 201 |
+
break
|
| 202 |
|
| 203 |
|
| 204 |
# PersistentClient auto-persists
|
|
|
|
| 263 |
retriever = vector_store.as_retriever(search_kwargs={"k": k})
|
| 264 |
logger.info(f"Retriever created with k={k}")
|
| 265 |
return retriever
|
| 266 |
+
|
| 267 |
+
# Add incremental indexing methods to the Indexer class
|
| 268 |
+
from code_chatbot.incremental_indexing import add_incremental_indexing_methods
|
| 269 |
+
Indexer = add_incremental_indexing_methods(Indexer)
|
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MCP Client for interacting with Refactor MCP Server.
|
| 3 |
+
|
| 4 |
+
Provides async methods to call MCP tools from other parts of the application.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
from typing import List, Dict, Optional
|
| 9 |
+
from code_chatbot.mcp_server import RefactorMCPServer, SearchResult, RefactorResult, RefactorSuggestion
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class MCPClient:
|
| 15 |
+
"""
|
| 16 |
+
Client for Refactor MCP server.
|
| 17 |
+
|
| 18 |
+
Provides a simple interface to call MCP tools.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, workspace_root: str):
|
| 22 |
+
"""
|
| 23 |
+
Initialize MCP client.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
workspace_root: Root directory of the codebase
|
| 27 |
+
"""
|
| 28 |
+
self.server = RefactorMCPServer(workspace_root)
|
| 29 |
+
logger.info(f"MCP Client initialized for workspace: {workspace_root}")
|
| 30 |
+
|
| 31 |
+
def search_code(
|
| 32 |
+
self,
|
| 33 |
+
pattern: str,
|
| 34 |
+
file_pattern: str = "**/*.py",
|
| 35 |
+
context_lines: int = 2,
|
| 36 |
+
is_regex: bool = True
|
| 37 |
+
) -> List[SearchResult]:
|
| 38 |
+
"""
|
| 39 |
+
Search for patterns in codebase.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
pattern: Search pattern (regex or literal)
|
| 43 |
+
file_pattern: Glob pattern for files to search
|
| 44 |
+
context_lines: Number of context lines before/after match
|
| 45 |
+
is_regex: Whether pattern is regex
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
List of search results
|
| 49 |
+
"""
|
| 50 |
+
try:
|
| 51 |
+
results = self.server.code_search(
|
| 52 |
+
pattern=pattern,
|
| 53 |
+
file_pattern=file_pattern,
|
| 54 |
+
context_lines=context_lines,
|
| 55 |
+
is_regex=is_regex
|
| 56 |
+
)
|
| 57 |
+
logger.info(f"Code search completed: {len(results)} results")
|
| 58 |
+
return results
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.error(f"Code search failed: {e}")
|
| 61 |
+
return []
|
| 62 |
+
|
| 63 |
+
def refactor_code(
|
| 64 |
+
self,
|
| 65 |
+
search_pattern: str,
|
| 66 |
+
replace_pattern: str,
|
| 67 |
+
file_pattern: str = "**/*.py",
|
| 68 |
+
dry_run: bool = True,
|
| 69 |
+
is_regex: bool = True
|
| 70 |
+
) -> RefactorResult:
|
| 71 |
+
"""
|
| 72 |
+
Perform regex-based code refactoring.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
search_pattern: Pattern to search for
|
| 76 |
+
replace_pattern: Replacement string (supports capture groups)
|
| 77 |
+
file_pattern: Glob pattern for files to process
|
| 78 |
+
dry_run: If True, only show what would change
|
| 79 |
+
is_regex: Whether pattern is regex
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
RefactorResult with changes made or to be made
|
| 83 |
+
"""
|
| 84 |
+
try:
|
| 85 |
+
result = self.server.code_refactor(
|
| 86 |
+
search_pattern=search_pattern,
|
| 87 |
+
replace_pattern=replace_pattern,
|
| 88 |
+
file_pattern=file_pattern,
|
| 89 |
+
dry_run=dry_run,
|
| 90 |
+
is_regex=is_regex
|
| 91 |
+
)
|
| 92 |
+
logger.info(f"Refactoring {'preview' if dry_run else 'complete'}: "
|
| 93 |
+
f"{result.files_changed} files, {result.total_replacements} replacements")
|
| 94 |
+
return result
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.error(f"Refactoring failed: {e}")
|
| 97 |
+
return RefactorResult(
|
| 98 |
+
files_changed=0,
|
| 99 |
+
total_replacements=0,
|
| 100 |
+
changes=[],
|
| 101 |
+
dry_run=dry_run,
|
| 102 |
+
success=False,
|
| 103 |
+
error=str(e)
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
def suggest_refactorings(
|
| 107 |
+
self,
|
| 108 |
+
file_path: str,
|
| 109 |
+
max_suggestions: int = 5
|
| 110 |
+
) -> List[RefactorSuggestion]:
|
| 111 |
+
"""
|
| 112 |
+
Analyze code and suggest refactorings.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
file_path: Path to file to analyze
|
| 116 |
+
max_suggestions: Maximum number of suggestions
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List of refactoring suggestions
|
| 120 |
+
"""
|
| 121 |
+
try:
|
| 122 |
+
suggestions = self.server.suggest_refactorings(
|
| 123 |
+
file_path=file_path,
|
| 124 |
+
max_suggestions=max_suggestions
|
| 125 |
+
)
|
| 126 |
+
logger.info(f"Generated {len(suggestions)} refactoring suggestions for {file_path}")
|
| 127 |
+
return suggestions
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.error(f"Suggestion generation failed: {e}")
|
| 130 |
+
return []
|
| 131 |
+
|
| 132 |
+
def format_search_results(self, results: List[SearchResult], max_results: int = 10) -> str:
|
| 133 |
+
"""
|
| 134 |
+
Format search results for display.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
results: List of search results
|
| 138 |
+
max_results: Maximum number of results to format
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Formatted string
|
| 142 |
+
"""
|
| 143 |
+
if not results:
|
| 144 |
+
return "No results found."
|
| 145 |
+
|
| 146 |
+
output = [f"Found {len(results)} matches:\n"]
|
| 147 |
+
|
| 148 |
+
for i, result in enumerate(results[:max_results], 1):
|
| 149 |
+
output.append(f"\n{i}. {result.file_path}:{result.line_number}")
|
| 150 |
+
output.append(f" {result.line_content}")
|
| 151 |
+
|
| 152 |
+
if result.context_before:
|
| 153 |
+
output.append(f" Context before:")
|
| 154 |
+
for line in result.context_before[-2:]:
|
| 155 |
+
output.append(f" {line}")
|
| 156 |
+
|
| 157 |
+
if len(results) > max_results:
|
| 158 |
+
output.append(f"\n... and {len(results) - max_results} more results")
|
| 159 |
+
|
| 160 |
+
return '\n'.join(output)
|
| 161 |
+
|
| 162 |
+
def format_refactor_result(self, result: RefactorResult) -> str:
|
| 163 |
+
"""
|
| 164 |
+
Format refactor result for display.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
result: Refactor result
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
Formatted string
|
| 171 |
+
"""
|
| 172 |
+
if not result.success:
|
| 173 |
+
return f"β Refactoring failed: {result.error}"
|
| 174 |
+
|
| 175 |
+
mode = "Preview" if result.dry_run else "Applied"
|
| 176 |
+
output = [
|
| 177 |
+
f"β
Refactoring {mode}:",
|
| 178 |
+
f" Files changed: {result.files_changed}",
|
| 179 |
+
f" Total replacements: {result.total_replacements}\n"
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
for change in result.changes[:5]:
|
| 183 |
+
output.append(f"\nπ {change['file_path']}")
|
| 184 |
+
output.append(f" Replacements: {change['replacements']}")
|
| 185 |
+
if change.get('preview'):
|
| 186 |
+
output.append(f" Preview:")
|
| 187 |
+
for line in change['preview'].split('\n')[:6]:
|
| 188 |
+
output.append(f" {line}")
|
| 189 |
+
|
| 190 |
+
if len(result.changes) > 5:
|
| 191 |
+
output.append(f"\n... and {len(result.changes) - 5} more files")
|
| 192 |
+
|
| 193 |
+
return '\n'.join(output)
|
| 194 |
+
|
| 195 |
+
def format_suggestions(self, suggestions: List[RefactorSuggestion]) -> str:
|
| 196 |
+
"""
|
| 197 |
+
Format refactoring suggestions for display.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
suggestions: List of suggestions
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
Formatted string
|
| 204 |
+
"""
|
| 205 |
+
if not suggestions:
|
| 206 |
+
return "No refactoring suggestions found."
|
| 207 |
+
|
| 208 |
+
output = [f"π‘ Found {len(suggestions)} refactoring suggestions:\n"]
|
| 209 |
+
|
| 210 |
+
for i, suggestion in enumerate(suggestions, 1):
|
| 211 |
+
impact_emoji = {'low': 'π’', 'medium': 'π‘', 'high': 'π΄'}
|
| 212 |
+
emoji = impact_emoji.get(suggestion.estimated_impact, 'βͺ')
|
| 213 |
+
|
| 214 |
+
output.append(f"\n{i}. {emoji} {suggestion.type.replace('_', ' ').title()}")
|
| 215 |
+
output.append(f" Location: {suggestion.file_path}:L{suggestion.line_start}-L{suggestion.line_end}")
|
| 216 |
+
output.append(f" Issue: {suggestion.description}")
|
| 217 |
+
output.append(f" Suggestion: {suggestion.rationale}")
|
| 218 |
+
|
| 219 |
+
return '\n'.join(output)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# Convenience function
|
| 223 |
+
def get_mcp_client(workspace_root: str = ".") -> MCPClient:
|
| 224 |
+
"""Get an MCP client instance."""
|
| 225 |
+
return MCPClient(workspace_root)
|
|
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MCP (Model Context Protocol) Server for Code Refactoring.
|
| 3 |
+
|
| 4 |
+
Provides tools for code search, refactoring, and analysis via MCP protocol.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import re
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List, Dict, Optional, Tuple
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
import ast
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class SearchResult:
|
| 19 |
+
"""Result from code search"""
|
| 20 |
+
file_path: str
|
| 21 |
+
line_number: int
|
| 22 |
+
line_content: str
|
| 23 |
+
context_before: List[str]
|
| 24 |
+
context_after: List[str]
|
| 25 |
+
match_start: int
|
| 26 |
+
match_end: int
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class RefactorResult:
|
| 31 |
+
"""Result from code refactoring"""
|
| 32 |
+
files_changed: int
|
| 33 |
+
total_replacements: int
|
| 34 |
+
changes: List[Dict[str, any]]
|
| 35 |
+
dry_run: bool
|
| 36 |
+
success: bool
|
| 37 |
+
error: Optional[str] = None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class RefactorSuggestion:
|
| 42 |
+
"""Suggested refactoring"""
|
| 43 |
+
type: str # 'extract_function', 'rename', 'simplify', etc.
|
| 44 |
+
file_path: str
|
| 45 |
+
line_start: int
|
| 46 |
+
line_end: int
|
| 47 |
+
description: str
|
| 48 |
+
rationale: str
|
| 49 |
+
estimated_impact: str # 'low', 'medium', 'high'
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class RefactorMCPServer:
|
| 53 |
+
"""
|
| 54 |
+
MCP server providing code refactoring tools.
|
| 55 |
+
|
| 56 |
+
Tools:
|
| 57 |
+
- code_search: Search for patterns in codebase
|
| 58 |
+
- code_refactor: Perform regex-based refactoring
|
| 59 |
+
- suggest_refactorings: Analyze code and suggest improvements
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
def __init__(self, workspace_root: str):
|
| 63 |
+
"""
|
| 64 |
+
Initialize MCP server.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
workspace_root: Root directory of the codebase
|
| 68 |
+
"""
|
| 69 |
+
self.workspace_root = Path(workspace_root)
|
| 70 |
+
|
| 71 |
+
# Default ignore patterns
|
| 72 |
+
self.ignore_patterns = [
|
| 73 |
+
'**/__pycache__/**',
|
| 74 |
+
'**/*.pyc',
|
| 75 |
+
'**/node_modules/**',
|
| 76 |
+
'**/.git/**',
|
| 77 |
+
'**/venv/**',
|
| 78 |
+
'**/.venv/**',
|
| 79 |
+
'**/dist/**',
|
| 80 |
+
'**/build/**',
|
| 81 |
+
'**/*.egg-info/**'
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
def code_search(
|
| 85 |
+
self,
|
| 86 |
+
pattern: str,
|
| 87 |
+
file_pattern: str = "**/*.py",
|
| 88 |
+
context_lines: int = 2,
|
| 89 |
+
is_regex: bool = True
|
| 90 |
+
) -> List[SearchResult]:
|
| 91 |
+
"""
|
| 92 |
+
Search for patterns in codebase.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
pattern: Search pattern (regex or literal)
|
| 96 |
+
file_pattern: Glob pattern for files to search
|
| 97 |
+
context_lines: Number of context lines before/after match
|
| 98 |
+
is_regex: Whether pattern is regex
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
List of search results
|
| 102 |
+
"""
|
| 103 |
+
results = []
|
| 104 |
+
|
| 105 |
+
# Compile regex pattern
|
| 106 |
+
try:
|
| 107 |
+
if is_regex:
|
| 108 |
+
regex = re.compile(pattern)
|
| 109 |
+
else:
|
| 110 |
+
regex = re.compile(re.escape(pattern))
|
| 111 |
+
except re.error as e:
|
| 112 |
+
logger.error(f"Invalid regex pattern: {e}")
|
| 113 |
+
return results
|
| 114 |
+
|
| 115 |
+
# Find matching files
|
| 116 |
+
files = self._find_files(file_pattern)
|
| 117 |
+
|
| 118 |
+
for file_path in files:
|
| 119 |
+
try:
|
| 120 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 121 |
+
lines = f.readlines()
|
| 122 |
+
|
| 123 |
+
# Search each line
|
| 124 |
+
for line_num, line in enumerate(lines, start=1):
|
| 125 |
+
match = regex.search(line)
|
| 126 |
+
if match:
|
| 127 |
+
# Get context
|
| 128 |
+
start_idx = max(0, line_num - context_lines - 1)
|
| 129 |
+
end_idx = min(len(lines), line_num + context_lines)
|
| 130 |
+
|
| 131 |
+
context_before = [l.rstrip() for l in lines[start_idx:line_num-1]]
|
| 132 |
+
context_after = [l.rstrip() for l in lines[line_num:end_idx]]
|
| 133 |
+
|
| 134 |
+
results.append(SearchResult(
|
| 135 |
+
file_path=str(file_path.relative_to(self.workspace_root)),
|
| 136 |
+
line_number=line_num,
|
| 137 |
+
line_content=line.rstrip(),
|
| 138 |
+
context_before=context_before,
|
| 139 |
+
context_after=context_after,
|
| 140 |
+
match_start=match.start(),
|
| 141 |
+
match_end=match.end()
|
| 142 |
+
))
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.error(f"Error searching {file_path}: {e}")
|
| 146 |
+
|
| 147 |
+
logger.info(f"Found {len(results)} matches for pattern '{pattern}'")
|
| 148 |
+
return results
|
| 149 |
+
|
| 150 |
+
def code_refactor(
|
| 151 |
+
self,
|
| 152 |
+
search_pattern: str,
|
| 153 |
+
replace_pattern: str,
|
| 154 |
+
file_pattern: str = "**/*.py",
|
| 155 |
+
dry_run: bool = True,
|
| 156 |
+
is_regex: bool = True
|
| 157 |
+
) -> RefactorResult:
|
| 158 |
+
"""
|
| 159 |
+
Perform regex-based code refactoring.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
search_pattern: Pattern to search for
|
| 163 |
+
replace_pattern: Replacement string (supports capture groups)
|
| 164 |
+
file_pattern: Glob pattern for files to process
|
| 165 |
+
dry_run: If True, only show what would change
|
| 166 |
+
is_regex: Whether pattern is regex
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
RefactorResult with changes made or to be made
|
| 170 |
+
"""
|
| 171 |
+
changes = []
|
| 172 |
+
files_changed = 0
|
| 173 |
+
total_replacements = 0
|
| 174 |
+
|
| 175 |
+
try:
|
| 176 |
+
# Compile regex
|
| 177 |
+
if is_regex:
|
| 178 |
+
regex = re.compile(search_pattern)
|
| 179 |
+
else:
|
| 180 |
+
regex = re.compile(re.escape(search_pattern))
|
| 181 |
+
except re.error as e:
|
| 182 |
+
return RefactorResult(
|
| 183 |
+
files_changed=0,
|
| 184 |
+
total_replacements=0,
|
| 185 |
+
changes=[],
|
| 186 |
+
dry_run=dry_run,
|
| 187 |
+
success=False,
|
| 188 |
+
error=f"Invalid regex: {e}"
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# Find matching files
|
| 192 |
+
files = self._find_files(file_pattern)
|
| 193 |
+
|
| 194 |
+
for file_path in files:
|
| 195 |
+
try:
|
| 196 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 197 |
+
original_content = f.read()
|
| 198 |
+
|
| 199 |
+
# Perform replacement
|
| 200 |
+
new_content, num_replacements = regex.subn(replace_pattern, original_content)
|
| 201 |
+
|
| 202 |
+
if num_replacements > 0:
|
| 203 |
+
files_changed += 1
|
| 204 |
+
total_replacements += num_replacements
|
| 205 |
+
|
| 206 |
+
# Record change
|
| 207 |
+
change = {
|
| 208 |
+
'file_path': str(file_path.relative_to(self.workspace_root)),
|
| 209 |
+
'replacements': num_replacements,
|
| 210 |
+
'preview': self._generate_diff_preview(original_content, new_content)
|
| 211 |
+
}
|
| 212 |
+
changes.append(change)
|
| 213 |
+
|
| 214 |
+
# Apply change if not dry run
|
| 215 |
+
if not dry_run:
|
| 216 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
| 217 |
+
f.write(new_content)
|
| 218 |
+
logger.info(f"Applied {num_replacements} replacements to {file_path}")
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f"Error processing {file_path}: {e}")
|
| 222 |
+
|
| 223 |
+
result = RefactorResult(
|
| 224 |
+
files_changed=files_changed,
|
| 225 |
+
total_replacements=total_replacements,
|
| 226 |
+
changes=changes,
|
| 227 |
+
dry_run=dry_run,
|
| 228 |
+
success=True
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
logger.info(f"Refactoring {'preview' if dry_run else 'complete'}: "
|
| 232 |
+
f"{files_changed} files, {total_replacements} replacements")
|
| 233 |
+
|
| 234 |
+
return result
|
| 235 |
+
|
| 236 |
+
def suggest_refactorings(
|
| 237 |
+
self,
|
| 238 |
+
file_path: str,
|
| 239 |
+
max_suggestions: int = 5
|
| 240 |
+
) -> List[RefactorSuggestion]:
|
| 241 |
+
"""
|
| 242 |
+
Analyze code and suggest refactorings.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
file_path: Path to file to analyze
|
| 246 |
+
max_suggestions: Maximum number of suggestions
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
List of refactoring suggestions
|
| 250 |
+
"""
|
| 251 |
+
suggestions = []
|
| 252 |
+
|
| 253 |
+
full_path = self.workspace_root / file_path
|
| 254 |
+
|
| 255 |
+
if not full_path.exists():
|
| 256 |
+
logger.error(f"File not found: {file_path}")
|
| 257 |
+
return suggestions
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
with open(full_path, 'r', encoding='utf-8') as f:
|
| 261 |
+
content = f.read()
|
| 262 |
+
|
| 263 |
+
# Parse AST
|
| 264 |
+
tree = ast.parse(content)
|
| 265 |
+
|
| 266 |
+
# Analyze for common issues
|
| 267 |
+
for node in ast.walk(tree):
|
| 268 |
+
# Long functions
|
| 269 |
+
if isinstance(node, ast.FunctionDef):
|
| 270 |
+
func_lines = node.end_lineno - node.lineno + 1
|
| 271 |
+
if func_lines > 50:
|
| 272 |
+
suggestions.append(RefactorSuggestion(
|
| 273 |
+
type='extract_function',
|
| 274 |
+
file_path=file_path,
|
| 275 |
+
line_start=node.lineno,
|
| 276 |
+
line_end=node.end_lineno,
|
| 277 |
+
description=f"Function '{node.name}' is {func_lines} lines long",
|
| 278 |
+
rationale="Consider breaking it into smaller functions for better readability",
|
| 279 |
+
estimated_impact='medium'
|
| 280 |
+
))
|
| 281 |
+
|
| 282 |
+
# Complex conditionals
|
| 283 |
+
if isinstance(node, ast.If):
|
| 284 |
+
if self._is_complex_conditional(node.test):
|
| 285 |
+
suggestions.append(RefactorSuggestion(
|
| 286 |
+
type='simplify_conditional',
|
| 287 |
+
file_path=file_path,
|
| 288 |
+
line_start=node.lineno,
|
| 289 |
+
line_end=node.lineno,
|
| 290 |
+
description="Complex conditional expression",
|
| 291 |
+
rationale="Consider extracting to a named variable for clarity",
|
| 292 |
+
estimated_impact='low'
|
| 293 |
+
))
|
| 294 |
+
|
| 295 |
+
# Limit suggestions
|
| 296 |
+
suggestions = suggestions[:max_suggestions]
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
logger.error(f"Error analyzing {file_path}: {e}")
|
| 300 |
+
|
| 301 |
+
return suggestions
|
| 302 |
+
|
| 303 |
+
def _find_files(self, pattern: str) -> List[Path]:
|
| 304 |
+
"""Find files matching glob pattern, excluding ignored paths."""
|
| 305 |
+
files = []
|
| 306 |
+
|
| 307 |
+
for file_path in self.workspace_root.glob(pattern):
|
| 308 |
+
if file_path.is_file() and not self._should_ignore(file_path):
|
| 309 |
+
files.append(file_path)
|
| 310 |
+
|
| 311 |
+
return files
|
| 312 |
+
|
| 313 |
+
def _should_ignore(self, file_path: Path) -> bool:
|
| 314 |
+
"""Check if file should be ignored."""
|
| 315 |
+
relative_path = file_path.relative_to(self.workspace_root)
|
| 316 |
+
|
| 317 |
+
for pattern in self.ignore_patterns:
|
| 318 |
+
if relative_path.match(pattern):
|
| 319 |
+
return True
|
| 320 |
+
|
| 321 |
+
return False
|
| 322 |
+
|
| 323 |
+
def _generate_diff_preview(self, original: str, new: str, max_lines: int = 10) -> str:
|
| 324 |
+
"""Generate a preview of changes."""
|
| 325 |
+
orig_lines = original.split('\n')
|
| 326 |
+
new_lines = new.split('\n')
|
| 327 |
+
|
| 328 |
+
# Simple diff - show first few changed lines
|
| 329 |
+
diff_lines = []
|
| 330 |
+
for i, (orig, new) in enumerate(zip(orig_lines, new_lines)):
|
| 331 |
+
if orig != new:
|
| 332 |
+
diff_lines.append(f"Line {i+1}:")
|
| 333 |
+
diff_lines.append(f"- {orig}")
|
| 334 |
+
diff_lines.append(f"+ {new}")
|
| 335 |
+
|
| 336 |
+
if len(diff_lines) >= max_lines * 3:
|
| 337 |
+
break
|
| 338 |
+
|
| 339 |
+
return '\n'.join(diff_lines)
|
| 340 |
+
|
| 341 |
+
def _is_complex_conditional(self, node: ast.expr) -> bool:
|
| 342 |
+
"""Check if conditional is complex."""
|
| 343 |
+
# Count boolean operators
|
| 344 |
+
bool_ops = sum(1 for _ in ast.walk(node) if isinstance(_, (ast.And, ast.Or)))
|
| 345 |
+
return bool_ops > 2
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
# Example usage
|
| 349 |
+
if __name__ == "__main__":
|
| 350 |
+
# Create server
|
| 351 |
+
server = RefactorMCPServer("/Users/asishkarthikeyagogineni/Desktop/Codebase_Agent")
|
| 352 |
+
|
| 353 |
+
# Test code search
|
| 354 |
+
results = server.code_search("def.*index", file_pattern="**/*.py")
|
| 355 |
+
print(f"\nFound {len(results)} matches")
|
| 356 |
+
for r in results[:3]:
|
| 357 |
+
print(f" {r.file_path}:{r.line_number} - {r.line_content[:60]}")
|
| 358 |
+
|
| 359 |
+
# Test refactor (dry run)
|
| 360 |
+
refactor_result = server.code_refactor(
|
| 361 |
+
search_pattern=r"print\((.*)\)",
|
| 362 |
+
replace_pattern=r"logger.info(\1)",
|
| 363 |
+
file_pattern="**/*.py",
|
| 364 |
+
dry_run=True
|
| 365 |
+
)
|
| 366 |
+
print(f"\nRefactor preview: {refactor_result.files_changed} files, {refactor_result.total_replacements} replacements")
|
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Merkle Tree implementation for efficient codebase change detection.
|
| 3 |
+
|
| 4 |
+
Inspired by Cursor's approach to incremental indexing, this module builds
|
| 5 |
+
a cryptographic hash tree of the codebase to quickly identify which files
|
| 6 |
+
have changed since the last indexing operation.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import hashlib
|
| 10 |
+
import json
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
from dataclasses import dataclass, asdict
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Dict, List, Optional, Set
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class MerkleNode:
|
| 23 |
+
"""Represents a node in the Merkle tree (file or directory)."""
|
| 24 |
+
|
| 25 |
+
path: str # Relative path from root
|
| 26 |
+
hash: str # SHA-256 hash of content (or combined child hashes for directories)
|
| 27 |
+
is_directory: bool
|
| 28 |
+
size: int = 0 # File size in bytes (0 for directories)
|
| 29 |
+
modified_time: Optional[str] = None # ISO format timestamp
|
| 30 |
+
children: Optional[List['MerkleNode']] = None
|
| 31 |
+
|
| 32 |
+
def to_dict(self) -> Dict:
|
| 33 |
+
"""Convert to dictionary for JSON serialization."""
|
| 34 |
+
result = {
|
| 35 |
+
'path': self.path,
|
| 36 |
+
'hash': self.hash,
|
| 37 |
+
'is_directory': self.is_directory,
|
| 38 |
+
'size': self.size,
|
| 39 |
+
'modified_time': self.modified_time,
|
| 40 |
+
}
|
| 41 |
+
if self.children:
|
| 42 |
+
result['children'] = [child.to_dict() for child in self.children]
|
| 43 |
+
return result
|
| 44 |
+
|
| 45 |
+
@classmethod
|
| 46 |
+
def from_dict(cls, data: Dict) -> 'MerkleNode':
|
| 47 |
+
"""Create MerkleNode from dictionary."""
|
| 48 |
+
children = None
|
| 49 |
+
if 'children' in data and data['children']:
|
| 50 |
+
children = [cls.from_dict(child) for child in data['children']]
|
| 51 |
+
|
| 52 |
+
return cls(
|
| 53 |
+
path=data['path'],
|
| 54 |
+
hash=data['hash'],
|
| 55 |
+
is_directory=data['is_directory'],
|
| 56 |
+
size=data.get('size', 0),
|
| 57 |
+
modified_time=data.get('modified_time'),
|
| 58 |
+
children=children
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@dataclass
|
| 63 |
+
class ChangeSet:
|
| 64 |
+
"""Represents changes detected between two Merkle trees."""
|
| 65 |
+
|
| 66 |
+
added: List[str] # New files
|
| 67 |
+
modified: List[str] # Changed files
|
| 68 |
+
deleted: List[str] # Removed files
|
| 69 |
+
unchanged: List[str] # Files that haven't changed
|
| 70 |
+
|
| 71 |
+
def has_changes(self) -> bool:
|
| 72 |
+
"""Check if there are any changes."""
|
| 73 |
+
return bool(self.added or self.modified or self.deleted)
|
| 74 |
+
|
| 75 |
+
def total_changes(self) -> int:
|
| 76 |
+
"""Total number of changed files."""
|
| 77 |
+
return len(self.added) + len(self.modified) + len(self.deleted)
|
| 78 |
+
|
| 79 |
+
def summary(self) -> str:
|
| 80 |
+
"""Human-readable summary of changes."""
|
| 81 |
+
return (
|
| 82 |
+
f"Added: {len(self.added)}, "
|
| 83 |
+
f"Modified: {len(self.modified)}, "
|
| 84 |
+
f"Deleted: {len(self.deleted)}, "
|
| 85 |
+
f"Unchanged: {len(self.unchanged)}"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class MerkleTree:
|
| 90 |
+
"""
|
| 91 |
+
Builds and compares Merkle trees for efficient change detection.
|
| 92 |
+
|
| 93 |
+
The tree structure mirrors the directory structure, with each node
|
| 94 |
+
containing a hash of its content (for files) or combined child hashes
|
| 95 |
+
(for directories). This allows quick identification of changes.
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
# File extensions to ignore
|
| 99 |
+
IGNORE_EXTENSIONS = {
|
| 100 |
+
'.pyc', '.pyo', '.pyd', '.so', '.dll', '.dylib',
|
| 101 |
+
'.class', '.o', '.obj', '.exe', '.bin',
|
| 102 |
+
'.git', '.svn', '.hg', '.DS_Store',
|
| 103 |
+
'__pycache__', 'node_modules', '.venv', 'venv',
|
| 104 |
+
'.egg-info', 'dist', 'build', '.pytest_cache',
|
| 105 |
+
'.mypy_cache', '.tox', 'coverage', '.coverage'
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
def __init__(self, ignore_patterns: Optional[List[str]] = None):
|
| 109 |
+
"""
|
| 110 |
+
Initialize Merkle tree builder.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
ignore_patterns: Additional patterns to ignore (e.g., ['*.log', 'temp/*'])
|
| 114 |
+
"""
|
| 115 |
+
self.ignore_patterns = ignore_patterns or []
|
| 116 |
+
|
| 117 |
+
def _should_ignore(self, path: Path) -> bool:
|
| 118 |
+
"""Check if a path should be ignored."""
|
| 119 |
+
# Check if any part of the path matches ignore extensions
|
| 120 |
+
for part in path.parts:
|
| 121 |
+
if part in self.IGNORE_EXTENSIONS:
|
| 122 |
+
return True
|
| 123 |
+
|
| 124 |
+
# Check file extension
|
| 125 |
+
if path.suffix in self.IGNORE_EXTENSIONS:
|
| 126 |
+
return True
|
| 127 |
+
|
| 128 |
+
# Check custom patterns
|
| 129 |
+
for pattern in self.ignore_patterns:
|
| 130 |
+
if path.match(pattern):
|
| 131 |
+
return True
|
| 132 |
+
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
+
def _hash_file(self, file_path: Path) -> str:
|
| 136 |
+
"""
|
| 137 |
+
Compute SHA-256 hash of a file's content.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
file_path: Path to the file
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
Hexadecimal hash string
|
| 144 |
+
"""
|
| 145 |
+
sha256 = hashlib.sha256()
|
| 146 |
+
try:
|
| 147 |
+
with open(file_path, 'rb') as f:
|
| 148 |
+
# Read in chunks to handle large files
|
| 149 |
+
for chunk in iter(lambda: f.read(8192), b''):
|
| 150 |
+
sha256.update(chunk)
|
| 151 |
+
return sha256.hexdigest()
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.warning(f"Failed to hash file {file_path}: {e}")
|
| 154 |
+
# Return a hash of the error message to ensure consistency
|
| 155 |
+
return hashlib.sha256(str(e).encode()).hexdigest()
|
| 156 |
+
|
| 157 |
+
def _hash_directory(self, children: List[MerkleNode]) -> str:
|
| 158 |
+
"""
|
| 159 |
+
Compute hash for a directory based on its children.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
children: List of child MerkleNodes
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
Combined hash of all children
|
| 166 |
+
"""
|
| 167 |
+
# Sort children by path for consistency
|
| 168 |
+
sorted_children = sorted(children, key=lambda x: x.path)
|
| 169 |
+
|
| 170 |
+
# Combine all child hashes
|
| 171 |
+
combined = ''.join(child.hash for child in sorted_children)
|
| 172 |
+
|
| 173 |
+
return hashlib.sha256(combined.encode()).hexdigest()
|
| 174 |
+
|
| 175 |
+
def build_tree(self, root_path: str) -> MerkleNode:
|
| 176 |
+
"""
|
| 177 |
+
Build a Merkle tree for the given directory.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
root_path: Root directory to build tree from
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
Root MerkleNode of the tree
|
| 184 |
+
"""
|
| 185 |
+
root = Path(root_path).resolve()
|
| 186 |
+
|
| 187 |
+
if not root.exists():
|
| 188 |
+
raise ValueError(f"Path does not exist: {root_path}")
|
| 189 |
+
|
| 190 |
+
logger.info(f"Building Merkle tree for: {root}")
|
| 191 |
+
return self._build_node(root, root)
|
| 192 |
+
|
| 193 |
+
def _build_node(self, path: Path, root: Path) -> MerkleNode:
|
| 194 |
+
"""
|
| 195 |
+
Recursively build a MerkleNode for a path.
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
path: Current path to process
|
| 199 |
+
root: Root directory (for computing relative paths)
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
MerkleNode for this path
|
| 203 |
+
"""
|
| 204 |
+
relative_path = str(path.relative_to(root))
|
| 205 |
+
|
| 206 |
+
if path.is_file():
|
| 207 |
+
# File node
|
| 208 |
+
stat = path.stat()
|
| 209 |
+
return MerkleNode(
|
| 210 |
+
path=relative_path,
|
| 211 |
+
hash=self._hash_file(path),
|
| 212 |
+
is_directory=False,
|
| 213 |
+
size=stat.st_size,
|
| 214 |
+
modified_time=datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
| 215 |
+
children=None
|
| 216 |
+
)
|
| 217 |
+
else:
|
| 218 |
+
# Directory node
|
| 219 |
+
children = []
|
| 220 |
+
try:
|
| 221 |
+
for child_path in sorted(path.iterdir()):
|
| 222 |
+
if self._should_ignore(child_path):
|
| 223 |
+
continue
|
| 224 |
+
|
| 225 |
+
child_node = self._build_node(child_path, root)
|
| 226 |
+
children.append(child_node)
|
| 227 |
+
except PermissionError:
|
| 228 |
+
logger.warning(f"Permission denied: {path}")
|
| 229 |
+
|
| 230 |
+
return MerkleNode(
|
| 231 |
+
path=relative_path,
|
| 232 |
+
hash=self._hash_directory(children),
|
| 233 |
+
is_directory=True,
|
| 234 |
+
size=0,
|
| 235 |
+
modified_time=None,
|
| 236 |
+
children=children
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
def compare_trees(self, old_tree: Optional[MerkleNode], new_tree: MerkleNode) -> ChangeSet:
|
| 240 |
+
"""
|
| 241 |
+
Compare two Merkle trees to find changes.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
old_tree: Previous tree snapshot (None if first time)
|
| 245 |
+
new_tree: Current tree snapshot
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
ChangeSet describing all changes
|
| 249 |
+
"""
|
| 250 |
+
if old_tree is None:
|
| 251 |
+
# First time indexing - all files are new
|
| 252 |
+
all_files = self._collect_all_files(new_tree)
|
| 253 |
+
return ChangeSet(
|
| 254 |
+
added=all_files,
|
| 255 |
+
modified=[],
|
| 256 |
+
deleted=[],
|
| 257 |
+
unchanged=[]
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
added: List[str] = []
|
| 261 |
+
modified: List[str] = []
|
| 262 |
+
deleted: List[str] = []
|
| 263 |
+
unchanged: List[str] = []
|
| 264 |
+
|
| 265 |
+
# Build path->node maps for efficient lookup
|
| 266 |
+
old_files = self._build_file_map(old_tree)
|
| 267 |
+
new_files = self._build_file_map(new_tree)
|
| 268 |
+
|
| 269 |
+
# Find added and modified files
|
| 270 |
+
for path, new_node in new_files.items():
|
| 271 |
+
if path not in old_files:
|
| 272 |
+
added.append(path)
|
| 273 |
+
elif old_files[path].hash != new_node.hash:
|
| 274 |
+
modified.append(path)
|
| 275 |
+
else:
|
| 276 |
+
unchanged.append(path)
|
| 277 |
+
|
| 278 |
+
# Find deleted files
|
| 279 |
+
for path in old_files:
|
| 280 |
+
if path not in new_files:
|
| 281 |
+
deleted.append(path)
|
| 282 |
+
|
| 283 |
+
change_set = ChangeSet(
|
| 284 |
+
added=sorted(added),
|
| 285 |
+
modified=sorted(modified),
|
| 286 |
+
deleted=sorted(deleted),
|
| 287 |
+
unchanged=sorted(unchanged)
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
logger.info(f"Change detection complete: {change_set.summary()}")
|
| 291 |
+
return change_set
|
| 292 |
+
|
| 293 |
+
def _collect_all_files(self, node: MerkleNode) -> List[str]:
|
| 294 |
+
"""Collect all file paths from a tree."""
|
| 295 |
+
files = []
|
| 296 |
+
|
| 297 |
+
if not node.is_directory:
|
| 298 |
+
files.append(node.path)
|
| 299 |
+
elif node.children:
|
| 300 |
+
for child in node.children:
|
| 301 |
+
files.extend(self._collect_all_files(child))
|
| 302 |
+
|
| 303 |
+
return files
|
| 304 |
+
|
| 305 |
+
def _build_file_map(self, node: MerkleNode) -> Dict[str, MerkleNode]:
|
| 306 |
+
"""Build a map of file paths to nodes."""
|
| 307 |
+
file_map = {}
|
| 308 |
+
|
| 309 |
+
if not node.is_directory:
|
| 310 |
+
file_map[node.path] = node
|
| 311 |
+
elif node.children:
|
| 312 |
+
for child in node.children:
|
| 313 |
+
file_map.update(self._build_file_map(child))
|
| 314 |
+
|
| 315 |
+
return file_map
|
| 316 |
+
|
| 317 |
+
def save_snapshot(self, tree: MerkleNode, snapshot_path: str):
|
| 318 |
+
"""
|
| 319 |
+
Save a Merkle tree snapshot to disk.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
tree: MerkleNode to save
|
| 323 |
+
snapshot_path: Path to save the snapshot JSON file
|
| 324 |
+
"""
|
| 325 |
+
snapshot_file = Path(snapshot_path)
|
| 326 |
+
snapshot_file.parent.mkdir(parents=True, exist_ok=True)
|
| 327 |
+
|
| 328 |
+
with open(snapshot_file, 'w') as f:
|
| 329 |
+
json.dump(tree.to_dict(), f, indent=2)
|
| 330 |
+
|
| 331 |
+
logger.info(f"Saved Merkle tree snapshot to: {snapshot_path}")
|
| 332 |
+
|
| 333 |
+
def load_snapshot(self, snapshot_path: str) -> Optional[MerkleNode]:
|
| 334 |
+
"""
|
| 335 |
+
Load a Merkle tree snapshot from disk.
|
| 336 |
+
|
| 337 |
+
Args:
|
| 338 |
+
snapshot_path: Path to the snapshot JSON file
|
| 339 |
+
|
| 340 |
+
Returns:
|
| 341 |
+
MerkleNode or None if snapshot doesn't exist
|
| 342 |
+
"""
|
| 343 |
+
snapshot_file = Path(snapshot_path)
|
| 344 |
+
|
| 345 |
+
if not snapshot_file.exists():
|
| 346 |
+
logger.info(f"No snapshot found at: {snapshot_path}")
|
| 347 |
+
return None
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
with open(snapshot_file, 'r') as f:
|
| 351 |
+
data = json.load(f)
|
| 352 |
+
|
| 353 |
+
tree = MerkleNode.from_dict(data)
|
| 354 |
+
logger.info(f"Loaded Merkle tree snapshot from: {snapshot_path}")
|
| 355 |
+
return tree
|
| 356 |
+
except Exception as e:
|
| 357 |
+
logger.error(f"Failed to load snapshot: {e}")
|
| 358 |
+
return None
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def get_changed_files(root_path: str, snapshot_path: str) -> ChangeSet:
|
| 362 |
+
"""
|
| 363 |
+
Convenience function to detect changes since last snapshot.
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
root_path: Root directory of codebase
|
| 367 |
+
snapshot_path: Path to previous snapshot file
|
| 368 |
+
|
| 369 |
+
Returns:
|
| 370 |
+
ChangeSet describing all changes
|
| 371 |
+
"""
|
| 372 |
+
merkle = MerkleTree()
|
| 373 |
+
|
| 374 |
+
# Load previous snapshot
|
| 375 |
+
old_tree = merkle.load_snapshot(snapshot_path)
|
| 376 |
+
|
| 377 |
+
# Build current tree
|
| 378 |
+
new_tree = merkle.build_tree(root_path)
|
| 379 |
+
|
| 380 |
+
# Compare
|
| 381 |
+
changes = merkle.compare_trees(old_tree, new_tree)
|
| 382 |
+
|
| 383 |
+
# Save new snapshot
|
| 384 |
+
merkle.save_snapshot(new_tree, snapshot_path)
|
| 385 |
+
|
| 386 |
+
return changes
|
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Path obfuscation module for privacy-preserving codebase indexing.
|
| 3 |
+
|
| 4 |
+
Implements HMAC-based path component hashing to mask sensitive file paths
|
| 5 |
+
while preserving directory structure for retrieval. Inspired by Cursor's
|
| 6 |
+
privacy features.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import hashlib
|
| 10 |
+
import hmac
|
| 11 |
+
import json
|
| 12 |
+
import logging
|
| 13 |
+
import secrets
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Dict, Optional
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class PathObfuscator:
|
| 21 |
+
"""
|
| 22 |
+
Obfuscates file paths using HMAC-based hashing.
|
| 23 |
+
|
| 24 |
+
Each path component (directory/file name) is hashed separately,
|
| 25 |
+
preserving the directory structure while masking actual names.
|
| 26 |
+
|
| 27 |
+
Example:
|
| 28 |
+
src/payments/invoice_processor.py -> a9f3/x72k/qp1m8d.f4
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, secret_key: Optional[str] = None, mapping_file: Optional[str] = None):
|
| 32 |
+
"""
|
| 33 |
+
Initialize path obfuscator.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
secret_key: Secret key for HMAC (auto-generated if not provided)
|
| 37 |
+
mapping_file: File to store path mappings for decryption
|
| 38 |
+
"""
|
| 39 |
+
self.secret_key = secret_key or self._generate_key()
|
| 40 |
+
self.mapping_file = mapping_file or "chroma_db/.path_mapping.json"
|
| 41 |
+
|
| 42 |
+
# Load existing mappings
|
| 43 |
+
self.obfuscated_to_original: Dict[str, str] = {}
|
| 44 |
+
self.original_to_obfuscated: Dict[str, str] = {}
|
| 45 |
+
self._load_mappings()
|
| 46 |
+
|
| 47 |
+
def _generate_key(self) -> str:
|
| 48 |
+
"""Generate a random secret key."""
|
| 49 |
+
return secrets.token_hex(32)
|
| 50 |
+
|
| 51 |
+
def _hash_component(self, component: str) -> str:
|
| 52 |
+
"""
|
| 53 |
+
Hash a single path component using HMAC.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
component: Path component (directory or file name)
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Hashed component (shortened for readability)
|
| 60 |
+
"""
|
| 61 |
+
# Use HMAC-SHA256 for secure hashing
|
| 62 |
+
h = hmac.new(
|
| 63 |
+
self.secret_key.encode(),
|
| 64 |
+
component.encode(),
|
| 65 |
+
hashlib.sha256
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Take first 8 characters of hex digest for readability
|
| 69 |
+
return h.hexdigest()[:8]
|
| 70 |
+
|
| 71 |
+
def obfuscate_path(self, original_path: str) -> str:
|
| 72 |
+
"""
|
| 73 |
+
Obfuscate a file path.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
original_path: Original file path (e.g., "src/payments/invoice.py")
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Obfuscated path (e.g., "a9f3/x72k/qp1m8d.f4")
|
| 80 |
+
"""
|
| 81 |
+
# Check if already obfuscated
|
| 82 |
+
if original_path in self.original_to_obfuscated:
|
| 83 |
+
return self.original_to_obfuscated[original_path]
|
| 84 |
+
|
| 85 |
+
# Split path into components
|
| 86 |
+
path_obj = Path(original_path)
|
| 87 |
+
components = list(path_obj.parts)
|
| 88 |
+
|
| 89 |
+
# Hash each component
|
| 90 |
+
obfuscated_components = []
|
| 91 |
+
for component in components:
|
| 92 |
+
# Preserve file extension for type identification
|
| 93 |
+
if '.' in component and component == components[-1]:
|
| 94 |
+
# This is a file with extension
|
| 95 |
+
name, ext = component.rsplit('.', 1)
|
| 96 |
+
hashed_name = self._hash_component(name)
|
| 97 |
+
# Shorten extension hash
|
| 98 |
+
hashed_ext = self._hash_component(ext)[:2]
|
| 99 |
+
obfuscated_components.append(f"{hashed_name}.{hashed_ext}")
|
| 100 |
+
else:
|
| 101 |
+
# Directory or file without extension
|
| 102 |
+
obfuscated_components.append(self._hash_component(component))
|
| 103 |
+
|
| 104 |
+
# Reconstruct path
|
| 105 |
+
obfuscated_path = '/'.join(obfuscated_components)
|
| 106 |
+
|
| 107 |
+
# Store mapping
|
| 108 |
+
self.original_to_obfuscated[original_path] = obfuscated_path
|
| 109 |
+
self.obfuscated_to_original[obfuscated_path] = original_path
|
| 110 |
+
self._save_mappings()
|
| 111 |
+
|
| 112 |
+
logger.debug(f"Obfuscated: {original_path} -> {obfuscated_path}")
|
| 113 |
+
return obfuscated_path
|
| 114 |
+
|
| 115 |
+
def deobfuscate_path(self, obfuscated_path: str) -> Optional[str]:
|
| 116 |
+
"""
|
| 117 |
+
Deobfuscate a file path.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
obfuscated_path: Obfuscated path
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Original path or None if not found
|
| 124 |
+
"""
|
| 125 |
+
return self.obfuscated_to_original.get(obfuscated_path)
|
| 126 |
+
|
| 127 |
+
def _load_mappings(self):
|
| 128 |
+
"""Load path mappings from disk."""
|
| 129 |
+
mapping_path = Path(self.mapping_file)
|
| 130 |
+
|
| 131 |
+
if not mapping_path.exists():
|
| 132 |
+
logger.info(f"No existing path mappings found at {self.mapping_file}")
|
| 133 |
+
return
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
with open(mapping_path, 'r') as f:
|
| 137 |
+
data = json.load(f)
|
| 138 |
+
|
| 139 |
+
self.obfuscated_to_original = data.get('obfuscated_to_original', {})
|
| 140 |
+
self.original_to_obfuscated = data.get('original_to_obfuscated', {})
|
| 141 |
+
|
| 142 |
+
logger.info(f"Loaded {len(self.original_to_obfuscated)} path mappings")
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"Failed to load path mappings: {e}")
|
| 145 |
+
|
| 146 |
+
def _save_mappings(self):
|
| 147 |
+
"""Save path mappings to disk."""
|
| 148 |
+
mapping_path = Path(self.mapping_file)
|
| 149 |
+
mapping_path.parent.mkdir(parents=True, exist_ok=True)
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
data = {
|
| 153 |
+
'obfuscated_to_original': self.obfuscated_to_original,
|
| 154 |
+
'original_to_obfuscated': self.original_to_obfuscated,
|
| 155 |
+
'secret_key': self.secret_key # Store for consistency
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
with open(mapping_path, 'w') as f:
|
| 159 |
+
json.dump(data, f, indent=2)
|
| 160 |
+
|
| 161 |
+
logger.debug(f"Saved {len(self.original_to_obfuscated)} path mappings")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Failed to save path mappings: {e}")
|
| 164 |
+
|
| 165 |
+
def clear_mappings(self):
|
| 166 |
+
"""Clear all path mappings."""
|
| 167 |
+
self.obfuscated_to_original.clear()
|
| 168 |
+
self.original_to_obfuscated.clear()
|
| 169 |
+
|
| 170 |
+
mapping_path = Path(self.mapping_file)
|
| 171 |
+
if mapping_path.exists():
|
| 172 |
+
mapping_path.unlink()
|
| 173 |
+
|
| 174 |
+
logger.info("Cleared all path mappings")
|
| 175 |
+
|
| 176 |
+
def get_stats(self) -> Dict[str, int]:
|
| 177 |
+
"""Get statistics about path mappings."""
|
| 178 |
+
return {
|
| 179 |
+
'total_paths': len(self.original_to_obfuscated),
|
| 180 |
+
'unique_directories': len(set(
|
| 181 |
+
str(Path(p).parent) for p in self.original_to_obfuscated.keys()
|
| 182 |
+
))
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# Global obfuscator instance
|
| 187 |
+
_obfuscator: Optional[PathObfuscator] = None
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def get_obfuscator(
|
| 191 |
+
secret_key: Optional[str] = None,
|
| 192 |
+
mapping_file: Optional[str] = None
|
| 193 |
+
) -> PathObfuscator:
|
| 194 |
+
"""
|
| 195 |
+
Get the global path obfuscator instance.
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
secret_key: Secret key for HMAC (auto-generated if not provided)
|
| 199 |
+
mapping_file: File to store path mappings
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
PathObfuscator instance
|
| 203 |
+
"""
|
| 204 |
+
global _obfuscator
|
| 205 |
+
|
| 206 |
+
if _obfuscator is None:
|
| 207 |
+
_obfuscator = PathObfuscator(secret_key, mapping_file)
|
| 208 |
+
|
| 209 |
+
return _obfuscator
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def reset_obfuscator():
|
| 213 |
+
"""Reset the global obfuscator (useful for testing)."""
|
| 214 |
+
global _obfuscator
|
| 215 |
+
_obfuscator = None
|
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-mode interface components for Codebase Agent.
|
| 3 |
+
|
| 4 |
+
Provides different interaction modes: Chat, Search, Refactor, Generate
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import streamlit as st
|
| 8 |
+
from typing import Optional, Dict, Any
|
| 9 |
+
import os
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_workspace_root() -> str:
|
| 14 |
+
"""
|
| 15 |
+
Get the workspace root directory for the indexed codebase.
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
Path to the extracted/processed codebase
|
| 19 |
+
"""
|
| 20 |
+
# Check if we have a processed data directory
|
| 21 |
+
data_dir = Path("data")
|
| 22 |
+
if data_dir.exists():
|
| 23 |
+
# Find the extracted folder inside data
|
| 24 |
+
for item in data_dir.iterdir():
|
| 25 |
+
if item.is_dir() and not item.name.startswith('.'):
|
| 26 |
+
return str(item)
|
| 27 |
+
|
| 28 |
+
# Fallback to data directory itself
|
| 29 |
+
return "data"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def render_mode_selector() -> str:
|
| 33 |
+
"""
|
| 34 |
+
Render mode selector and return selected mode.
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
Selected mode: 'chat', 'search', 'refactor', or 'generate'
|
| 38 |
+
"""
|
| 39 |
+
# Mode selector with icons
|
| 40 |
+
mode = st.radio(
|
| 41 |
+
"",
|
| 42 |
+
["π¬ Chat", "π Search", "π§ Refactor", "β¨ Generate"],
|
| 43 |
+
horizontal=True,
|
| 44 |
+
key="mode_selector",
|
| 45 |
+
help="Select interaction mode"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Map display name to mode key
|
| 49 |
+
mode_map = {
|
| 50 |
+
"π¬ Chat": "chat",
|
| 51 |
+
"π Search": "search",
|
| 52 |
+
"π§ Refactor": "refactor",
|
| 53 |
+
"β¨ Generate": "generate"
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return mode_map[mode]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def render_chat_mode(chat_engine):
|
| 60 |
+
"""
|
| 61 |
+
Render standard chat interface.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
chat_engine: ChatEngine instance
|
| 65 |
+
"""
|
| 66 |
+
st.markdown("### π¬ Chat with Your Codebase")
|
| 67 |
+
st.caption("Ask questions about your code, get explanations, and more")
|
| 68 |
+
|
| 69 |
+
# Show suggested prompts if no history
|
| 70 |
+
if not st.session_state.get("messages", []):
|
| 71 |
+
st.markdown("#### π‘ Try asking:")
|
| 72 |
+
|
| 73 |
+
suggestions = [
|
| 74 |
+
"Explain how authentication works",
|
| 75 |
+
"Find all database queries",
|
| 76 |
+
"What are the main entry points?",
|
| 77 |
+
"Show me the API endpoints",
|
| 78 |
+
"Explain the data flow"
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
cols = st.columns(len(suggestions))
|
| 82 |
+
for i, suggestion in enumerate(suggestions):
|
| 83 |
+
with cols[i]:
|
| 84 |
+
if st.button(suggestion, key=f"suggest_{i}", use_container_width=True):
|
| 85 |
+
st.session_state.pending_prompt = suggestion
|
| 86 |
+
st.rerun()
|
| 87 |
+
|
| 88 |
+
# Return True to continue with normal chat flow
|
| 89 |
+
return True
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def render_search_mode():
|
| 93 |
+
"""
|
| 94 |
+
Render MCP code search interface.
|
| 95 |
+
"""
|
| 96 |
+
st.markdown("### π Search Codebase")
|
| 97 |
+
st.caption("Find patterns across your entire codebase using regex")
|
| 98 |
+
|
| 99 |
+
# Get workspace root
|
| 100 |
+
workspace = get_workspace_root()
|
| 101 |
+
st.info(f"π Searching in: `{workspace}`")
|
| 102 |
+
|
| 103 |
+
# Search input
|
| 104 |
+
col1, col2 = st.columns([3, 1])
|
| 105 |
+
with col1:
|
| 106 |
+
pattern = st.text_input(
|
| 107 |
+
"Search Pattern",
|
| 108 |
+
placeholder="e.g., class (or def.*login)",
|
| 109 |
+
help="Enter a regex pattern to search for"
|
| 110 |
+
)
|
| 111 |
+
with col2:
|
| 112 |
+
is_regex = st.checkbox("Regex", value=True, help="Use regex pattern matching")
|
| 113 |
+
|
| 114 |
+
# File pattern filter
|
| 115 |
+
file_pattern = st.text_input(
|
| 116 |
+
"File Pattern",
|
| 117 |
+
value="**/*.py",
|
| 118 |
+
help="Glob pattern for files to search (e.g., **/*.py, src/**/*.js)"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# Context lines
|
| 122 |
+
context_lines = st.slider("Context Lines", 0, 5, 2, help="Number of lines to show before/after match")
|
| 123 |
+
|
| 124 |
+
# Search button
|
| 125 |
+
if st.button("π Search", type="primary", use_container_width=True):
|
| 126 |
+
if not pattern:
|
| 127 |
+
st.warning("Please enter a search pattern")
|
| 128 |
+
return
|
| 129 |
+
|
| 130 |
+
with st.spinner("Searching codebase..."):
|
| 131 |
+
try:
|
| 132 |
+
from code_chatbot.mcp_client import MCPClient
|
| 133 |
+
|
| 134 |
+
client = MCPClient(workspace_root=workspace)
|
| 135 |
+
results = client.search_code(
|
| 136 |
+
pattern=pattern,
|
| 137 |
+
file_pattern=file_pattern,
|
| 138 |
+
context_lines=context_lines,
|
| 139 |
+
is_regex=is_regex
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
if results:
|
| 143 |
+
st.success(f"β
Found {len(results)} matches")
|
| 144 |
+
|
| 145 |
+
# Display results
|
| 146 |
+
for i, result in enumerate(results[:20], 1): # Limit to 20 results
|
| 147 |
+
with st.expander(f"π {result.file_path}:L{result.line_number}"):
|
| 148 |
+
# Show context before
|
| 149 |
+
if result.context_before:
|
| 150 |
+
st.code("\n".join(result.context_before), language="python")
|
| 151 |
+
|
| 152 |
+
# Highlight matching line
|
| 153 |
+
st.markdown(f"**β Line {result.line_number}:**")
|
| 154 |
+
st.code(result.line_content, language="python")
|
| 155 |
+
|
| 156 |
+
# Show context after
|
| 157 |
+
if result.context_after:
|
| 158 |
+
st.code("\n".join(result.context_after), language="python")
|
| 159 |
+
|
| 160 |
+
if len(results) > 20:
|
| 161 |
+
st.info(f"Showing first 20 of {len(results)} results")
|
| 162 |
+
else:
|
| 163 |
+
st.info("No matches found. Try a different pattern.")
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
st.error(f"Search failed: {e}")
|
| 167 |
+
st.exception(e)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def render_refactor_mode():
|
| 171 |
+
"""
|
| 172 |
+
Render MCP refactoring interface.
|
| 173 |
+
"""
|
| 174 |
+
st.markdown("### π§ Refactor Code")
|
| 175 |
+
st.caption("Perform automated refactorings across your codebase")
|
| 176 |
+
|
| 177 |
+
# Get workspace root
|
| 178 |
+
workspace = get_workspace_root()
|
| 179 |
+
st.info(f"π Refactoring in: `{workspace}`")
|
| 180 |
+
|
| 181 |
+
# Refactoring type selector
|
| 182 |
+
refactor_type = st.selectbox(
|
| 183 |
+
"Refactoring Type",
|
| 184 |
+
["Custom Regex", "Common Patterns"],
|
| 185 |
+
help="Choose refactoring approach"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
if refactor_type == "Custom Regex":
|
| 189 |
+
# Custom regex refactoring
|
| 190 |
+
col1, col2 = st.columns(2)
|
| 191 |
+
with col1:
|
| 192 |
+
search_pattern = st.text_input(
|
| 193 |
+
"Search Pattern",
|
| 194 |
+
placeholder="e.g., print\\((.*)\\)",
|
| 195 |
+
help="Regex pattern to find"
|
| 196 |
+
)
|
| 197 |
+
with col2:
|
| 198 |
+
replace_pattern = st.text_input(
|
| 199 |
+
"Replace Pattern",
|
| 200 |
+
placeholder="e.g., logger.info(\\1)",
|
| 201 |
+
help="Replacement (supports capture groups like \\1)"
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
file_pattern = st.text_input(
|
| 205 |
+
"File Pattern",
|
| 206 |
+
value="**/*.py",
|
| 207 |
+
help="Files to process"
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
dry_run = st.checkbox("Dry Run (Preview Only)", value=True, help="Preview changes without applying")
|
| 211 |
+
|
| 212 |
+
if st.button("π§ Refactor", type="primary", use_container_width=True):
|
| 213 |
+
if not search_pattern or not replace_pattern:
|
| 214 |
+
st.warning("Please enter both search and replace patterns")
|
| 215 |
+
return
|
| 216 |
+
|
| 217 |
+
with st.spinner("Processing refactoring..."):
|
| 218 |
+
try:
|
| 219 |
+
from code_chatbot.mcp_client import MCPClient
|
| 220 |
+
|
| 221 |
+
client = MCPClient(workspace_root=workspace)
|
| 222 |
+
result = client.refactor_code(
|
| 223 |
+
search_pattern=search_pattern,
|
| 224 |
+
replace_pattern=replace_pattern,
|
| 225 |
+
file_pattern=file_pattern,
|
| 226 |
+
dry_run=dry_run
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
if result.success:
|
| 230 |
+
mode_text = "Preview" if dry_run else "Applied"
|
| 231 |
+
st.success(f"β
Refactoring {mode_text}: {result.files_changed} files, {result.total_replacements} replacements")
|
| 232 |
+
|
| 233 |
+
# Show changes
|
| 234 |
+
if result.changes:
|
| 235 |
+
for change in result.changes[:10]: # Limit to 10 files
|
| 236 |
+
with st.expander(f"π {change['file_path']} ({change['replacements']} replacements)"):
|
| 237 |
+
if change.get('preview'):
|
| 238 |
+
st.code(change['preview'], language="diff")
|
| 239 |
+
|
| 240 |
+
if len(result.changes) > 10:
|
| 241 |
+
st.info(f"Showing first 10 of {len(result.changes)} changed files")
|
| 242 |
+
else:
|
| 243 |
+
st.info("No matches found for the given pattern")
|
| 244 |
+
|
| 245 |
+
if dry_run and result.files_changed > 0:
|
| 246 |
+
st.info("π‘ Uncheck 'Dry Run' to apply these changes")
|
| 247 |
+
else:
|
| 248 |
+
st.error(f"Refactoring failed: {result.error}")
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
st.error(f"Refactoring failed: {e}")
|
| 252 |
+
st.exception(e)
|
| 253 |
+
|
| 254 |
+
else:
|
| 255 |
+
# Common patterns
|
| 256 |
+
st.markdown("#### Common Refactoring Patterns")
|
| 257 |
+
|
| 258 |
+
common_patterns = {
|
| 259 |
+
"print() β logging": {
|
| 260 |
+
"search": r"print\((.*)\)",
|
| 261 |
+
"replace": r"logger.info(\1)",
|
| 262 |
+
"description": "Replace print statements with logging"
|
| 263 |
+
},
|
| 264 |
+
"assertEqual β assert ==": {
|
| 265 |
+
"search": r"assertEqual\(([^,]+),\s*([^)]+)\)",
|
| 266 |
+
"replace": r"assert \1 == \2",
|
| 267 |
+
"description": "Convert unittest to pytest assertions"
|
| 268 |
+
},
|
| 269 |
+
"Remove trailing whitespace": {
|
| 270 |
+
"search": r"[ \t]+$",
|
| 271 |
+
"replace": "",
|
| 272 |
+
"description": "Clean up trailing whitespace"
|
| 273 |
+
}
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
pattern_choice = st.selectbox(
|
| 277 |
+
"Select Pattern",
|
| 278 |
+
list(common_patterns.keys())
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
selected = common_patterns[pattern_choice]
|
| 282 |
+
st.info(selected["description"])
|
| 283 |
+
|
| 284 |
+
col1, col2 = st.columns(2)
|
| 285 |
+
with col1:
|
| 286 |
+
st.code(f"Search: {selected['search']}", language="regex")
|
| 287 |
+
with col2:
|
| 288 |
+
st.code(f"Replace: {selected['replace']}", language="regex")
|
| 289 |
+
|
| 290 |
+
dry_run = st.checkbox("Dry Run (Preview Only)", value=True, key="common_dry_run")
|
| 291 |
+
|
| 292 |
+
if st.button("Apply Refactoring", type="primary", use_container_width=True):
|
| 293 |
+
with st.spinner("Processing..."):
|
| 294 |
+
try:
|
| 295 |
+
from code_chatbot.mcp_client import MCPClient
|
| 296 |
+
|
| 297 |
+
client = MCPClient(workspace_root=workspace)
|
| 298 |
+
result = client.refactor_code(
|
| 299 |
+
search_pattern=selected["search"],
|
| 300 |
+
replace_pattern=selected["replace"],
|
| 301 |
+
file_pattern="**/*.py",
|
| 302 |
+
dry_run=dry_run
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
if result.success:
|
| 306 |
+
st.success(f"β
{result.files_changed} files, {result.total_replacements} replacements")
|
| 307 |
+
if result.changes:
|
| 308 |
+
for change in result.changes[:5]:
|
| 309 |
+
with st.expander(f"π {change['file_path']}"):
|
| 310 |
+
st.code(change.get('preview', 'No preview'), language="diff")
|
| 311 |
+
else:
|
| 312 |
+
st.error(f"Failed: {result.error}")
|
| 313 |
+
except Exception as e:
|
| 314 |
+
st.error(f"Failed: {e}")
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def render_generate_mode(chat_engine):
|
| 318 |
+
"""
|
| 319 |
+
Render code generation interface using ChatEngine.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
chat_engine: ChatEngine instance
|
| 323 |
+
"""
|
| 324 |
+
st.markdown("### β¨ Generate New Features")
|
| 325 |
+
st.caption("Use AI to scaffold complete features from descriptions")
|
| 326 |
+
|
| 327 |
+
# Feature description
|
| 328 |
+
feature_desc = st.text_area(
|
| 329 |
+
"Describe the feature you want to build",
|
| 330 |
+
placeholder="Example: Create a user authentication system with JWT tokens, login/logout endpoints, password hashing with bcrypt, and session management",
|
| 331 |
+
height=120,
|
| 332 |
+
help="Be as detailed as possible"
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
# Options
|
| 336 |
+
col1, col2, col3 = st.columns(3)
|
| 337 |
+
with col1:
|
| 338 |
+
include_tests = st.checkbox("Generate Tests", value=True)
|
| 339 |
+
with col2:
|
| 340 |
+
include_docs = st.checkbox("Generate Docs", value=True)
|
| 341 |
+
with col3:
|
| 342 |
+
include_examples = st.checkbox("Include Examples", value=True)
|
| 343 |
+
|
| 344 |
+
# Framework selection
|
| 345 |
+
framework = st.selectbox(
|
| 346 |
+
"Framework/Stack",
|
| 347 |
+
["Auto-detect from codebase", "FastAPI", "Flask", "Django", "Express.js", "React", "Vue.js"],
|
| 348 |
+
help="Technology stack for the feature"
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
if st.button("π Generate Feature", type="primary", use_container_width=True):
|
| 352 |
+
if not feature_desc:
|
| 353 |
+
st.warning("Please describe the feature you want to build")
|
| 354 |
+
return
|
| 355 |
+
|
| 356 |
+
if not chat_engine:
|
| 357 |
+
st.error("β οΈ Chat engine not initialized. Please index your codebase first.")
|
| 358 |
+
return
|
| 359 |
+
|
| 360 |
+
with st.spinner("π€ Generating feature... (this may take 30-60 seconds)"):
|
| 361 |
+
try:
|
| 362 |
+
# Build comprehensive prompt
|
| 363 |
+
prompt = f"""Generate a complete implementation for this feature:
|
| 364 |
+
|
| 365 |
+
**Feature Request:**
|
| 366 |
+
{feature_desc}
|
| 367 |
+
|
| 368 |
+
**Requirements:**
|
| 369 |
+
- Framework: {framework}
|
| 370 |
+
- Include tests: {include_tests}
|
| 371 |
+
- Include documentation: {include_docs}
|
| 372 |
+
- Include examples: {include_examples}
|
| 373 |
+
|
| 374 |
+
**Please provide:**
|
| 375 |
+
1. A clear file structure showing all files to create
|
| 376 |
+
2. Complete, production-ready code for each file
|
| 377 |
+
3. Clear comments explaining the code
|
| 378 |
+
4. Setup/installation instructions
|
| 379 |
+
5. Usage examples
|
| 380 |
+
|
| 381 |
+
Format each file like this:
|
| 382 |
+
|
| 383 |
+
### `path/to/filename.py`
|
| 384 |
+
```python
|
| 385 |
+
# Code here
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
Make sure the code follows best practices and matches the existing codebase style."""
|
| 389 |
+
|
| 390 |
+
# Use chat engine
|
| 391 |
+
answer, sources = chat_engine.chat(prompt)
|
| 392 |
+
|
| 393 |
+
st.success("β
Feature generated!")
|
| 394 |
+
|
| 395 |
+
# Display generated content
|
| 396 |
+
st.markdown("---")
|
| 397 |
+
st.markdown("#### π Generated Feature")
|
| 398 |
+
st.markdown(answer)
|
| 399 |
+
|
| 400 |
+
# Show sources if available
|
| 401 |
+
if sources:
|
| 402 |
+
st.markdown("---")
|
| 403 |
+
with st.expander("π Reference Files Used"):
|
| 404 |
+
for source in sources:
|
| 405 |
+
if isinstance(source, dict):
|
| 406 |
+
st.write(f"- `{source.get('file_path', 'Unknown')}`")
|
| 407 |
+
else:
|
| 408 |
+
st.write(f"- `{source}`")
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
st.error(f"Generation failed: {e}")
|
| 412 |
+
st.exception(e)
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
# Export functions
|
| 416 |
+
__all__ = [
|
| 417 |
+
'render_mode_selector',
|
| 418 |
+
'render_chat_mode',
|
| 419 |
+
'render_search_mode',
|
| 420 |
+
'render_refactor_mode',
|
| 421 |
+
'render_generate_mode'
|
| 422 |
+
]
|
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Demo script for MCP and CrewAI integration.
|
| 3 |
+
|
| 4 |
+
Shows how to use the new refactoring and multi-agent capabilities.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add project root to path
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 13 |
+
|
| 14 |
+
from code_chatbot.mcp_client import MCPClient
|
| 15 |
+
from code_chatbot.crews import RefactoringCrew, CodeReviewCrew
|
| 16 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def demo_mcp_search():
|
| 20 |
+
"""Demo: Search for code patterns using MCP"""
|
| 21 |
+
print("\n" + "="*60)
|
| 22 |
+
print("DEMO 1: MCP Code Search")
|
| 23 |
+
print("="*60)
|
| 24 |
+
|
| 25 |
+
# Create MCP client
|
| 26 |
+
client = MCPClient(workspace_root=".")
|
| 27 |
+
|
| 28 |
+
# Search for all class definitions
|
| 29 |
+
print("\nπ Searching for class definitions...")
|
| 30 |
+
results = client.search_code(
|
| 31 |
+
pattern=r"class\s+(\w+)",
|
| 32 |
+
file_pattern="code_chatbot/*.py",
|
| 33 |
+
context_lines=1
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Format and display results
|
| 37 |
+
print(client.format_search_results(results, max_results=5))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def demo_mcp_refactor():
|
| 41 |
+
"""Demo: Preview a refactoring using MCP"""
|
| 42 |
+
print("\n" + "="*60)
|
| 43 |
+
print("DEMO 2: MCP Code Refactoring (Dry Run)")
|
| 44 |
+
print("="*60)
|
| 45 |
+
|
| 46 |
+
# Create MCP client
|
| 47 |
+
client = MCPClient(workspace_root=".")
|
| 48 |
+
|
| 49 |
+
# Preview refactoring: print -> logger.info
|
| 50 |
+
print("\nπ§ Previewing refactoring: print() -> logger.info()...")
|
| 51 |
+
result = client.refactor_code(
|
| 52 |
+
search_pattern=r'print\((.*)\)',
|
| 53 |
+
replace_pattern=r'logger.info(\1)',
|
| 54 |
+
file_pattern="code_chatbot/mcp_*.py",
|
| 55 |
+
dry_run=True # Preview only, don't apply
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Format and display result
|
| 59 |
+
print(client.format_refactor_result(result))
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def demo_mcp_suggestions():
|
| 63 |
+
"""Demo: Get refactoring suggestions using MCP"""
|
| 64 |
+
print("\n" + "="*60)
|
| 65 |
+
print("DEMO 3: MCP Refactoring Suggestions")
|
| 66 |
+
print("="*60)
|
| 67 |
+
|
| 68 |
+
# Create MCP client
|
| 69 |
+
client = MCPClient(workspace_root=".")
|
| 70 |
+
|
| 71 |
+
# Get suggestions for a file
|
| 72 |
+
print("\nπ‘ Analyzing code_chatbot/mcp_server.py for refactoring opportunities...")
|
| 73 |
+
suggestions = client.suggest_refactorings(
|
| 74 |
+
file_path="code_chatbot/mcp_server.py",
|
| 75 |
+
max_suggestions=3
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Format and display suggestions
|
| 79 |
+
print(client.format_suggestions(suggestions))
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def demo_crewai_refactoring():
|
| 83 |
+
"""Demo: Use CrewAI multi-agent refactoring"""
|
| 84 |
+
print("\n" + "="*60)
|
| 85 |
+
print("DEMO 4: CrewAI Multi-Agent Refactoring")
|
| 86 |
+
print("="*60)
|
| 87 |
+
|
| 88 |
+
# Check for API key
|
| 89 |
+
if not os.getenv("GOOGLE_API_KEY"):
|
| 90 |
+
print("\nβ οΈ Skipping CrewAI demo: GOOGLE_API_KEY not set")
|
| 91 |
+
print(" Set your API key to run multi-agent workflows")
|
| 92 |
+
return
|
| 93 |
+
|
| 94 |
+
# Create LLM
|
| 95 |
+
llm = ChatGoogleGenerativeAI(
|
| 96 |
+
model="gemini-2.0-flash-exp",
|
| 97 |
+
google_api_key=os.getenv("GOOGLE_API_KEY")
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Create refactoring crew
|
| 101 |
+
print("\nπ€ Creating refactoring crew (Analyst + Refactor + Reviewer)...")
|
| 102 |
+
crew = RefactoringCrew(llm=llm)
|
| 103 |
+
|
| 104 |
+
# Run crew on a file
|
| 105 |
+
print("\nπ Running crew on code_chatbot/mcp_client.py...")
|
| 106 |
+
print(" (This may take 30-60 seconds...)\n")
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
result = crew.run(file_path="code_chatbot/mcp_client.py")
|
| 110 |
+
|
| 111 |
+
print("\nβ
Crew execution complete!")
|
| 112 |
+
print(f" Tasks completed: {result['tasks_completed']}")
|
| 113 |
+
print(f"\nπ Result:\n{result['result']}")
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"\nβ Crew execution failed: {e}")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def demo_crewai_review():
|
| 120 |
+
"""Demo: Use CrewAI multi-agent code review"""
|
| 121 |
+
print("\n" + "="*60)
|
| 122 |
+
print("DEMO 5: CrewAI Multi-Agent Code Review")
|
| 123 |
+
print("="*60)
|
| 124 |
+
|
| 125 |
+
# Check for API key
|
| 126 |
+
if not os.getenv("GOOGLE_API_KEY"):
|
| 127 |
+
print("\nβ οΈ Skipping CrewAI demo: GOOGLE_API_KEY not set")
|
| 128 |
+
return
|
| 129 |
+
|
| 130 |
+
# Create LLM
|
| 131 |
+
llm = ChatGoogleGenerativeAI(
|
| 132 |
+
model="gemini-2.0-flash-exp",
|
| 133 |
+
google_api_key=os.getenv("GOOGLE_API_KEY")
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Create code review crew
|
| 137 |
+
print("\nπ€ Creating code review crew (Analyst + Reviewer + Documentation)...")
|
| 138 |
+
crew = CodeReviewCrew(llm=llm)
|
| 139 |
+
|
| 140 |
+
# Run crew on a file
|
| 141 |
+
print("\nπ Running crew on code_chatbot/mcp_server.py...")
|
| 142 |
+
print(" (This may take 30-60 seconds...)\n")
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
result = crew.run(file_path="code_chatbot/mcp_server.py")
|
| 146 |
+
|
| 147 |
+
print("\nβ
Crew execution complete!")
|
| 148 |
+
print(f" Tasks completed: {result['tasks_completed']}")
|
| 149 |
+
print(f"\nπ Result:\n{result['result']}")
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"\nβ Crew execution failed: {e}")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def main():
|
| 156 |
+
"""Run all demos"""
|
| 157 |
+
print("\n" + "="*60)
|
| 158 |
+
print("π MCP + CrewAI Integration Demo")
|
| 159 |
+
print("="*60)
|
| 160 |
+
print("\nThis demo showcases:")
|
| 161 |
+
print(" 1. MCP Code Search - Find patterns in your codebase")
|
| 162 |
+
print(" 2. MCP Refactoring - Preview/apply code changes")
|
| 163 |
+
print(" 3. MCP Suggestions - Get AI-powered refactoring ideas")
|
| 164 |
+
print(" 4. CrewAI Refactoring - Multi-agent automated refactoring")
|
| 165 |
+
print(" 5. CrewAI Code Review - Multi-agent code review")
|
| 166 |
+
|
| 167 |
+
# Run MCP demos (no API key needed)
|
| 168 |
+
demo_mcp_search()
|
| 169 |
+
demo_mcp_refactor()
|
| 170 |
+
demo_mcp_suggestions()
|
| 171 |
+
|
| 172 |
+
# Run CrewAI demos (requires API key)
|
| 173 |
+
demo_crewai_refactoring()
|
| 174 |
+
demo_crewai_review()
|
| 175 |
+
|
| 176 |
+
print("\n" + "="*60)
|
| 177 |
+
print("β
Demo Complete!")
|
| 178 |
+
print("="*60)
|
| 179 |
+
print("\nNext steps:")
|
| 180 |
+
print(" - Try the MCP tools in your own code")
|
| 181 |
+
print(" - Customize agent roles and workflows")
|
| 182 |
+
print(" - Integrate with Streamlit UI")
|
| 183 |
+
print(" - Add more specialized agents")
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
main()
|
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# How Codebase Agent Indexes Your Codebase
|
| 2 |
+
|
| 3 |
+
**A deep dive into the RAG pipeline that powers intelligent code understanding**
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Overview
|
| 8 |
+
|
| 9 |
+
Codebase Agent uses a sophisticated Retrieval-Augmented Generation (RAG) pipeline to build a deep understanding of your codebase. Unlike simple text search tools, our system combines:
|
| 10 |
+
|
| 11 |
+
- **Semantic code chunking** using Abstract Syntax Trees (AST)
|
| 12 |
+
- **Efficient change detection** with Merkle trees
|
| 13 |
+
- **Privacy-preserving path obfuscation**
|
| 14 |
+
- **Rich metadata extraction** (symbols, imports, complexity)
|
| 15 |
+
- **Hybrid semantic + keyword search**
|
| 16 |
+
|
| 17 |
+
This document explains how each component works and how they fit together.
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## The RAG Pipeline
|
| 22 |
+
|
| 23 |
+
```mermaid
|
| 24 |
+
flowchart TD
|
| 25 |
+
A[Source Code] --> B[Universal Ingestor]
|
| 26 |
+
B --> C{Incremental Mode?}
|
| 27 |
+
C -->|Yes| D[Merkle Tree Change Detection]
|
| 28 |
+
C -->|No| E[Full Indexing]
|
| 29 |
+
D --> F[Changed Files Only]
|
| 30 |
+
E --> G[All Files]
|
| 31 |
+
F --> H[Structural Chunker]
|
| 32 |
+
G --> H
|
| 33 |
+
H --> I[Enhanced Metadata Extraction]
|
| 34 |
+
I --> J{Path Obfuscation?}
|
| 35 |
+
J -->|Yes| K[Obfuscate Paths]
|
| 36 |
+
J -->|No| L[Original Paths]
|
| 37 |
+
K --> M[Embedding Generation]
|
| 38 |
+
L --> M
|
| 39 |
+
M --> N[Vector Database ChromaDB]
|
| 40 |
+
N --> O[Semantic Search]
|
| 41 |
+
O --> P[Reranking]
|
| 42 |
+
P --> Q[LLM Context]
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
---
|
| 46 |
+
|
| 47 |
+
## Step 1: Semantic Code Chunking
|
| 48 |
+
|
| 49 |
+
### The Challenge
|
| 50 |
+
|
| 51 |
+
Raw code files can be thousands of lines long, but embedding models have token limits (typically 512-8192 tokens). Naively splitting code by character count would:
|
| 52 |
+
- Break functions mid-definition
|
| 53 |
+
- Separate related code blocks
|
| 54 |
+
- Lose semantic context
|
| 55 |
+
|
| 56 |
+
### Our Solution: AST-Based Chunking
|
| 57 |
+
|
| 58 |
+
We use **Tree-sitter** to parse code into an Abstract Syntax Tree, then chunk along semantic boundaries.
|
| 59 |
+
|
| 60 |
+
#### Example
|
| 61 |
+
|
| 62 |
+
Consider this Python code:
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
class UserAuth:
|
| 66 |
+
def __init__(self, db):
|
| 67 |
+
self.db = db
|
| 68 |
+
|
| 69 |
+
def login(self, username, password):
|
| 70 |
+
user = self.db.get_user(username)
|
| 71 |
+
if user and user.check_password(password):
|
| 72 |
+
return self.create_session(user)
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
def create_session(self, user):
|
| 76 |
+
session_id = generate_token()
|
| 77 |
+
self.db.save_session(session_id, user.id)
|
| 78 |
+
return session_id
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
**Traditional chunking** (by character count) might split this awkwardly:
|
| 82 |
+
|
| 83 |
+
```
|
| 84 |
+
Chunk 1: class UserAuth:\n def __init__(self, db):\n self.db = db\n \n def login(self, username, password):\n user = self.db.get_user(username)\n if user and user.check_password(password):\n return self.create_session(user)\n return None\n \n def create_session(self, user):\n session_id = generate_token()
|
| 85 |
+
Chunk 2: \n self.db.save_session(session_id, user.id)\n return session_id
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
**Our AST-based chunking** respects function boundaries:
|
| 89 |
+
|
| 90 |
+
```
|
| 91 |
+
Chunk 1:
|
| 92 |
+
class UserAuth:
|
| 93 |
+
def __init__(self, db):
|
| 94 |
+
self.db = db
|
| 95 |
+
|
| 96 |
+
Chunk 2:
|
| 97 |
+
class UserAuth:
|
| 98 |
+
def login(self, username, password):
|
| 99 |
+
user = self.db.get_user(username)
|
| 100 |
+
if user and user.check_password(password):
|
| 101 |
+
return self.create_session(user)
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
Chunk 3:
|
| 105 |
+
class UserAuth:
|
| 106 |
+
def create_session(self, user):
|
| 107 |
+
session_id = generate_token()
|
| 108 |
+
self.db.save_session(session_id, user.id)
|
| 109 |
+
return session_id
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
#### Implementation Details
|
| 113 |
+
|
| 114 |
+
Our `StructuralChunker` class:
|
| 115 |
+
|
| 116 |
+
1. **Parses code** using Tree-sitter for multiple languages (Python, JavaScript, TypeScript, etc.)
|
| 117 |
+
2. **Traverses the AST** recursively, identifying logical units (functions, classes, methods)
|
| 118 |
+
3. **Counts tokens** accurately using `tiktoken` (same tokenizer as GPT models)
|
| 119 |
+
4. **Merges small chunks** to avoid pathologically tiny fragments
|
| 120 |
+
5. **Splits large chunks** only when necessary, preserving semantic boundaries
|
| 121 |
+
|
| 122 |
+
**Key Parameters:**
|
| 123 |
+
- `max_chunk_tokens`: 800 (configurable)
|
| 124 |
+
- `min_chunk_tokens`: 100 (for merging)
|
| 125 |
+
|
| 126 |
+
---
|
| 127 |
+
|
| 128 |
+
## Step 2: Enhanced Metadata Extraction
|
| 129 |
+
|
| 130 |
+
Each code chunk is enriched with metadata that enables powerful filtering and retrieval.
|
| 131 |
+
|
| 132 |
+
### Metadata Fields
|
| 133 |
+
|
| 134 |
+
| Field | Description | Example |
|
| 135 |
+
|-------|-------------|---------|
|
| 136 |
+
| `file_path` | Original or obfuscated path | `src/auth/user.py` |
|
| 137 |
+
| `line_range` | Line numbers in source file | `L10-L25` |
|
| 138 |
+
| `language` | Programming language | `python` |
|
| 139 |
+
| `chunk_type` | AST node type | `function_definition` |
|
| 140 |
+
| `name` | Function/class name | `UserAuth.login` |
|
| 141 |
+
| `symbols` | Symbols defined in chunk | `['UserAuth', 'UserAuth.login']` |
|
| 142 |
+
| `imports` | Import statements used | `['from db import Database']` |
|
| 143 |
+
| `complexity` | Cyclomatic complexity | `5` |
|
| 144 |
+
| `parent_context` | Parent class/module | `UserAuth` |
|
| 145 |
+
|
| 146 |
+
### Symbol Extraction
|
| 147 |
+
|
| 148 |
+
We traverse the AST to extract all function and class definitions:
|
| 149 |
+
|
| 150 |
+
```python
|
| 151 |
+
def _extract_symbols(self, node: Node, content: str) -> List[str]:
|
| 152 |
+
symbols = []
|
| 153 |
+
# Recursively find function_definition and class_definition nodes
|
| 154 |
+
# Build hierarchical names like "MyClass.my_method"
|
| 155 |
+
return symbols
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### Complexity Calculation
|
| 159 |
+
|
| 160 |
+
Cyclomatic complexity = number of decision points + 1
|
| 161 |
+
|
| 162 |
+
Decision points include: `if`, `elif`, `for`, `while`, `except`, `and`, `or`, etc.
|
| 163 |
+
|
| 164 |
+
This helps identify complex code that may need more careful review.
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## Step 3: Efficient Change Detection with Merkle Trees
|
| 169 |
+
|
| 170 |
+
### The Problem
|
| 171 |
+
|
| 172 |
+
Re-indexing a large codebase (10,000+ files) can take 10-30 minutes. But most of the time, only a few files have changed.
|
| 173 |
+
|
| 174 |
+
### The Solution: Merkle Trees
|
| 175 |
+
|
| 176 |
+
A **Merkle tree** is a cryptographic hash tree where:
|
| 177 |
+
- Each **leaf node** = hash of a file's content
|
| 178 |
+
- Each **directory node** = hash of its children's hashes
|
| 179 |
+
- The **root hash** represents the entire codebase
|
| 180 |
+
|
| 181 |
+
#### How It Works
|
| 182 |
+
|
| 183 |
+
```mermaid
|
| 184 |
+
graph TD
|
| 185 |
+
A[Root: abc123] --> B[src/: def456]
|
| 186 |
+
A --> C[tests/: ghi789]
|
| 187 |
+
B --> D[auth.py: aaa111]
|
| 188 |
+
B --> E[db.py: bbb222]
|
| 189 |
+
C --> F[test_auth.py: ccc333]
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
**Change Detection:**
|
| 193 |
+
1. Build Merkle tree for current codebase
|
| 194 |
+
2. Load previous tree snapshot from disk
|
| 195 |
+
3. Compare root hashes
|
| 196 |
+
- If identical β No changes, skip indexing
|
| 197 |
+
- If different β Traverse tree to find changed files
|
| 198 |
+
|
| 199 |
+
**Performance:**
|
| 200 |
+
- **Initial indexing**: 10,000 files in ~15 minutes
|
| 201 |
+
- **Incremental re-indexing**: 100 changed files in ~90 seconds
|
| 202 |
+
- **Speedup**: ~10-100x faster
|
| 203 |
+
|
| 204 |
+
#### Implementation
|
| 205 |
+
|
| 206 |
+
```python
|
| 207 |
+
class MerkleTree:
|
| 208 |
+
def build_tree(self, root_path: str) -> MerkleNode:
|
| 209 |
+
# Recursively hash files and directories
|
| 210 |
+
pass
|
| 211 |
+
|
| 212 |
+
def compare_trees(self, old_tree, new_tree) -> ChangeSet:
|
| 213 |
+
# Returns: added, modified, deleted, unchanged files
|
| 214 |
+
pass
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
**Snapshot Storage:**
|
| 218 |
+
- Saved as JSON in `chroma_db/merkle_snapshots/{collection}_snapshot.json`
|
| 219 |
+
- Includes file hashes, sizes, modification times
|
| 220 |
+
|
| 221 |
+
---
|
| 222 |
+
|
| 223 |
+
## Step 4: Privacy-Preserving Path Obfuscation
|
| 224 |
+
|
| 225 |
+
### The Need for Privacy
|
| 226 |
+
|
| 227 |
+
File paths can reveal sensitive information:
|
| 228 |
+
- Internal project structure
|
| 229 |
+
- Client names (`projects/acme-corp/...`)
|
| 230 |
+
- Product codenames (`features/project-phoenix/...`)
|
| 231 |
+
- Team organization (`teams/security/...`)
|
| 232 |
+
|
| 233 |
+
### HMAC-Based Path Hashing
|
| 234 |
+
|
| 235 |
+
We use **HMAC-SHA256** to hash each path component separately:
|
| 236 |
+
|
| 237 |
+
```python
|
| 238 |
+
def obfuscate_path(self, original_path: str) -> str:
|
| 239 |
+
# Split: src/payments/invoice_processor.py
|
| 240 |
+
# Hash each component with secret key
|
| 241 |
+
# Result: a9f3/x72k/qp1m8d.f4
|
| 242 |
+
pass
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
**Key Features:**
|
| 246 |
+
- **Deterministic**: Same path always hashes to same value
|
| 247 |
+
- **Reversible**: Mapping stored locally for decryption
|
| 248 |
+
- **Structure-preserving**: Directory hierarchy maintained
|
| 249 |
+
- **Extension hints**: File extensions shortened but recognizable
|
| 250 |
+
|
| 251 |
+
**Example:**
|
| 252 |
+
```
|
| 253 |
+
Original: src/payments/invoice_processor.py
|
| 254 |
+
Masked: a9f3/x72k/qp1m8d.f4
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
**Configuration:**
|
| 258 |
+
```bash
|
| 259 |
+
ENABLE_PATH_OBFUSCATION=true
|
| 260 |
+
PATH_OBFUSCATION_KEY=your-secret-key-here
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
---
|
| 264 |
+
|
| 265 |
+
## Step 5: Embedding Generation & Vector Storage
|
| 266 |
+
|
| 267 |
+
### Embedding Model
|
| 268 |
+
|
| 269 |
+
We use **Google's text-embedding-004** model:
|
| 270 |
+
- **Dimensions**: 768
|
| 271 |
+
- **Max tokens**: 2048
|
| 272 |
+
- **Quality**: State-of-the-art for code
|
| 273 |
+
|
| 274 |
+
Each chunk is converted to a dense vector that captures its semantic meaning.
|
| 275 |
+
|
| 276 |
+
### Vector Database: ChromaDB
|
| 277 |
+
|
| 278 |
+
**Why ChromaDB?**
|
| 279 |
+
- **Local-first**: No cloud dependency
|
| 280 |
+
- **Fast**: Optimized for similarity search
|
| 281 |
+
- **Persistent**: Auto-saves to disk
|
| 282 |
+
- **Metadata filtering**: Supports complex queries
|
| 283 |
+
|
| 284 |
+
**Storage Structure:**
|
| 285 |
+
```
|
| 286 |
+
chroma_db/
|
| 287 |
+
βββ {collection_name}/
|
| 288 |
+
β βββ chroma.sqlite3 # Metadata database
|
| 289 |
+
β βββ index/ # Vector indices
|
| 290 |
+
β βββ ...
|
| 291 |
+
βββ merkle_snapshots/
|
| 292 |
+
βββ {collection}_snapshot.json
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
---
|
| 296 |
+
|
| 297 |
+
## Step 6: Semantic Search & Retrieval
|
| 298 |
+
|
| 299 |
+
### Query Processing
|
| 300 |
+
|
| 301 |
+
When you ask a question:
|
| 302 |
+
|
| 303 |
+
1. **Query embedding**: Your question is embedded using the same model
|
| 304 |
+
2. **Similarity search**: Find top-K most similar code chunks (K=10 by default)
|
| 305 |
+
3. **Metadata filtering** (optional): Filter by language, file type, complexity
|
| 306 |
+
4. **Reranking**: Apply cross-encoder reranking to refine results (top-5)
|
| 307 |
+
5. **Context assembly**: Combine retrieved chunks with chat history
|
| 308 |
+
|
| 309 |
+
### Hybrid Search
|
| 310 |
+
|
| 311 |
+
We combine **semantic search** with **keyword search**:
|
| 312 |
+
|
| 313 |
+
- **Semantic**: Finds conceptually similar code (e.g., "authentication" matches `login()`, `verify_token()`)
|
| 314 |
+
- **Keyword**: Exact matches for function names, file paths, symbols
|
| 315 |
+
|
| 316 |
+
### Reranking
|
| 317 |
+
|
| 318 |
+
After initial retrieval, we apply a **cross-encoder reranker** that:
|
| 319 |
+
- Scores each (query, chunk) pair directly
|
| 320 |
+
- Re-orders results by relevance
|
| 321 |
+
- Improves precision significantly
|
| 322 |
+
|
| 323 |
+
---
|
| 324 |
+
|
| 325 |
+
## Step 7: LLM Context & Generation
|
| 326 |
+
|
| 327 |
+
### Context Window Management
|
| 328 |
+
|
| 329 |
+
Modern LLMs have large context windows (Gemini 2.0: 1M+ tokens), but we still optimize:
|
| 330 |
+
|
| 331 |
+
1. **Top-K retrieval**: Only include most relevant chunks (5-10)
|
| 332 |
+
2. **Deduplication**: Remove redundant information
|
| 333 |
+
3. **Source citations**: Include file paths and line ranges
|
| 334 |
+
4. **Chat history**: Maintain conversation context
|
| 335 |
+
|
| 336 |
+
### Prompt Engineering
|
| 337 |
+
|
| 338 |
+
Our prompts include:
|
| 339 |
+
- **System instructions**: "You are a code analysis assistant..."
|
| 340 |
+
- **Retrieved context**: Top-K code chunks with metadata
|
| 341 |
+
- **Chat history**: Previous Q&A for continuity
|
| 342 |
+
- **User query**: The actual question
|
| 343 |
+
|
| 344 |
+
---
|
| 345 |
+
|
| 346 |
+
## Performance Benchmarks
|
| 347 |
+
|
| 348 |
+
| Operation | Small Codebase (100 files) | Large Codebase (10,000 files) |
|
| 349 |
+
|-----------|----------------------------|-------------------------------|
|
| 350 |
+
| **Initial Indexing** | ~30 seconds | ~15 minutes |
|
| 351 |
+
| **Incremental Re-index** (10% changed) | ~5 seconds | ~90 seconds |
|
| 352 |
+
| **Query Latency** | ~300ms | ~500ms |
|
| 353 |
+
| **Memory Usage** | ~200 MB | ~1.5 GB |
|
| 354 |
+
|
| 355 |
+
**Speedup from Incremental Indexing:** 10-100x
|
| 356 |
+
|
| 357 |
+
---
|
| 358 |
+
|
| 359 |
+
## Comparison with Cursor
|
| 360 |
+
|
| 361 |
+
| Feature | Codebase Agent | Cursor |
|
| 362 |
+
|---------|----------------|--------|
|
| 363 |
+
| **AST-based chunking** | β
Tree-sitter | β
Tree-sitter |
|
| 364 |
+
| **Merkle tree change detection** | β
| β
|
|
| 365 |
+
| **Path obfuscation** | β
HMAC-based | β
HMAC-based |
|
| 366 |
+
| **Rich metadata** | β
Symbols, imports, complexity | β
Similar |
|
| 367 |
+
| **Local-first** | β
100% local option | β Cloud-based |
|
| 368 |
+
| **Open source** | β
MIT License | β Proprietary |
|
| 369 |
+
| **Multi-provider LLMs** | β
Gemini, Groq, OpenAI | β OpenAI only |
|
| 370 |
+
|
| 371 |
+
---
|
| 372 |
+
|
| 373 |
+
## Configuration
|
| 374 |
+
|
| 375 |
+
All features are configurable via environment variables:
|
| 376 |
+
|
| 377 |
+
```bash
|
| 378 |
+
# Chunking
|
| 379 |
+
CHUNK_MAX_TOKENS=800
|
| 380 |
+
CHUNK_MIN_TOKENS=100
|
| 381 |
+
CHUNK_PRESERVE_IMPORTS=true
|
| 382 |
+
CHUNK_CALCULATE_COMPLEXITY=true
|
| 383 |
+
|
| 384 |
+
# Privacy
|
| 385 |
+
ENABLE_PATH_OBFUSCATION=false
|
| 386 |
+
PATH_OBFUSCATION_KEY=your-secret-key
|
| 387 |
+
|
| 388 |
+
# Indexing
|
| 389 |
+
ENABLE_INCREMENTAL_INDEXING=true
|
| 390 |
+
MERKLE_SNAPSHOT_DIR=chroma_db/merkle_snapshots
|
| 391 |
+
INDEXING_BATCH_SIZE=100
|
| 392 |
+
MAX_FILE_SIZE_MB=10
|
| 393 |
+
|
| 394 |
+
# Retrieval
|
| 395 |
+
ENABLE_RERANKING=true
|
| 396 |
+
RETRIEVAL_K=10
|
| 397 |
+
RERANK_TOP_K=5
|
| 398 |
+
SIMILARITY_THRESHOLD=0.5
|
| 399 |
+
|
| 400 |
+
# Providers
|
| 401 |
+
EMBEDDING_PROVIDER=gemini
|
| 402 |
+
LLM_PROVIDER=gemini
|
| 403 |
+
```
|
| 404 |
+
|
| 405 |
+
See [`code_chatbot/config.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/config.py) for full configuration options.
|
| 406 |
+
|
| 407 |
+
---
|
| 408 |
+
|
| 409 |
+
## Implementation Files
|
| 410 |
+
|
| 411 |
+
| Component | File | Description |
|
| 412 |
+
|-----------|------|-------------|
|
| 413 |
+
| **Chunking** | [`chunker.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/chunker.py) | AST-based semantic chunking |
|
| 414 |
+
| **Merkle Tree** | [`merkle_tree.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/merkle_tree.py) | Change detection |
|
| 415 |
+
| **Path Obfuscation** | [`path_obfuscator.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/path_obfuscator.py) | Privacy features |
|
| 416 |
+
| **Indexing** | [`indexer.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/indexer.py) | Vector database operations |
|
| 417 |
+
| **Incremental Indexing** | [`incremental_indexing.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/incremental_indexing.py) | Merkle tree integration |
|
| 418 |
+
| **Configuration** | [`config.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/config.py) | Centralized settings |
|
| 419 |
+
| **Retrieval** | [`retriever_wrapper.py`](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/retriever_wrapper.py) | Reranking & multi-query |
|
| 420 |
+
|
| 421 |
+
---
|
| 422 |
+
|
| 423 |
+
## Next Steps
|
| 424 |
+
|
| 425 |
+
- **Try incremental indexing**: See the speedup for yourself
|
| 426 |
+
- **Enable path obfuscation**: Protect sensitive codebases
|
| 427 |
+
- **Tune chunk size**: Experiment with `CHUNK_MAX_TOKENS`
|
| 428 |
+
- **Explore metadata filtering**: Filter by language, complexity, etc.
|
| 429 |
+
|
| 430 |
+
For more details, see:
|
| 431 |
+
- [Architecture Overview](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/docs/ARCHITECTURE.md)
|
| 432 |
+
- [Configuration Guide](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/code_chatbot/config.py)
|
| 433 |
+
- [API Reference](file:///Users/asishkarthikeyagogineni/Desktop/Codebase_Agent/README.md)
|
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Quick integration script for multi-mode interface.
|
| 4 |
+
|
| 5 |
+
This script will help you integrate the multi-mode interface into app.py.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
def show_integration_steps():
|
| 11 |
+
"""Display integration steps"""
|
| 12 |
+
|
| 13 |
+
print("""
|
| 14 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
+
β Multi-Mode Interface Integration β
|
| 16 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
|
| 18 |
+
β
Components Created:
|
| 19 |
+
- components/multi_mode.py (Chat, Search, Refactor, Generate modes)
|
| 20 |
+
- Verified imports work correctly
|
| 21 |
+
|
| 22 |
+
π Integration Steps:
|
| 23 |
+
|
| 24 |
+
STEP 1: Add Import to app.py
|
| 25 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
Add this import after line 11 in app.py:
|
| 27 |
+
|
| 28 |
+
from components.multi_mode import (
|
| 29 |
+
render_mode_selector,
|
| 30 |
+
render_chat_mode,
|
| 31 |
+
render_search_mode,
|
| 32 |
+
render_refactor_mode,
|
| 33 |
+
render_generate_mode
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
STEP 2: Add Mode Selector
|
| 38 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
Replace lines 489-491 in app.py with:
|
| 40 |
+
|
| 41 |
+
# Main Chat Interface
|
| 42 |
+
st.title("π·οΈ Code Crawler")
|
| 43 |
+
|
| 44 |
+
# Multi-Mode Interface
|
| 45 |
+
if st.session_state.processed_files:
|
| 46 |
+
selected_mode = render_mode_selector()
|
| 47 |
+
st.divider()
|
| 48 |
+
|
| 49 |
+
# Render appropriate interface based on mode
|
| 50 |
+
if selected_mode == "search":
|
| 51 |
+
render_search_mode()
|
| 52 |
+
elif selected_mode == "refactor":
|
| 53 |
+
render_refactor_mode()
|
| 54 |
+
elif selected_mode == "generate":
|
| 55 |
+
render_generate_mode(st.session_state.chat_engine)
|
| 56 |
+
else: # chat mode
|
| 57 |
+
render_chat_mode(st.session_state.chat_engine)
|
| 58 |
+
st.caption(f"Ask questions about your uploaded project. (Using {provider}, Enhanced with AST)")
|
| 59 |
+
else:
|
| 60 |
+
st.caption(f"Configure and index your codebase to get started. (Using {provider}, Enhanced with AST)")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
STEP 3: Wrap Chat Interface
|
| 64 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
Add this check before line 526 (before "# Display History"):
|
| 66 |
+
|
| 67 |
+
# Only show chat history in chat mode
|
| 68 |
+
selected_mode = st.session_state.get("mode_selector", "π¬ Chat")
|
| 69 |
+
if selected_mode == "π¬ Chat":
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
And indent all the chat code (lines 526-614) by 4 spaces.
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
STEP 4: Test the Integration
|
| 76 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
+
Run your Streamlit app:
|
| 78 |
+
|
| 79 |
+
streamlit run app.py
|
| 80 |
+
|
| 81 |
+
You should see:
|
| 82 |
+
β
Mode selector with 4 buttons: π¬ Chat | π Search | π§ Refactor | β¨ Generate
|
| 83 |
+
β
Chat mode works as before
|
| 84 |
+
β
Search mode shows MCP code search interface
|
| 85 |
+
β
Refactor mode shows MCP refactoring interface
|
| 86 |
+
β
Generate mode shows CrewAI feature generation interface
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
π― Quick Test Commands:
|
| 90 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
1. Chat Mode: Ask "Explain how authentication works"
|
| 92 |
+
2. Search Mode: Pattern "class\\s+(\\w+)", File Pattern "**/*.py"
|
| 93 |
+
3. Refactor Mode: Search "print\\((.*)\)", Replace "logger.info(\\1)", Dry Run β
|
| 94 |
+
4. Generate Mode: "Create a REST API endpoint for user management"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
π Documentation:
|
| 98 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 99 |
+
See the walkthrough for detailed usage:
|
| 100 |
+
multimode_walkthrough.md
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
π‘ Need Help?
|
| 104 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 105 |
+
If you encounter issues:
|
| 106 |
+
1. Check that components/multi_mode.py exists
|
| 107 |
+
2. Verify imports work: python3 -c "from components.multi_mode import render_mode_selector"
|
| 108 |
+
3. Check Streamlit logs for errors
|
| 109 |
+
4. Ensure MCP and CrewAI dependencies are installed
|
| 110 |
+
|
| 111 |
+
""")
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
show_integration_steps()
|
|
@@ -18,3 +18,13 @@ sentence-transformers
|
|
| 18 |
gitpython
|
| 19 |
beautifulsoup4
|
| 20 |
pygments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
gitpython
|
| 19 |
beautifulsoup4
|
| 20 |
pygments
|
| 21 |
+
|
| 22 |
+
# MCP (Model Context Protocol)
|
| 23 |
+
mcp>=1.0.0
|
| 24 |
+
|
| 25 |
+
# CrewAI Multi-Agent Framework
|
| 26 |
+
crewai>=0.80.0
|
| 27 |
+
crewai-tools>=0.12.0
|
| 28 |
+
|
| 29 |
+
# Code Refactoring Tools
|
| 30 |
+
rope>=1.13.0
|
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test script for Merkle tree change detection.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from code_chatbot.merkle_tree import MerkleTree
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import tempfile
|
| 8 |
+
import shutil
|
| 9 |
+
|
| 10 |
+
def test_merkle_tree():
|
| 11 |
+
"""Test Merkle tree change detection."""
|
| 12 |
+
|
| 13 |
+
# Create a temporary directory with some files
|
| 14 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 15 |
+
tmpdir = Path(tmpdir)
|
| 16 |
+
|
| 17 |
+
# Create initial files
|
| 18 |
+
(tmpdir / "file1.py").write_text("print('hello')")
|
| 19 |
+
(tmpdir / "file2.py").write_text("print('world')")
|
| 20 |
+
(tmpdir / "subdir").mkdir()
|
| 21 |
+
(tmpdir / "subdir" / "file3.py").write_text("print('test')")
|
| 22 |
+
|
| 23 |
+
# Build initial tree
|
| 24 |
+
merkle = MerkleTree()
|
| 25 |
+
tree1 = merkle.build_tree(str(tmpdir))
|
| 26 |
+
|
| 27 |
+
print(f"β
Built initial Merkle tree")
|
| 28 |
+
print(f" Root hash: {tree1.hash[:16]}...")
|
| 29 |
+
|
| 30 |
+
# Modify a file
|
| 31 |
+
(tmpdir / "file1.py").write_text("print('hello world')")
|
| 32 |
+
|
| 33 |
+
# Add a new file
|
| 34 |
+
(tmpdir / "file4.py").write_text("print('new')")
|
| 35 |
+
|
| 36 |
+
# Delete a file
|
| 37 |
+
(tmpdir / "file2.py").unlink()
|
| 38 |
+
|
| 39 |
+
# Build new tree
|
| 40 |
+
tree2 = merkle.build_tree(str(tmpdir))
|
| 41 |
+
|
| 42 |
+
# Compare
|
| 43 |
+
changes = merkle.compare_trees(tree1, tree2)
|
| 44 |
+
|
| 45 |
+
print(f"\\nβ
Change detection complete:")
|
| 46 |
+
print(f" {changes.summary()}")
|
| 47 |
+
print(f" Added: {changes.added}")
|
| 48 |
+
print(f" Modified: {changes.modified}")
|
| 49 |
+
print(f" Deleted: {changes.deleted}")
|
| 50 |
+
|
| 51 |
+
# Verify results
|
| 52 |
+
assert "file4.py" in changes.added, "Should detect new file"
|
| 53 |
+
assert "file1.py" in changes.modified, "Should detect modified file"
|
| 54 |
+
assert "file2.py" in changes.deleted, "Should detect deleted file"
|
| 55 |
+
assert "subdir/file3.py" in changes.unchanged, "Should detect unchanged file"
|
| 56 |
+
|
| 57 |
+
print(f"\\nβ
All assertions passed!")
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
test_merkle_tree()
|