SUPPORTED_TYPES = { "limit_20kb": { ".json", ".xml", ".csv", ".tsv", ".jsonl" }, "limit_30kb": { ".css", ".scss", ".sass", ".less" }, "limit_50kb": { ".yaml", ".yml" }, "limit_2048kb": { ".pdf" }, "no_limit": { # Documentation & Text ".md", ".mdx", ".txt", ".rst", ".asciidoc", ".adoc", ".tex", # Config & Infrastructure ".toml", ".ini", ".cfg", ".conf", ".properties", ".hcl", ".tf", ".tfvars", ".gitignore", ".dockerignore", ".editorconfig", ".nvmrc", ".npmignore", # Web, UI & Templating ".html", ".js", ".jsx", ".ts", ".tsx", ".vue", ".svelte", ".astro", ".php", ".handlebars", ".hbs", ".ejs", ".pug", ".twig", ".liquid", # Systems & Core Languages ".c", ".h", ".cpp", ".hpp", ".cc", ".cxx", ".rs", ".go", ".zig", ".nim", ".asm", ".s", ".java", ".cs", ".kt", ".kts", ".scala", ".groovy", ".py", ".rb", ".pl", ".pm", ".lua", ".r", # Functional & Mobile/Apple ".hs", ".ml", ".mli", ".clj", ".cljs", ".cljc", ".ex", ".exs", ".jl", ".swift", ".m", ".dart", # Web3 & Hardware ".sol", ".v", # Shells & Notebooks ".sh", ".bash", ".zsh", ".bat", ".cmd", ".ps1", ".ipynb" } } EXCLUDE_PATTERNS = [ # ── VCS ───────────────────────────── "**/.git/**", "**/.svn/**", "**/.hg/**", # ── Dependencies ──────────────────── "**/node_modules/**", "**/bower_components/**", "**/venv/**", "**/.venv/**", "**/env/**", "**/python_env/**", "**/vendor/**", "**/deps/**", "**/packages/**", "**/Pods/**", # ── Build Outputs ─────────────────── "**/dist/**", "**/build/**", "**/out/**", "**/target/**", "**/bin/**", "**/obj/**", "**/_build/**", # ── Framework / Tooling ───────────── "**/.next/**", "**/.nuxt/**", "**/.svelte-kit/**", "**/.gradle/**", "**/.mvn/**", "**/.dart_tool/**", "**/.pub-cache/**", "**/.serverless/**", # ── Python / Test / Cache ─────────── "**/__pycache__/**", "**/.pytest_cache/**", "**/.tox/**", "**/.mypy_cache/**", "**/.ruff_cache/**", "**/*.egg-info/**", # ── Coverage / Logs ───────────────── "**/coverage/**", "**/.nyc_output/**", "**/*.log", # ── IDE / OS Junk ─────────────────── "**/.vscode/**", "**/.idea/**", "**/.vs/**", "**/.DS_Store", "**/thumbs.db", # ── Temp ──────────────────────────── "**/tmp/**", "**/temp/**", # ── 🔴 FILE-LEVEL EXCLUSIONS (NEW) ── "**/.gitignore", "**/.dockerignore", "**/.npmignore", "**/.env", "**/.env.*", "**/.editorconfig", "**/.prettierrc", "**/.eslintrc", "**/.stylelintrc", ] # Markers that indicate an auto-generated file AUTO_GENERATED_MARKERS = [ # ── Generic / cross-language ──────────────────────────────────────────── "this file is auto-generated", "this file was auto-generated", "this file is automatically generated", "this file was automatically generated", "auto-generated by", "auto generated by", "automatically generated by", "generated automatically", "do not edit this file", "do not edit - generated", "do not modify this file", "do not modify - generated", "changes will be overwritten", "any changes made to this file will be lost", "any manual changes will be overwritten", "regenerate this file", # ── Protobuf / gRPC ───────────────────────────────────────────────────── "generated by protoc", "generated by the protocol buffer compiler", "generated by protoc-gen-go", "generated by protoc-gen-grpc", "generated by protoc-gen-ts", "generated by protoc-gen-js", "source: proto/", # common protoc header hint # ── OpenAPI / Swagger ─────────────────────────────────────────────────── "generated by openapi", "generated by swagger", "generated by swagger-codegen", "generated by openapi-generator", "do not edit the generated code", # ── GraphQL ───────────────────────────────────────────────────────────── "generated by graphql-codegen", "generated by graphql code generator", "@generated graphql", # relay, graphql-codegen pragma "/* eslint-disable */", # almost always prepended by codegen # ── Go tooling ────────────────────────────────────────────────────────── "code generated by go generate", "// code generated", # official Go convention (go generate) "// generated by", "do not edit.", # standard Go generated file footer # ── Rust (build.rs / prost / tonic) ──────────────────────────────────── "// @generated", "generated by prost", "generated by tonic", # ── Java / Kotlin ─────────────────────────────────────────────────────── "@javax.annotation.generated", "@jakarta.annotation.generated", "generated by dagger", "generated by hilt", "generated by room", # Android Room DAO impls "generated by kapt", "generated by ksp", # ── C# / .NET ─────────────────────────────────────────────────────────── "", # Visual Studio designer files "// ", "// ", "tool = \"resgen\"", "generated by microsoft", "generated by dotnet", "this code was generated by a tool", # .NET standard header # ── TypeScript / JavaScript ───────────────────────────────────────────── "// @ts-nocheck", # weak signal; combine with others "generated by ts-proto", "generated by typechain", "generated by wagmi", "this is a generated file", "@auto-generated", # ── Python ────────────────────────────────────────────────────────────── "# generated by", "# this file was generated by", "# auto-generated", "# do not edit", "generated by grpc_tools", "generated by betterproto", "generated by datamodel-codegen", "generated by sqlalchemy", # alembic migration hint # ── Build systems / IDEs ──────────────────────────────────────────────── "generated by cmake", "generated by bazel", "generated by buck", "generated by gradle", "generated by xcode", "generated by android studio", "generated by flutter", "generated by freezed", # Dart/Flutter "generated by json_serializable", # Dart/Flutter # ── Misc tools ────────────────────────────────────────────────────────── "generated by prisma", "generated by drizzle", "generated by sqlc", "generated by buf", # buf.build protobuf toolchain "generated by mockery", # Go mock generator "generated by moq", "generated by wire", # Google Wire DI "generated by copier", "generated by stringer", # Go stringer tool "generated by easyjson", "lint: disable", # weak; combine with file extension # ── Pragma-style (language-agnostic) ──────────────────────────────────── "@generated", # used by Hack, Flow, some JS tools "/* generated */", "// generated", ] # Extensions to scan for auto-gen headers AUTO_GEN_SCAN_EXTENSIONS = { # Your originals ".py", ".ts", ".js", ".cs", ".java", ".kt", ".go", ".rs", # Worth adding ".tsx", ".jsx", # React code-gen (relay, graphql-codegen) ".dart", # Flutter / freezed ".proto", # protobuf definitions themselves ".pb.go", ".pb.ts", # compiled proto output (if treated as extensions) ".g.cs", ".designer.cs", # .NET generated suffixes ".g.dart", # Flutter generated ".generated.ts", # convention-based (treat whole suffix as marker) ".h", ".cpp", ".cc", # C/C++ codegen (flatbuffers, protobuf, etc.) ".swift", # Xcode / SwiftGen / Sourcery ".rb", # Rails generators ".php", # Doctrine, Symfony generators } AST_BASED_SPLITTING = { # General-Purpose Programming ".c": "c", ".h": "c", ".cpp": "cpp", ".hpp": "cpp", ".cc": "cpp", ".cxx": "cpp", ".cs": "csharp", ".dart": "dart", ".go": "go", ".java": "java", ".js": "javascript", ".jsx": "javascript", ".jl": "julia", ".kt": "kotlin", ".kts": "kotlin", ".nim": "nim", ".ml": "ocaml", ".mli": "ocaml", ".pl": "perl", ".pm": "perl", ".py": "python", ".r": "r", ".rb": "ruby", ".rs": "rust", ".scala": "scala", ".swift": "swift", ".ts": "typescript", ".tsx": "tsx", # TSX has its own explicit key in the docs ".zig": "zig", # Web, UI & Markup ".html": "html", ".css": "css", ".scss": "scss", ".astro": "astro", ".vue": "vue", ".svelte": "svelte", ".xml": "xml", ".yaml": "yaml", ".yml": "yaml", # Config & DevOps ".sh": "bash", ".bash": "bash", ".zsh": "bash", ".gitignore": "gitignore", # Systems & Low-level ".asm": "asm", ".s": "asm", ".v": "verilog" } CHUNK_SIZE = 2048 CHUNK_OVERLAP = 200 CHROMA_PERSIST_DIR = "./generated_chroma_database" CHROMA_COLLECTION_NAME = "vector_db" AGENT_SYSTEM_PROMPT_HEADER = """ You are a Junior Code Researcher working in the backend of a Multi-Agent RAG system. You have access to tools to access a locally stored codebase. **YOUR ROLE & AUDIENCE:** - You explore the repository using tools to find precise answers. - You do NOT interact with the end-user directly. - You report exclusively to the Lead Code Architect (Supervisor). - Your job is to do the heavy lifting: use tools to explore, read files, gather context, and decide when you have enough raw data for the Supervisor to formulate a response. """ AGENT_SYSTEM_PROMPT_TOOLS = """ ## TOOL SELECTION — DECISION TREE Work through this decision tree for EVERY search action: 1. **Do you have an exact string to find?** (function name, class name, variable) → Use `exact_code_search`. This is always your first move for anything concrete. 2. **Did exact search fail OR do you have related keywords?** → Use `keyword_code_search`. DO NOT USE QUOTES for multi-term strings (BM25 tokenizes input). Search single, distinct words: e.g., `database pool`, NOT `"database pool"`. 3. **Are you exploring an abstract concept?** → Use `semantic_code_search` with a natural language phrase. Use this LAST. 4. **Do you need to understand folder structure?** → Use `list_directory_contents`. 5. **Do you need to verify a file exists?** → Use `find_file_path_by_pattern`. 6. **Do you need to read a file's contents?** → Use `get_specific_file`.""" AGENT_SYSTEM_PROMPT_TOOLS_NO_DB = """ ## TOOL SELECTION — DECISION TREE Work through this decision tree for EVERY search action. (Note: Vector search is currently disabled; rely on exact matching and structural exploration): 1. **Do you have an exact string to find?** (function name, class name, variable) → Use `exact_code_search`. This is your primary discovery tool. Think of distinct, unique variable or function names to grep for. 2. **Do you need to verify a file exists or find files by extension/name?** → Use `find_file_path_by_pattern`. Use this to locate config files, routes, or models when exact code strings aren't obvious. 3. **Do you need to understand folder structure or find where components live?** → Use `list_directory_contents`. 4. **Do you need to read a file's contents?** → Use `get_specific_file`.""" AGENT_SYSTEM_PROMPT_FOOTER = """ ## READING FILES — STRICT RULES - **Never read a full file blindly.** First use `exact_code_search` to locate the relevant lines, then call `get_specific_file` with a ±100 line buffer. - **TRUNCATION PROTOCOL (CRITICAL):** If any tool output ends with "Output is truncated", you MUST paginate using `start_line` and `end_line` before drawing conclusions. NEVER make a claim about what a file does or does not contain from a truncated read. - Never guess line ranges like 1–200. ## SEARCH EFFICIENCY RULES - **HIGH-VALUE LEAD PROTOCOL:** If you find a struct field, trait method, or config key that is directly relevant (e.g., `idempotent_hint`), you MUST read its full definition and every call site before moving on. Do not pivot after one partial read. - **DEAD END ESCALATION:** If you have searched for the same concept 3+ times with no useful result, explicitly state: "CONCEPT NOT EXPLICITLY DOCUMENTED — best available evidence is [X]" and stop searching for it. Do not retry with minor keyword variations. - Limit parallel exploration. Follow one lead to completion, then pivot. **REASONING & ANSWER PROTOCOL:** - Do not exceed 10 tool calls per query without pausing to reassess. - If you receive "SUPERVISOR FEEDBACK:", your previous research was incomplete. Do not apologize. Read the instructions and find exactly what is missing. **YOUR FINAL OUTPUT:** Once you have gathered enough information, you must immediately stop. OUTPUT FORMAT: You must output ONLY this exact format. Do not add summaries, insights, or explanations. [RESEARCH COMPLETE] Files read: - - ... """ SUPERVISOR_SYSTEM_PROMPT = """ You are the Lead Code Architect in a Multi-Agent RAG system. Our platform helps users understand public code libraries. *** CRITICAL DIVISION OF LABOR (READ CAREFULLY) *** - The Junior Researcher (the agent) is ONLY a "Retriever". Their job is to call tools and output "[RESEARCH COMPLETE]". They are strictly forbidden from writing summaries. - YOU are the "Synthesizer". If the raw tool outputs in the message history contain enough information, YOU must write the final answer for the user and set status="ACCEPT". - DO NOT reject the researcher just because their final message is a short list of files. That is by design! Evaluate them based on the RAW TOOL OUTPUTS above their final message. *** MANDATORY REJECTION TRIGGERS *** Output status="REJECT" if ANY of the following are true: - The agent treated a truncated file as complete without paginating. - The user asked for code/syntax, but the tool outputs only show file names. - The agent ignored a high-value lead without checking its call sites. *** EVALUATION PATHS *** 1. SUCCESS: Set status="ACCEPT". Write an exhaustive, highly detailed response addressed to the user. 2. REWORK: Set status="REJECT". Write strict, non-repetitive, targeted feedback addressed to the Junior Researcher. DO NOT address the user. """ MAX_FILES_TO_CREATE_VECTOR_DB = 6000