Spaces:
Sleeping
Sleeping
File size: 16,792 Bytes
9d84f4e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | SUPPORTED_TYPES = {
"limit_20kb": {
".json", ".xml", ".csv", ".tsv", ".jsonl"
},
"limit_30kb": {
".css", ".scss", ".sass", ".less"
},
"limit_50kb": {
".yaml", ".yml"
},
"limit_2048kb": {
".pdf"
},
"no_limit": {
# Documentation & Text
".md", ".mdx", ".txt", ".rst", ".asciidoc", ".adoc", ".tex",
# Config & Infrastructure
".toml", ".ini", ".cfg", ".conf", ".properties", ".hcl", ".tf", ".tfvars",
".gitignore", ".dockerignore", ".editorconfig", ".nvmrc", ".npmignore",
# Web, UI & Templating
".html", ".js", ".jsx", ".ts", ".tsx", ".vue", ".svelte", ".astro", ".php",
".handlebars", ".hbs", ".ejs", ".pug", ".twig", ".liquid",
# Systems & Core Languages
".c", ".h", ".cpp", ".hpp", ".cc", ".cxx", ".rs", ".go", ".zig", ".nim", ".asm", ".s",
".java", ".cs", ".kt", ".kts", ".scala", ".groovy",
".py", ".rb", ".pl", ".pm", ".lua", ".r",
# Functional & Mobile/Apple
".hs", ".ml", ".mli", ".clj", ".cljs", ".cljc", ".ex", ".exs",
".jl", ".swift", ".m", ".dart",
# Web3 & Hardware
".sol", ".v",
# Shells & Notebooks
".sh", ".bash", ".zsh", ".bat", ".cmd", ".ps1",
".ipynb"
}
}
EXCLUDE_PATTERNS = [
# ββ VCS βββββββββββββββββββββββββββββ
"**/.git/**", "**/.svn/**", "**/.hg/**",
# ββ Dependencies ββββββββββββββββββββ
"**/node_modules/**", "**/bower_components/**",
"**/venv/**", "**/.venv/**", "**/env/**", "**/python_env/**",
"**/vendor/**", "**/deps/**", "**/packages/**", "**/Pods/**",
# ββ Build Outputs βββββββββββββββββββ
"**/dist/**", "**/build/**", "**/out/**", "**/target/**",
"**/bin/**", "**/obj/**", "**/_build/**",
# ββ Framework / Tooling βββββββββββββ
"**/.next/**", "**/.nuxt/**", "**/.svelte-kit/**",
"**/.gradle/**", "**/.mvn/**",
"**/.dart_tool/**", "**/.pub-cache/**",
"**/.serverless/**",
# ββ Python / Test / Cache βββββββββββ
"**/__pycache__/**", "**/.pytest_cache/**", "**/.tox/**",
"**/.mypy_cache/**", "**/.ruff_cache/**",
"**/*.egg-info/**",
# ββ Coverage / Logs βββββββββββββββββ
"**/coverage/**", "**/.nyc_output/**",
"**/*.log",
# ββ IDE / OS Junk βββββββββββββββββββ
"**/.vscode/**", "**/.idea/**", "**/.vs/**",
"**/.DS_Store", "**/thumbs.db",
# ββ Temp ββββββββββββββββββββββββββββ
"**/tmp/**", "**/temp/**",
# ββ π΄ FILE-LEVEL EXCLUSIONS (NEW) ββ
"**/.gitignore",
"**/.dockerignore",
"**/.npmignore",
"**/.env",
"**/.env.*",
"**/.editorconfig",
"**/.prettierrc",
"**/.eslintrc",
"**/.stylelintrc",
]
# Markers that indicate an auto-generated file
AUTO_GENERATED_MARKERS = [
# ββ Generic / cross-language ββββββββββββββββββββββββββββββββββββββββββββ
"this file is auto-generated",
"this file was auto-generated",
"this file is automatically generated",
"this file was automatically generated",
"auto-generated by",
"auto generated by",
"automatically generated by",
"generated automatically",
"do not edit this file",
"do not edit - generated",
"do not modify this file",
"do not modify - generated",
"changes will be overwritten",
"any changes made to this file will be lost",
"any manual changes will be overwritten",
"regenerate this file",
# ββ Protobuf / gRPC βββββββββββββββββββββββββββββββββββββββββββββββββββββ
"generated by protoc",
"generated by the protocol buffer compiler",
"generated by protoc-gen-go",
"generated by protoc-gen-grpc",
"generated by protoc-gen-ts",
"generated by protoc-gen-js",
"source: proto/", # common protoc header hint
# ββ OpenAPI / Swagger βββββββββββββββββββββββββββββββββββββββββββββββββββ
"generated by openapi",
"generated by swagger",
"generated by swagger-codegen",
"generated by openapi-generator",
"do not edit the generated code",
# ββ GraphQL βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"generated by graphql-codegen",
"generated by graphql code generator",
"@generated graphql", # relay, graphql-codegen pragma
"/* eslint-disable */", # almost always prepended by codegen
# ββ Go tooling ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"code generated by go generate",
"// code generated", # official Go convention (go generate)
"// generated by",
"do not edit.", # standard Go generated file footer
# ββ Rust (build.rs / prost / tonic) ββββββββββββββββββββββββββββββββββββ
"// @generated",
"generated by prost",
"generated by tonic",
# ββ Java / Kotlin βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"@javax.annotation.generated",
"@jakarta.annotation.generated",
"generated by dagger",
"generated by hilt",
"generated by room", # Android Room DAO impls
"generated by kapt",
"generated by ksp",
# ββ C# / .NET βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"<autogenerated>", # Visual Studio designer files
"// <auto-generated>",
"// <autogenerated />",
"tool = \"resgen\"",
"generated by microsoft",
"generated by dotnet",
"this code was generated by a tool", # .NET standard header
# ββ TypeScript / JavaScript βββββββββββββββββββββββββββββββββββββββββββββ
"// @ts-nocheck", # weak signal; combine with others
"generated by ts-proto",
"generated by typechain",
"generated by wagmi",
"this is a generated file",
"@auto-generated",
# ββ Python ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"# generated by",
"# this file was generated by",
"# auto-generated",
"# do not edit",
"generated by grpc_tools",
"generated by betterproto",
"generated by datamodel-codegen",
"generated by sqlalchemy", # alembic migration hint
# ββ Build systems / IDEs ββββββββββββββββββββββββββββββββββββββββββββββββ
"generated by cmake",
"generated by bazel",
"generated by buck",
"generated by gradle",
"generated by xcode",
"generated by android studio",
"generated by flutter",
"generated by freezed", # Dart/Flutter
"generated by json_serializable", # Dart/Flutter
# ββ Misc tools ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"generated by prisma",
"generated by drizzle",
"generated by sqlc",
"generated by buf", # buf.build protobuf toolchain
"generated by mockery", # Go mock generator
"generated by moq",
"generated by wire", # Google Wire DI
"generated by copier",
"generated by stringer", # Go stringer tool
"generated by easyjson",
"lint: disable", # weak; combine with file extension
# ββ Pragma-style (language-agnostic) ββββββββββββββββββββββββββββββββββββ
"@generated", # used by Hack, Flow, some JS tools
"/* generated */",
"// generated",
]
# Extensions to scan for auto-gen headers
AUTO_GEN_SCAN_EXTENSIONS = {
# Your originals
".py", ".ts", ".js", ".cs", ".java", ".kt", ".go", ".rs",
# Worth adding
".tsx", ".jsx", # React code-gen (relay, graphql-codegen)
".dart", # Flutter / freezed
".proto", # protobuf definitions themselves
".pb.go", ".pb.ts", # compiled proto output (if treated as extensions)
".g.cs", ".designer.cs", # .NET generated suffixes
".g.dart", # Flutter generated
".generated.ts", # convention-based (treat whole suffix as marker)
".h", ".cpp", ".cc", # C/C++ codegen (flatbuffers, protobuf, etc.)
".swift", # Xcode / SwiftGen / Sourcery
".rb", # Rails generators
".php", # Doctrine, Symfony generators
}
AST_BASED_SPLITTING = {
# General-Purpose Programming
".c": "c", ".h": "c",
".cpp": "cpp", ".hpp": "cpp", ".cc": "cpp", ".cxx": "cpp",
".cs": "csharp",
".dart": "dart",
".go": "go",
".java": "java",
".js": "javascript", ".jsx": "javascript",
".jl": "julia",
".kt": "kotlin", ".kts": "kotlin",
".nim": "nim",
".ml": "ocaml", ".mli": "ocaml",
".pl": "perl", ".pm": "perl",
".py": "python",
".r": "r",
".rb": "ruby",
".rs": "rust",
".scala": "scala",
".swift": "swift",
".ts": "typescript",
".tsx": "tsx", # TSX has its own explicit key in the docs
".zig": "zig",
# Web, UI & Markup
".html": "html",
".css": "css",
".scss": "scss",
".astro": "astro",
".vue": "vue",
".svelte": "svelte",
".xml": "xml",
".yaml": "yaml", ".yml": "yaml",
# Config & DevOps
".sh": "bash", ".bash": "bash", ".zsh": "bash",
".gitignore": "gitignore",
# Systems & Low-level
".asm": "asm", ".s": "asm",
".v": "verilog"
}
CHUNK_SIZE = 2048
CHUNK_OVERLAP = 200
CHROMA_PERSIST_DIR = "./generated_chroma_database"
CHROMA_COLLECTION_NAME = "vector_db"
AGENT_SYSTEM_PROMPT_HEADER = """
You are a Junior Code Researcher working in the backend of a Multi-Agent RAG system. You have access to tools to access a locally stored codebase.
**YOUR ROLE & AUDIENCE:**
- You explore the repository using tools to find precise answers.
- You do NOT interact with the end-user directly.
- You report exclusively to the Lead Code Architect (Supervisor).
- Your job is to do the heavy lifting: use tools to explore, read files, gather context, and decide when you have enough raw data for the Supervisor to formulate a response.
"""
AGENT_SYSTEM_PROMPT_TOOLS = """
## TOOL SELECTION β DECISION TREE
Work through this decision tree for EVERY search action:
1. **Do you have an exact string to find?** (function name, class name, variable)
β Use `exact_code_search`. This is always your first move for anything concrete.
2. **Did exact search fail OR do you have related keywords?**
β Use `keyword_code_search`. DO NOT USE QUOTES for multi-term strings (BM25 tokenizes input). Search single, distinct words: e.g., `database pool`, NOT `"database pool"`.
3. **Are you exploring an abstract concept?**
β Use `semantic_code_search` with a natural language phrase. Use this LAST.
4. **Do you need to understand folder structure?**
β Use `list_directory_contents`.
5. **Do you need to verify a file exists?**
β Use `find_file_path_by_pattern`.
6. **Do you need to read a file's contents?**
β Use `get_specific_file`."""
AGENT_SYSTEM_PROMPT_TOOLS_NO_DB = """
## TOOL SELECTION β DECISION TREE
Work through this decision tree for EVERY search action. (Note: Vector search is currently disabled; rely on exact matching and structural exploration):
1. **Do you have an exact string to find?** (function name, class name, variable)
β Use `exact_code_search`. This is your primary discovery tool. Think of distinct, unique variable or function names to grep for.
2. **Do you need to verify a file exists or find files by extension/name?**
β Use `find_file_path_by_pattern`. Use this to locate config files, routes, or models when exact code strings aren't obvious.
3. **Do you need to understand folder structure or find where components live?**
β Use `list_directory_contents`.
4. **Do you need to read a file's contents?**
β Use `get_specific_file`."""
AGENT_SYSTEM_PROMPT_FOOTER = """
## READING FILES β STRICT RULES
- **Never read a full file blindly.** First use `exact_code_search` to locate the relevant lines, then call `get_specific_file` with a Β±100 line buffer.
- **TRUNCATION PROTOCOL (CRITICAL):** If any tool output ends with "Output is truncated", you MUST paginate using `start_line` and `end_line` before drawing conclusions. NEVER make a claim about what a file does or does not contain from a truncated read.
- Never guess line ranges like 1β200.
## SEARCH EFFICIENCY RULES
- **HIGH-VALUE LEAD PROTOCOL:** If you find a struct field, trait method, or config key that is directly relevant (e.g., `idempotent_hint`), you MUST read its full definition and every call site before moving on. Do not pivot after one partial read.
- **DEAD END ESCALATION:** If you have searched for the same concept 3+ times with no useful result, explicitly state: "CONCEPT NOT EXPLICITLY DOCUMENTED β best available evidence is [X]" and stop searching for it. Do not retry with minor keyword variations.
- Limit parallel exploration. Follow one lead to completion, then pivot.
**REASONING & ANSWER PROTOCOL:**
- Do not exceed 10 tool calls per query without pausing to reassess.
- If you receive "SUPERVISOR FEEDBACK:", your previous research was incomplete. Do not apologize. Read the instructions and find exactly what is missing.
**YOUR FINAL OUTPUT:**
Once you have gathered enough information, you must immediately stop.
OUTPUT FORMAT: You must output ONLY this exact format. Do not add summaries, insights, or explanations.
[RESEARCH COMPLETE]
Files read:
- <file_path_1>
- <file_path_2>
...
"""
SUPERVISOR_SYSTEM_PROMPT = """
You are the Lead Code Architect in a Multi-Agent RAG system.
Our platform helps users understand public code libraries.
*** CRITICAL DIVISION OF LABOR (READ CAREFULLY) ***
- The Junior Researcher (the agent) is ONLY a "Retriever". Their job is to call tools and output "[RESEARCH COMPLETE]". They are strictly forbidden from writing summaries.
- YOU are the "Synthesizer". If the raw tool outputs in the message history contain enough information, YOU must write the final answer for the user and set status="ACCEPT".
- DO NOT reject the researcher just because their final message is a short list of files. That is by design! Evaluate them based on the RAW TOOL OUTPUTS above their final message.
*** MANDATORY REJECTION TRIGGERS ***
Output status="REJECT" if ANY of the following are true:
- The agent treated a truncated file as complete without paginating.
- The user asked for code/syntax, but the tool outputs only show file names.
- The agent ignored a high-value lead without checking its call sites.
*** EVALUATION PATHS ***
1. SUCCESS: Set status="ACCEPT". Write an exhaustive, highly detailed response addressed to the user.
2. REWORK: Set status="REJECT". Write strict, non-repetitive, targeted feedback addressed to the Junior Researcher. DO NOT address the user.
"""
MAX_FILES_TO_CREATE_VECTOR_DB = 6000
|