File size: 16,792 Bytes
9d84f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
SUPPORTED_TYPES = {
    "limit_20kb": {
        ".json", ".xml", ".csv", ".tsv", ".jsonl"
    },
    "limit_30kb": {
        ".css", ".scss", ".sass", ".less"
    },
    "limit_50kb": {
        ".yaml", ".yml"
    },
    "limit_2048kb": {
        ".pdf"
    },
    "no_limit": {
        # Documentation & Text
        ".md", ".mdx", ".txt", ".rst", ".asciidoc", ".adoc", ".tex",
        
        # Config & Infrastructure
        ".toml", ".ini", ".cfg", ".conf", ".properties", ".hcl", ".tf", ".tfvars",
        ".gitignore", ".dockerignore", ".editorconfig", ".nvmrc", ".npmignore",
        
        # Web, UI & Templating
        ".html", ".js", ".jsx", ".ts", ".tsx", ".vue", ".svelte", ".astro", ".php",
        ".handlebars", ".hbs", ".ejs", ".pug", ".twig", ".liquid",
        
        # Systems & Core Languages
        ".c", ".h", ".cpp", ".hpp", ".cc", ".cxx", ".rs", ".go", ".zig", ".nim", ".asm", ".s",
        ".java", ".cs", ".kt", ".kts", ".scala", ".groovy",
        ".py", ".rb", ".pl", ".pm", ".lua", ".r",
        
        # Functional & Mobile/Apple
        ".hs", ".ml", ".mli", ".clj", ".cljs", ".cljc", ".ex", ".exs",
        ".jl", ".swift", ".m", ".dart",
        
        # Web3 & Hardware
        ".sol", ".v",
        
        # Shells & Notebooks
        ".sh", ".bash", ".zsh", ".bat", ".cmd", ".ps1",
        ".ipynb"
    }
}

EXCLUDE_PATTERNS = [

    # ── VCS ─────────────────────────────
    "**/.git/**", "**/.svn/**", "**/.hg/**",

    # ── Dependencies ────────────────────
    "**/node_modules/**", "**/bower_components/**",
    "**/venv/**", "**/.venv/**", "**/env/**", "**/python_env/**",
    "**/vendor/**", "**/deps/**", "**/packages/**", "**/Pods/**",

    # ── Build Outputs ───────────────────
    "**/dist/**", "**/build/**", "**/out/**", "**/target/**",
    "**/bin/**", "**/obj/**", "**/_build/**",

    # ── Framework / Tooling ─────────────
    "**/.next/**", "**/.nuxt/**", "**/.svelte-kit/**",
    "**/.gradle/**", "**/.mvn/**",
    "**/.dart_tool/**", "**/.pub-cache/**",
    "**/.serverless/**",

    # ── Python / Test / Cache ───────────
    "**/__pycache__/**", "**/.pytest_cache/**", "**/.tox/**",
    "**/.mypy_cache/**", "**/.ruff_cache/**",
    "**/*.egg-info/**",

    # ── Coverage / Logs ─────────────────
    "**/coverage/**", "**/.nyc_output/**",
    "**/*.log",

    # ── IDE / OS Junk ───────────────────
    "**/.vscode/**", "**/.idea/**", "**/.vs/**",
    "**/.DS_Store", "**/thumbs.db",

    # ── Temp ────────────────────────────
    "**/tmp/**", "**/temp/**",

    # ── πŸ”΄ FILE-LEVEL EXCLUSIONS (NEW) ──
    "**/.gitignore",
    "**/.dockerignore",
    "**/.npmignore",
    "**/.env",
    "**/.env.*",
    "**/.editorconfig",
    "**/.prettierrc",
    "**/.eslintrc",
    "**/.stylelintrc",
]

# Markers that indicate an auto-generated file 
AUTO_GENERATED_MARKERS = [
    # ── Generic / cross-language ────────────────────────────────────────────
    "this file is auto-generated",
    "this file was auto-generated",
    "this file is automatically generated",
    "this file was automatically generated",
    "auto-generated by",
    "auto generated by",
    "automatically generated by",
    "generated automatically",
    "do not edit this file",
    "do not edit - generated",
    "do not modify this file",
    "do not modify - generated",
    "changes will be overwritten",
    "any changes made to this file will be lost",
    "any manual changes will be overwritten",
    "regenerate this file",

    # ── Protobuf / gRPC ─────────────────────────────────────────────────────
    "generated by protoc",
    "generated by the protocol buffer compiler",
    "generated by protoc-gen-go",
    "generated by protoc-gen-grpc",
    "generated by protoc-gen-ts",
    "generated by protoc-gen-js",
    "source: proto/",                         # common protoc header hint

    # ── OpenAPI / Swagger ───────────────────────────────────────────────────
    "generated by openapi",
    "generated by swagger",
    "generated by swagger-codegen",
    "generated by openapi-generator",
    "do not edit the generated code",

    # ── GraphQL ─────────────────────────────────────────────────────────────
    "generated by graphql-codegen",
    "generated by graphql code generator",
    "@generated graphql",                     # relay, graphql-codegen pragma
    "/* eslint-disable */",                   # almost always prepended by codegen

    # ── Go tooling ──────────────────────────────────────────────────────────
    "code generated by go generate",
    "// code generated",                      # official Go convention (go generate)
    "// generated by",
    "do not edit.",                           # standard Go generated file footer

    # ── Rust (build.rs / prost / tonic) ────────────────────────────────────
    "// @generated",
    "generated by prost",
    "generated by tonic",

    # ── Java / Kotlin ───────────────────────────────────────────────────────
    "@javax.annotation.generated",
    "@jakarta.annotation.generated",
    "generated by dagger",
    "generated by hilt",
    "generated by room",                      # Android Room DAO impls
    "generated by kapt",
    "generated by ksp",

    # ── C# / .NET ───────────────────────────────────────────────────────────
    "<autogenerated>",                        # Visual Studio designer files
    "// <auto-generated>",
    "// <autogenerated />",
    "tool = \"resgen\"",
    "generated by microsoft",
    "generated by dotnet",
    "this code was generated by a tool",      # .NET standard header

    # ── TypeScript / JavaScript ─────────────────────────────────────────────
    "// @ts-nocheck",                         # weak signal; combine with others
    "generated by ts-proto",
    "generated by typechain",
    "generated by wagmi",
    "this is a generated file",
    "@auto-generated",

    # ── Python ──────────────────────────────────────────────────────────────
    "# generated by",
    "# this file was generated by",
    "# auto-generated",
    "# do not edit",
    "generated by grpc_tools",
    "generated by betterproto",
    "generated by datamodel-codegen",
    "generated by sqlalchemy",               # alembic migration hint

    # ── Build systems / IDEs ────────────────────────────────────────────────
    "generated by cmake",
    "generated by bazel",
    "generated by buck",
    "generated by gradle",
    "generated by xcode",
    "generated by android studio",
    "generated by flutter",
    "generated by freezed",                  # Dart/Flutter
    "generated by json_serializable",        # Dart/Flutter

    # ── Misc tools ──────────────────────────────────────────────────────────
    "generated by prisma",
    "generated by drizzle",
    "generated by sqlc",
    "generated by buf",                      # buf.build protobuf toolchain
    "generated by mockery",                  # Go mock generator
    "generated by moq",
    "generated by wire",                     # Google Wire DI
    "generated by copier",
    "generated by stringer",                 # Go stringer tool
    "generated by easyjson",
    "lint: disable",                         # weak; combine with file extension

    # ── Pragma-style (language-agnostic) ────────────────────────────────────
    "@generated",                            # used by Hack, Flow, some JS tools
    "/* generated */",
    "// generated",
]

# Extensions to scan for auto-gen headers
AUTO_GEN_SCAN_EXTENSIONS = {
    # Your originals
    ".py", ".ts", ".js", ".cs", ".java", ".kt", ".go", ".rs",
    # Worth adding
    ".tsx", ".jsx",          # React code-gen (relay, graphql-codegen)
    ".dart",                 # Flutter / freezed
    ".proto",                # protobuf definitions themselves
    ".pb.go", ".pb.ts",      # compiled proto output (if treated as extensions)
    ".g.cs", ".designer.cs", # .NET generated suffixes
    ".g.dart",               # Flutter generated
    ".generated.ts",         # convention-based (treat whole suffix as marker)
    ".h", ".cpp", ".cc",     # C/C++ codegen (flatbuffers, protobuf, etc.)
    ".swift",                # Xcode / SwiftGen / Sourcery
    ".rb",                   # Rails generators
    ".php",                  # Doctrine, Symfony generators
}

AST_BASED_SPLITTING = {
    # General-Purpose Programming
    ".c": "c", ".h": "c",
    ".cpp": "cpp", ".hpp": "cpp", ".cc": "cpp", ".cxx": "cpp",
    ".cs": "csharp",
    ".dart": "dart",
    ".go": "go",
    ".java": "java",
    ".js": "javascript", ".jsx": "javascript",
    ".jl": "julia",
    ".kt": "kotlin", ".kts": "kotlin",
    ".nim": "nim",
    ".ml": "ocaml", ".mli": "ocaml",
    ".pl": "perl", ".pm": "perl",
    ".py": "python",
    ".r": "r",
    ".rb": "ruby",
    ".rs": "rust",
    ".scala": "scala",
    ".swift": "swift",
    ".ts": "typescript", 
    ".tsx": "tsx", # TSX has its own explicit key in the docs
    ".zig": "zig",

    # Web, UI & Markup
    ".html": "html",
    ".css": "css",
    ".scss": "scss",
    ".astro": "astro",
    ".vue": "vue",
    ".svelte": "svelte",
    ".xml": "xml",
    ".yaml": "yaml", ".yml": "yaml",

    # Config & DevOps
    ".sh": "bash", ".bash": "bash", ".zsh": "bash",
    ".gitignore": "gitignore",

    # Systems & Low-level
    ".asm": "asm", ".s": "asm",
    ".v": "verilog"
}

CHUNK_SIZE    = 2048
CHUNK_OVERLAP = 200


CHROMA_PERSIST_DIR = "./generated_chroma_database"
CHROMA_COLLECTION_NAME = "vector_db"




AGENT_SYSTEM_PROMPT_HEADER = """ 
You are a Junior Code Researcher working in the backend of a Multi-Agent RAG system. You have access to tools to access a locally stored codebase.
    **YOUR ROLE & AUDIENCE:**
    - You explore the repository using tools to find precise answers.
    - You do NOT interact with the end-user directly. 
    - You report exclusively to the Lead Code Architect (Supervisor).
    - Your job is to do the heavy lifting: use tools to explore, read files, gather context, and decide when you have enough raw data for the Supervisor to formulate a response.
"""

AGENT_SYSTEM_PROMPT_TOOLS = """   
## TOOL SELECTION β€” DECISION TREE
    Work through this decision tree for EVERY search action:
    1. **Do you have an exact string to find?** (function name, class name, variable)
        β†’ Use `exact_code_search`. This is always your first move for anything concrete.
    2. **Did exact search fail OR do you have related keywords?**
        β†’ Use `keyword_code_search`. DO NOT USE QUOTES for multi-term strings (BM25 tokenizes input). Search single, distinct words: e.g., `database pool`, NOT `"database pool"`.
    3. **Are you exploring an abstract concept?**
        β†’ Use `semantic_code_search` with a natural language phrase. Use this LAST.
    4. **Do you need to understand folder structure?**
        β†’ Use `list_directory_contents`.
    5. **Do you need to verify a file exists?**
        β†’ Use `find_file_path_by_pattern`.
    6. **Do you need to read a file's contents?**
        β†’ Use `get_specific_file`."""

AGENT_SYSTEM_PROMPT_TOOLS_NO_DB = """   
## TOOL SELECTION β€” DECISION TREE
    Work through this decision tree for EVERY search action. (Note: Vector search is currently disabled; rely on exact matching and structural exploration):
    1. **Do you have an exact string to find?** (function name, class name, variable)
        β†’ Use `exact_code_search`. This is your primary discovery tool. Think of distinct, unique variable or function names to grep for.
    2. **Do you need to verify a file exists or find files by extension/name?**
        β†’ Use `find_file_path_by_pattern`. Use this to locate config files, routes, or models when exact code strings aren't obvious.
    3. **Do you need to understand folder structure or find where components live?**
        β†’ Use `list_directory_contents`.
    4. **Do you need to read a file's contents?**
        β†’ Use `get_specific_file`."""

AGENT_SYSTEM_PROMPT_FOOTER = """
   ## READING FILES β€” STRICT RULES
    - **Never read a full file blindly.** First use `exact_code_search` to locate the relevant lines, then call `get_specific_file` with a Β±100 line buffer.
    - **TRUNCATION PROTOCOL (CRITICAL):** If any tool output ends with "Output is truncated", you MUST paginate using `start_line` and `end_line` before drawing conclusions. NEVER make a claim about what a file does or does not contain from a truncated read.
    - Never guess line ranges like 1–200.

    ## SEARCH EFFICIENCY RULES
    - **HIGH-VALUE LEAD PROTOCOL:** If you find a struct field, trait method, or config key that is directly relevant (e.g., `idempotent_hint`), you MUST read its full definition and every call site before moving on. Do not pivot after one partial read.
    - **DEAD END ESCALATION:** If you have searched for the same concept 3+ times with no useful result, explicitly state: "CONCEPT NOT EXPLICITLY DOCUMENTED β€” best available evidence is [X]" and stop searching for it. Do not retry with minor keyword variations.
    - Limit parallel exploration. Follow one lead to completion, then pivot.

    **REASONING & ANSWER PROTOCOL:**
    - Do not exceed 10 tool calls per query without pausing to reassess.
    - If you receive "SUPERVISOR FEEDBACK:", your previous research was incomplete. Do not apologize. Read the instructions and find exactly what is missing.

    **YOUR FINAL OUTPUT:**
    Once you have gathered enough information, you must immediately stop. 
    OUTPUT FORMAT: You must output ONLY this exact format. Do not add summaries, insights, or explanations. 

    [RESEARCH COMPLETE]
    Files read:
    - <file_path_1>
    - <file_path_2>
    ...
"""


SUPERVISOR_SYSTEM_PROMPT = """
    You are the Lead Code Architect in a Multi-Agent RAG system. 
    Our platform helps users understand public code libraries.

    *** CRITICAL DIVISION OF LABOR (READ CAREFULLY) ***
    - The Junior Researcher (the agent) is ONLY a "Retriever". Their job is to call tools and output "[RESEARCH COMPLETE]". They are strictly forbidden from writing summaries.
    - YOU are the "Synthesizer". If the raw tool outputs in the message history contain enough information, YOU must write the final answer for the user and set status="ACCEPT".
    - DO NOT reject the researcher just because their final message is a short list of files. That is by design! Evaluate them based on the RAW TOOL OUTPUTS above their final message.

    *** MANDATORY REJECTION TRIGGERS ***
    Output status="REJECT" if ANY of the following are true:
    - The agent treated a truncated file as complete without paginating.
    - The user asked for code/syntax, but the tool outputs only show file names.
    - The agent ignored a high-value lead without checking its call sites.

    *** EVALUATION PATHS ***
    1. SUCCESS: Set status="ACCEPT". Write an exhaustive, highly detailed response addressed to the user.
    2. REWORK: Set status="REJECT". Write strict, non-repetitive, targeted feedback addressed to the Junior Researcher. DO NOT address the user.
"""


MAX_FILES_TO_CREATE_VECTOR_DB = 6000