Spaces:
Sleeping
Sleeping
| SUPPORTED_TYPES = { | |
| "limit_20kb": { | |
| ".json", ".xml", ".csv", ".tsv", ".jsonl" | |
| }, | |
| "limit_30kb": { | |
| ".css", ".scss", ".sass", ".less" | |
| }, | |
| "limit_50kb": { | |
| ".yaml", ".yml" | |
| }, | |
| "limit_2048kb": { | |
| ".pdf" | |
| }, | |
| "no_limit": { | |
| # Documentation & Text | |
| ".md", ".mdx", ".txt", ".rst", ".asciidoc", ".adoc", ".tex", | |
| # Config & Infrastructure | |
| ".toml", ".ini", ".cfg", ".conf", ".properties", ".hcl", ".tf", ".tfvars", | |
| ".gitignore", ".dockerignore", ".editorconfig", ".nvmrc", ".npmignore", | |
| # Web, UI & Templating | |
| ".html", ".js", ".jsx", ".ts", ".tsx", ".vue", ".svelte", ".astro", ".php", | |
| ".handlebars", ".hbs", ".ejs", ".pug", ".twig", ".liquid", | |
| # Systems & Core Languages | |
| ".c", ".h", ".cpp", ".hpp", ".cc", ".cxx", ".rs", ".go", ".zig", ".nim", ".asm", ".s", | |
| ".java", ".cs", ".kt", ".kts", ".scala", ".groovy", | |
| ".py", ".rb", ".pl", ".pm", ".lua", ".r", | |
| # Functional & Mobile/Apple | |
| ".hs", ".ml", ".mli", ".clj", ".cljs", ".cljc", ".ex", ".exs", | |
| ".jl", ".swift", ".m", ".dart", | |
| # Web3 & Hardware | |
| ".sol", ".v", | |
| # Shells & Notebooks | |
| ".sh", ".bash", ".zsh", ".bat", ".cmd", ".ps1", | |
| ".ipynb" | |
| } | |
| } | |
| EXCLUDE_PATTERNS = [ | |
| # ββ VCS βββββββββββββββββββββββββββββ | |
| "**/.git/**", "**/.svn/**", "**/.hg/**", | |
| # ββ Dependencies ββββββββββββββββββββ | |
| "**/node_modules/**", "**/bower_components/**", | |
| "**/venv/**", "**/.venv/**", "**/env/**", "**/python_env/**", | |
| "**/vendor/**", "**/deps/**", "**/packages/**", "**/Pods/**", | |
| # ββ Build Outputs βββββββββββββββββββ | |
| "**/dist/**", "**/build/**", "**/out/**", "**/target/**", | |
| "**/bin/**", "**/obj/**", "**/_build/**", | |
| # ββ Framework / Tooling βββββββββββββ | |
| "**/.next/**", "**/.nuxt/**", "**/.svelte-kit/**", | |
| "**/.gradle/**", "**/.mvn/**", | |
| "**/.dart_tool/**", "**/.pub-cache/**", | |
| "**/.serverless/**", | |
| # ββ Python / Test / Cache βββββββββββ | |
| "**/__pycache__/**", "**/.pytest_cache/**", "**/.tox/**", | |
| "**/.mypy_cache/**", "**/.ruff_cache/**", | |
| "**/*.egg-info/**", | |
| # ββ Coverage / Logs βββββββββββββββββ | |
| "**/coverage/**", "**/.nyc_output/**", | |
| "**/*.log", | |
| # ββ IDE / OS Junk βββββββββββββββββββ | |
| "**/.vscode/**", "**/.idea/**", "**/.vs/**", | |
| "**/.DS_Store", "**/thumbs.db", | |
| # ββ Temp ββββββββββββββββββββββββββββ | |
| "**/tmp/**", "**/temp/**", | |
| # ββ π΄ FILE-LEVEL EXCLUSIONS (NEW) ββ | |
| "**/.gitignore", | |
| "**/.dockerignore", | |
| "**/.npmignore", | |
| "**/.env", | |
| "**/.env.*", | |
| "**/.editorconfig", | |
| "**/.prettierrc", | |
| "**/.eslintrc", | |
| "**/.stylelintrc", | |
| ] | |
| # Markers that indicate an auto-generated file | |
| AUTO_GENERATED_MARKERS = [ | |
| # ββ Generic / cross-language ββββββββββββββββββββββββββββββββββββββββββββ | |
| "this file is auto-generated", | |
| "this file was auto-generated", | |
| "this file is automatically generated", | |
| "this file was automatically generated", | |
| "auto-generated by", | |
| "auto generated by", | |
| "automatically generated by", | |
| "generated automatically", | |
| "do not edit this file", | |
| "do not edit - generated", | |
| "do not modify this file", | |
| "do not modify - generated", | |
| "changes will be overwritten", | |
| "any changes made to this file will be lost", | |
| "any manual changes will be overwritten", | |
| "regenerate this file", | |
| # ββ Protobuf / gRPC βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "generated by protoc", | |
| "generated by the protocol buffer compiler", | |
| "generated by protoc-gen-go", | |
| "generated by protoc-gen-grpc", | |
| "generated by protoc-gen-ts", | |
| "generated by protoc-gen-js", | |
| "source: proto/", # common protoc header hint | |
| # ββ OpenAPI / Swagger βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "generated by openapi", | |
| "generated by swagger", | |
| "generated by swagger-codegen", | |
| "generated by openapi-generator", | |
| "do not edit the generated code", | |
| # ββ GraphQL βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "generated by graphql-codegen", | |
| "generated by graphql code generator", | |
| "@generated graphql", # relay, graphql-codegen pragma | |
| "/* eslint-disable */", # almost always prepended by codegen | |
| # ββ Go tooling ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "code generated by go generate", | |
| "// code generated", # official Go convention (go generate) | |
| "// generated by", | |
| "do not edit.", # standard Go generated file footer | |
| # ββ Rust (build.rs / prost / tonic) ββββββββββββββββββββββββββββββββββββ | |
| "// @generated", | |
| "generated by prost", | |
| "generated by tonic", | |
| # ββ Java / Kotlin βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "@javax.annotation.generated", | |
| "@jakarta.annotation.generated", | |
| "generated by dagger", | |
| "generated by hilt", | |
| "generated by room", # Android Room DAO impls | |
| "generated by kapt", | |
| "generated by ksp", | |
| # ββ C# / .NET βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "<autogenerated>", # Visual Studio designer files | |
| "// <auto-generated>", | |
| "// <autogenerated />", | |
| "tool = \"resgen\"", | |
| "generated by microsoft", | |
| "generated by dotnet", | |
| "this code was generated by a tool", # .NET standard header | |
| # ββ TypeScript / JavaScript βββββββββββββββββββββββββββββββββββββββββββββ | |
| "// @ts-nocheck", # weak signal; combine with others | |
| "generated by ts-proto", | |
| "generated by typechain", | |
| "generated by wagmi", | |
| "this is a generated file", | |
| "@auto-generated", | |
| # ββ Python ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "# generated by", | |
| "# this file was generated by", | |
| "# auto-generated", | |
| "# do not edit", | |
| "generated by grpc_tools", | |
| "generated by betterproto", | |
| "generated by datamodel-codegen", | |
| "generated by sqlalchemy", # alembic migration hint | |
| # ββ Build systems / IDEs ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "generated by cmake", | |
| "generated by bazel", | |
| "generated by buck", | |
| "generated by gradle", | |
| "generated by xcode", | |
| "generated by android studio", | |
| "generated by flutter", | |
| "generated by freezed", # Dart/Flutter | |
| "generated by json_serializable", # Dart/Flutter | |
| # ββ Misc tools ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "generated by prisma", | |
| "generated by drizzle", | |
| "generated by sqlc", | |
| "generated by buf", # buf.build protobuf toolchain | |
| "generated by mockery", # Go mock generator | |
| "generated by moq", | |
| "generated by wire", # Google Wire DI | |
| "generated by copier", | |
| "generated by stringer", # Go stringer tool | |
| "generated by easyjson", | |
| "lint: disable", # weak; combine with file extension | |
| # ββ Pragma-style (language-agnostic) ββββββββββββββββββββββββββββββββββββ | |
| "@generated", # used by Hack, Flow, some JS tools | |
| "/* generated */", | |
| "// generated", | |
| ] | |
| # Extensions to scan for auto-gen headers | |
| AUTO_GEN_SCAN_EXTENSIONS = { | |
| # Your originals | |
| ".py", ".ts", ".js", ".cs", ".java", ".kt", ".go", ".rs", | |
| # Worth adding | |
| ".tsx", ".jsx", # React code-gen (relay, graphql-codegen) | |
| ".dart", # Flutter / freezed | |
| ".proto", # protobuf definitions themselves | |
| ".pb.go", ".pb.ts", # compiled proto output (if treated as extensions) | |
| ".g.cs", ".designer.cs", # .NET generated suffixes | |
| ".g.dart", # Flutter generated | |
| ".generated.ts", # convention-based (treat whole suffix as marker) | |
| ".h", ".cpp", ".cc", # C/C++ codegen (flatbuffers, protobuf, etc.) | |
| ".swift", # Xcode / SwiftGen / Sourcery | |
| ".rb", # Rails generators | |
| ".php", # Doctrine, Symfony generators | |
| } | |
| AST_BASED_SPLITTING = { | |
| # General-Purpose Programming | |
| ".c": "c", ".h": "c", | |
| ".cpp": "cpp", ".hpp": "cpp", ".cc": "cpp", ".cxx": "cpp", | |
| ".cs": "csharp", | |
| ".dart": "dart", | |
| ".go": "go", | |
| ".java": "java", | |
| ".js": "javascript", ".jsx": "javascript", | |
| ".jl": "julia", | |
| ".kt": "kotlin", ".kts": "kotlin", | |
| ".nim": "nim", | |
| ".ml": "ocaml", ".mli": "ocaml", | |
| ".pl": "perl", ".pm": "perl", | |
| ".py": "python", | |
| ".r": "r", | |
| ".rb": "ruby", | |
| ".rs": "rust", | |
| ".scala": "scala", | |
| ".swift": "swift", | |
| ".ts": "typescript", | |
| ".tsx": "tsx", # TSX has its own explicit key in the docs | |
| ".zig": "zig", | |
| # Web, UI & Markup | |
| ".html": "html", | |
| ".css": "css", | |
| ".scss": "scss", | |
| ".astro": "astro", | |
| ".vue": "vue", | |
| ".svelte": "svelte", | |
| ".xml": "xml", | |
| ".yaml": "yaml", ".yml": "yaml", | |
| # Config & DevOps | |
| ".sh": "bash", ".bash": "bash", ".zsh": "bash", | |
| ".gitignore": "gitignore", | |
| # Systems & Low-level | |
| ".asm": "asm", ".s": "asm", | |
| ".v": "verilog" | |
| } | |
| CHUNK_SIZE = 2048 | |
| CHUNK_OVERLAP = 200 | |
| CHROMA_PERSIST_DIR = "./generated_chroma_database" | |
| CHROMA_COLLECTION_NAME = "vector_db" | |
| AGENT_SYSTEM_PROMPT_HEADER = """ | |
| You are a Junior Code Researcher working in the backend of a Multi-Agent RAG system. You have access to tools to access a locally stored codebase. | |
| **YOUR ROLE & AUDIENCE:** | |
| - You explore the repository using tools to find precise answers. | |
| - You do NOT interact with the end-user directly. | |
| - You report exclusively to the Lead Code Architect (Supervisor). | |
| - Your job is to do the heavy lifting: use tools to explore, read files, gather context, and decide when you have enough raw data for the Supervisor to formulate a response. | |
| """ | |
| AGENT_SYSTEM_PROMPT_TOOLS = """ | |
| ## TOOL SELECTION β DECISION TREE | |
| Work through this decision tree for EVERY search action: | |
| 1. **Do you have an exact string to find?** (function name, class name, variable) | |
| β Use `exact_code_search`. This is always your first move for anything concrete. | |
| 2. **Did exact search fail OR do you have related keywords?** | |
| β Use `keyword_code_search`. DO NOT USE QUOTES for multi-term strings (BM25 tokenizes input). Search single, distinct words: e.g., `database pool`, NOT `"database pool"`. | |
| 3. **Are you exploring an abstract concept?** | |
| β Use `semantic_code_search` with a natural language phrase. Use this LAST. | |
| 4. **Do you need to understand folder structure?** | |
| β Use `list_directory_contents`. | |
| 5. **Do you need to verify a file exists?** | |
| β Use `find_file_path_by_pattern`. | |
| 6. **Do you need to read a file's contents?** | |
| β Use `get_specific_file`.""" | |
| AGENT_SYSTEM_PROMPT_TOOLS_NO_DB = """ | |
| ## TOOL SELECTION β DECISION TREE | |
| Work through this decision tree for EVERY search action. (Note: Vector search is currently disabled; rely on exact matching and structural exploration): | |
| 1. **Do you have an exact string to find?** (function name, class name, variable) | |
| β Use `exact_code_search`. This is your primary discovery tool. Think of distinct, unique variable or function names to grep for. | |
| 2. **Do you need to verify a file exists or find files by extension/name?** | |
| β Use `find_file_path_by_pattern`. Use this to locate config files, routes, or models when exact code strings aren't obvious. | |
| 3. **Do you need to understand folder structure or find where components live?** | |
| β Use `list_directory_contents`. | |
| 4. **Do you need to read a file's contents?** | |
| β Use `get_specific_file`.""" | |
| AGENT_SYSTEM_PROMPT_FOOTER = """ | |
| ## READING FILES β STRICT RULES | |
| - **Never read a full file blindly.** First use `exact_code_search` to locate the relevant lines, then call `get_specific_file` with a Β±100 line buffer. | |
| - **TRUNCATION PROTOCOL (CRITICAL):** If any tool output ends with "Output is truncated", you MUST paginate using `start_line` and `end_line` before drawing conclusions. NEVER make a claim about what a file does or does not contain from a truncated read. | |
| - Never guess line ranges like 1β200. | |
| ## SEARCH EFFICIENCY RULES | |
| - **HIGH-VALUE LEAD PROTOCOL:** If you find a struct field, trait method, or config key that is directly relevant (e.g., `idempotent_hint`), you MUST read its full definition and every call site before moving on. Do not pivot after one partial read. | |
| - **DEAD END ESCALATION:** If you have searched for the same concept 3+ times with no useful result, explicitly state: "CONCEPT NOT EXPLICITLY DOCUMENTED β best available evidence is [X]" and stop searching for it. Do not retry with minor keyword variations. | |
| - Limit parallel exploration. Follow one lead to completion, then pivot. | |
| **REASONING & ANSWER PROTOCOL:** | |
| - Do not exceed 10 tool calls per query without pausing to reassess. | |
| - If you receive "SUPERVISOR FEEDBACK:", your previous research was incomplete. Do not apologize. Read the instructions and find exactly what is missing. | |
| **YOUR FINAL OUTPUT:** | |
| Once you have gathered enough information, you must immediately stop. | |
| OUTPUT FORMAT: You must output ONLY this exact format. Do not add summaries, insights, or explanations. | |
| [RESEARCH COMPLETE] | |
| Files read: | |
| - <file_path_1> | |
| - <file_path_2> | |
| ... | |
| """ | |
| SUPERVISOR_SYSTEM_PROMPT = """ | |
| You are the Lead Code Architect in a Multi-Agent RAG system. | |
| Our platform helps users understand public code libraries. | |
| *** CRITICAL DIVISION OF LABOR (READ CAREFULLY) *** | |
| - The Junior Researcher (the agent) is ONLY a "Retriever". Their job is to call tools and output "[RESEARCH COMPLETE]". They are strictly forbidden from writing summaries. | |
| - YOU are the "Synthesizer". If the raw tool outputs in the message history contain enough information, YOU must write the final answer for the user and set status="ACCEPT". | |
| - DO NOT reject the researcher just because their final message is a short list of files. That is by design! Evaluate them based on the RAW TOOL OUTPUTS above their final message. | |
| *** MANDATORY REJECTION TRIGGERS *** | |
| Output status="REJECT" if ANY of the following are true: | |
| - The agent treated a truncated file as complete without paginating. | |
| - The user asked for code/syntax, but the tool outputs only show file names. | |
| - The agent ignored a high-value lead without checking its call sites. | |
| *** EVALUATION PATHS *** | |
| 1. SUCCESS: Set status="ACCEPT". Write an exhaustive, highly detailed response addressed to the user. | |
| 2. REWORK: Set status="REJECT". Write strict, non-repetitive, targeted feedback addressed to the Junior Researcher. DO NOT address the user. | |
| """ | |
| MAX_FILES_TO_CREATE_VECTOR_DB = 6000 | |