SUPPORTED_TYPES = {
    "limit_20kb": {
        ".json", ".xml", ".csv", ".tsv", ".jsonl"
    },
    "limit_30kb": {
        ".css", ".scss", ".sass", ".less"
    },
    "limit_50kb": {
        ".yaml", ".yml"
    },
    "limit_2048kb": {
        ".pdf"
    },
    "no_limit": {
        # Documentation & Text
        ".md", ".mdx", ".txt", ".rst", ".asciidoc", ".adoc", ".tex",
        
        # Config & Infrastructure
        ".toml", ".ini", ".cfg", ".conf", ".properties", ".hcl", ".tf", ".tfvars",
        ".gitignore", ".dockerignore", ".editorconfig", ".nvmrc", ".npmignore",
        
        # Web, UI & Templating
        ".html", ".js", ".jsx", ".ts", ".tsx", ".vue", ".svelte", ".astro", ".php",
        ".handlebars", ".hbs", ".ejs", ".pug", ".twig", ".liquid",
        
        # Systems & Core Languages
        ".c", ".h", ".cpp", ".hpp", ".cc", ".cxx", ".rs", ".go", ".zig", ".nim", ".asm", ".s",
        ".java", ".cs", ".kt", ".kts", ".scala", ".groovy",
        ".py", ".rb", ".pl", ".pm", ".lua", ".r",
        
        # Functional & Mobile/Apple
        ".hs", ".ml", ".mli", ".clj", ".cljs", ".cljc", ".ex", ".exs",
        ".jl", ".swift", ".m", ".dart",
        
        # Web3 & Hardware
        ".sol", ".v",
        
        # Shells & Notebooks
        ".sh", ".bash", ".zsh", ".bat", ".cmd", ".ps1",
        ".ipynb"
    }
}

EXCLUDE_PATTERNS = [

    # ── VCS ─────────────────────────────
    "**/.git/**", "**/.svn/**", "**/.hg/**",

    # ── Dependencies ────────────────────
    "**/node_modules/**", "**/bower_components/**",
    "**/venv/**", "**/.venv/**", "**/env/**", "**/python_env/**",
    "**/vendor/**", "**/deps/**", "**/packages/**", "**/Pods/**",

    # ── Build Outputs ───────────────────
    "**/dist/**", "**/build/**", "**/out/**", "**/target/**",
    "**/bin/**", "**/obj/**", "**/_build/**",

    # ── Framework / Tooling ─────────────
    "**/.next/**", "**/.nuxt/**", "**/.svelte-kit/**",
    "**/.gradle/**", "**/.mvn/**",
    "**/.dart_tool/**", "**/.pub-cache/**",
    "**/.serverless/**",

    # ── Python / Test / Cache ───────────
    "**/__pycache__/**", "**/.pytest_cache/**", "**/.tox/**",
    "**/.mypy_cache/**", "**/.ruff_cache/**",
    "**/*.egg-info/**",

    # ── Coverage / Logs ─────────────────
    "**/coverage/**", "**/.nyc_output/**",
    "**/*.log",

    # ── IDE / OS Junk ───────────────────
    "**/.vscode/**", "**/.idea/**", "**/.vs/**",
    "**/.DS_Store", "**/thumbs.db",

    # ── Temp ────────────────────────────
    "**/tmp/**", "**/temp/**",

    # ── 🔴 FILE-LEVEL EXCLUSIONS (NEW) ──
    "**/.gitignore",
    "**/.dockerignore",
    "**/.npmignore",
    "**/.env",
    "**/.env.*",
    "**/.editorconfig",
    "**/.prettierrc",
    "**/.eslintrc",
    "**/.stylelintrc",
]

# Markers that indicate an auto-generated file 
AUTO_GENERATED_MARKERS = [
    # ── Generic / cross-language ────────────────────────────────────────────
    "this file is auto-generated",
    "this file was auto-generated",
    "this file is automatically generated",
    "this file was automatically generated",
    "auto-generated by",
    "auto generated by",
    "automatically generated by",
    "generated automatically",
    "do not edit this file",
    "do not edit - generated",
    "do not modify this file",
    "do not modify - generated",
    "changes will be overwritten",
    "any changes made to this file will be lost",
    "any manual changes will be overwritten",
    "regenerate this file",

    # ── Protobuf / gRPC ─────────────────────────────────────────────────────
    "generated by protoc",
    "generated by the protocol buffer compiler",
    "generated by protoc-gen-go",
    "generated by protoc-gen-grpc",
    "generated by protoc-gen-ts",
    "generated by protoc-gen-js",
    "source: proto/",                         # common protoc header hint

    # ── OpenAPI / Swagger ───────────────────────────────────────────────────
    "generated by openapi",
    "generated by swagger",
    "generated by swagger-codegen",
    "generated by openapi-generator",
    "do not edit the generated code",

    # ── GraphQL ─────────────────────────────────────────────────────────────
    "generated by graphql-codegen",
    "generated by graphql code generator",
    "@generated graphql",                     # relay, graphql-codegen pragma
    "/* eslint-disable */",                   # almost always prepended by codegen

    # ── Go tooling ──────────────────────────────────────────────────────────
    "code generated by go generate",
    "// code generated",                      # official Go convention (go generate)
    "// generated by",
    "do not edit.",                           # standard Go generated file footer

    # ── Rust (build.rs / prost / tonic) ────────────────────────────────────
    "// @generated",
    "generated by prost",
    "generated by tonic",

    # ── Java / Kotlin ───────────────────────────────────────────────────────
    "@javax.annotation.generated",
    "@jakarta.annotation.generated",
    "generated by dagger",
    "generated by hilt",
    "generated by room",                      # Android Room DAO impls
    "generated by kapt",
    "generated by ksp",

    # ── C# / .NET ───────────────────────────────────────────────────────────
    "<autogenerated>",                        # Visual Studio designer files
    "// <auto-generated>",
    "// <autogenerated />",
    "tool = \"resgen\"",
    "generated by microsoft",
    "generated by dotnet",
    "this code was generated by a tool",      # .NET standard header

    # ── TypeScript / JavaScript ─────────────────────────────────────────────
    "// @ts-nocheck",                         # weak signal; combine with others
    "generated by ts-proto",
    "generated by typechain",
    "generated by wagmi",
    "this is a generated file",
    "@auto-generated",

    # ── Python ──────────────────────────────────────────────────────────────
    "# generated by",
    "# this file was generated by",
    "# auto-generated",
    "# do not edit",
    "generated by grpc_tools",
    "generated by betterproto",
    "generated by datamodel-codegen",
    "generated by sqlalchemy",               # alembic migration hint

    # ── Build systems / IDEs ────────────────────────────────────────────────
    "generated by cmake",
    "generated by bazel",
    "generated by buck",
    "generated by gradle",
    "generated by xcode",
    "generated by android studio",
    "generated by flutter",
    "generated by freezed",                  # Dart/Flutter
    "generated by json_serializable",        # Dart/Flutter

    # ── Misc tools ──────────────────────────────────────────────────────────
    "generated by prisma",
    "generated by drizzle",
    "generated by sqlc",
    "generated by buf",                      # buf.build protobuf toolchain
    "generated by mockery",                  # Go mock generator
    "generated by moq",
    "generated by wire",                     # Google Wire DI
    "generated by copier",
    "generated by stringer",                 # Go stringer tool
    "generated by easyjson",
    "lint: disable",                         # weak; combine with file extension

    # ── Pragma-style (language-agnostic) ────────────────────────────────────
    "@generated",                            # used by Hack, Flow, some JS tools
    "/* generated */",
    "// generated",
]

# Extensions to scan for auto-gen headers
AUTO_GEN_SCAN_EXTENSIONS = {
    # Your originals
    ".py", ".ts", ".js", ".cs", ".java", ".kt", ".go", ".rs",
    # Worth adding
    ".tsx", ".jsx",          # React code-gen (relay, graphql-codegen)
    ".dart",                 # Flutter / freezed
    ".proto",                # protobuf definitions themselves
    ".pb.go", ".pb.ts",      # compiled proto output (if treated as extensions)
    ".g.cs", ".designer.cs", # .NET generated suffixes
    ".g.dart",               # Flutter generated
    ".generated.ts",         # convention-based (treat whole suffix as marker)
    ".h", ".cpp", ".cc",     # C/C++ codegen (flatbuffers, protobuf, etc.)
    ".swift",                # Xcode / SwiftGen / Sourcery
    ".rb",                   # Rails generators
    ".php",                  # Doctrine, Symfony generators
}

AST_BASED_SPLITTING = {
    # General-Purpose Programming
    ".c": "c", ".h": "c",
    ".cpp": "cpp", ".hpp": "cpp", ".cc": "cpp", ".cxx": "cpp",
    ".cs": "csharp",
    ".dart": "dart",
    ".go": "go",
    ".java": "java",
    ".js": "javascript", ".jsx": "javascript",
    ".jl": "julia",
    ".kt": "kotlin", ".kts": "kotlin",
    ".nim": "nim",
    ".ml": "ocaml", ".mli": "ocaml",
    ".pl": "perl", ".pm": "perl",
    ".py": "python",
    ".r": "r",
    ".rb": "ruby",
    ".rs": "rust",
    ".scala": "scala",
    ".swift": "swift",
    ".ts": "typescript", 
    ".tsx": "tsx", # TSX has its own explicit key in the docs
    ".zig": "zig",

    # Web, UI & Markup
    ".html": "html",
    ".css": "css",
    ".scss": "scss",
    ".astro": "astro",
    ".vue": "vue",
    ".svelte": "svelte",
    ".xml": "xml",
    ".yaml": "yaml", ".yml": "yaml",

    # Config & DevOps
    ".sh": "bash", ".bash": "bash", ".zsh": "bash",
    ".gitignore": "gitignore",

    # Systems & Low-level
    ".asm": "asm", ".s": "asm",
    ".v": "verilog"
}

CHUNK_SIZE    = 2048
CHUNK_OVERLAP = 200


CHROMA_PERSIST_DIR = "./generated_chroma_database"
CHROMA_COLLECTION_NAME = "vector_db"


AGENT_SYSTEM_PROMPT_HEADER = """ 
You are a Junior Code Researcher working in the backend of a Multi-Agent RAG system. You have access to tools to access a locally stored codebase.
    **YOUR ROLE & AUDIENCE:**
    - You explore the repository using tools to find precise answers.
    - You do NOT interact with the end-user directly. 
    - You report exclusively to the Lead Code Architect (Supervisor).
    - Your job is to do the heavy lifting: use tools to explore, read files, gather context, and decide when you have enough raw data for the Supervisor to formulate a response.
"""

AGENT_SYSTEM_PROMPT_TOOLS = """   
## TOOL SELECTION — DECISION TREE
    Work through this decision tree for EVERY search action:
    1. **Do you have an exact string to find?** (function name, class name, variable)
        → Use `exact_code_search`. This is always your first move for anything concrete.
    2. **Did exact search fail OR do you have related keywords?**
        → Use `keyword_code_search`. DO NOT USE QUOTES for multi-term strings (BM25 tokenizes input). Search single, distinct words: e.g., `database pool`, NOT `"database pool"`.
    3. **Are you exploring an abstract concept?**
        → Use `semantic_code_search` with a natural language phrase. Use this LAST.
    4. **Do you need to understand folder structure?**
        → Use `list_directory_contents`.
    5. **Do you need to verify a file exists?**
        → Use `find_file_path_by_pattern`.
    6. **Do you need to read a file's contents?**
        → Use `get_specific_file`."""

AGENT_SYSTEM_PROMPT_TOOLS_NO_DB = """   
## TOOL SELECTION — DECISION TREE
    Work through this decision tree for EVERY search action. (Note: Vector search is currently disabled; rely on exact matching and structural exploration):
    1. **Do you have an exact string to find?** (function name, class name, variable)
        → Use `exact_code_search`. This is your primary discovery tool. Think of distinct, unique variable or function names to grep for.
    2. **Do you need to verify a file exists or find files by extension/name?**
        → Use `find_file_path_by_pattern`. Use this to locate config files, routes, or models when exact code strings aren't obvious.
    3. **Do you need to understand folder structure or find where components live?**
        → Use `list_directory_contents`.
    4. **Do you need to read a file's contents?**
        → Use `get_specific_file`."""

AGENT_SYSTEM_PROMPT_FOOTER = """
   ## READING FILES — STRICT RULES
    - **Never read a full file blindly.** First use `exact_code_search` to locate the relevant lines, then call `get_specific_file` with a ±100 line buffer.
    - **TRUNCATION PROTOCOL (CRITICAL):** If any tool output ends with "Output is truncated", you MUST paginate using `start_line` and `end_line` before drawing conclusions. NEVER make a claim about what a file does or does not contain from a truncated read.
    - Never guess line ranges like 1–200.

    ## SEARCH EFFICIENCY RULES
    - **HIGH-VALUE LEAD PROTOCOL:** If you find a struct field, trait method, or config key that is directly relevant (e.g., `idempotent_hint`), you MUST read its full definition and every call site before moving on. Do not pivot after one partial read.
    - **DEAD END ESCALATION:** If you have searched for the same concept 3+ times with no useful result, explicitly state: "CONCEPT NOT EXPLICITLY DOCUMENTED — best available evidence is [X]" and stop searching for it. Do not retry with minor keyword variations.
    - Limit parallel exploration. Follow one lead to completion, then pivot.

    **REASONING & ANSWER PROTOCOL:**
    - Do not exceed 10 tool calls per query without pausing to reassess.
    - If you receive "SUPERVISOR FEEDBACK:", your previous research was incomplete. Do not apologize. Read the instructions and find exactly what is missing.

    **YOUR FINAL OUTPUT:**
    Once you have gathered enough information, you must immediately stop. 
    OUTPUT FORMAT: You must output ONLY this exact format. Do not add summaries, insights, or explanations. 

    [RESEARCH COMPLETE]
    Files read:
    - <file_path_1>
    - <file_path_2>
    ...
"""


SUPERVISOR_SYSTEM_PROMPT = """
    You are the Lead Code Architect in a Multi-Agent RAG system. 
    Our platform helps users understand public code libraries.

    *** CRITICAL DIVISION OF LABOR (READ CAREFULLY) ***
    - The Junior Researcher (the agent) is ONLY a "Retriever". Their job is to call tools and output "[RESEARCH COMPLETE]". They are strictly forbidden from writing summaries.
    - YOU are the "Synthesizer". If the raw tool outputs in the message history contain enough information, YOU must write the final answer for the user and set status="ACCEPT".
    - DO NOT reject the researcher just because their final message is a short list of files. That is by design! Evaluate them based on the RAW TOOL OUTPUTS above their final message.

    *** MANDATORY REJECTION TRIGGERS ***
    Output status="REJECT" if ANY of the following are true:
    - The agent treated a truncated file as complete without paginating.
    - The user asked for code/syntax, but the tool outputs only show file names.
    - The agent ignored a high-value lead without checking its call sites.

    *** EVALUATION PATHS ***
    1. SUCCESS: Set status="ACCEPT". Write an exhaustive, highly detailed response addressed to the user.
    2. REWORK: Set status="REJECT". Write strict, non-repetitive, targeted feedback addressed to the Junior Researcher. DO NOT address the user.
"""


MAX_FILES_TO_CREATE_VECTOR_DB = 6000