File size: 2,828 Bytes
14f13a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Configuration management for Developer Docs AI Copilot.
"""
import os
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
from pydantic_settings import BaseSettings
from pydantic import Field, model_validator


class Settings(BaseSettings):
    """Application settings loaded from environment variables."""

    # API Keys
    hf_token: str = Field(default="", alias="HF_TOKEN")

    # Model Configuration
    llm_model: str = Field(
        default="meta-llama/Llama-3.2-3B-Instruct",
        alias="LLM_MODEL"
    )
    llm_max_tokens: int = Field(default=512, alias="LLM_MAX_TOKENS")
    llm_temperature: float = Field(default=0.1, alias="LLM_TEMPERATURE")

    embedding_model: str = Field(
        default="sentence-transformers/all-MiniLM-L6-v2",
        alias="EMBEDDING_MODEL"
    )

    # Vector Database
    chroma_persist_dir: str = Field(
        default="./data/vectordb",
        alias="CHROMA_PERSIST_DIR"
    )
    collection_name: str = Field(
        default="developer_docs",
        alias="COLLECTION_NAME"
    )

    # Chunking Configuration
    chunk_size: int = Field(default=600, alias="CHUNK_SIZE")
    chunk_overlap: int = Field(default=100, alias="CHUNK_OVERLAP")

    # Retrieval Configuration
    top_k_retrieval: int = Field(default=5, alias="TOP_K_RETRIEVAL")
    min_similarity_score: float = Field(
        default=0.2,
        alias="MIN_SIMILARITY_SCORE"
    )

    # Application Settings
    app_port: int = Field(default=7860, alias="APP_PORT")
    log_level: str = Field(default="INFO", alias="LOG_LEVEL")

    # Documentation Source
    docs_url: str = Field(
        default="https://fastapi.tiangolo.com",
        alias="DOCS_URL"
    )
    # Human-readable name for the docs. it is auto-derived from URL if not set
    docs_name: str = Field(default="", alias="DOCS_NAME")
    
    docs_url_patterns: str = Field(default="", alias="DOCS_URL_PATTERNS")

    @model_validator(mode="after")
    def set_docs_name(self) -> "Settings":
        if not self.docs_name:
            hostname = urlparse(self.docs_url).hostname or ""
            name = hostname.split(".")[0].replace("-", " ").title()
            self.docs_name = name
        return self

    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"
        case_sensitive = False


# Global settings instance
settings = Settings()


# Directory paths
PROJECT_ROOT = Path(__file__).parent.parent
DATA_DIR = PROJECT_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
VECTORDB_DIR = DATA_DIR / "vectordb"
EVALS_DIR = PROJECT_ROOT / "evals"
RESULTS_DIR = EVALS_DIR / "results"

# Ensure directories exist
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, VECTORDB_DIR, RESULTS_DIR]:
    directory.mkdir(parents=True, exist_ok=True)