File size: 3,365 Bytes
dc4e6da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from __future__ import annotations

from enum import Enum
from pathlib import Path

_root_path = Path(__file__).parent.parent.resolve()


# Project paths
class ENV:
    # General
    ROOT_DIR: Path = _root_path
    DATA_DIR: Path = ROOT_DIR / "data"

    DATASETS_DIR: Path = ROOT_DIR / "data" / "datasets"
    BASE_DATASETS_DIR: Path = DATASETS_DIR / "base_v2"
    SYN_DATASETS_PREPARED_DIR: Path = DATASETS_DIR / "synthesized_prepared"
    SYN_DATASETS_DIR: Path = DATASETS_DIR / "synthesized_datasets"

    VISUAL_ELEMENT_PREFABS_DIR: Path = DATA_DIR / "visual_element_prefabs"

    EMBEDDINGS_DIR: Path = DATA_DIR / "embeddings"
    GT_EMBEDDINGS_DIR: Path = DATA_DIR / "gt_embeddings"
    CLUSTERS_DIR: Path = DATA_DIR / "clusters"
    CLUSTER_PLOTS: Path = DATA_DIR / "cluster_plots"
    SYN_DATASET_STAT_PLOTS: Path = DATA_DIR / "syn_dataste_statistics_plots"

    ANALYZATION_DIR: Path = DATA_DIR / "analyzation"
    GT_ANALYZATION_DIR: Path = ANALYZATION_DIR / "gt"
    KIE_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "kie"
    CLS_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "cls"
    QA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "qa"
    DLA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "dla"

    WEBAPP_CACHE_DIR: Path = DATA_DIR / "webapp_cache"
    QA_GT_WEBAPP_CACHE_DIR: Path = WEBAPP_CACHE_DIR / "qa_gt"

    TEMP_DIR: Path = DATA_DIR / "temp"

    MODELS_DIR: Path = DATA_DIR / "models"
    RUNS_DIR: Path = DATA_DIR / "runs"

    EXPORTS_DIR: Path = DATA_DIR / "exports"

    # Contains combined datasets (original and synthetic)
    PREPARED_DATASETS_DIR: Path = DATASETS_DIR / "prepared"

    SYN_DATA_DEFINITIONS_DIR: Path = DATA_DIR / "syn_dataset_definitions"
    PROMPT_TEMPLATES_DIR: Path = DATA_DIR / "prompt_templates"
    SEED_IMAGES_DIR: Path = DATA_DIR / "seed-images"


ENV.BASE_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ENV.SYN_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ENV.SYN_DATASETS_PREPARED_DIR.mkdir(parents=True, exist_ok=True)
ENV.VISUAL_ELEMENT_PREFABS_DIR.mkdir(parents=True, exist_ok=True)
ENV.PREPARED_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ENV.EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
ENV.CLUSTERS_DIR.mkdir(parents=True, exist_ok=True)
ENV.TEMP_DIR.mkdir(parents=True, exist_ok=True)
ENV.MODELS_DIR.mkdir(parents=True, exist_ok=True)
ENV.EXPORTS_DIR.mkdir(parents=True, exist_ok=True)
ENV.CLUSTER_PLOTS.mkdir(parents=True, exist_ok=True)
ENV.SYN_DATASET_STAT_PLOTS.mkdir(parents=True, exist_ok=True)
ENV.GT_EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
ENV.KIE_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.CLS_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.DLA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.QA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.QA_GT_WEBAPP_CACHE_DIR.mkdir(parents=True, exist_ok=True)


class LLM:
    CLAUDE_SONNET_4 = "claude-sonnet-4-20250514"
    CLAUDE_SONNET_4_5 = "claude-sonnet-4-5-20250929"
    CLAUDE_HAIKU_4_5 = "claude-haiku-4-5-20251001"
    TINYLLM_CLAUDE_SONNET_4 = "anthropic/claude-sonnet-4-20250514"


# Default values for generation
class GENERATION:
    LLM = LLM.CLAUDE_SONNET_4_5
    MAX_TOKENS = 16384
    HANDWRITING_MODEL_CHECKPOINT = ENV.MODELS_DIR / "handwriting" / "latest.pt"