Docgenie-API / docgenie /__init__.py
Ahadhassan-2003
deploy: update HF Space
dc4e6da
from __future__ import annotations
from enum import Enum
from pathlib import Path
_root_path = Path(__file__).parent.parent.resolve()
# Project paths
class ENV:
# General
ROOT_DIR: Path = _root_path
DATA_DIR: Path = ROOT_DIR / "data"
DATASETS_DIR: Path = ROOT_DIR / "data" / "datasets"
BASE_DATASETS_DIR: Path = DATASETS_DIR / "base_v2"
SYN_DATASETS_PREPARED_DIR: Path = DATASETS_DIR / "synthesized_prepared"
SYN_DATASETS_DIR: Path = DATASETS_DIR / "synthesized_datasets"
VISUAL_ELEMENT_PREFABS_DIR: Path = DATA_DIR / "visual_element_prefabs"
EMBEDDINGS_DIR: Path = DATA_DIR / "embeddings"
GT_EMBEDDINGS_DIR: Path = DATA_DIR / "gt_embeddings"
CLUSTERS_DIR: Path = DATA_DIR / "clusters"
CLUSTER_PLOTS: Path = DATA_DIR / "cluster_plots"
SYN_DATASET_STAT_PLOTS: Path = DATA_DIR / "syn_dataste_statistics_plots"
ANALYZATION_DIR: Path = DATA_DIR / "analyzation"
GT_ANALYZATION_DIR: Path = ANALYZATION_DIR / "gt"
KIE_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "kie"
CLS_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "cls"
QA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "qa"
DLA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "dla"
WEBAPP_CACHE_DIR: Path = DATA_DIR / "webapp_cache"
QA_GT_WEBAPP_CACHE_DIR: Path = WEBAPP_CACHE_DIR / "qa_gt"
TEMP_DIR: Path = DATA_DIR / "temp"
MODELS_DIR: Path = DATA_DIR / "models"
RUNS_DIR: Path = DATA_DIR / "runs"
EXPORTS_DIR: Path = DATA_DIR / "exports"
# Contains combined datasets (original and synthetic)
PREPARED_DATASETS_DIR: Path = DATASETS_DIR / "prepared"
SYN_DATA_DEFINITIONS_DIR: Path = DATA_DIR / "syn_dataset_definitions"
PROMPT_TEMPLATES_DIR: Path = DATA_DIR / "prompt_templates"
SEED_IMAGES_DIR: Path = DATA_DIR / "seed-images"
ENV.BASE_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ENV.SYN_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ENV.SYN_DATASETS_PREPARED_DIR.mkdir(parents=True, exist_ok=True)
ENV.VISUAL_ELEMENT_PREFABS_DIR.mkdir(parents=True, exist_ok=True)
ENV.PREPARED_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
ENV.EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
ENV.CLUSTERS_DIR.mkdir(parents=True, exist_ok=True)
ENV.TEMP_DIR.mkdir(parents=True, exist_ok=True)
ENV.MODELS_DIR.mkdir(parents=True, exist_ok=True)
ENV.EXPORTS_DIR.mkdir(parents=True, exist_ok=True)
ENV.CLUSTER_PLOTS.mkdir(parents=True, exist_ok=True)
ENV.SYN_DATASET_STAT_PLOTS.mkdir(parents=True, exist_ok=True)
ENV.GT_EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
ENV.KIE_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.CLS_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.DLA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.QA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
ENV.QA_GT_WEBAPP_CACHE_DIR.mkdir(parents=True, exist_ok=True)
class LLM:
CLAUDE_SONNET_4 = "claude-sonnet-4-20250514"
CLAUDE_SONNET_4_5 = "claude-sonnet-4-5-20250929"
CLAUDE_HAIKU_4_5 = "claude-haiku-4-5-20251001"
TINYLLM_CLAUDE_SONNET_4 = "anthropic/claude-sonnet-4-20250514"
# Default values for generation
class GENERATION:
LLM = LLM.CLAUDE_SONNET_4_5
MAX_TOKENS = 16384
HANDWRITING_MODEL_CHECKPOINT = ENV.MODELS_DIR / "handwriting" / "latest.pt"