mastermap-cleaner / src /config.py
andrewbejjani's picture
Added functional doc in README.md and added basic
c6a3f44
import os
import argparse
from dotenv import load_dotenv
# Load local .env values for development; Hugging Face injects the same names
# as environment variables in production.
load_dotenv()
# --- ENVIRONMENT VARIABLES to be set up in .env ---
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
RAW_MODELS = os.getenv("GROQ_MODEL", "")
APP_USERNAME = os.getenv("APP_USERNAME")
APP_PASSWORD = os.getenv("APP_PASSWORD")
SPACE_ID = os.getenv("SPACE_ID")
HF_TOKEN = os.getenv("HF_TOKEN")
# Parse models cleanly into a list
AVAILABLE_MODELS = [m.strip() for m in RAW_MODELS.split(",") if m.strip()]
# --- PIPELINE THRESHOLDS & CONSTANTS ---
DEFAULT_SIMILARITY_THRESHOLD = 0.93
TOP_K_CANDIDATES = 5
SHEET_NAME = "Data"
DATA_DIR = "data"
REFDATA_DIR = "refdata"
DEFAULT_OUTPUT_SHEET_NAME = "Cleaned_Data"
DEFAULT_BLUEPRINT_FILE = "Blueprint.xlsx"
DEFAULT_OFFICIAL_REFS_FILE = "official_references.json"
DEFAULT_MANUAL_REFS_FILE = "manual_references.json"
def resolve_data_path(file_arg):
"""Resolve bare file names into the project data directory."""
if not file_arg:
return file_arg
if os.path.isabs(file_arg) or os.path.dirname(file_arg):
return file_arg
return os.path.join(DATA_DIR, file_arg)
def resolve_ref_path(file_arg):
"""Resolve bare reference file names into the project refdata directory."""
if not file_arg:
return file_arg
if os.path.isabs(file_arg) or os.path.dirname(file_arg):
return file_arg
return os.path.join(REFDATA_DIR, file_arg)
def parse_cli_args():
"""Parse shared CLI arguments used by both local runs and the Flask UI."""
parser = argparse.ArgumentParser(description="MasterMap Data Normalization Pipeline")
parser.add_argument("--input", required=True, help="Raw input Excel file name inside data/")
parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
parser.add_argument("--sheet", default=SHEET_NAME, help="Source sheet name to clean")
parser.add_argument("--output_sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Output sheet name to create")
parser.add_argument("--models", default="", help="Comma-separated Groq models to use in fallback order")
args = parser.parse_args()
# Keep CLI calls short by treating bare names as files under data/refdata.
args.input = resolve_data_path(args.input)
args.blueprint = resolve_data_path(args.blueprint)
args.refs = resolve_ref_path(args.refs)
args.manual_refs = resolve_ref_path(args.manual_refs)
return args