import os import argparse from dotenv import load_dotenv # Load local .env values for development; Hugging Face injects the same names # as environment variables in production. load_dotenv() # --- ENVIRONMENT VARIABLES to be set up in .env --- GROQ_API_KEY = os.getenv("GROQ_API_KEY") RAW_MODELS = os.getenv("GROQ_MODEL", "") APP_USERNAME = os.getenv("APP_USERNAME") APP_PASSWORD = os.getenv("APP_PASSWORD") SPACE_ID = os.getenv("SPACE_ID") HF_TOKEN = os.getenv("HF_TOKEN") # Parse models cleanly into a list AVAILABLE_MODELS = [m.strip() for m in RAW_MODELS.split(",") if m.strip()] # --- PIPELINE THRESHOLDS & CONSTANTS --- DEFAULT_SIMILARITY_THRESHOLD = 0.93 TOP_K_CANDIDATES = 5 SHEET_NAME = "Data" DATA_DIR = "data" REFDATA_DIR = "refdata" DEFAULT_OUTPUT_SHEET_NAME = "Cleaned_Data" DEFAULT_BLUEPRINT_FILE = "Blueprint.xlsx" DEFAULT_OFFICIAL_REFS_FILE = "official_references.json" DEFAULT_MANUAL_REFS_FILE = "manual_references.json" def resolve_data_path(file_arg): """Resolve bare file names into the project data directory.""" if not file_arg: return file_arg if os.path.isabs(file_arg) or os.path.dirname(file_arg): return file_arg return os.path.join(DATA_DIR, file_arg) def resolve_ref_path(file_arg): """Resolve bare reference file names into the project refdata directory.""" if not file_arg: return file_arg if os.path.isabs(file_arg) or os.path.dirname(file_arg): return file_arg return os.path.join(REFDATA_DIR, file_arg) def parse_cli_args(): """Parse shared CLI arguments used by both local runs and the Flask UI.""" parser = argparse.ArgumentParser(description="MasterMap Data Normalization Pipeline") parser.add_argument("--input", required=True, help="Raw input Excel file name inside data/") parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/") parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/") parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/") parser.add_argument("--sheet", default=SHEET_NAME, help="Source sheet name to clean") parser.add_argument("--output_sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Output sheet name to create") parser.add_argument("--models", default="", help="Comma-separated Groq models to use in fallback order") args = parser.parse_args() # Keep CLI calls short by treating bare names as files under data/refdata. args.input = resolve_data_path(args.input) args.blueprint = resolve_data_path(args.blueprint) args.refs = resolve_ref_path(args.refs) args.manual_refs = resolve_ref_path(args.manual_refs) return args