Spaces:
Running
Running
| import os | |
| import argparse | |
| from dotenv import load_dotenv | |
| # Load local .env values for development; Hugging Face injects the same names | |
| # as environment variables in production. | |
| load_dotenv() | |
| # --- ENVIRONMENT VARIABLES to be set up in .env --- | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| RAW_MODELS = os.getenv("GROQ_MODEL", "") | |
| APP_USERNAME = os.getenv("APP_USERNAME") | |
| APP_PASSWORD = os.getenv("APP_PASSWORD") | |
| SPACE_ID = os.getenv("SPACE_ID") | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Parse models cleanly into a list | |
| AVAILABLE_MODELS = [m.strip() for m in RAW_MODELS.split(",") if m.strip()] | |
| # --- PIPELINE THRESHOLDS & CONSTANTS --- | |
| DEFAULT_SIMILARITY_THRESHOLD = 0.93 | |
| TOP_K_CANDIDATES = 5 | |
| SHEET_NAME = "Data" | |
| DATA_DIR = "data" | |
| REFDATA_DIR = "refdata" | |
| DEFAULT_OUTPUT_SHEET_NAME = "Cleaned_Data" | |
| DEFAULT_BLUEPRINT_FILE = "Blueprint.xlsx" | |
| DEFAULT_OFFICIAL_REFS_FILE = "official_references.json" | |
| DEFAULT_MANUAL_REFS_FILE = "manual_references.json" | |
| def resolve_data_path(file_arg): | |
| """Resolve bare file names into the project data directory.""" | |
| if not file_arg: | |
| return file_arg | |
| if os.path.isabs(file_arg) or os.path.dirname(file_arg): | |
| return file_arg | |
| return os.path.join(DATA_DIR, file_arg) | |
| def resolve_ref_path(file_arg): | |
| """Resolve bare reference file names into the project refdata directory.""" | |
| if not file_arg: | |
| return file_arg | |
| if os.path.isabs(file_arg) or os.path.dirname(file_arg): | |
| return file_arg | |
| return os.path.join(REFDATA_DIR, file_arg) | |
| def parse_cli_args(): | |
| """Parse shared CLI arguments used by both local runs and the Flask UI.""" | |
| parser = argparse.ArgumentParser(description="MasterMap Data Normalization Pipeline") | |
| parser.add_argument("--input", required=True, help="Raw input Excel file name inside data/") | |
| parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/") | |
| parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/") | |
| parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/") | |
| parser.add_argument("--sheet", default=SHEET_NAME, help="Source sheet name to clean") | |
| parser.add_argument("--output_sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Output sheet name to create") | |
| parser.add_argument("--models", default="", help="Comma-separated Groq models to use in fallback order") | |
| args = parser.parse_args() | |
| # Keep CLI calls short by treating bare names as files under data/refdata. | |
| args.input = resolve_data_path(args.input) | |
| args.blueprint = resolve_data_path(args.blueprint) | |
| args.refs = resolve_ref_path(args.refs) | |
| args.manual_refs = resolve_ref_path(args.manual_refs) | |
| return args | |