File size: 2,819 Bytes
dc06d4c
 
 
 
c6a3f44
 
dc06d4c
 
 
 
c6a3f44
7583b3e
 
 
 
dc06d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import argparse
from dotenv import load_dotenv

# Load local .env values for development; Hugging Face injects the same names
# as environment variables in production.
load_dotenv()

# --- ENVIRONMENT VARIABLES to be set up in .env ---
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
RAW_MODELS = os.getenv("GROQ_MODEL", "")
APP_USERNAME = os.getenv("APP_USERNAME")
APP_PASSWORD = os.getenv("APP_PASSWORD")
SPACE_ID = os.getenv("SPACE_ID")
HF_TOKEN = os.getenv("HF_TOKEN")

# Parse models cleanly into a list
AVAILABLE_MODELS = [m.strip() for m in RAW_MODELS.split(",") if m.strip()]

# --- PIPELINE THRESHOLDS & CONSTANTS ---
DEFAULT_SIMILARITY_THRESHOLD = 0.93
TOP_K_CANDIDATES = 5

SHEET_NAME = "Data"
DATA_DIR = "data"
REFDATA_DIR = "refdata"
DEFAULT_OUTPUT_SHEET_NAME = "Cleaned_Data"

DEFAULT_BLUEPRINT_FILE = "Blueprint.xlsx"
DEFAULT_OFFICIAL_REFS_FILE = "official_references.json"
DEFAULT_MANUAL_REFS_FILE = "manual_references.json"

def resolve_data_path(file_arg):
    """Resolve bare file names into the project data directory."""
    if not file_arg:
        return file_arg
    if os.path.isabs(file_arg) or os.path.dirname(file_arg):
        return file_arg
    return os.path.join(DATA_DIR, file_arg)

def resolve_ref_path(file_arg):
    """Resolve bare reference file names into the project refdata directory."""
    if not file_arg:
        return file_arg
    if os.path.isabs(file_arg) or os.path.dirname(file_arg):
        return file_arg
    return os.path.join(REFDATA_DIR, file_arg)

def parse_cli_args():
    """Parse shared CLI arguments used by both local runs and the Flask UI."""
    parser = argparse.ArgumentParser(description="MasterMap Data Normalization Pipeline")
    parser.add_argument("--input", required=True, help="Raw input Excel file name inside data/")
    parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
    parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
    parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
    parser.add_argument("--sheet", default=SHEET_NAME, help="Source sheet name to clean")
    parser.add_argument("--output_sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Output sheet name to create")
    parser.add_argument("--models", default="", help="Comma-separated Groq models to use in fallback order")

    args = parser.parse_args()
    # Keep CLI calls short by treating bare names as files under data/refdata.
    args.input = resolve_data_path(args.input)
    args.blueprint = resolve_data_path(args.blueprint)
    args.refs = resolve_ref_path(args.refs)
    args.manual_refs = resolve_ref_path(args.manual_refs)
    return args