andrewbejjani commited on
Commit
dc06d4c
·
0 Parent(s):

Initial clean commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ * text=auto eol=lf
2
+ *.7z filter=lfs diff=lfs merge=lfs -text
3
+ *.arrow filter=lfs diff=lfs merge=lfs -text
4
+ *.bin filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
7
+ *.ftz filter=lfs diff=lfs merge=lfs -text
8
+ *.gz filter=lfs diff=lfs merge=lfs -text
9
+ *.h5 filter=lfs diff=lfs merge=lfs -text
10
+ *.joblib filter=lfs diff=lfs merge=lfs -text
11
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ ENV PYTHONUNBUFFERED=1
4
+ ENV HF_HOME=/app/.cache/huggingface
5
+ ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence-transformers
6
+
7
+ WORKDIR /app
8
+
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
15
+
16
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "1", "--timeout", "3600", "ui_app:app"]
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MasterMap Cleaner
3
+ sdk: docker
4
+ app_port: 7860
5
+ ---
6
+
7
+ ## Hugging Face Secrets
8
+
9
+ Set these in the Space settings:
10
+
11
+ - `GROQ_API_KEY`: required for Groq model calls.
12
+ - `HF_TOKEN`: optional, required only for the `Save Manual References` button.
13
+
14
+ `Save Manual References` only enables on Hugging Face Spaces when `SPACE_ID` is present and `HF_TOKEN` is configured. It commits the current `refdata/manual_references.json` back to the Space repository.
apply_blueprint.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import openpyxl
3
+ import os
4
+ import json
5
+ import argparse
6
+ from src.config import (
7
+ DEFAULT_BLUEPRINT_FILE,
8
+ DEFAULT_OFFICIAL_REFS_FILE,
9
+ DEFAULT_MANUAL_REFS_FILE,
10
+ DEFAULT_OUTPUT_SHEET_NAME,
11
+ resolve_data_path,
12
+ resolve_ref_path,
13
+ )
14
+ from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains
15
+
16
+ def parse_args():
17
+ parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides")
18
+ parser.add_argument("--input", required=True, help="Master Excel file name inside data/")
19
+ parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
20
+ parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
21
+ parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
22
+ parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied")
23
+
24
+ args = parser.parse_args()
25
+ args.input = resolve_data_path(args.input)
26
+ args.blueprint = resolve_data_path(args.blueprint)
27
+ args.refs = resolve_ref_path(args.refs)
28
+ args.manual_refs = resolve_ref_path(args.manual_refs)
29
+ return args
30
+
31
+ def load_json_safe(filepath):
32
+ try:
33
+ with open(filepath, 'r', encoding='utf-8-sig') as f:
34
+ return json.load(f)
35
+ except Exception:
36
+ return {}
37
+
38
+ def split_approved_parts(value):
39
+ if pd.isna(value):
40
+ return []
41
+ return [part.strip() for part in str(value).split(",") if part.strip()]
42
+
43
+ def ensure_manual_bucket(manual_refs, official_refs, column_name):
44
+ if column_name not in manual_refs:
45
+ manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else []
46
+ return manual_refs[column_name]
47
+
48
+ def remember_approved_value(manual_refs, official_refs, column_name, approved_value):
49
+ manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name)
50
+ added_count = 0
51
+
52
+ for part in split_approved_parts(approved_value):
53
+ if ref_contains(official_refs.get(column_name, []), part):
54
+ continue
55
+ if ref_contains(manual_bucket, part):
56
+ continue
57
+
58
+ if isinstance(manual_bucket, list):
59
+ manual_bucket.append(part)
60
+ added_count += 1
61
+ elif isinstance(manual_bucket, dict):
62
+ manual_bucket[normalize_ref(part)] = part
63
+ added_count += 1
64
+
65
+ return added_count
66
+
67
+ if __name__ == "__main__":
68
+ args = parse_args()
69
+
70
+ print("Loading Master Data, Blueprint, and Memory Files...")
71
+ if not os.path.exists(args.blueprint):
72
+ print(f"Error: No blueprint found at {args.blueprint} to apply.")
73
+ exit()
74
+
75
+ bp_df = pd.read_excel(args.blueprint)
76
+ required_columns = {
77
+ "Row_Index",
78
+ "Column",
79
+ "Original_Raw_Text",
80
+ "AI_Suggested_Match",
81
+ "Human_Override",
82
+ }
83
+ missing_columns = required_columns - set(bp_df.columns)
84
+ if missing_columns:
85
+ print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}")
86
+ exit()
87
+
88
+ # Load the target Excel workbook
89
+ wb = openpyxl.load_workbook(args.input)
90
+ if args.sheet not in wb.sheetnames:
91
+ print(f"Error: No '{args.sheet}' sheet found in {args.input}.")
92
+ exit()
93
+
94
+ sheet = wb[args.sheet]
95
+ col_name_to_idx = {
96
+ str(sheet.cell(row=1, column=c).value).strip(): c
97
+ for c in range(1, sheet.max_column + 1)
98
+ if sheet.cell(row=1, column=c).value
99
+ }
100
+
101
+ # Load the memory dictionaries using the synced CLI path
102
+ official_refs = load_json_safe(args.refs)
103
+ manual_refs = load_json_safe(args.manual_refs)
104
+
105
+ changes_made = 0
106
+ memory_additions = 0
107
+
108
+ print("Applying manual overrides and updating memory...")
109
+ for _, row in bp_df.iterrows():
110
+ human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else ""
111
+ approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else ""
112
+ confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else ""
113
+
114
+ if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}:
115
+ continue
116
+
117
+ raw_col = str(row['Column']).strip()
118
+
119
+ if human_val:
120
+ # 1. Update the Excel File
121
+ try:
122
+ excel_row = int(row['Row_Index'])
123
+ except (TypeError, ValueError):
124
+ print(f"Skipping override with invalid Row_Index: {row['Row_Index']}")
125
+ continue
126
+
127
+ if raw_col not in col_name_to_idx:
128
+ print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.")
129
+ continue
130
+
131
+ if excel_row < 1 or excel_row > sheet.max_row:
132
+ print(f"Skipping override: row {excel_row} is outside {args.sheet}.")
133
+ continue
134
+
135
+ col_idx = col_name_to_idx[raw_col]
136
+ sheet.cell(row=excel_row, column=col_idx).value = human_val
137
+ changes_made += 1
138
+
139
+ # 2. Update Manual References for human overrides and accepted AI suggestions.
140
+ if raw_col == "Degree":
141
+ continue
142
+
143
+ if not human_val and confidence == "LOW":
144
+ continue
145
+
146
+ memory_additions += remember_approved_value(
147
+ manual_refs=manual_refs,
148
+ official_refs=official_refs,
149
+ column_name=raw_col,
150
+ approved_value=approved_val,
151
+ )
152
+
153
+ memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
154
+
155
+ # Save Excel
156
+ wb.save(args.input)
157
+
158
+ # Save JSONs
159
+ # Make sure the data directory exists before dumping
160
+ manual_refs_dir = os.path.dirname(args.manual_refs)
161
+ if manual_refs_dir:
162
+ os.makedirs(manual_refs_dir, exist_ok=True)
163
+ with open(args.manual_refs, 'w', encoding='utf-8') as f:
164
+ json.dump(manual_refs, f, indent=4, ensure_ascii=False)
165
+
166
+ print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.")
167
+ print(f"Memory updated: {memory_additions} new approved values added.")
168
+ print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.")
169
+ print(f"Memory updated: Human overrides dumped to {args.manual_refs}")
main.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers import SentenceTransformer
3
+ import os
4
+ import sys
5
+ import json
6
+ import openpyxl
7
+ from openpyxl.styles import PatternFill, Font
8
+ from openpyxl.utils import get_column_letter
9
+ from openpyxl.worksheet.datavalidation import DataValidation
10
+ from openpyxl.workbook.defined_name import DefinedName
11
+
12
+ # Import our new modular architecture
13
+ from src.config import parse_cli_args, GROQ_API_KEY, AVAILABLE_MODELS, DEFAULT_SIMILARITY_THRESHOLD
14
+ from src.llm_router import GroqRouter
15
+ from src.data_pipeline import process_column, cluster_degrees_by_institution
16
+ from src.utils import prune_manual_refs_against_official
17
+
18
+ # --- 1. CONFIGURATION ---
19
+ COLUMNS_CONFIG = {
20
+ "Country": r',|;|\n|/',
21
+ "Institution": r'[,/;|\n]',
22
+ "Continent": r',|;|\n|/',
23
+ "City": r',|;|\n|/',
24
+ "Level": r'\n|;',
25
+ "Language": r',|;|\n|/',
26
+ "Tags": r',|;|\n|/',
27
+ "Degree": r'\n|;'
28
+ }
29
+
30
+ master_cache = {}
31
+
32
+ def load_json_safe(filepath):
33
+ with open(filepath, 'r', encoding='utf-8-sig') as f:
34
+ return json.load(f)
35
+
36
+ def validate_official_refs(official_refs):
37
+ missing = []
38
+ for column_name in COLUMNS_CONFIG:
39
+ if column_name == "Degree":
40
+ continue
41
+
42
+ ref_data = official_refs.get(column_name)
43
+ if not isinstance(ref_data, (list, dict)) or len(ref_data) == 0:
44
+ missing.append(column_name)
45
+
46
+ if missing:
47
+ raise ValueError(
48
+ "Official references are missing or empty for: "
49
+ + ", ".join(missing)
50
+ + ". Refusing to run because this would send too many values to Groq."
51
+ )
52
+
53
+ def inject_searchable_dropdowns(blueprint_path, master_unique_lists):
54
+ """Injects robust, static searchable dropdowns into the Blueprint."""
55
+ print("Injecting static searchable dropdowns into Blueprint...")
56
+ wb = openpyxl.load_workbook(blueprint_path)
57
+ main_sheet = wb.active
58
+
59
+ # 1. Create the Reference Sheet
60
+ ref_sheet = wb.create_sheet(title="Reference_Lists")
61
+
62
+ col_idx = 1
63
+ for column_name, unique_items in master_unique_lists.items():
64
+ safe_name = column_name.replace(" ", "_")
65
+
66
+ # Write the header
67
+ ref_sheet.cell(row=1, column=col_idx, value=safe_name)
68
+
69
+ # Clean and alphabetize the list for a better user experience
70
+ valid_items = sorted([item for item in unique_items if item and isinstance(item, str)])
71
+
72
+ # Write the items
73
+ for row_idx, item in enumerate(valid_items, start=2):
74
+ ref_sheet.cell(row=row_idx, column=col_idx, value=item)
75
+
76
+ # 2. Create the Excel "Named Range"
77
+ if valid_items:
78
+ letter = get_column_letter(col_idx)
79
+ range_str = f"Reference_Lists!${letter}$2:${letter}${len(valid_items) + 1}"
80
+ named_range = DefinedName(name=safe_name, attr_text=range_str)
81
+ wb.defined_names.add(named_range)
82
+
83
+ col_idx += 1
84
+
85
+ # 3. Locate Target & Override Columns
86
+ target_col_idx = None
87
+ override_col_letter = None
88
+ for cell in main_sheet[1]:
89
+ if cell.value == "Column":
90
+ target_col_idx = get_column_letter(cell.column)
91
+ elif cell.value == "Human_Override":
92
+ override_col_letter = get_column_letter(cell.column)
93
+
94
+ # 4. Apply Data Validation
95
+ if target_col_idx and override_col_letter:
96
+ dv = DataValidation(
97
+ type="list",
98
+ formula1=f'=INDIRECT(SUBSTITUTE(${target_col_idx}2, " ", "_"))',
99
+ allowBlank=True,
100
+ showErrorMessage=False # CRITICAL: This allows the user to manually type an override!
101
+ )
102
+ dv.add(f"{override_col_letter}2:{override_col_letter}{main_sheet.max_row}")
103
+ main_sheet.add_data_validation(dv)
104
+
105
+ ref_sheet.sheet_state = 'hidden'
106
+ wb.save(blueprint_path)
107
+ print("Dropdowns successfully injected!")
108
+
109
+
110
+ if __name__ == "__main__":
111
+ # --- 2. INITIALIZATION ---
112
+ args = parse_cli_args()
113
+ source_sheet_name = args.sheet
114
+ output_sheet_name = args.output_sheet
115
+ available_models = [m.strip() for m in args.models.split(",") if m.strip()] if args.models else AVAILABLE_MODELS
116
+
117
+ print("Loading AI Model (this may take a few seconds)...")
118
+ model = SentenceTransformer('all-MiniLM-L6-v2')
119
+
120
+ # Initialize our LLM Router
121
+ router = GroqRouter(api_key=GROQ_API_KEY, available_models=available_models)
122
+
123
+ if not os.path.exists(args.refs):
124
+ raise FileNotFoundError(f"Official references file not found: {args.refs}")
125
+
126
+ if not os.path.exists(args.manual_refs):
127
+ os.makedirs(os.path.dirname(args.manual_refs), exist_ok=True)
128
+ with open(args.manual_refs, 'w', encoding='utf-8') as f:
129
+ json.dump({}, f)
130
+
131
+ official_refs = load_json_safe(args.refs)
132
+ manual_refs = load_json_safe(args.manual_refs)
133
+ validate_official_refs(official_refs)
134
+ memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
135
+ if memory_pruned:
136
+ print(f"[INFO] Removed {memory_pruned} manual reference duplicate(s) already covered by official refs.")
137
+
138
+ print(f"Loading Excel dataset from {args.input}, sheet '{source_sheet_name}'...")
139
+ data = pd.read_excel(args.input, sheet_name=source_sheet_name, skiprows=[1])
140
+
141
+ # Initialize the global Blueprint Logger
142
+ blueprint_records = []
143
+
144
+ # --- 3. EXECUTE BATCH PIPELINE ---
145
+ for col, pattern in COLUMNS_CONFIG.items():
146
+ if col == "Degree":
147
+ inst_col = 'Cleaned_Institution' if 'Cleaned_Institution' in data.columns else 'Institution'
148
+ data = cluster_degrees_by_institution(
149
+ df=data, degree_col=col, inst_col=inst_col, model=model,
150
+ master_cache=master_cache, blueprint_data=blueprint_records,
151
+ threshold=DEFAULT_SIMILARITY_THRESHOLD
152
+ )
153
+ else:
154
+ data = process_column(
155
+ df=data, column_name=col, model=model, groq_router=router,
156
+ official_refs=official_refs, manual_refs=manual_refs, master_cache=master_cache,
157
+ split_pattern=pattern, blueprint_data=blueprint_records
158
+ )
159
+
160
+ # --- 4. EXPORT RESULTS ---
161
+ print("\nSaving all memory files...")
162
+ with open(args.manual_refs, 'w', encoding='utf-8') as f: json.dump(manual_refs, f, indent=4, ensure_ascii=False)
163
+
164
+ # 4a. Export the Blueprint for Human Review
165
+ if blueprint_records:
166
+ bp_df = pd.DataFrame(blueprint_records)
167
+ bp_df.to_excel(args.blueprint, index=False)
168
+
169
+ # --- Format the Blueprint Visually ---
170
+ bp_wb = openpyxl.load_workbook(args.blueprint)
171
+ bp_sheet = bp_wb.active
172
+
173
+ header_fill = PatternFill(start_color="1F4E78", end_color="1F4E78", fill_type="solid")
174
+ header_font = Font(color="FFFFFF", bold=True)
175
+ high_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
176
+ med_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
177
+ low_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
178
+
179
+ conf_col_idx = None
180
+ for col_idx in range(1, bp_sheet.max_column + 1):
181
+ cell = bp_sheet.cell(row=1, column=col_idx)
182
+ cell.fill = header_fill
183
+ cell.font = header_font
184
+ if cell.value == "Confidence": conf_col_idx = col_idx
185
+ bp_sheet.column_dimensions[get_column_letter(col_idx)].width = 30
186
+
187
+ if conf_col_idx:
188
+ for row_idx in range(2, bp_sheet.max_row + 1):
189
+ cell = bp_sheet.cell(row=row_idx, column=conf_col_idx)
190
+ val = str(cell.value).upper()
191
+ if "HIGH" in val: cell.fill = high_fill
192
+ elif "MEDIUM" in val: cell.fill = med_fill
193
+ elif "LOW" in val: cell.fill = low_fill
194
+
195
+ bp_wb.save(args.blueprint)
196
+ print(f"[!] Saved and formatted {len(bp_df)} rows requiring review to {args.blueprint}")
197
+
198
+ # --- NEW: Build master lists and inject dropdowns ---
199
+ def extract_uniques(ref_data):
200
+ """Helper to extract names whether the memory file is a list or a dict"""
201
+ if isinstance(ref_data, dict): return list(ref_data.values())
202
+ elif isinstance(ref_data, list): return ref_data
203
+ return []
204
+
205
+ master_lists = {}
206
+ for category in COLUMNS_CONFIG.keys():
207
+ off_items = extract_uniques(official_refs.get(category, []))
208
+ man_items = extract_uniques(manual_refs.get(category, []))
209
+ # Merge, deduplicate, and remove blanks
210
+ master_lists[category] = list(set([x for x in (off_items + man_items) if x]))
211
+
212
+ inject_searchable_dropdowns(args.blueprint, master_lists)
213
+
214
+ else:
215
+ print("[!] No blueprint generated. All matches were HIGH confidence!")
216
+
217
+ # 4b. Inject Cleaned Data to Mastermap
218
+ print("\nOpening original Excel file to preserve formatting...")
219
+ wb = openpyxl.load_workbook(args.input)
220
+ new_sheet_name = output_sheet_name
221
+
222
+ if source_sheet_name == new_sheet_name:
223
+ raise ValueError("Output sheet name cannot match the source sheet name.")
224
+
225
+ source_sheet = wb[source_sheet_name]
226
+
227
+ if new_sheet_name in wb.sheetnames: del wb[new_sheet_name]
228
+ new_sheet = wb.copy_worksheet(source_sheet)
229
+ new_sheet.title = new_sheet_name
230
+
231
+ col_name_to_idx = {new_sheet.cell(row=1, column=c).value: c for c in range(1, new_sheet.max_column + 1) if new_sheet.cell(row=1, column=c).value}
232
+
233
+ for row_idx, (_, row_data) in enumerate(data.iterrows()):
234
+ excel_row = row_idx + 3
235
+ for col_name in COLUMNS_CONFIG.keys():
236
+ cleaned_col_name = f"Cleaned_{col_name}"
237
+ if cleaned_col_name in data.columns and col_name in col_name_to_idx:
238
+ new_value = row_data[cleaned_col_name]
239
+ new_sheet.cell(row=excel_row, column=col_name_to_idx[col_name]).value = None if pd.isna(new_value) else new_value
240
+
241
+ wb.save(args.input)
242
+ print(f"\nSuccess! Initial pass saved. Please review {args.blueprint}.")
newest_model.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from typing import Any
4
+
5
+ import requests
6
+ from dotenv import load_dotenv
7
+
8
+
9
+ load_dotenv()
10
+
11
+ GROQ_MODELS_URL = "https://api.groq.com/openai/v1/models"
12
+
13
+ # Models selected from Groq's rate-limit table for this workload: relevant
14
+ # chat/reasoning models with at least 1K requests per day.
15
+ PREFERRED_PRODUCTION_CHAT_MODELS = [
16
+ "llama-3.3-70b-versatile",
17
+ "openai/gpt-oss-120b",
18
+ "qwen/qwen3-32b",
19
+ "llama-3.1-8b-instant",
20
+ "openai/gpt-oss-20b",
21
+ "meta-llama/llama-4-scout-17b-16e-instruct",
22
+ "meta-llama/llama-prompt-guard-2-22m",
23
+ "meta-llama/llama-prompt-guard-2-86m"
24
+ ]
25
+
26
+ EXCLUDED_MODEL_TERMS = [
27
+ "whisper",
28
+ "tts",
29
+ "audio",
30
+ "moderation",
31
+ "embed",
32
+ "compound",
33
+ "safeguard"
34
+ ]
35
+
36
+ PREFERRED_MODEL_IDS = {model_id.lower() for model_id in PREFERRED_PRODUCTION_CHAT_MODELS}
37
+
38
+
39
+ def fetch_groq_models(api_key: str) -> list[dict[str, Any]]:
40
+ headers = {
41
+ "Authorization": f"Bearer {api_key}",
42
+ "Content-Type": "application/json",
43
+ }
44
+ response = requests.get(GROQ_MODELS_URL, headers=headers, timeout=30)
45
+ response.raise_for_status()
46
+ return response.json().get("data", [])
47
+
48
+
49
+ def is_active_chat_model(model: dict[str, Any]) -> bool:
50
+ model_id = str(model.get("id", "")).lower()
51
+ if not model_id:
52
+ return False
53
+ if model.get("active") is False:
54
+ return False
55
+ if model_id not in PREFERRED_MODEL_IDS:
56
+ return False
57
+ return not any(term in model_id for term in EXCLUDED_MODEL_TERMS)
58
+
59
+
60
+ def rank_model(model: dict[str, Any]) -> tuple[int, int, int, str]:
61
+ model_id = str(model.get("id", ""))
62
+ model_id_lower = model_id.lower()
63
+
64
+ try:
65
+ preferred_rank = [item.lower() for item in PREFERRED_PRODUCTION_CHAT_MODELS].index(model_id_lower)
66
+ except ValueError:
67
+ preferred_rank = len(PREFERRED_PRODUCTION_CHAT_MODELS)
68
+
69
+ created = int(model.get("created") or 0)
70
+ context_window = int(model.get("context_window") or 0)
71
+
72
+ # Lower tuple wins. Known production models come first; unknown active chat
73
+ # models then prefer newer creation dates and larger context windows.
74
+ return (preferred_rank, -created, -context_window, model_id)
75
+
76
+
77
+ def select_groq_chat_models(limit: int = len(PREFERRED_PRODUCTION_CHAT_MODELS), strategy: str = "stable") -> list[str]:
78
+ api_key = os.getenv("GROQ_API_KEY")
79
+ if not api_key:
80
+ raise RuntimeError("GROQ_API_KEY is missing. Add it to .env first.")
81
+
82
+ models = fetch_groq_models(api_key)
83
+ chat_models = [model for model in models if is_active_chat_model(model)]
84
+
85
+ if strategy == "latest":
86
+ ranked_models = sorted(
87
+ chat_models,
88
+ key=lambda model: (
89
+ -int(model.get("created") or 0),
90
+ -int(model.get("context_window") or 0),
91
+ str(model.get("id", "")),
92
+ ),
93
+ )
94
+ else:
95
+ ranked_models = sorted(chat_models, key=rank_model)
96
+
97
+ return [model["id"] for model in ranked_models[:limit]]
98
+
99
+
100
+ def main() -> None:
101
+ parser = argparse.ArgumentParser(description="Select currently available Groq chat models.")
102
+ parser.add_argument("--limit", type=int, default=len(PREFERRED_PRODUCTION_CHAT_MODELS), help="Number of fallback models to print.")
103
+ parser.add_argument(
104
+ "--strategy",
105
+ choices=["stable", "latest"],
106
+ default="stable",
107
+ help="stable prefers known production chat models; latest picks newest active chat models.",
108
+ )
109
+ args = parser.parse_args()
110
+
111
+ models = select_groq_chat_models(limit=args.limit, strategy=args.strategy)
112
+ if not models:
113
+ raise RuntimeError("No active Groq chat models were returned by the Models API.")
114
+
115
+ print("Recommended GROQ_MODEL value:")
116
+ print(f"GROQ_MODEL={','.join(models)}")
117
+ print("\nSelected fallback order:")
118
+ for index, model_id in enumerate(models, start=1):
119
+ print(f"{index}. {model_id}")
120
+
121
+
122
+ if __name__ == "__main__":
123
+ main()
refdata/manual_references.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Country": [
3
+ "Hong Kong"
4
+ ],
5
+ "Institution": [
6
+ "Centre Interdisciplinaire de Formation à la Fonction Personnel",
7
+ "Faculté de médecine Paris-Descartes",
8
+ "Politecnico di Milano",
9
+ "INSTN",
10
+ "Université Paris-Est Marne-la-Vallée",
11
+ "CEMS",
12
+ "Conservatoire National Supérieur d'Art Dramatique (CNSAD)",
13
+ "EDC Paris Business School",
14
+ "EFREI",
15
+ "Catolica Lisbon School of Business & Economics",
16
+ "Ecole des Mines Paris - Université PSL",
17
+ "École d'urbanisme de Paris",
18
+ "ESMOD Paris",
19
+ "HEAD Genève (Haute école d'art et de design)",
20
+ "MODART International",
21
+ "University of Mumbai",
22
+ "Oxford Brookes University",
23
+ "Shanghai Advanced Institute of Finance",
24
+ "The University of Hong Kong (HKU)",
25
+ "Université de Montpellier Paul Valéry",
26
+ "IAE Paris",
27
+ "The University of Sydney Business School",
28
+ "Korea University Business School",
29
+ "EHL Business School",
30
+ "Technische Universität Dortmund"
31
+ ],
32
+ "Continent": [],
33
+ "City": [
34
+ "Champs-sur-Marne",
35
+ "Croix",
36
+ "Hanoi",
37
+ "Pantin",
38
+ "Versailles",
39
+ "Mumbai",
40
+ "Oxford",
41
+ "Suresnes",
42
+ "Aubervilliers",
43
+ "Dortmund",
44
+ "Mannheim"
45
+ ],
46
+ "Level": {},
47
+ "Language": {},
48
+ "Tags": [
49
+ "Sociology",
50
+ "Fashion Management",
51
+ "Geoeconomy",
52
+ "Humanitarian",
53
+ "Sustainable Finance",
54
+ "Fashion",
55
+ "Financial Risks",
56
+ "Digital Creativity",
57
+ "International Marketing",
58
+ "Sales Management",
59
+ "Data Management",
60
+ "Journalism",
61
+ "Arts & Culture",
62
+ "Environmental Governance",
63
+ "Neurology",
64
+ "Dentist",
65
+ "Financial Control",
66
+ "Biodiversity",
67
+ "Languages",
68
+ "Financial Engineering",
69
+ "Biology"
70
+ ]
71
+ }
refdata/official_references.json ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Country": [
3
+ "Afghanistan",
4
+ "Albania",
5
+ "Algeria",
6
+ "Andorra",
7
+ "Angola",
8
+ "Antigua and Barbuda",
9
+ "Argentina",
10
+ "Armenia",
11
+ "Australia",
12
+ "Austria",
13
+ "Azerbaijan",
14
+ "Bahamas",
15
+ "Bahrain",
16
+ "Bangladesh",
17
+ "Barbados",
18
+ "Belarus",
19
+ "Belgium",
20
+ "Belize",
21
+ "Benin",
22
+ "Bhutan",
23
+ "Bolivia",
24
+ "Bosnia and Herzegovina",
25
+ "Botswana",
26
+ "Brazil",
27
+ "Brunei",
28
+ "Bulgaria",
29
+ "Burkina Faso",
30
+ "Burundi",
31
+ "Cabo Verde",
32
+ "Cambodia",
33
+ "Cameroon",
34
+ "Canada",
35
+ "Central African Republic",
36
+ "Chad",
37
+ "Chile",
38
+ "China",
39
+ "Colombia",
40
+ "Comoros",
41
+ "Congo",
42
+ "Costa Rica",
43
+ "Croatia",
44
+ "Cuba",
45
+ "Cyprus",
46
+ "Czech Republic",
47
+ "Democratic Republic of the Congo",
48
+ "Denmark",
49
+ "Djibouti",
50
+ "Dominica",
51
+ "Dominican Republic",
52
+ "Ecuador",
53
+ "Egypt",
54
+ "El Salvador",
55
+ "Equatorial Guinea",
56
+ "Eritrea",
57
+ "Estonia",
58
+ "Eswatini",
59
+ "Ethiopia",
60
+ "Fiji",
61
+ "Finland",
62
+ "France",
63
+ "Gabon",
64
+ "Gambia",
65
+ "Georgia",
66
+ "Germany",
67
+ "Ghana",
68
+ "Greece",
69
+ "Grenada",
70
+ "Guatemala",
71
+ "Guinea",
72
+ "Guinea-Bissau",
73
+ "Guyana",
74
+ "Haiti",
75
+ "Honduras",
76
+ "Hungary",
77
+ "Iceland",
78
+ "India",
79
+ "Indonesia",
80
+ "Iran",
81
+ "Iraq",
82
+ "Ireland",
83
+ "Israel",
84
+ "Italy",
85
+ "Ivory Coast",
86
+ "Jamaica",
87
+ "Japan",
88
+ "Jordan",
89
+ "Kazakhstan",
90
+ "Kenya",
91
+ "Kiribati",
92
+ "Kosovo",
93
+ "Kuwait",
94
+ "Kyrgyzstan",
95
+ "Laos",
96
+ "Latvia",
97
+ "Lebanon",
98
+ "Lesotho",
99
+ "Liberia",
100
+ "Libya",
101
+ "Liechtenstein",
102
+ "Lithuania",
103
+ "Luxembourg",
104
+ "Madagascar",
105
+ "Malawi",
106
+ "Malaysia",
107
+ "Maldives",
108
+ "Mali",
109
+ "Malta",
110
+ "Marshall Islands",
111
+ "Mauritania",
112
+ "Mauritius",
113
+ "Mexico",
114
+ "Micronesia",
115
+ "Moldova",
116
+ "Monaco",
117
+ "Mongolia",
118
+ "Montenegro",
119
+ "Morocco",
120
+ "Mozambique",
121
+ "Myanmar",
122
+ "Namibia",
123
+ "Nauru",
124
+ "Nepal",
125
+ "Netherlands",
126
+ "New Zealand",
127
+ "Nicaragua",
128
+ "Niger",
129
+ "Nigeria",
130
+ "North Korea",
131
+ "North Macedonia",
132
+ "Norway",
133
+ "Oman",
134
+ "Pakistan",
135
+ "Palau",
136
+ "Palestine",
137
+ "Panama",
138
+ "Papua New Guinea",
139
+ "Paraguay",
140
+ "Peru",
141
+ "Philippines",
142
+ "Poland",
143
+ "Portugal",
144
+ "Qatar",
145
+ "Romania",
146
+ "Russia",
147
+ "Rwanda",
148
+ "Saint Kitts and Nevis",
149
+ "Saint Lucia",
150
+ "Saint Vincent and the Grenadines",
151
+ "Samoa",
152
+ "San Marino",
153
+ "Sao Tome and Principe",
154
+ "Saudi Arabia",
155
+ "Senegal",
156
+ "Serbia",
157
+ "Seychelles",
158
+ "Sierra Leone",
159
+ "Singapore",
160
+ "Slovakia",
161
+ "Slovenia",
162
+ "Solomon Islands",
163
+ "Somalia",
164
+ "South Africa",
165
+ "South Korea",
166
+ "South Sudan",
167
+ "Spain",
168
+ "Sri Lanka",
169
+ "Sudan",
170
+ "Suriname",
171
+ "Sweden",
172
+ "Switzerland",
173
+ "Syria",
174
+ "Taiwan",
175
+ "Tajikistan",
176
+ "Tanzania",
177
+ "Thailand",
178
+ "Timor-Leste",
179
+ "Togo",
180
+ "Tonga",
181
+ "Trinidad and Tobago",
182
+ "Tunisia",
183
+ "Turkey",
184
+ "Turkmenistan",
185
+ "Tuvalu",
186
+ "Uganda",
187
+ "Ukraine",
188
+ "United Arab Emirates",
189
+ "United Kingdom",
190
+ "United States",
191
+ "Uruguay",
192
+ "Uzbekistan",
193
+ "Vanuatu",
194
+ "Vatican City",
195
+ "Venezuela",
196
+ "Vietnam",
197
+ "Yemen",
198
+ "Zambia",
199
+ "Zimbabwe"
200
+ ],
201
+ "Institution": [
202
+ "AgroParisTech (Institut des sciences et industries du vivant et de l'environnement)",
203
+ "Aix Marseille Université (AMU)",
204
+ "Alliance Manchester Business School (The University of Manchester)",
205
+ "American University",
206
+ "Audencia Business School",
207
+ "Babson College",
208
+ "Bayes Business School (formerly Cass Business School)",
209
+ "Berklee College of Music",
210
+ "Bocconi University (Università Commerciale Luigi Bocconi)",
211
+ "Brandeis University",
212
+ "Bucharest University of Economic Studies (ASE)",
213
+ "Burgundy School of Business (BSB)",
214
+ "Cambridge Judge Business School",
215
+ "CELSA Sorbonne Université",
216
+ "CEMS (The Global Alliance in Management Education)",
217
+ "CentraleSupélec",
218
+ "CFA Dordogne Périgueux",
219
+ "Charité Universitätsmedizin Berlin",
220
+ "Charles University of Prague",
221
+ "CIFFOP (Université Paris 1 Panthéon Sorbonne)",
222
+ "Columbia University",
223
+ "Conservatoire National des Arts et Métiers (Le Cnam)",
224
+ "Conservatoire National Supérieur d'Art Dramatique (CNSAD PSL)",
225
+ "Copenhagen Business School (CBS)",
226
+ "Cornell University",
227
+ "Cours Florent",
228
+ "Coventry University",
229
+ "CY Cergy Paris Université",
230
+ "Dublin City University (DCU)",
231
+ "EAC (École d'Art et de Culture)",
232
+ "EAE Business School",
233
+ "École 42",
234
+ "École des Hautes Études en Sciences Sociales (EHESS)",
235
+ "École Hôtelière de Paris CFA Médéric",
236
+ "École Polytechnique",
237
+ "EDHEC Business School (École des Hautes Études Commerciales du Nord)",
238
+ "EFAP (École des Nouveaux Métiers de la Communication)",
239
+ "EM Strasbourg Business School",
240
+ "EMIC (École de Management des Industries Créatives)",
241
+ "Emlyon Business School (EM Lyon)",
242
+ "Epitech Digital (École pour l'informatique et les nouvelles technologies)",
243
+ "Erasmus University Rotterdam",
244
+ "ESADE Business School (Ramon Llull University)",
245
+ "ESCP Business School (École Supérieure de Commerce de Paris)",
246
+ "ESG Group",
247
+ "ESI Business School",
248
+ "ESIC Business & Marketing School",
249
+ "ESPI (École Supérieure des Professions Immobilières)",
250
+ "ESSCA School of Management (École Supérieure des Sciences Commerciales d'Angers)",
251
+ "ESSEC Business School",
252
+ "ESTICE International Business School",
253
+ "FERRANDI Paris",
254
+ "Fordham University",
255
+ "Frankfurt School of Finance & Management",
256
+ "Freie Universität Berlin",
257
+ "Fu Jen Catholic University",
258
+ "Fudan University",
259
+ "George Brown College",
260
+ "Glion Institute of Higher Education",
261
+ "GOBELINS Paris (l'école de l'image)",
262
+ "Graduate School of Management (GSOM) St. Petersburg University",
263
+ "Grenoble École de Management (GEM)",
264
+ "Hanyang University",
265
+ "Haute École de Joaillerie (HEJ)",
266
+ "HEC Lausanne (Université de Lausanne)",
267
+ "HEC Montréal (École des Hautes Études Commerciales de Montréal)",
268
+ "HEC Paris (École des Hautes Études Commerciales de Paris)",
269
+ "HEIP (Hautes Études Internationales et Politiques)",
270
+ "Hertie School",
271
+ "Hong Kong University of Science and Technology (HKUST)",
272
+ "Humboldt Universität zu Berlin",
273
+ "IAE Aix Marseille Graduate School of Management",
274
+ "IAE Lille University School of Management",
275
+ "IAE Paris Sorbonne Business School",
276
+ "IAE Paris Est (formerly IAE Gustave Eiffel)",
277
+ "IAE Versailles (Université Paris Saclay)",
278
+ "IAE Lyon School of Management",
279
+ "IAE Creteil",
280
+ "ICART (L'école du management de la culture et du marché de l'art)",
281
+ "IE Business School (IE University)",
282
+ "Institut d'études politiques (IEP)",
283
+ "IESEG School of Management (Institut d'Économie Scientifique et de Gestion)",
284
+ "IFP School (Institut Français du Pétrole)",
285
+ "IGS RH (Institut de Gestion Sociale Ressources Humaines)",
286
+ "IIM Digital School (Institut de l'Internet et du Multimédia)",
287
+ "Imperial College Business School",
288
+ "Imperial College London",
289
+ "IMPGT Aix Marseille Université (Institut de Management Public et Gouvernance Territoriale)",
290
+ "IMT Atlantique (École Nationale Supérieure des Mines de Nantes et Télécom Bretagne)",
291
+ "INSEAD (Institut Européen d'Administration des Affaires)",
292
+ "INSEEC (Institut des Hautes Études Économiques et Commerciales)",
293
+ "Institut Catholique de Paris (ICP)",
294
+ "Institut Français de la Mode (IFM)",
295
+ "Institut Léonard de Vinci (ILV)",
296
+ "Institut Lyfe (formerly Institut Paul Bocuse)",
297
+ "Institut Magellan",
298
+ "Institut Polytechnique de Paris (IP Paris)",
299
+ "INSTN (Institut National des Sciences et Techniques Nucléaires)",
300
+ "International University of Monaco (IUM)",
301
+ "IQS School of Management (Universitat Ramon Llull)",
302
+ "IRIS Sup' (Institut de Relations Internationales et Stratégiques)",
303
+ "ISAE SUPAERO (Institut Supérieur de l'Aéronautique et de l'Espace)",
304
+ "ISC Paris Business School (Institut Supérieur du Commerce de Paris)",
305
+ "ISCOM (Institut Supérieur de Communication et Publicité)",
306
+ "ISIPCA (Institut Supérieur International du Parfum de la Cosmétique et de l'Aromatique alimentaire)",
307
+ "IULM University (Libera Università di Lingue e Comunicazione)",
308
+ "Ivey Business School (Western University)",
309
+ "Kedge Business School",
310
+ "King's College London",
311
+ "KU Leuven",
312
+ "Les Roches Global Hospitality Education",
313
+ "London Business School (LBS)",
314
+ "London College of Fashion (University of the Arts London)",
315
+ "London Film School",
316
+ "London School of Economics and Political Science (LSE)",
317
+ "Loughborough University London",
318
+ "Loyola Marymount University",
319
+ "LUISS Guido Carli (Libera Università Internazionale degli Studi Sociali)",
320
+ "Lund University",
321
+ "MBA ESG",
322
+ "MGIMO University (Moscow State Institute of International Relations)",
323
+ "Monash University",
324
+ "National University of Singapore (NUS)",
325
+ "NEOMA Business School",
326
+ "New York University (NYU)",
327
+ "Newcastle University",
328
+ "Northumbria University",
329
+ "Nova School of Business and Economics",
330
+ "Paris Executive Business School (PEBS)",
331
+ "Paris School of Business (PSB)",
332
+ "Penninghen (École de direction artistique et d'architecture intérieure)",
333
+ "Plekhanov Russian University of Economics",
334
+ "POLIMI Graduate School of Management",
335
+ "Polytech Nice Sophia (Université Côte d'Azur)",
336
+ "Prague University of Economics and Business (VSE)",
337
+ "Queen's University",
338
+ "Renmin University of China",
339
+ "Rotterdam School of Management Erasmus University (RSM)",
340
+ "Royal College of Art (RCA)",
341
+ "Ruhr University Bochum",
342
+ "School of International and Public Affairs (SIPA) at Columbia University",
343
+ "Sciences Po (Institut d'études politiques de Paris)",
344
+ "Sciences Po Grenoble UGA",
345
+ "SDA Bocconi School of Management",
346
+ "Seoul National University",
347
+ "Shanghai Jiao Tong University",
348
+ "Singapore Management University (SMU)",
349
+ "SKEMA Business School (School of Knowledge Economy and Management)",
350
+ "Smith School of Business at Queen's University",
351
+ "Sophia University",
352
+ "Sorbonne Paris Nord University",
353
+ "Sorbonne Université",
354
+ "Sotheby's Institute of Art",
355
+ "Stockholm School of Economics (SSE)",
356
+ "Sup de Luxe (Institut Supérieur de Marketing du Luxe)",
357
+ "TBS Education (Toulouse Business School)",
358
+ "Technische Universität Berlin (TU Berlin)",
359
+ "Télécom Paris (École Nationale Supérieure des Télécommunications)",
360
+ "The University of Edinburgh",
361
+ "The University of Melbourne",
362
+ "The University of Sydney (USYD)",
363
+ "Trinity College Dublin",
364
+ "Tsinghua University",
365
+ "UCD Michael Smurfit Graduate Business School",
366
+ "Universidad Carlos III de Madrid (UC3M)",
367
+ "Universidad Europea de Madrid (UEM)",
368
+ "Universidade Católica Portuguesa",
369
+ "Université Côte d'Azur",
370
+ "Université de Strasbourg",
371
+ "Université Gustave Eiffel",
372
+ "Université Jean Moulin Lyon 3",
373
+ "Université Laval",
374
+ "Université Libre de Bruxelles (ULB)",
375
+ "Université Paris 1 Panthéon Sorbonne",
376
+ "Université Paris 8 Vincennes Saint Denis",
377
+ "Université Paris Cité",
378
+ "Université Paris Dauphine PSL",
379
+ "Université Paris Nanterre",
380
+ "Université Paris Panthéon Assas",
381
+ "Université Paris Saclay",
382
+ "Université Sorbonne Nouvelle (Paris 3)",
383
+ "Université Toulouse Capitole",
384
+ "University College Dublin (UCD)",
385
+ "University College London (UCL)",
386
+ "Université de Nice",
387
+ "University of Amsterdam (UvA)",
388
+ "University of California Los Angeles (UCLA)",
389
+ "University of Delaware",
390
+ "University of Freiburg (Albert Ludwigs Universität Freiburg)",
391
+ "University of Fribourg",
392
+ "University of Glasgow",
393
+ "University of Groningen",
394
+ "University of London",
395
+ "University of Mannheim",
396
+ "University of Pennsylvania (UPenn)",
397
+ "University of Rochester",
398
+ "University of Seoul",
399
+ "University of South Carolina",
400
+ "University of Southern California (USC)",
401
+ "University of St. Gallen (HSG)",
402
+ "University of the Arts London (UAL)",
403
+ "University of Trento",
404
+ "University of Warwick",
405
+ "Warwick Business School (WBS)",
406
+ "Washington University in St. Louis",
407
+ "WHU Otto Beisheim School of Management",
408
+ "WU (Vienna University of Economics and Business)",
409
+ "Xi'an Jiaotong Liverpool University (XJTLU)",
410
+ "Zhejiang A&F University (ZAFU)"
411
+ ],
412
+ "Continent": [
413
+ "Africa",
414
+ "America",
415
+ "Asia",
416
+ "Australia",
417
+ "Europe",
418
+ "Middle East",
419
+ "North America",
420
+ "Oceania",
421
+ "South America"
422
+ ],
423
+ "City": [
424
+ "Abu Dhabi",
425
+ "Aix-en-Provence",
426
+ "Amsterdam",
427
+ "Barcelona",
428
+ "Beijing",
429
+ "Belo Horizonte",
430
+ "Berlin",
431
+ "Bochum",
432
+ "Bordeaux",
433
+ "Boulogne-Billancourt",
434
+ "Brussels",
435
+ "Bucharest",
436
+ "Bulle",
437
+ "Cambridge",
438
+ "Carcavelos",
439
+ "Casablanca",
440
+ "Cergy",
441
+ "Cergy-Pontoise",
442
+ "Clayton",
443
+ "Copenhagen",
444
+ "Courbevoie",
445
+ "Coventry",
446
+ "Creteil",
447
+ "Dublin",
448
+ "Ecully",
449
+ "Edinburgh",
450
+ "Fontainebleau",
451
+ "Frankfurt",
452
+ "Fribourg",
453
+ "Gif-sur-Yvette",
454
+ "Glasgow",
455
+ "Grenoble",
456
+ "Groningen",
457
+ "Ho Chi Minh",
458
+ "Hong Kong",
459
+ "Jouy-en-Josas",
460
+ "Kingston",
461
+ "Lancaster",
462
+ "Las Vegas",
463
+ "Lausanne",
464
+ "Leuven",
465
+ "Lille",
466
+ "Lisbon",
467
+ "London",
468
+ "Los Angeles",
469
+ "Lund",
470
+ "Lyon",
471
+ "Madrid",
472
+ "Malaga",
473
+ "Manchester",
474
+ "Marne-la-Vallée",
475
+ "Marseille",
476
+ "Melbourne",
477
+ "Milan",
478
+ "Monaco",
479
+ "Montpellier",
480
+ "Montreal",
481
+ "Moscow",
482
+ "Munich",
483
+ "Nantes",
484
+ "New York",
485
+ "Newark",
486
+ "Newcastle",
487
+ "Newcastle upon Tyne",
488
+ "Nice",
489
+ "North Carolina",
490
+ "Oulu",
491
+ "Palaiseau",
492
+ "Paris",
493
+ "Périgueux",
494
+ "Philadelphia",
495
+ "Prague",
496
+ "Puteaux",
497
+ "Quebec",
498
+ "Rabat",
499
+ "Reims",
500
+ "Rochester",
501
+ "Rome",
502
+ "Rotterdam",
503
+ "Roubaix",
504
+ "Rouen",
505
+ "Saint Aubin",
506
+ "Saint Denis",
507
+ "Saint-Petersbourg",
508
+ "Seoul",
509
+ "Serris",
510
+ "Shanghai",
511
+ "Shenzhen",
512
+ "Singapore",
513
+ "Sophia Antipolis",
514
+ "St Gallen",
515
+ "Stockholm",
516
+ "Strasbourg",
517
+ "Suzhou",
518
+ "Sydney",
519
+ "Taipei",
520
+ "Tokyo",
521
+ "Toronto",
522
+ "Toulouse",
523
+ "Tunis",
524
+ "Turin",
525
+ "Valencia",
526
+ "Vallendar",
527
+ "Velizy",
528
+ "Vienne",
529
+ "Villetaneuse",
530
+ "Waltham",
531
+ "Warsaw",
532
+ "Washington",
533
+ "Wellesley",
534
+ "Zhejiang"
535
+ ],
536
+ "Level": {
537
+ "Master": "Master 2 (BAC+5) / Master Degree",
538
+ "Specialised Master": "Mastère Spécialisé/Advanced Master",
539
+ "Mastere": "Mastère Spécialisé/Advanced Master",
540
+ "MBA": "MBA",
541
+ "Doctorat": "Doctorat / PhD",
542
+ "Preparationdunconcours": "CAP",
543
+ "Maitrise": "Master 2 (BAC+5) / Master Degree",
544
+ "Licence": "Licence (BAC+3) / pengraduate",
545
+ "Chineselanguage": "Autre",
546
+ "Other kind of studies": "Autre",
547
+ "Maîtrise": "Master 1 (BAC+4) / Graduate",
548
+ "Preparation for a competitive examination": "Preparation for a competitive exam",
549
+ "L3": "Licence (BAC+3) / pengraduate",
550
+ "Bachelor": "Licence (BAC+3) / pengraduate"
551
+ },
552
+ "Language": {
553
+ "French": "French",
554
+ "Français": "French",
555
+ "Francais": "French",
556
+ "FR": "French",
557
+ "English": "English",
558
+ "Anglais": "English",
559
+ "EN": "English",
560
+ "German": "German",
561
+ "Deutsch": "German",
562
+ "Allemand": "German",
563
+ "DE": "German",
564
+ "Spanish": "Spanish",
565
+ "Español": "Spanish",
566
+ "Espagnol": "Spanish",
567
+ "ES": "Spanish",
568
+ "Chinese": "Chinese",
569
+ "Mandarin": "Chinese",
570
+ "Cantonese": "Chinese",
571
+ "Chinois": "Chinese",
572
+ "CN": "Chinese",
573
+ "Italian": "Italian",
574
+ "Italiano": "Italian",
575
+ "Italien": "Italian",
576
+ "IT": "Italian",
577
+ "Portuguese": "Portuguese",
578
+ "Português": "Portuguese",
579
+ "Portugues": "Portuguese",
580
+ "Portugais": "Portuguese",
581
+ "PT": "Portuguese",
582
+ "Japanese": "Japanese",
583
+ "Nihongo": "Japanese",
584
+ "Japonais": "Japanese",
585
+ "JP": "Japanese",
586
+ "Korean": "Korean",
587
+ "Hangul": "Korean",
588
+ "Coréen": "Korean",
589
+ "Coreen": "Korean",
590
+ "KR": "Korean",
591
+ "Arabic": "Arabic",
592
+ "Arabe": "Arabic",
593
+ "AR": "Arabic",
594
+ "Russian": "Russian",
595
+ "Russe": "Russian",
596
+ "RU": "Russian",
597
+ "Dutch": "Dutch",
598
+ "Nederlands": "Dutch",
599
+ "Néerlandais": "Dutch",
600
+ "Neerlandais": "Dutch",
601
+ "NL": "Dutch",
602
+ "Hindi": "Hindi",
603
+ "Indien": "Hindi"
604
+ },
605
+ "Tags": [
606
+ "Accounting",
607
+ "Audit & Control",
608
+ "Architecture & Urban Planning",
609
+ "Arts & Cultural Management",
610
+ "Banking & Insurance",
611
+ "Business Administration & Management",
612
+ "Communication & Media",
613
+ "Consulting & Professional Services",
614
+ "Cybersecurity",
615
+ "Data Science & Analytics",
616
+ "Design Management",
617
+ "Digital Marketing",
618
+ "Digital Transformation & Tech Management",
619
+ "Economics",
620
+ "Economics & Social Sciences",
621
+ "Energy Management",
622
+ "Engineering Management",
623
+ "Entrepreneurship & Innovation",
624
+ "Finance & Corporate Finance",
625
+ "Healthcare Management",
626
+ "Hospitality & Culinary Management",
627
+ "Human Resources (HR)",
628
+ "Humanities & Social Sciences",
629
+ "Information Systems & IT Management",
630
+ "International Business",
631
+ "International Security & Defense",
632
+ "Law",
633
+ "Luxury Management",
634
+ "Marketing",
635
+ "Marketing Research & Insights",
636
+ "NGO & Social Impact",
637
+ "Organizational Psychology",
638
+ "Political Science & International Relations",
639
+ "Public & Organizational Management",
640
+ "Real Estate & Asset Management",
641
+ "Real Estate & Urban Planning",
642
+ "Sales & Business Development",
643
+ "Sports Management",
644
+ "Strategy & Strategic Management",
645
+ "Supply Chain & Purchasing",
646
+ "Sustainability & Environmental Management",
647
+ "Tourism & Leisure",
648
+ "Video Games & Creative Tech"
649
+ ]
650
+ }
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ openpyxl
3
+ sentence_transformers
4
+ torch
5
+ tqdm
6
+ requests
7
+ python-dotenv
8
+ tenacity
9
+ flask
10
+ gunicorn
11
+ huggingface_hub
src/config.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables
6
+ load_dotenv()
7
+
8
+ # --- ENVIRONMENT VARIABLES to be set up in .env ---
9
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
10
+ RAW_MODELS = os.getenv("GROQ_MODEL", "")
11
+
12
+ # Parse models cleanly into a list
13
+ AVAILABLE_MODELS = [m.strip() for m in RAW_MODELS.split(",") if m.strip()]
14
+
15
+ # --- PIPELINE THRESHOLDS & CONSTANTS ---
16
+ DEFAULT_SIMILARITY_THRESHOLD = 0.93
17
+ TOP_K_CANDIDATES = 5
18
+
19
+ SHEET_NAME = "Data"
20
+ DATA_DIR = "data"
21
+ REFDATA_DIR = "refdata"
22
+ DEFAULT_OUTPUT_SHEET_NAME = "Cleaned_Data"
23
+
24
+ DEFAULT_BLUEPRINT_FILE = "Blueprint.xlsx"
25
+ DEFAULT_OFFICIAL_REFS_FILE = "official_references.json"
26
+ DEFAULT_MANUAL_REFS_FILE = "manual_references.json"
27
+
28
+ def resolve_data_path(file_arg):
29
+ """Resolve bare file names into the project data directory."""
30
+ if not file_arg:
31
+ return file_arg
32
+ if os.path.isabs(file_arg) or os.path.dirname(file_arg):
33
+ return file_arg
34
+ return os.path.join(DATA_DIR, file_arg)
35
+
36
+ def resolve_ref_path(file_arg):
37
+ """Resolve bare reference file names into the project refdata directory."""
38
+ if not file_arg:
39
+ return file_arg
40
+ if os.path.isabs(file_arg) or os.path.dirname(file_arg):
41
+ return file_arg
42
+ return os.path.join(REFDATA_DIR, file_arg)
43
+
44
+ def parse_cli_args():
45
+ """Sets up the command line arguments so you don't have to hardcode filenames."""
46
+ parser = argparse.ArgumentParser(description="MasterMap Data Normalization Pipeline")
47
+ parser.add_argument("--input", required=True, help="Raw input Excel file name inside data/")
48
+ parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
49
+ parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
50
+ parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
51
+ parser.add_argument("--sheet", default=SHEET_NAME, help="Source sheet name to clean")
52
+ parser.add_argument("--output_sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Output sheet name to create")
53
+ parser.add_argument("--models", default="", help="Comma-separated Groq models to use in fallback order")
54
+
55
+ args = parser.parse_args()
56
+ args.input = resolve_data_path(args.input)
57
+ args.blueprint = resolve_data_path(args.blueprint)
58
+ args.refs = resolve_ref_path(args.refs)
59
+ args.manual_refs = resolve_ref_path(args.manual_refs)
60
+ return args
src/data_pipeline.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import re
4
+ from collections import Counter
5
+ from sentence_transformers import util
6
+ from tqdm import tqdm
7
+
8
+ # Import our pure text manipulation functions
9
+ from src.utils import (
10
+ clean_degree_text,
11
+ normalize_text,
12
+ strip_degrees_for_search,
13
+ smart_format
14
+ )
15
+ from src.config import TOP_K_CANDIDATES, DEFAULT_SIMILARITY_THRESHOLD
16
+ # ---------------------------------------------------------------------------
17
+ # ML & CLUSTERING ENGINE
18
+ # ---------------------------------------------------------------------------
19
+
20
+ def self_cluster_degrees(raw_degrees_list, model, school_cache, threshold=0.93):
21
+ cleaned_list = [clean_degree_text(raw) for raw in raw_degrees_list if isinstance(raw, str)]
22
+ raw_to_clean = {raw: clean_degree_text(raw) for raw in raw_degrees_list if isinstance(raw, str)}
23
+ clean_counts = Counter(cleaned_list)
24
+ unique_cleans = [deg for deg, count in clean_counts.most_common() if deg]
25
+
26
+ raw_to_meta = {}
27
+
28
+ if len(unique_cleans) <= 1:
29
+ for raw, clean in raw_to_clean.items():
30
+ raw_to_meta[raw] = (clean, "Degree_Formatter", "HIGH")
31
+ return raw_to_meta
32
+
33
+ embeddings = model.encode(unique_cleans, convert_to_tensor=True)
34
+ clean_to_clustered = {}
35
+ merge_info = {} # Tracks similarity scores for the Blueprint
36
+
37
+ for i, current_deg in enumerate(unique_cleans):
38
+ if current_deg in clean_to_clustered: continue
39
+ clean_to_clustered[current_deg] = current_deg
40
+
41
+ if i + 1 < len(unique_cleans):
42
+ cos_scores = util.cos_sim(embeddings[i], embeddings[i+1:])[0]
43
+ for j, score in enumerate(cos_scores):
44
+ target_deg = unique_cleans[i + 1 + j]
45
+ if score.item() >= threshold and target_deg not in clean_to_clustered:
46
+ pair_key = f"{min(current_deg, target_deg)}|||{max(current_deg, target_deg)}"
47
+
48
+ # We still use school_cache as a temporary runtime speedup,
49
+ # but it is NOT saved to the json memory.
50
+ cached_action = school_cache.get(pair_key)
51
+
52
+ if cached_action:
53
+ if cached_action == current_deg:
54
+ clean_to_clustered[target_deg] = current_deg
55
+ elif cached_action == target_deg:
56
+ clean_to_clustered[current_deg] = target_deg
57
+ merge_info[target_deg] = "Cached (Runtime)"
58
+ else:
59
+ clean_to_clustered[target_deg] = current_deg
60
+ school_cache[pair_key] = current_deg
61
+ merge_info[target_deg] = f"{score.item()*100:.1f}%"
62
+
63
+ for raw, clean in raw_to_clean.items():
64
+ final_val = clean
65
+ was_merged = False
66
+ sim_str = ""
67
+
68
+ while final_val in clean_to_clustered and clean_to_clustered[final_val] != final_val:
69
+ if not sim_str: sim_str = merge_info.get(final_val, "")
70
+ final_val = clean_to_clustered[final_val]
71
+ was_merged = True
72
+
73
+ conf = "MEDIUM" if was_merged else "HIGH"
74
+
75
+ if was_merged:
76
+ src = f"Auto-Merge ({sim_str})" if sim_str else "Auto-Merge"
77
+ else:
78
+ src = "Degree_Formatter"
79
+
80
+ raw_to_meta[raw] = (final_val, src, conf)
81
+
82
+ return raw_to_meta
83
+
84
+
85
+ def cluster_degrees_by_institution(df, degree_col, inst_col, model, master_cache, blueprint_data, threshold=0.93):
86
+ print(f"\n[INFO] Auto-Clustering '{degree_col}'. (Merges will be logged to Blueprint...)")
87
+ cleaned_col_name = f'Cleaned_{degree_col}'
88
+ df[cleaned_col_name] = df[degree_col].copy()
89
+ unique_schools = df[inst_col].dropna().unique()
90
+
91
+ if "Degree_Decisions" not in master_cache: master_cache["Degree_Decisions"] = {}
92
+
93
+ school_mappings = {}
94
+
95
+ # 1. Wrap the AI bottleneck (school clustering) in tqdm
96
+ for school in tqdm(unique_schools, desc=f"Mapping {degree_col}s by Institution"):
97
+ school_mask = (df[inst_col] == school) & (df[degree_col].notna())
98
+ raw_degs = df.loc[school_mask, degree_col].astype(str).tolist()
99
+ if not raw_degs: continue
100
+
101
+ if school not in master_cache["Degree_Decisions"]: master_cache["Degree_Decisions"][school] = {}
102
+ school_mappings[school] = self_cluster_degrees(raw_degs, model, master_cache["Degree_Decisions"][school], threshold)
103
+
104
+ # 2. Wrap the DataFrame injection and Blueprint logging in tqdm
105
+ for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Applying & Logging {degree_col}s"):
106
+ school = row[inst_col]
107
+ raw_deg = str(row[degree_col])
108
+
109
+ if pd.isna(row[degree_col]) or school not in school_mappings: continue
110
+
111
+ mapping_data = school_mappings[school].get(raw_deg)
112
+ if mapping_data:
113
+ final_val, src, conf = mapping_data
114
+ df.at[idx, cleaned_col_name] = final_val
115
+
116
+ # Log to Blueprint if modified or auto-merged
117
+ if str(raw_deg).strip() != final_val.strip() or conf != "HIGH":
118
+ blueprint_data.append({
119
+ "Row_Index": idx + 3,
120
+ "Column": degree_col,
121
+ "Original_Raw_Text": raw_deg,
122
+ "AI_Suggested_Match": final_val,
123
+ "Human_Override": "",
124
+ "Confidence": conf,
125
+ "Match_Source": src
126
+ })
127
+ return df
128
+
129
+
130
+ def get_deterministic_match(value, combined_valid_targets):
131
+ val_clean = normalize_text(value)
132
+ for target in combined_valid_targets:
133
+ target_clean = normalize_text(target)
134
+ if re.search(rf"^{re.escape(val_clean)}(\b|[\s\(\/\\\-])", target_clean): return target
135
+ for target in combined_valid_targets:
136
+ if f"({val_clean.upper()})" in normalize_text(target).upper(): return target
137
+ return None
138
+
139
+
140
+ def get_top_candidates(model, value, combined_valid_targets, reference_embeddings, k=5):
141
+ if not combined_valid_targets: return []
142
+ query_embedding = model.encode(value, convert_to_tensor=True)
143
+ similarities = util.pytorch_cos_sim(query_embedding, reference_embeddings)[0]
144
+ actual_k = min(k, len(combined_valid_targets))
145
+ top_matches = torch.topk(similarities, actual_k)
146
+ return [combined_valid_targets[idx] for idx in top_matches.indices]
147
+
148
+ def get_dict_exact_match(value, combined_dict):
149
+ value_clean = normalize_text(value)
150
+
151
+ for alias, canonical in combined_dict.items():
152
+ if normalize_text(alias) == value_clean:
153
+ return canonical
154
+
155
+ for canonical in combined_dict.values():
156
+ if normalize_text(canonical) == value_clean:
157
+ return canonical
158
+
159
+ return None
160
+
161
+ def get_dict_rule_match(value, combined_dict):
162
+ aliases = list(combined_dict.keys())
163
+ canonical_values = list(dict.fromkeys(combined_dict.values()))
164
+
165
+ alias_match = get_deterministic_match(value, aliases)
166
+ if alias_match:
167
+ return combined_dict[alias_match]
168
+
169
+ value_match = get_deterministic_match(value, canonical_values)
170
+ if value_match:
171
+ return value_match
172
+
173
+ return None
174
+
175
+ def as_reference_list(ref_data):
176
+ if isinstance(ref_data, list):
177
+ return ref_data
178
+ if isinstance(ref_data, dict):
179
+ return list(dict.fromkeys(ref_data.values()))
180
+ return []
181
+
182
+ def as_reference_dict(ref_data):
183
+ if isinstance(ref_data, dict):
184
+ return ref_data
185
+ if isinstance(ref_data, list):
186
+ return {item: item for item in ref_data if isinstance(item, str)}
187
+ return {}
188
+
189
+ def update_match_postfix(progress, source_counts):
190
+ progress.set_postfix({
191
+ "Exact_Match": source_counts["Exact_Match"],
192
+ "Rule_Match": source_counts["Rule_Match"],
193
+ "LLM_Judged": source_counts["LLM_Judged"],
194
+ }, refresh=False)
195
+
196
+
197
+ def match_cache_key(column_name, value):
198
+ """Return the single cache key format used for matching and reconstruction."""
199
+ if column_name in ["Institution", "Degree"]:
200
+ value = strip_degrees_for_search(value)
201
+ return normalize_text(str(value).rstrip("."))
202
+
203
+
204
+ def append_unique_cleaned_part(cleaned_parts, value):
205
+ seen = set()
206
+ for existing_value in cleaned_parts:
207
+ for existing_part in str(existing_value).split(","):
208
+ key = normalize_text(existing_part.strip())
209
+ if key:
210
+ seen.add(key)
211
+
212
+ added = False
213
+ for part in str(value).split(","):
214
+ clean_part = part.strip()
215
+ if not clean_part:
216
+ continue
217
+
218
+ key = normalize_text(clean_part)
219
+ if key in seen:
220
+ continue
221
+
222
+ seen.add(key)
223
+ cleaned_parts.append(clean_part)
224
+ added = True
225
+
226
+ return added
227
+
228
+
229
+ # ---------------------------------------------------------------------------
230
+ # CORE DATA PIPELINE
231
+ # ---------------------------------------------------------------------------
232
+
233
+ def process_column(df, column_name, model, groq_router, official_refs, manual_refs, master_cache, split_pattern, blueprint_data):
234
+ if column_name not in df.columns: return df
235
+
236
+ core_data = official_refs.get(column_name, [])
237
+ added_data = manual_refs.get(column_name, [])
238
+ if column_name not in master_cache: master_cache[column_name] = {}
239
+
240
+ detailed_cache = {}
241
+ is_dict_mode = isinstance(core_data, dict)
242
+
243
+ def get_updated_embeddings():
244
+ if is_dict_mode:
245
+ c_dict = {**as_reference_dict(core_data), **as_reference_dict(added_data)}
246
+ c_keys = list(c_dict.keys())
247
+ u_vals = list(set(c_dict.values()))
248
+ k_emb = model.encode(c_keys, convert_to_tensor=True) if c_keys else None
249
+ v_emb = model.encode(u_vals, convert_to_tensor=True) if u_vals else None
250
+ return c_dict, c_keys, k_emb, u_vals, v_emb
251
+ else:
252
+ comb = as_reference_list(core_data) + as_reference_list(added_data)
253
+ comb = list(dict.fromkeys(item for item in comb if isinstance(item, str) and item.strip()))
254
+ emb = model.encode(comb, convert_to_tensor=True) if comb else None
255
+ return None, comb, emb, None, None
256
+
257
+ combined_dict, combined_valid_targets, reference_embeddings, unique_values, value_embeddings = get_updated_embeddings()
258
+
259
+ if is_dict_mode and not combined_dict:
260
+ raise ValueError(f"No dictionary references loaded for '{column_name}'. Refusing to call Groq for every value.")
261
+ if not is_dict_mode and not combined_valid_targets:
262
+ raise ValueError(f"No list references loaded for '{column_name}'. Refusing to call Groq for every value.")
263
+
264
+ uniques = set()
265
+ for cell in df[column_name].dropna():
266
+ for p in re.split(split_pattern, str(cell), flags=re.IGNORECASE):
267
+ if p.strip(): uniques.add(p.strip())
268
+
269
+ print(f"\n[INFO] Analyzing {len(uniques)} unique entities in '{column_name}'...")
270
+ source_counts = Counter()
271
+
272
+ progress = tqdm(sorted(uniques, key=normalize_text), desc=f"Cleaning {column_name}")
273
+ for word in progress:
274
+ word_clean = match_cache_key(column_name, word)
275
+
276
+ # 1. Check Memory Cache
277
+ if word_clean in master_cache[column_name]:
278
+ detailed_cache[word_clean] = {"val": master_cache[column_name][word_clean], "src": "Memory_Cache", "conf": "HIGH"}
279
+ source_counts["Memory_Cache"] += 1
280
+ update_match_postfix(progress, source_counts)
281
+ continue
282
+
283
+ # 2. Check Exact Targets
284
+ if is_dict_mode:
285
+ exact = get_dict_exact_match(word, combined_dict)
286
+ else:
287
+ exact = next((k for k in combined_valid_targets if normalize_text(k) == normalize_text(word_clean)), None) if combined_valid_targets else None
288
+
289
+ if exact:
290
+ val = exact
291
+ detailed_cache[word_clean] = {"val": val, "src": "Exact_Match", "conf": "HIGH"}
292
+ source_counts["Exact_Match"] += 1
293
+ update_match_postfix(progress, source_counts)
294
+ continue
295
+
296
+ # 3. Deterministic / Rule Match
297
+ if is_dict_mode:
298
+ suggested_match = get_dict_rule_match(word, combined_dict)
299
+ else:
300
+ suggested_match = get_deterministic_match(word, combined_valid_targets) if combined_valid_targets else None
301
+
302
+ if suggested_match:
303
+ detailed_cache[word_clean] = {"val": suggested_match, "src": "Rule_Match", "conf": "HIGH"}
304
+ source_counts["Rule_Match"] += 1
305
+ update_match_postfix(progress, source_counts)
306
+ continue
307
+
308
+ # 4. LLM API Match
309
+ candidates = []
310
+ if is_dict_mode:
311
+ cand_keys = get_top_candidates(model, word, combined_valid_targets, reference_embeddings)
312
+ cand_vals = get_top_candidates(model, word, unique_values, value_embeddings)
313
+ candidates = list(dict.fromkeys(cand_keys + cand_vals))[:TOP_K_CANDIDATES]
314
+ else:
315
+ candidates = get_top_candidates(model, word, combined_valid_targets, reference_embeddings)
316
+
317
+ # Call the router instance
318
+ ans_val, src, conf = groq_router.ask_judge(word, candidates, column_name)
319
+ source_counts[src] += 1
320
+ update_match_postfix(progress, source_counts)
321
+
322
+ # Process every valid string, regardless of confidence (skip if API crashed)
323
+ if "API_Error" not in conf and ans_val != "UNKNOWN" and ans_val != "LLM_Failed":
324
+ llm_parts = [p.strip() for p in ans_val.split(",")]
325
+ corrected_parts = []
326
+ all_matched = True # Flag to track if every piece exists in our data
327
+
328
+ for part in llm_parts:
329
+ if is_dict_mode:
330
+ exact_match = get_dict_exact_match(part, combined_dict)
331
+ if exact_match:
332
+ corrected_parts.append(exact_match)
333
+ else:
334
+ rule_match = get_dict_rule_match(part, combined_dict)
335
+ if rule_match:
336
+ corrected_parts.append(rule_match)
337
+ else:
338
+ corrected_parts.append(part)
339
+ all_matched = False
340
+ else:
341
+ # 1. Exact Match Check (Case-insensitive)
342
+ exact_match = next((c for c in candidates if c.lower() == part.lower()), None)
343
+ if exact_match:
344
+ corrected_parts.append(exact_match)
345
+ else:
346
+ # 2. Rule-Based Match Check
347
+ rule_match = get_deterministic_match(part, candidates)
348
+ if rule_match:
349
+ corrected_parts.append(rule_match)
350
+ else:
351
+ # 3. No match in dictionary. Keep LLM's version, but flag that we couldn't verify it.
352
+ corrected_parts.append(part)
353
+ all_matched = False
354
+
355
+ # Remove duplicates while preserving the exact order
356
+ unique_parts = list(dict.fromkeys(corrected_parts))
357
+
358
+ # Glue it back together
359
+ ans_val = ", ".join(unique_parts)
360
+
361
+ raw_parts_for_check = [
362
+ p.strip()
363
+ for p in re.split(split_pattern, str(word))
364
+ if p.strip()
365
+ ] or [word]
366
+ raw_lookup_keys = {normalize_text(part) for part in raw_parts_for_check}
367
+ verified_lookup_keys = {normalize_text(part) for part in unique_parts}
368
+
369
+ # Only upgrade when the LLM answer was verified against the refs and
370
+ # also matches the original text directly. Otherwise it is still an
371
+ # LLM judgment, even if the chosen answer exists in the references.
372
+ if all_matched and verified_lookup_keys.issubset(raw_lookup_keys):
373
+ conf = "HIGH"
374
+ src = "LLM_Rule_Verified"
375
+ # -----------------------------------------------------
376
+
377
+ detailed_cache[word_clean] = {"val": ans_val, "src": src, "conf": conf}
378
+
379
+ # Reconstruct cells and capture low/medium confidence matches for the Blueprint
380
+ for idx, row in df.iterrows():
381
+ cell_val = row[column_name]
382
+ if pd.isna(cell_val): continue
383
+
384
+ raw_parts = [p.strip() for p in re.split(split_pattern, str(cell_val)) if p.strip()]
385
+ cleaned_parts = []
386
+ lowest_conf = "HIGH"
387
+ cell_sources = []
388
+
389
+ i = 0
390
+ while i < len(raw_parts):
391
+ curr = raw_parts[i]
392
+
393
+ # Check for combined pairs (e.g., "University of, Manchester" split by mistake)
394
+ if i + 1 < len(raw_parts):
395
+ combo_clean = match_cache_key(column_name, f"{curr}, {raw_parts[i+1]}")
396
+ if combo_clean in detailed_cache:
397
+ append_unique_cleaned_part(cleaned_parts, detailed_cache[combo_clean]["val"])
398
+ if detailed_cache[combo_clean]["conf"] != "HIGH": lowest_conf = detailed_cache[combo_clean]["conf"]
399
+ cell_sources.append(detailed_cache[combo_clean]["src"])
400
+ i += 2; continue
401
+
402
+ p_clean = match_cache_key(column_name, curr)
403
+ if len(p_clean) <= 1: i += 1; continue
404
+
405
+ cache_hit = detailed_cache.get(p_clean)
406
+ if cache_hit:
407
+ append_unique_cleaned_part(cleaned_parts, cache_hit["val"])
408
+ if cache_hit["conf"] != "HIGH": lowest_conf = cache_hit["conf"]
409
+ cell_sources.append(cache_hit["src"])
410
+ else:
411
+ append_unique_cleaned_part(cleaned_parts, smart_format(curr))
412
+ lowest_conf = "LOW"
413
+ cell_sources.append("Fallback_Format")
414
+ i += 1
415
+
416
+ final_stitched_val = ", ".join(cleaned_parts)
417
+ df.at[idx, f'Cleaned_{column_name}'] = final_stitched_val
418
+
419
+ # Log EVERY change made to the Excel file, plus any low/medium confidence guesses
420
+ if str(cell_val).strip() != final_stitched_val.strip() or lowest_conf != "HIGH":
421
+ blueprint_data.append({
422
+ "Row_Index": idx + 3,
423
+ "Column": column_name,
424
+ "Original_Raw_Text": cell_val,
425
+ "AI_Suggested_Match": final_stitched_val,
426
+ "Human_Override": "",
427
+ "Match_Source": " | ".join(set(cell_sources)),
428
+ "Confidence": lowest_conf
429
+ })
430
+
431
+ return df
src/llm_router.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ from tqdm import tqdm
4
+ from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
5
+
6
+ class RateLimitException(Exception): pass
7
+
8
+ class GroqRouter:
9
+ def __init__(self, api_key, available_models):
10
+ self.api_key = api_key
11
+ self.available_models = available_models
12
+ self.current_model_index = 0
13
+ self.last_printed_model = None
14
+
15
+ def ask_judge(self, word, candidates, column_name):
16
+ if self.current_model_index >= len(self.available_models):
17
+ return (word, "API_Error_All_Models_Dead", "LOW")
18
+
19
+ if not self.api_key:
20
+ return (word, "No_API_Key", "LOW")
21
+
22
+ headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
23
+
24
+ if column_name in ["Institution", "Degree"]:
25
+ specific_rules = (
26
+ "- Split distinct separate schools or global alliances with a comma (e.g., 'Harvard & MIT' -> 'Harvard University, MIT').\n"
27
+ "- Treat alliances like CEMS as distinct institutions! NEVER delete them.\n"
28
+ "- Do NOT split a Business School and its Parent University. They are ONE entity.\n"
29
+ "- Do NOT split a school and its acronym/alias. Keep them as one entity.\n"
30
+ "- Watch out for joint programs! If a hyphen separates TWO truly distinct institutions, split them.\n"
31
+ "- Do NOT split single names with hyphens (e.g., 'Paris-Descartes').\n"
32
+ "- Strip out ONLY standard degree titles (e.g., 'MSc', 'MBA', 'BBA', 'Ph.D.')."
33
+ )
34
+ elif column_name in ["Country", "Continent"]:
35
+ specific_rules = (
36
+ f"- MUST translate the {column_name} name to its standard ENGLISH equivalent (e.g., 'Espagne' -> 'Spain', 'Milano' -> 'Milan').\n"
37
+ f"- If the input contains multiple distinct {column_name}s, MUST split them into a comma-separated list.\n"
38
+ f"- If it is a single {column_name}, return exactly ONE official English name."
39
+ )
40
+ else:
41
+ specific_rules = (
42
+ "- If the input contains multiple distinct entities, MUST split them into a comma-separated list.\n"
43
+ "- If it is a single entity, return exactly ONE official name. No aliases."
44
+ )
45
+
46
+ system_msg = (
47
+ "You are a data normalization expert. Your task is to map raw input to the MOST ACCURATE "
48
+ "official name from a provided list of candidates or your own world knowledge. "
49
+ "Respond ONLY with the final string. No conversational text, no quotes."
50
+ )
51
+
52
+ user_msg = (
53
+ f"Column: {column_name}\n"
54
+ f"Raw Input: '{word}'\n"
55
+ f"Database Candidates: {candidates}\n\n"
56
+ f"Rules:\n"
57
+ f"1. Choose the best match from the Database Candidates if possible.\n"
58
+ f"2. If an entity from the Raw Input does NOT exist in the Candidates list, DO NOT map it to a different candidate just because they share a city or country. Instead, return its cleaned original name exactly as written.\n"
59
+ f"{specific_rules}\n"
60
+ f"3. If gibberish, return 'UNKNOWN'."
61
+ )
62
+
63
+ payload = {
64
+ "messages": [{"role": "system", "content": system_msg}, {"role": "user", "content": user_msg}],
65
+ "temperature": 0.0,
66
+ "max_tokens": 50
67
+ }
68
+
69
+ # --- SIMPLIFIED RETRY LOGIC ---
70
+ @retry(
71
+ retry=retry_if_exception_type(RateLimitException),
72
+ wait=wait_exponential(multiplier=2, min=2, max=30),
73
+ stop=stop_after_attempt(5),
74
+ reraise=True
75
+ )
76
+ def fire_request():
77
+ res = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload, timeout=30)
78
+
79
+ if res.status_code == 429:
80
+ raise RateLimitException("Model Limits Exhausted")
81
+
82
+ return res
83
+
84
+ while self.current_model_index < len(self.available_models):
85
+ active_model = self.available_models[self.current_model_index]
86
+ payload["model"] = active_model
87
+
88
+ if active_model != self.last_printed_model:
89
+ tqdm.write(f"\n[INFO] Routing requests to model: {active_model}")
90
+ self.last_printed_model = active_model
91
+
92
+ try:
93
+ time.sleep(0.3)
94
+ response = fire_request()
95
+
96
+ if response.status_code == 200:
97
+ answer = response.json()["choices"][0]["message"]["content"].strip().strip('"\'')
98
+ if answer == "UNKNOWN":
99
+ return (word, "LLM_Failed", "LOW")
100
+ return (answer, "LLM_Judged", "MEDIUM")
101
+
102
+ else:
103
+ tqdm.write(f" [!] Groq API Error: {response.status_code} for '{word}'")
104
+ return (word, f"API_Error_{response.status_code}", "LOW")
105
+
106
+ except RateLimitException:
107
+ tqdm.write(f" [!] Limits exhausted for {active_model}!")
108
+
109
+ self.current_model_index += 1
110
+
111
+ if self.current_model_index < len(self.available_models):
112
+ next_model = self.available_models[self.current_model_index]
113
+ tqdm.write(f" [!] Switching to fallback model: {next_model}...")
114
+ else:
115
+ tqdm.write(f" [!] FATAL: ALL {len(self.available_models)} MODELS DEAD. Skipping remainder of dataset.")
116
+ return (word, "API_Error_All_Models_Dead", "LOW")
117
+
118
+ except Exception as e:
119
+ tqdm.write(f" [!] Request failed for '{word}' on {active_model}: {e}")
120
+ return (word, "API_Error_Timeout", "LOW")
121
+
122
+ return (word, "API_Error_Unhandled", "LOW")
src/process_runner.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs
2
+ import json
3
+ import os
4
+ import signal
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+
9
+ ACTIVE_PROCESSES = {}
10
+
11
+
12
+ def stop_process(job_id: str) -> bool:
13
+ process = ACTIVE_PROCESSES.get(job_id)
14
+ if not process or process.poll() is not None:
15
+ return False
16
+
17
+ try:
18
+ if os.name == "nt":
19
+ process.send_signal(signal.CTRL_BREAK_EVENT)
20
+ else:
21
+ process.terminate()
22
+ except Exception:
23
+ process.terminate()
24
+
25
+ return True
26
+
27
+
28
+ def stream_process(command, cwd: Path, job_id=None):
29
+ env = os.environ.copy()
30
+ env["PYTHONUNBUFFERED"] = "1"
31
+ popen_kwargs = {
32
+ "cwd": cwd,
33
+ "stdout": subprocess.PIPE,
34
+ "stderr": subprocess.STDOUT,
35
+ "bufsize": 0,
36
+ "env": env,
37
+ }
38
+ if os.name == "nt":
39
+ popen_kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP
40
+
41
+ process = subprocess.Popen(
42
+ command,
43
+ **popen_kwargs,
44
+ )
45
+ if job_id:
46
+ ACTIVE_PROCESSES[job_id] = process
47
+ try:
48
+ assert process.stdout is not None
49
+ decoder = codecs.getincrementaldecoder("utf-8")("replace")
50
+ while True:
51
+ raw_chunk = process.stdout.read(1)
52
+ if raw_chunk == b"" and process.poll() is not None:
53
+ break
54
+ if raw_chunk:
55
+ chunk = decoder.decode(raw_chunk)
56
+ yield f"data: {json.dumps(chunk)}\n\n"
57
+
58
+ exit_code = process.wait()
59
+ trailing_chunk = decoder.decode(b"", final=True)
60
+ if trailing_chunk:
61
+ yield f"data: {json.dumps(trailing_chunk)}\n\n"
62
+ yield f"data: {json.dumps(chr(10) + f'Process exited with code {exit_code}' + chr(10))}\n\n"
63
+ event_name = "done" if exit_code == 0 else "failed"
64
+ yield f"event: {event_name}\ndata: {{}}\n\n"
65
+ finally:
66
+ if job_id:
67
+ ACTIVE_PROCESSES.pop(job_id, None)
src/utils.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import re
4
+ import unicodedata
5
+
6
+ def strip_degrees_for_search(text):
7
+ if not isinstance(text, str): return text
8
+ degree_pattern = r'\b(MSc|MBA|BBA|BSc|Ph\.?D\.?|BA|MA|BS|MS|EMBA|Master|Bachelor|Masters|Bachelors|Licence)\b'
9
+ cleaned = re.sub(degree_pattern, '', text, flags=re.IGNORECASE)
10
+ cleaned = re.sub(r'\s+', ' ', cleaned)
11
+ cleaned = cleaned.strip(' -.,&/|')
12
+ if not cleaned: return text.strip()
13
+ return cleaned
14
+
15
+ def smart_format(text):
16
+ if not isinstance(text, str): return text
17
+ res = text.title()
18
+ acronyms = ['Ma', 'Ba', 'Mba', 'Bba', 'Hr', 'It', 'Bs', 'Ms', 'Phd', 'Bsc', 'Msc', 'Llm', 'Pge', 'Cems']
19
+ for ac in acronyms:
20
+ res = re.sub(rf'\b{ac}\b', lambda m: m.group(0).upper(), res)
21
+ res = res.replace("PHD", "PhD").replace("BSC", "BSc").replace("MSC", "MSc")
22
+ res = re.sub(r"\b(L|D|Qu)'([A-Z])", lambda m: f"{m.group(1)}'{m.group(2).lower()}", res)
23
+ return res.strip()
24
+
25
+ def clean_degree_text(text):
26
+ if not isinstance(text, str): return ""
27
+ text = re.sub(r'\band\b', '&', text, flags=re.IGNORECASE)
28
+ text = re.sub(r'\bet\b', '&', text, flags=re.IGNORECASE)
29
+ text = re.sub(r'[^\w\s\-&\+\']', ' ', text)
30
+ text = re.sub(r'\s+', ' ', text).strip()
31
+ return smart_format(text)
32
+
33
+ def normalize_text(text):
34
+ if not isinstance(text, str): return ""
35
+ normalized = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
36
+ return normalized.strip().lower()
37
+
38
+ def normalize_ref(value):
39
+ return normalize_text(str(value))
40
+
41
+ def iter_ref_values(ref_data):
42
+ if isinstance(ref_data, dict):
43
+ yield from (item for item in ref_data.keys() if isinstance(item, str))
44
+ yield from (item for item in ref_data.values() if isinstance(item, str))
45
+ elif isinstance(ref_data, list):
46
+ yield from (item for item in ref_data if isinstance(item, str))
47
+
48
+ def ref_contains(ref_data, value):
49
+ needle = normalize_ref(value)
50
+ return any(normalize_ref(item) == needle for item in iter_ref_values(ref_data))
51
+
52
+ def prune_manual_refs_against_official(manual_refs, official_refs):
53
+ removed_count = 0
54
+
55
+ for column_name, manual_bucket in list(manual_refs.items()):
56
+ official_bucket = official_refs.get(column_name, [])
57
+
58
+ if isinstance(manual_bucket, list):
59
+ kept = []
60
+ seen = set()
61
+ for value in manual_bucket:
62
+ if not isinstance(value, str):
63
+ removed_count += 1
64
+ continue
65
+
66
+ key = normalize_ref(value)
67
+ if not key or key in seen:
68
+ removed_count += 1
69
+ continue
70
+
71
+ if ref_contains(official_bucket, value):
72
+ removed_count += 1
73
+ continue
74
+
75
+ seen.add(key)
76
+ kept.append(value)
77
+ manual_refs[column_name] = kept
78
+
79
+ elif isinstance(manual_bucket, dict):
80
+ kept = {}
81
+ seen_values = set()
82
+ for alias, value in manual_bucket.items():
83
+ candidate = value if isinstance(value, str) else alias
84
+ key = normalize_ref(candidate)
85
+ if not key or key in seen_values:
86
+ removed_count += 1
87
+ continue
88
+
89
+ if ref_contains(official_bucket, candidate):
90
+ removed_count += 1
91
+ continue
92
+
93
+ seen_values.add(key)
94
+ kept[normalize_ref(alias)] = value
95
+ manual_refs[column_name] = kept
96
+
97
+ return removed_count
98
+
99
+ MANUAL_REFERENCES_REPO_PATH = "refdata/manual_references.json"
100
+
101
+ def reference_sync_status():
102
+ space_id = os.getenv("SPACE_ID", "")
103
+ has_token = bool(os.getenv("HF_TOKEN"))
104
+
105
+ if not space_id:
106
+ return {
107
+ "enabled": False,
108
+ "space_id": "",
109
+ "reason": "Reference sync is only available on Hugging Face Spaces.",
110
+ }
111
+
112
+ if not has_token:
113
+ return {
114
+ "enabled": False,
115
+ "space_id": space_id,
116
+ "reason": "HF_TOKEN secret is missing from this Space.",
117
+ }
118
+
119
+ return {
120
+ "enabled": True,
121
+ "space_id": space_id,
122
+ "reason": "",
123
+ }
124
+
125
+ def save_manual_references_to_hub(app_root: Path):
126
+ status = reference_sync_status()
127
+ if not status["enabled"]:
128
+ raise RuntimeError(status["reason"])
129
+
130
+ manual_refs_path = app_root / MANUAL_REFERENCES_REPO_PATH
131
+ if not manual_refs_path.is_file():
132
+ raise FileNotFoundError(f"Manual references file not found: {manual_refs_path}")
133
+
134
+ try:
135
+ from huggingface_hub import HfApi
136
+ except ImportError as exc:
137
+ raise RuntimeError("huggingface_hub is not installed.") from exc
138
+
139
+ api = HfApi(token=os.environ["HF_TOKEN"])
140
+ commit_info = api.upload_file(
141
+ path_or_fileobj=str(manual_refs_path),
142
+ path_in_repo=MANUAL_REFERENCES_REPO_PATH,
143
+ repo_id=status["space_id"],
144
+ repo_type="space",
145
+ commit_message="Update manual references",
146
+ )
147
+
148
+ return {
149
+ "space_id": status["space_id"],
150
+ "path": MANUAL_REFERENCES_REPO_PATH,
151
+ "commit_url": str(commit_info),
152
+ }
src/workbook_io.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from pathlib import Path
3
+
4
+ import openpyxl
5
+ from werkzeug.utils import secure_filename
6
+
7
+
8
+ ALLOWED_EXCEL_EXTENSIONS = (".xlsx", ".xlsm")
9
+
10
+
11
+ def save_uploaded_excel(uploaded, upload_dir: Path):
12
+ if not uploaded or not uploaded.filename:
13
+ raise ValueError("No file uploaded.")
14
+
15
+ filename = secure_filename(uploaded.filename)
16
+ if not filename.lower().endswith(ALLOWED_EXCEL_EXTENSIONS):
17
+ raise ValueError("Upload an .xlsx or .xlsm file.")
18
+
19
+ stem = Path(filename).stem
20
+ suffix = Path(filename).suffix
21
+ saved_filename = f"{stem}_{uuid.uuid4().hex[:8]}{suffix}"
22
+ destination = upload_dir / saved_filename
23
+ uploaded.save(destination)
24
+ return saved_filename, destination
25
+
26
+
27
+ def read_workbook_sheets(path: Path) -> list[str]:
28
+ workbook = openpyxl.load_workbook(path, read_only=True, data_only=False)
29
+ try:
30
+ return workbook.sheetnames
31
+ finally:
32
+ workbook.close()
33
+
34
+
35
+ def resolve_allowed_path(raw_path: str, app_root: Path, allowed_roots: list[Path]) -> Path:
36
+ if not raw_path:
37
+ raise ValueError("Path is required.")
38
+
39
+ candidate = Path(raw_path)
40
+ if not candidate.is_absolute():
41
+ candidate = app_root / candidate
42
+
43
+ resolved = candidate.resolve()
44
+ allowed = [root.resolve() for root in allowed_roots]
45
+ if not any(resolved == root or resolved.is_relative_to(root) for root in allowed):
46
+ raise ValueError("Path is outside the application data directory.")
47
+
48
+ return resolved
ui/static/app.js ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const config = window.MASTERMAP_CONFIG || {};
2
+ const cleanPath = config.cleanPath || "";
3
+ let applyWorkbookPath = config.applyWorkbookPath || "";
4
+ let applyBlueprintPath = config.applyBlueprintPath || "";
5
+ const defaultOutputSheet = config.defaultOutputSheet || "Cleaned_Data";
6
+ const sheetSelect = document.getElementById("sheetSelect");
7
+ const applySheetSelect = document.getElementById("applySheetSelect");
8
+ const outputSheet = document.getElementById("outputSheet");
9
+ const models = document.getElementById("models");
10
+ const fetchModels = document.getElementById("fetchModels");
11
+ const runButton = document.getElementById("runButton");
12
+ const applyButton = document.getElementById("applyButton");
13
+ const saveReferencesButton = document.getElementById("saveReferencesButton");
14
+ const applyWorkbookForm = document.getElementById("applyWorkbookForm");
15
+ const applyBlueprintForm = document.getElementById("applyBlueprintForm");
16
+ const applyWorkbookInput = document.getElementById("applyWorkbookInput");
17
+ const applyBlueprintInput = document.getElementById("applyBlueprintInput");
18
+ const runStatus = document.getElementById("runStatus");
19
+ const applyStatus = document.getElementById("applyStatus");
20
+ const referencesStatus = document.getElementById("referencesStatus");
21
+ const applyWorkbookFile = document.getElementById("applyWorkbookFile");
22
+ const applyBlueprintFile = document.getElementById("applyBlueprintFile");
23
+ const cleanLogs = document.getElementById("cleanLogs");
24
+ const applyLogs = document.getElementById("applyLogs");
25
+ const cleanProgressPanel = document.getElementById("cleanProgressPanel");
26
+ const cleanProgressSummary = document.getElementById("cleanProgressSummary");
27
+ const cleanProgressList = document.getElementById("cleanProgressList");
28
+ const cleanResult = document.getElementById("cleanResult");
29
+ const applyResult = document.getElementById("applyResult");
30
+ let cleanRawLogText = "";
31
+ let cleanLiveLine = "";
32
+ let cleanProgressOrder = [];
33
+ let cleanProgressByColumn = {};
34
+ let applyRawLogText = "";
35
+ let applyLiveLine = "";
36
+ let activeRunStream = null;
37
+ let activeRunJobId = "";
38
+ let stopRequested = false;
39
+
40
+ function submitUploadForm(formId, statusId, event) {
41
+ if (event) {
42
+ event.preventDefault();
43
+ }
44
+ const form = document.getElementById(formId);
45
+ const status = document.getElementById(statusId);
46
+ [
47
+ ["clean_selected_sheet", sheetSelect ? sheetSelect.value : ""],
48
+ ["output_sheet", outputSheet ? outputSheet.value : ""],
49
+ ["models", models ? models.value : ""],
50
+ ["apply_selected_sheet", applySheetSelect ? applySheetSelect.value : ""]
51
+ ].forEach(([name, value]) => {
52
+ let input = form.querySelector(`input[name="${name}"]`);
53
+ if (!input) {
54
+ input = document.createElement("input");
55
+ input.type = "hidden";
56
+ input.name = name;
57
+ form.appendChild(input);
58
+ }
59
+ input.value = value;
60
+ });
61
+ status.textContent = "Loading...";
62
+
63
+ if (formId === "applyWorkbookForm" || formId === "applyBlueprintForm") {
64
+ uploadApplyFile(form, status, formId);
65
+ return;
66
+ }
67
+
68
+ form.submit();
69
+ }
70
+
71
+ async function uploadApplyFile(form, status, formId) {
72
+ try {
73
+ const res = await fetch(form.action, {
74
+ method: "POST",
75
+ body: new FormData(form),
76
+ headers: { "Accept": "application/json" }
77
+ });
78
+ const data = await res.json();
79
+ if (!res.ok) {
80
+ status.textContent = data.error || "Upload failed.";
81
+ status.classList.add("error");
82
+ return;
83
+ }
84
+
85
+ status.classList.remove("error");
86
+ status.textContent = data.message || "Loaded.";
87
+ if (formId === "applyWorkbookForm") {
88
+ applyWorkbookPath = data.apply_workbook_path || "";
89
+ applyWorkbookFile.innerHTML = data.apply_workbook_filename
90
+ ? `<div class="file-pill">${escapeHtml(data.apply_workbook_filename)}</div>`
91
+ : "";
92
+ }
93
+ if (formId === "applyBlueprintForm") {
94
+ applyBlueprintPath = data.apply_blueprint_path || "";
95
+ applyBlueprintFile.innerHTML = data.apply_blueprint_filename
96
+ ? `<div class="file-pill">${escapeHtml(data.apply_blueprint_filename)}</div>`
97
+ : "";
98
+ }
99
+ if (data.apply_sheets) {
100
+ setApplySheets(data.apply_sheets, data.apply_selected_sheet);
101
+ }
102
+ applyButton.disabled = !(applyWorkbookPath && applyBlueprintPath && applySheetSelect.value);
103
+ } catch (error) {
104
+ status.textContent = "Upload failed.";
105
+ status.classList.add("error");
106
+ }
107
+ }
108
+
109
+ function clearCleanOutput() {
110
+ cleanRawLogText = "";
111
+ cleanLiveLine = "";
112
+ cleanProgressOrder = [];
113
+ cleanProgressByColumn = {};
114
+ cleanLogs.textContent = "";
115
+ cleanProgressPanel.classList.remove("active");
116
+ cleanProgressSummary.textContent = "Waiting...";
117
+ cleanProgressList.innerHTML = "";
118
+ cleanResult.classList.remove("active");
119
+ cleanResult.innerHTML = "";
120
+ }
121
+
122
+ function clearApplyOutput() {
123
+ applyRawLogText = "";
124
+ applyLiveLine = "";
125
+ applyLogs.textContent = "";
126
+ applyResult.classList.remove("active");
127
+ applyResult.innerHTML = "";
128
+ }
129
+
130
+ function parseProgressLine(line) {
131
+ const cleanLine = line.replace(/\x1b\[[0-9;]*m/g, "").trim();
132
+ const match = cleanLine.match(/^Cleaning\s+(.+?):\s+(\d+)%\|.*?\|\s+(\d+)\/(\d+)\s+\[([^\]]*)\]/);
133
+ if (!match) return null;
134
+
135
+ const columnName = match[1].trim();
136
+ const percent = Number(match[2]);
137
+ const current = match[3];
138
+ const total = match[4];
139
+ const bracketParts = match[5].split(",").map(part => part.trim()).filter(Boolean);
140
+ const timingParts = bracketParts.slice(0, 2);
141
+ const metricParts = bracketParts.slice(2);
142
+ const timing = timingParts.join(", ").includes("?") ? "estimating..." : timingParts.join(", ");
143
+ const metrics = metricParts.join(", ");
144
+
145
+ return { columnName, percent, current, total, timing, metrics };
146
+ }
147
+
148
+ function escapeHtml(value) {
149
+ return String(value).replace(/[&<>"']/g, char => ({
150
+ "&": "&amp;",
151
+ "<": "&lt;",
152
+ ">": "&gt;",
153
+ '"': "&quot;",
154
+ "'": "&#39;"
155
+ }[char]));
156
+ }
157
+
158
+ function renderCleanProgressRows() {
159
+ cleanProgressPanel.classList.add("active");
160
+ cleanProgressSummary.textContent = `${cleanProgressOrder.length} column${cleanProgressOrder.length === 1 ? "" : "s"}`;
161
+ cleanProgressList.innerHTML = cleanProgressOrder.map(columnName => {
162
+ const item = cleanProgressByColumn[columnName];
163
+ const percent = Math.max(0, Math.min(100, item.percent));
164
+ const meta = `${item.current}/${item.total}${item.timing ? " | " + item.timing : ""}${item.metrics ? " | " + item.metrics : ""}`;
165
+ return `
166
+ <div class="progress-row">
167
+ <div class="progress-top">
168
+ <strong>${escapeHtml(columnName)}</strong>
169
+ <span>${percent}%</span>
170
+ </div>
171
+ <div class="progress-track">
172
+ <div class="progress-fill" style="width:${percent}%"></div>
173
+ </div>
174
+ <div class="progress-meta">${escapeHtml(meta)}</div>
175
+ </div>
176
+ `;
177
+ }).join("");
178
+ }
179
+
180
+ function renderCleanProgressLine(line) {
181
+ const parsed = parseProgressLine(line);
182
+ if (!parsed) return false;
183
+
184
+ if (!cleanProgressByColumn[parsed.columnName]) {
185
+ cleanProgressOrder.push(parsed.columnName);
186
+ }
187
+ cleanProgressByColumn[parsed.columnName] = parsed;
188
+ renderCleanProgressRows();
189
+ return true;
190
+ }
191
+
192
+ function appendCleanLogChunk(chunk) {
193
+ for (const char of chunk) {
194
+ cleanRawLogText += char === "\r" ? "\n" : char;
195
+ if (char === "\r") {
196
+ renderCleanProgressLine(cleanLiveLine);
197
+ cleanLiveLine = "";
198
+ } else if (char === "\n") {
199
+ renderCleanProgressLine(cleanLiveLine);
200
+ cleanLiveLine = "";
201
+ } else {
202
+ cleanLiveLine += char;
203
+ }
204
+ }
205
+ const isPartialProgress = cleanLiveLine.startsWith("Cleaning ") && cleanLiveLine.includes("|");
206
+ if (!isPartialProgress) {
207
+ renderCleanProgressLine(cleanLiveLine);
208
+ }
209
+ cleanLogs.textContent = cleanRawLogText;
210
+ cleanLogs.scrollTop = cleanLogs.scrollHeight;
211
+ }
212
+
213
+ function renderApplySummary() {
214
+ const changed = applyRawLogText.match(/Success!\s+(\d+)\s+corrections injected/i);
215
+ const added = applyRawLogText.match(/Memory updated:\s+(\d+)\s+new approved values added/i);
216
+ if (!changed && !added) return;
217
+
218
+ applyResult.classList.add("active");
219
+ applyResult.innerHTML = `
220
+ <strong>Blueprint applied</strong>
221
+ <div class="status">${changed ? changed[1] : "0"} workbook row value${changed && changed[1] === "1" ? "" : "s"} updated from human overrides.</div>
222
+ <div class="status">${added ? added[1] : "0"} new unique reference value${added && added[1] === "1" ? "" : "s"} added to manual references.</div>
223
+ <a class="download-link" href="/download-applied-workbook">Download Cleaned Workbook</a>
224
+ `;
225
+ }
226
+
227
+ function appendApplyLogChunk(chunk) {
228
+ for (const char of chunk) {
229
+ applyRawLogText += char === "\r" ? "\n" : char;
230
+ if (char === "\r" || char === "\n") {
231
+ applyLiveLine = "";
232
+ } else {
233
+ applyLiveLine += char;
234
+ }
235
+ }
236
+ applyLogs.textContent = applyRawLogText;
237
+ applyLogs.scrollTop = applyLogs.scrollHeight;
238
+ renderApplySummary();
239
+ }
240
+
241
+ function clearPersistedOutputs() {
242
+ [
243
+ "mastermap.cleanRawLogText",
244
+ "mastermap.applyRawLogText",
245
+ "mastermap.cleanResultHtml",
246
+ "mastermap.applyResultHtml",
247
+ "mastermap.cleanResultActive",
248
+ "mastermap.applyResultActive"
249
+ ].forEach(key => localStorage.removeItem(key));
250
+ }
251
+
252
+ function setRunButtonIdle() {
253
+ activeRunStream = null;
254
+ activeRunJobId = "";
255
+ stopRequested = false;
256
+ runButton.disabled = false;
257
+ runButton.textContent = "Run Cleaning";
258
+ runButton.classList.remove("danger");
259
+ }
260
+
261
+ async function stopActiveRun() {
262
+ if (!activeRunJobId) return;
263
+ stopRequested = true;
264
+ runButton.disabled = true;
265
+ runButton.textContent = "Stopping...";
266
+ runStatus.textContent = "Stopping run...";
267
+ await fetch(`/stop?job_id=${encodeURIComponent(activeRunJobId)}`, { method: "POST" });
268
+ }
269
+
270
+ function selectApplySheet(sheetName) {
271
+ if (!sheetName) return;
272
+ const existing = Array.from(applySheetSelect.options).some(option => option.value === sheetName);
273
+ if (!existing) {
274
+ const option = document.createElement("option");
275
+ option.value = sheetName;
276
+ option.textContent = sheetName;
277
+ applySheetSelect.appendChild(option);
278
+ }
279
+ applySheetSelect.value = sheetName;
280
+ applySheetSelect.disabled = false;
281
+ }
282
+
283
+ function setApplySheets(sheets, preferredSheet) {
284
+ const selected = preferredSheet || applySheetSelect.value;
285
+ applySheetSelect.innerHTML = "";
286
+ sheets.forEach(sheetName => {
287
+ const option = document.createElement("option");
288
+ option.value = sheetName;
289
+ option.textContent = sheetName;
290
+ applySheetSelect.appendChild(option);
291
+ });
292
+ if (selected && sheets.includes(selected)) {
293
+ applySheetSelect.value = selected;
294
+ } else if (sheets.length) {
295
+ applySheetSelect.value = sheets[0];
296
+ }
297
+ applySheetSelect.disabled = sheets.length === 0;
298
+ }
299
+
300
+ async function refreshApplySheets(preferredSheet) {
301
+ if (!applyWorkbookPath) return selectApplySheet(preferredSheet);
302
+ const params = new URLSearchParams({ path: applyWorkbookPath });
303
+ const res = await fetch(`/sheets?${params.toString()}`);
304
+ if (!res.ok) {
305
+ selectApplySheet(preferredSheet);
306
+ return;
307
+ }
308
+ const data = await res.json();
309
+ setApplySheets(data.sheets || [], preferredSheet);
310
+ }
311
+
312
+ async function refreshReferenceSyncStatus(updateText = true) {
313
+ try {
314
+ const res = await fetch("/references/status");
315
+ const data = await res.json();
316
+ saveReferencesButton.disabled = !data.enabled;
317
+ if (updateText) {
318
+ referencesStatus.textContent = data.enabled
319
+ ? `Ready to save manual references to ${data.space_id}.`
320
+ : data.reason || "Reference sync is unavailable.";
321
+ }
322
+ } catch (error) {
323
+ saveReferencesButton.disabled = true;
324
+ if (updateText) {
325
+ referencesStatus.textContent = "Reference sync status unavailable.";
326
+ }
327
+ }
328
+ }
329
+
330
+ clearPersistedOutputs();
331
+ refreshReferenceSyncStatus();
332
+
333
+ applyWorkbookForm.addEventListener("submit", event => {
334
+ submitUploadForm("applyWorkbookForm", "applyWorkbookUploadStatus", event);
335
+ });
336
+ applyBlueprintForm.addEventListener("submit", event => {
337
+ submitUploadForm("applyBlueprintForm", "applyBlueprintUploadStatus", event);
338
+ });
339
+ applyWorkbookInput.addEventListener("change", event => {
340
+ submitUploadForm("applyWorkbookForm", "applyWorkbookUploadStatus", event);
341
+ });
342
+ applyBlueprintInput.addEventListener("change", event => {
343
+ submitUploadForm("applyBlueprintForm", "applyBlueprintUploadStatus", event);
344
+ });
345
+
346
+ fetchModels.addEventListener("click", async () => {
347
+ fetchModels.disabled = true;
348
+ runStatus.textContent = "Fetching Groq models...";
349
+ const res = await fetch("/models");
350
+ const data = await res.json();
351
+ fetchModels.disabled = false;
352
+ if (!res.ok) {
353
+ runStatus.textContent = data.error || "Could not fetch models";
354
+ return;
355
+ }
356
+ models.value = data.models.join(",");
357
+ runStatus.textContent = "Model list updated.";
358
+ });
359
+
360
+ saveReferencesButton.addEventListener("click", async () => {
361
+ saveReferencesButton.disabled = true;
362
+ referencesStatus.textContent = "Saving manual references...";
363
+ try {
364
+ const res = await fetch("/references/save", { method: "POST" });
365
+ const data = await res.json();
366
+ if (!res.ok) {
367
+ referencesStatus.textContent = data.error || "Could not save manual references.";
368
+ await refreshReferenceSyncStatus();
369
+ return;
370
+ }
371
+ referencesStatus.textContent = data.message || "Manual references saved.";
372
+ } catch (error) {
373
+ referencesStatus.textContent = "Could not save manual references.";
374
+ } finally {
375
+ await refreshReferenceSyncStatus(false);
376
+ }
377
+ });
378
+
379
+ runButton.addEventListener("click", () => {
380
+ if (activeRunStream) {
381
+ stopActiveRun();
382
+ return;
383
+ }
384
+ if (!cleanPath || !sheetSelect.value) return;
385
+ clearCleanOutput();
386
+ activeRunJobId = window.crypto && window.crypto.randomUUID ? window.crypto.randomUUID() : String(Date.now());
387
+ stopRequested = false;
388
+ runButton.disabled = false;
389
+ runButton.textContent = "Stop Cleaning";
390
+ runButton.classList.add("danger");
391
+ runStatus.textContent = "Running...";
392
+ const params = new URLSearchParams({
393
+ job_id: activeRunJobId,
394
+ input: cleanPath,
395
+ sheet: sheetSelect.value,
396
+ output_sheet: outputSheet.value || defaultOutputSheet,
397
+ models: models.value.trim()
398
+ });
399
+ const stream = new EventSource(`/run?${params.toString()}`);
400
+ activeRunStream = stream;
401
+ stream.onmessage = event => appendCleanLogChunk(JSON.parse(event.data));
402
+ stream.addEventListener("done", async () => {
403
+ stream.close();
404
+ if (!stopRequested) {
405
+ applyBlueprintPath = "data/Blueprint.xlsx";
406
+ const targetSheet = outputSheet.value || defaultOutputSheet;
407
+ await refreshApplySheets(targetSheet);
408
+ applyButton.disabled = !(applyWorkbookPath && applyBlueprintPath && applySheetSelect.value);
409
+ applyStatus.textContent = "Generated blueprint is ready for Apply Blueprint.";
410
+ cleanResult.classList.add("active");
411
+ cleanResult.innerHTML = `
412
+ <strong>Blueprint generated</strong>
413
+ <div class="status">Blueprint saved at data/Blueprint.xlsx</div>
414
+ <a class="download-link" href="/download-blueprint">Download Blueprint</a>
415
+ <a class="download-link" href="/download-cleaned-workbook">Download Cleaned Workbook</a>
416
+ `;
417
+ runStatus.textContent = "Finished.";
418
+ } else {
419
+ runStatus.textContent = "Stopped.";
420
+ }
421
+ setRunButtonIdle();
422
+ });
423
+ stream.addEventListener("failed", () => {
424
+ stream.close();
425
+ setRunButtonIdle();
426
+ runStatus.textContent = "Run failed. Check logs.";
427
+ });
428
+ stream.addEventListener("error", () => {
429
+ stream.close();
430
+ setRunButtonIdle();
431
+ runStatus.textContent = "Run stopped. Check logs.";
432
+ });
433
+ });
434
+
435
+ applyButton.addEventListener("click", () => {
436
+ if (!applyWorkbookPath || !applyBlueprintPath || !applySheetSelect.value) return;
437
+ clearApplyOutput();
438
+ applyButton.disabled = true;
439
+ applyStatus.textContent = "Applying blueprint...";
440
+ const params = new URLSearchParams({
441
+ input: applyWorkbookPath,
442
+ blueprint: applyBlueprintPath,
443
+ sheet: applySheetSelect.value
444
+ });
445
+ const stream = new EventSource(`/apply?${params.toString()}`);
446
+ stream.onmessage = event => appendApplyLogChunk(JSON.parse(event.data));
447
+ stream.addEventListener("done", () => {
448
+ stream.close();
449
+ applyButton.disabled = false;
450
+ applyStatus.textContent = "Finished.";
451
+ renderApplySummary();
452
+ });
453
+ stream.addEventListener("failed", () => {
454
+ stream.close();
455
+ applyButton.disabled = false;
456
+ applyStatus.textContent = "Apply failed. Check logs.";
457
+ });
458
+ stream.addEventListener("error", () => {
459
+ stream.close();
460
+ applyButton.disabled = false;
461
+ applyStatus.textContent = "Apply stopped. Check logs.";
462
+ });
463
+ });
ui/static/styles.css ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --bg: #f6f7f9;
3
+ --panel: #ffffff;
4
+ --text: #17202a;
5
+ --muted: #627386;
6
+ --line: #d9e0e7;
7
+ --accent: #2457c5;
8
+ --accent-dark: #1d459c;
9
+ --danger: #b42318;
10
+ }
11
+ * { box-sizing: border-box; }
12
+ body {
13
+ margin: 0;
14
+ font-family: Arial, sans-serif;
15
+ color: var(--text);
16
+ background: var(--bg);
17
+ }
18
+ header {
19
+ padding: 20px 28px;
20
+ background: var(--panel);
21
+ border-bottom: 1px solid var(--line);
22
+ }
23
+ h1 { margin: 0; font-size: 22px; }
24
+ h2 { font-size: 16px; margin: 0 0 12px; }
25
+ main {
26
+ display: grid;
27
+ grid-template-columns: 380px 1fr;
28
+ gap: 18px;
29
+ padding: 18px 28px 28px;
30
+ }
31
+ section {
32
+ background: var(--panel);
33
+ border: 1px solid var(--line);
34
+ border-radius: 8px;
35
+ padding: 16px;
36
+ }
37
+ label {
38
+ display: block;
39
+ margin: 14px 0 6px;
40
+ font-weight: 700;
41
+ font-size: 13px;
42
+ }
43
+ input, select, textarea, button {
44
+ width: 100%;
45
+ font: inherit;
46
+ border: 1px solid var(--line);
47
+ border-radius: 6px;
48
+ padding: 10px 11px;
49
+ background: #fff;
50
+ }
51
+ textarea {
52
+ min-height: 86px;
53
+ resize: vertical;
54
+ font-family: Consolas, monospace;
55
+ font-size: 13px;
56
+ }
57
+ button {
58
+ cursor: pointer;
59
+ background: var(--accent);
60
+ color: white;
61
+ border-color: var(--accent);
62
+ font-weight: 700;
63
+ margin-top: 10px;
64
+ }
65
+ button:hover { background: var(--accent-dark); }
66
+ button.secondary {
67
+ background: white;
68
+ color: var(--accent);
69
+ }
70
+ button.danger {
71
+ background: white;
72
+ color: var(--danger);
73
+ border-color: var(--danger);
74
+ }
75
+ button.danger:hover { background: #fff1f0; }
76
+ button:disabled {
77
+ opacity: 0.55;
78
+ cursor: not-allowed;
79
+ }
80
+ .dropbox {
81
+ border: 2px dashed #aeb9c7;
82
+ border-radius: 8px;
83
+ background: #fbfcfe;
84
+ padding: 14px;
85
+ color: var(--muted);
86
+ }
87
+ .dropbox strong {
88
+ display: block;
89
+ color: var(--text);
90
+ margin-bottom: 8px;
91
+ }
92
+ .dropbox input {
93
+ background: white;
94
+ cursor: pointer;
95
+ }
96
+ .file-pill {
97
+ border: 1px solid var(--line);
98
+ border-radius: 6px;
99
+ padding: 10px 11px;
100
+ background: #f8fafc;
101
+ color: var(--text);
102
+ word-break: break-word;
103
+ margin-bottom: 8px;
104
+ }
105
+ .status {
106
+ margin-top: 10px;
107
+ color: var(--muted);
108
+ font-size: 13px;
109
+ min-height: 18px;
110
+ }
111
+ .status.error { color: var(--danger); }
112
+ .progress-panel {
113
+ display: none;
114
+ margin-bottom: 12px;
115
+ background: #ffffff;
116
+ border: 1px solid var(--line);
117
+ border-radius: 8px;
118
+ padding: 12px;
119
+ }
120
+ .progress-panel.active {
121
+ display: block;
122
+ }
123
+ .progress-heading {
124
+ display: flex;
125
+ justify-content: space-between;
126
+ gap: 12px;
127
+ align-items: center;
128
+ font-size: 13px;
129
+ font-weight: 700;
130
+ margin-bottom: 10px;
131
+ }
132
+ .progress-list {
133
+ display: grid;
134
+ gap: 10px;
135
+ }
136
+ .progress-row {
137
+ border: 1px solid var(--line);
138
+ border-radius: 6px;
139
+ padding: 10px;
140
+ background: #f8fafc;
141
+ }
142
+ .progress-top {
143
+ display: flex;
144
+ justify-content: space-between;
145
+ gap: 12px;
146
+ font-size: 13px;
147
+ margin-bottom: 8px;
148
+ }
149
+ .progress-track {
150
+ height: 10px;
151
+ background: #e8edf3;
152
+ border-radius: 999px;
153
+ overflow: hidden;
154
+ }
155
+ .progress-fill {
156
+ height: 100%;
157
+ width: 0%;
158
+ background: var(--accent);
159
+ }
160
+ .progress-meta {
161
+ margin-top: 8px;
162
+ color: var(--muted);
163
+ font-size: 12px;
164
+ line-height: 1.4;
165
+ }
166
+ .blueprint-result {
167
+ display: none;
168
+ margin-bottom: 12px;
169
+ border: 1px solid var(--line);
170
+ border-radius: 8px;
171
+ background: #f8fafc;
172
+ padding: 12px;
173
+ font-size: 13px;
174
+ }
175
+ .blueprint-result.active {
176
+ display: block;
177
+ }
178
+ .download-link {
179
+ display: inline-block;
180
+ color: white;
181
+ background: var(--accent);
182
+ border-radius: 6px;
183
+ padding: 9px 11px;
184
+ text-decoration: none;
185
+ font-weight: 700;
186
+ margin-top: 8px;
187
+ }
188
+ .download-link:hover {
189
+ background: var(--accent-dark);
190
+ }
191
+ .raw-logs {
192
+ border: 1px solid var(--line);
193
+ border-radius: 8px;
194
+ background: #ffffff;
195
+ padding: 10px;
196
+ }
197
+ .raw-logs summary {
198
+ cursor: pointer;
199
+ font-size: 13px;
200
+ font-weight: 700;
201
+ }
202
+ .output-block {
203
+ margin-bottom: 14px;
204
+ }
205
+ .output-block h2 {
206
+ margin-bottom: 10px;
207
+ }
208
+ .log-screen {
209
+ height: 420px;
210
+ margin-top: 10px;
211
+ overflow: auto;
212
+ background: #0f172a;
213
+ color: #dbeafe;
214
+ border-radius: 8px;
215
+ padding: 14px;
216
+ white-space: pre-wrap;
217
+ font-family: Consolas, monospace;
218
+ font-size: 13px;
219
+ line-height: 1.45;
220
+ }
221
+ hr {
222
+ border: 0;
223
+ border-top: 1px solid var(--line);
224
+ margin: 18px 0;
225
+ }
ui/templates/index.html ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1">
6
+ <title>MasterMap Cleaner</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
8
+
9
+ </head>
10
+ <body>
11
+ <header>
12
+ <h1>MasterMap Cleaner</h1>
13
+ </header>
14
+ <main>
15
+ <section>
16
+ <h2>Dataset to Clean</h2>
17
+ {% if clean_filename %}
18
+ <div class="file-pill">{{ clean_filename }}</div>
19
+ <form action="/remove-clean" method="post">
20
+ <button class="danger" type="submit">Remove File</button>
21
+ </form>
22
+ {% else %}
23
+ <form id="cleanUploadForm" action="/prepare-clean" method="post" enctype="multipart/form-data">
24
+ <div class="dropbox">
25
+ <strong>Drop or select an Excel file</strong>
26
+ <input name="file" type="file" accept=".xlsx,.xlsm" required onchange="submitUploadForm('cleanUploadForm', 'cleanUploadStatus')">
27
+ </div>
28
+ <div id="cleanUploadStatus" class="status"></div>
29
+ </form>
30
+ {% endif %}
31
+
32
+ <label for="sheetSelect">Source Sheet</label>
33
+ <select id="sheetSelect" {% if not clean_sheets %}disabled{% endif %}>
34
+ {% for sheet in clean_sheets %}
35
+ <option value="{{ sheet }}" {% if sheet == clean_selected_sheet %}selected{% endif %}>{{ sheet }}</option>
36
+ {% endfor %}
37
+ </select>
38
+
39
+ <label for="outputSheet">Output Sheet</label>
40
+ <input id="outputSheet" value="{{ output_sheet }}">
41
+
42
+ <label for="models">Groq Models</label>
43
+ <textarea id="models">{{ models or default_models }}</textarea>
44
+ <button id="fetchModels" class="secondary" type="button">Fetch Available Models</button>
45
+
46
+ <button id="runButton" type="button" {% if not clean_path %}disabled{% endif %}>Run Cleaning</button>
47
+ <div id="runStatus" class="status">{{ message }}</div>
48
+
49
+ <hr>
50
+
51
+ <h2>Apply Blueprint</h2>
52
+ <label>Workbook</label>
53
+ <div id="applyWorkbookFile">
54
+ {% if apply_workbook_filename %}
55
+ <div class="file-pill">{{ apply_workbook_filename }}</div>
56
+ {% endif %}
57
+ </div>
58
+ <form id="applyWorkbookForm" action="/prepare-apply-workbook" method="post" enctype="multipart/form-data" onsubmit="return false;">
59
+ <div class="dropbox">
60
+ <strong>Drop or select workbook</strong>
61
+ <input id="applyWorkbookInput" name="file" type="file" accept=".xlsx,.xlsm" required>
62
+ </div>
63
+ <div id="applyWorkbookUploadStatus" class="status"></div>
64
+ </form>
65
+
66
+ <label for="applySheetSelect">Sheet To Update</label>
67
+ <select id="applySheetSelect" {% if not apply_sheets %}disabled{% endif %}>
68
+ {% for sheet in apply_sheets %}
69
+ <option value="{{ sheet }}" {% if sheet == apply_selected_sheet %}selected{% endif %}>{{ sheet }}</option>
70
+ {% endfor %}
71
+ </select>
72
+
73
+ <label>Blueprint</label>
74
+ <div id="applyBlueprintFile">
75
+ {% if apply_blueprint_filename %}
76
+ <div class="file-pill">{{ apply_blueprint_filename }}</div>
77
+ {% endif %}
78
+ </div>
79
+ <form id="applyBlueprintForm" action="/prepare-apply-blueprint" method="post" enctype="multipart/form-data" onsubmit="return false;">
80
+ <div class="dropbox">
81
+ <strong>Drop or select blueprint</strong>
82
+ <input id="applyBlueprintInput" name="file" type="file" accept=".xlsx,.xlsm" required>
83
+ </div>
84
+ <div id="applyBlueprintUploadStatus" class="status"></div>
85
+ </form>
86
+
87
+ <button id="applyButton" type="button" {% if not can_apply %}disabled{% endif %}>Apply Blueprint</button>
88
+ <div id="applyStatus" class="status error">{{ error }}</div>
89
+
90
+ <hr>
91
+
92
+ <h2>References</h2>
93
+ <button id="saveReferencesButton" class="secondary" type="button" disabled>Save Manual References</button>
94
+ <div id="referencesStatus" class="status"></div>
95
+ </section>
96
+
97
+ <section>
98
+ <div class="output-block">
99
+ <h2>Cleaning Output</h2>
100
+ <div id="cleanProgressPanel" class="progress-panel">
101
+ <div class="progress-heading">
102
+ <span>Cleaning Progress</span>
103
+ <span id="cleanProgressSummary">Waiting...</span>
104
+ </div>
105
+ <div id="cleanProgressList" class="progress-list"></div>
106
+ </div>
107
+ <div id="cleanResult" class="blueprint-result"></div>
108
+ <details id="cleanRawLogDetails" class="raw-logs">
109
+ <summary>Raw cleaning logs</summary>
110
+ <div id="cleanLogs" class="log-screen"></div>
111
+ </details>
112
+ </div>
113
+
114
+ <div class="output-block">
115
+ <h2>Apply Output</h2>
116
+ <div id="applyResult" class="blueprint-result"></div>
117
+ <details id="applyRawLogDetails" class="raw-logs">
118
+ <summary>Raw apply logs</summary>
119
+ <div id="applyLogs" class="log-screen"></div>
120
+ </details>
121
+ </div>
122
+ </section>
123
+ </main>
124
+
125
+ <script>
126
+ window.MASTERMAP_CONFIG = {
127
+ cleanPath: {{ clean_path|tojson }},
128
+ applyWorkbookPath: {{ apply_workbook_path|tojson }},
129
+ applyBlueprintPath: {{ apply_blueprint_path|tojson }},
130
+ defaultOutputSheet: {{ default_output_sheet|tojson }}
131
+ };
132
+ </script>
133
+ <script src="{{ url_for('static', filename='app.js') }}"></script>
134
+
135
+ </body>
136
+ </html>
ui_app.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import uuid
3
+ from pathlib import Path
4
+
5
+ from flask import Flask, Response, jsonify, render_template, request, send_file
6
+
7
+ from newest_model import PREFERRED_PRODUCTION_CHAT_MODELS, select_groq_chat_models
8
+ from src.config import AVAILABLE_MODELS, DATA_DIR, DEFAULT_OUTPUT_SHEET_NAME
9
+ from src.process_runner import stop_process, stream_process
10
+ from src.utils import reference_sync_status, save_manual_references_to_hub
11
+ from src.workbook_io import read_workbook_sheets, resolve_allowed_path, save_uploaded_excel
12
+
13
+
14
+ APP_ROOT = Path(__file__).resolve().parent
15
+ UPLOAD_DIR = APP_ROOT / DATA_DIR / "uploads"
16
+ UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
17
+ ALLOWED_FILE_ROOTS = [APP_ROOT / DATA_DIR]
18
+
19
+ app = Flask(
20
+ __name__,
21
+ template_folder=str(APP_ROOT / "ui" / "templates"),
22
+ static_folder=str(APP_ROOT / "ui" / "static"),
23
+ )
24
+ app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
25
+
26
+ STATE = {
27
+ "clean_path": "",
28
+ "clean_filename": "",
29
+ "clean_sheets": [],
30
+ "clean_selected_sheet": "",
31
+ "output_sheet": DEFAULT_OUTPUT_SHEET_NAME,
32
+ "models": "",
33
+ "apply_workbook_path": "",
34
+ "apply_workbook_filename": "",
35
+ "apply_sheets": [],
36
+ "apply_selected_sheet": "",
37
+ "apply_blueprint_path": "",
38
+ "apply_blueprint_filename": "",
39
+ }
40
+ @app.after_request
41
+ def prevent_browser_cache(response):
42
+ response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
43
+ response.headers["Pragma"] = "no-cache"
44
+ response.headers["Expires"] = "0"
45
+ return response
46
+
47
+
48
+
49
+ def default_models() -> str:
50
+ preferred_model_ids = {model.lower() for model in PREFERRED_PRODUCTION_CHAT_MODELS}
51
+ env_preferred_models = [
52
+ model
53
+ for model in AVAILABLE_MODELS
54
+ if model.lower() in preferred_model_ids
55
+ ]
56
+ return ",".join(env_preferred_models or PREFERRED_PRODUCTION_CHAT_MODELS)
57
+
58
+
59
+ def render_page(message: str = "", error: str = ""):
60
+ if STATE["clean_sheets"]:
61
+ STATE["clean_selected_sheet"] = pick_sheet(STATE["clean_sheets"], STATE["clean_selected_sheet"])
62
+ if STATE["apply_sheets"]:
63
+ STATE["apply_selected_sheet"] = pick_sheet(STATE["apply_sheets"], STATE["apply_selected_sheet"])
64
+
65
+ return render_template(
66
+ "index.html",
67
+ **STATE,
68
+ default_output_sheet=DEFAULT_OUTPUT_SHEET_NAME,
69
+ default_models=default_models(),
70
+ can_apply=can_apply_blueprint(),
71
+ message=message,
72
+ error=error,
73
+ )
74
+
75
+
76
+ def can_apply_blueprint() -> bool:
77
+ return bool(
78
+ STATE["apply_workbook_path"]
79
+ and STATE["apply_blueprint_path"]
80
+ and STATE["apply_sheets"]
81
+ and STATE["apply_selected_sheet"]
82
+ )
83
+
84
+
85
+ def wants_json_response() -> bool:
86
+ return "application/json" in request.headers.get("Accept", "")
87
+
88
+
89
+ def ui_state_payload(message: str = "", error: str = ""):
90
+ return {
91
+ "message": message,
92
+ "error": error,
93
+ "apply_workbook_path": STATE["apply_workbook_path"],
94
+ "apply_workbook_filename": STATE["apply_workbook_filename"],
95
+ "apply_sheets": STATE["apply_sheets"],
96
+ "apply_selected_sheet": STATE["apply_selected_sheet"],
97
+ "apply_blueprint_path": STATE["apply_blueprint_path"],
98
+ "apply_blueprint_filename": STATE["apply_blueprint_filename"],
99
+ "can_apply": can_apply_blueprint(),
100
+ }
101
+
102
+
103
+ def pick_sheet(sheets, preferred_sheet=None):
104
+ if preferred_sheet and preferred_sheet in sheets:
105
+ return preferred_sheet
106
+ if STATE["output_sheet"] in sheets:
107
+ return STATE["output_sheet"]
108
+ return sheets[0] if sheets else ""
109
+
110
+
111
+ def update_ui_state_from_form(form):
112
+ STATE["clean_selected_sheet"] = form.get("clean_selected_sheet") or STATE["clean_selected_sheet"]
113
+ STATE["output_sheet"] = form.get("output_sheet") or STATE["output_sheet"] or DEFAULT_OUTPUT_SHEET_NAME
114
+ STATE["models"] = form.get("models") or STATE["models"]
115
+ STATE["apply_selected_sheet"] = form.get("apply_selected_sheet") or STATE["apply_selected_sheet"]
116
+
117
+
118
+ @app.route("/")
119
+ def index():
120
+ return render_page()
121
+
122
+
123
+ @app.route("/prepare-clean", methods=["POST"])
124
+ def prepare_clean():
125
+ try:
126
+ update_ui_state_from_form(request.form)
127
+ filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
128
+ sheets = read_workbook_sheets(path)
129
+ except Exception as exc:
130
+ return render_page(error=str(exc))
131
+
132
+ STATE["clean_path"] = str(path)
133
+ STATE["clean_filename"] = filename
134
+ STATE["clean_sheets"] = sheets
135
+ STATE["clean_selected_sheet"] = pick_sheet(sheets, request.form.get("clean_selected_sheet"))
136
+ STATE["apply_workbook_path"] = str(path)
137
+ STATE["apply_workbook_filename"] = filename
138
+ STATE["apply_sheets"] = sheets
139
+ STATE["apply_selected_sheet"] = pick_sheet(sheets, request.form.get("apply_selected_sheet") or STATE["output_sheet"])
140
+ return render_page(message=f"Loaded {filename}.")
141
+
142
+
143
+ @app.route("/remove-clean", methods=["POST"])
144
+ def remove_clean():
145
+ update_ui_state_from_form(request.form)
146
+ old_path = STATE["clean_path"]
147
+ STATE["clean_path"] = ""
148
+ STATE["clean_filename"] = ""
149
+ STATE["clean_sheets"] = []
150
+ STATE["clean_selected_sheet"] = ""
151
+ if STATE["apply_workbook_path"] == old_path:
152
+ STATE["apply_workbook_path"] = ""
153
+ STATE["apply_workbook_filename"] = ""
154
+ STATE["apply_sheets"] = []
155
+ STATE["apply_selected_sheet"] = ""
156
+ return render_page(message="File removed.")
157
+
158
+
159
+ @app.route("/prepare-apply-workbook", methods=["POST"])
160
+ def prepare_apply_workbook():
161
+ try:
162
+ update_ui_state_from_form(request.form)
163
+ filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
164
+ sheets = read_workbook_sheets(path)
165
+ except Exception as exc:
166
+ if wants_json_response():
167
+ return jsonify(ui_state_payload(error=str(exc))), 400
168
+ return render_page(error=str(exc))
169
+
170
+ STATE["apply_workbook_path"] = str(path)
171
+ STATE["apply_workbook_filename"] = filename
172
+ STATE["apply_sheets"] = sheets
173
+ STATE["apply_selected_sheet"] = pick_sheet(sheets, request.form.get("apply_selected_sheet"))
174
+ if wants_json_response():
175
+ return jsonify(ui_state_payload(message=f"Loaded apply workbook {filename}."))
176
+ return render_page(message=f"Loaded apply workbook {filename}.")
177
+
178
+
179
+ @app.route("/prepare-apply-blueprint", methods=["POST"])
180
+ def prepare_apply_blueprint():
181
+ try:
182
+ update_ui_state_from_form(request.form)
183
+ if STATE["apply_workbook_path"] and Path(STATE["apply_workbook_path"]).is_file():
184
+ STATE["apply_sheets"] = read_workbook_sheets(Path(STATE["apply_workbook_path"]))
185
+ STATE["apply_selected_sheet"] = pick_sheet(STATE["apply_sheets"], request.form.get("apply_selected_sheet"))
186
+ filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
187
+ except Exception as exc:
188
+ if wants_json_response():
189
+ return jsonify(ui_state_payload(error=str(exc))), 400
190
+ return render_page(error=str(exc))
191
+
192
+ STATE["apply_blueprint_path"] = str(path)
193
+ STATE["apply_blueprint_filename"] = filename
194
+ if wants_json_response():
195
+ return jsonify(ui_state_payload(message=f"Loaded blueprint {filename}."))
196
+ return render_page(message=f"Loaded blueprint {filename}.")
197
+
198
+
199
+ @app.route("/models")
200
+ def models_endpoint():
201
+ try:
202
+ models = select_groq_chat_models(limit=len(PREFERRED_PRODUCTION_CHAT_MODELS))
203
+ except Exception as exc:
204
+ return jsonify({"error": str(exc)}), 500
205
+ return jsonify({"models": models})
206
+
207
+
208
+ @app.route("/references/status")
209
+ def references_status():
210
+ return jsonify(reference_sync_status())
211
+
212
+
213
+ @app.route("/references/save", methods=["POST"])
214
+ def save_references():
215
+ try:
216
+ result = save_manual_references_to_hub(APP_ROOT)
217
+ except Exception as exc:
218
+ return jsonify({"error": str(exc)}), 400
219
+ return jsonify({"message": "Manual references saved to Hugging Face.", **result})
220
+
221
+
222
+ @app.route("/sheets")
223
+ def sheets_endpoint():
224
+ try:
225
+ workbook_path = resolve_allowed_path(request.args.get("path", ""), APP_ROOT, ALLOWED_FILE_ROOTS)
226
+ if not workbook_path.is_file():
227
+ return jsonify({"error": "Workbook is not available."}), 404
228
+ return jsonify({"sheets": read_workbook_sheets(workbook_path)})
229
+ except Exception as exc:
230
+ return jsonify({"error": str(exc)}), 500
231
+
232
+
233
+ @app.route("/download-blueprint")
234
+ def download_blueprint():
235
+ blueprint_path = APP_ROOT / DATA_DIR / "Blueprint.xlsx"
236
+ if not blueprint_path.exists():
237
+ return jsonify({"error": "Blueprint has not been generated yet."}), 404
238
+ return send_file(blueprint_path, as_attachment=True, download_name="Blueprint.xlsx")
239
+
240
+
241
+ @app.route("/download-cleaned-workbook")
242
+ def download_cleaned_workbook():
243
+ if not STATE["clean_path"]:
244
+ return jsonify({"error": "Cleaned workbook is not available."}), 404
245
+ workbook_path = resolve_allowed_path(STATE["clean_path"], APP_ROOT, ALLOWED_FILE_ROOTS)
246
+ if not workbook_path.is_file():
247
+ return jsonify({"error": "Cleaned workbook is not available."}), 404
248
+ return send_file(
249
+ workbook_path,
250
+ as_attachment=True,
251
+ download_name=f"cleaned_{STATE['clean_filename'] or workbook_path.name}",
252
+ )
253
+
254
+
255
+ @app.route("/download-applied-workbook")
256
+ def download_applied_workbook():
257
+ if not STATE["apply_workbook_path"]:
258
+ return jsonify({"error": "Applied workbook is not available."}), 404
259
+ workbook_path = resolve_allowed_path(STATE["apply_workbook_path"], APP_ROOT, ALLOWED_FILE_ROOTS)
260
+ if not workbook_path.is_file():
261
+ return jsonify({"error": "Applied workbook is not available."}), 404
262
+ return send_file(
263
+ workbook_path,
264
+ as_attachment=True,
265
+ download_name=f"cleaned_{STATE['apply_workbook_filename'] or workbook_path.name}",
266
+ )
267
+
268
+
269
+ @app.route("/run")
270
+ def run():
271
+ job_id = request.args.get("job_id", uuid.uuid4().hex)
272
+ input_path = request.args.get("input", "")
273
+ sheet = request.args.get("sheet", "")
274
+ output_sheet = request.args.get("output_sheet", DEFAULT_OUTPUT_SHEET_NAME)
275
+ model_list = request.args.get("models", "")
276
+
277
+ if not input_path or not sheet:
278
+ return jsonify({"error": "Input file and source sheet are required."}), 400
279
+
280
+ try:
281
+ workbook_path = resolve_allowed_path(input_path, APP_ROOT, ALLOWED_FILE_ROOTS)
282
+ except ValueError as exc:
283
+ return jsonify({"error": str(exc)}), 400
284
+
285
+ command = [
286
+ sys.executable,
287
+ "-u",
288
+ str(APP_ROOT / "main.py"),
289
+ "--input",
290
+ str(workbook_path),
291
+ "--sheet",
292
+ sheet,
293
+ "--output_sheet",
294
+ output_sheet,
295
+ ]
296
+ if model_list:
297
+ command.extend(["--models", model_list])
298
+
299
+ return Response(stream_process(command, cwd=APP_ROOT, job_id=job_id), mimetype="text/event-stream")
300
+
301
+
302
+ @app.route("/stop", methods=["POST"])
303
+ def stop():
304
+ job_id = request.args.get("job_id", "")
305
+ if not stop_process(job_id):
306
+ return jsonify({"stopped": False, "message": "No active run found."}), 404
307
+
308
+ return jsonify({"stopped": True})
309
+
310
+
311
+ @app.route("/apply")
312
+ def apply_blueprint():
313
+ input_path = request.args.get("input", "")
314
+ blueprint_path = request.args.get("blueprint", "")
315
+ sheet = request.args.get("sheet", DEFAULT_OUTPUT_SHEET_NAME)
316
+
317
+ if not input_path or not blueprint_path or not sheet:
318
+ return jsonify({"error": "Workbook, blueprint, and target sheet are required."}), 400
319
+
320
+ try:
321
+ workbook_path = resolve_allowed_path(input_path, APP_ROOT, ALLOWED_FILE_ROOTS)
322
+ resolved_blueprint_path = resolve_allowed_path(blueprint_path, APP_ROOT, ALLOWED_FILE_ROOTS)
323
+ except ValueError as exc:
324
+ return jsonify({"error": str(exc)}), 400
325
+
326
+ command = [
327
+ sys.executable,
328
+ "-u",
329
+ str(APP_ROOT / "apply_blueprint.py"),
330
+ "--input",
331
+ str(workbook_path),
332
+ "--blueprint",
333
+ str(resolved_blueprint_path),
334
+ "--sheet",
335
+ sheet,
336
+ ]
337
+
338
+ return Response(stream_process(command, cwd=APP_ROOT), mimetype="text/event-stream")
339
+
340
+
341
+ if __name__ == "__main__":
342
+ app.run(debug=False, threaded=True)