Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -73,11 +73,9 @@ if not check_password():
|
|
| 73 |
st.stop()
|
| 74 |
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
# Read Excel with HTS columns as string
|
| 80 |
-
df = pd.read_excel(file_content, dtype={
|
| 81 |
"Tariff": str,
|
| 82 |
"Primary 1": str,
|
| 83 |
"Primary 2": str,
|
|
@@ -98,6 +96,35 @@ def load_and_validate_excel(file_content, file_name, keywords_hash):
|
|
| 98 |
return df
|
| 99 |
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
@st.cache_data
|
| 102 |
def run_validation(df_hash, _df, _validator):
|
| 103 |
"""Run validation - cached based on dataframe hash"""
|
|
@@ -238,22 +265,38 @@ tab1, tab2, tab2b, tab3, tab4, tab5 = st.tabs([
|
|
| 238 |
|
| 239 |
# Tab 1: Upload & Filter
|
| 240 |
with tab1:
|
| 241 |
-
st.header("Upload Excel
|
| 242 |
|
| 243 |
-
|
| 244 |
-
"Upload
|
| 245 |
type=["xlsx", "xls"],
|
| 246 |
-
|
|
|
|
| 247 |
)
|
| 248 |
|
| 249 |
-
if
|
| 250 |
try:
|
| 251 |
-
# Use cached loading function
|
| 252 |
keywords_hash = get_keywords_hash(st.session_state.keywords)
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
st.session_state.original_df = df
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
# Display column mapping info
|
| 259 |
with st.expander("Column Mapping"):
|
|
@@ -321,13 +364,14 @@ with tab1:
|
|
| 321 |
st.info("Showing first 100 rows.")
|
| 322 |
|
| 323 |
# Run validation ONCE on full dataset (cached), then filter results
|
| 324 |
-
|
|
|
|
| 325 |
with st.spinner("Validating all entries (one-time)..."):
|
| 326 |
validator = get_validator()
|
| 327 |
full_results = validate_dataframe(df, validator)
|
| 328 |
full_results_df = results_to_dataframe(full_results)
|
| 329 |
st.session_state.cached_full_results = full_results_df
|
| 330 |
-
st.session_state.
|
| 331 |
|
| 332 |
# Filter cached results based on current filters
|
| 333 |
full_results_df = st.session_state.cached_full_results
|
|
@@ -770,8 +814,8 @@ with tab3:
|
|
| 770 |
# Clear cached results to force re-validation
|
| 771 |
if "cached_full_results" in st.session_state:
|
| 772 |
del st.session_state.cached_full_results
|
| 773 |
-
if "
|
| 774 |
-
del st.session_state.
|
| 775 |
st.success("Keywords saved! Re-upload file or refresh to apply changes.")
|
| 776 |
|
| 777 |
with col2:
|
|
@@ -786,8 +830,8 @@ with tab3:
|
|
| 786 |
# Clear cached results
|
| 787 |
if "cached_full_results" in st.session_state:
|
| 788 |
del st.session_state.cached_full_results
|
| 789 |
-
if "
|
| 790 |
-
del st.session_state.
|
| 791 |
st.success("Keywords reset to defaults!")
|
| 792 |
st.rerun()
|
| 793 |
|
|
|
|
| 73 |
st.stop()
|
| 74 |
|
| 75 |
|
| 76 |
+
def load_single_excel(file_content):
|
| 77 |
+
"""Load a single Excel file with proper HTS column types"""
|
| 78 |
+
df = pd.read_excel(BytesIO(file_content), dtype={
|
|
|
|
|
|
|
| 79 |
"Tariff": str,
|
| 80 |
"Primary 1": str,
|
| 81 |
"Primary 2": str,
|
|
|
|
| 96 |
return df
|
| 97 |
|
| 98 |
|
| 99 |
+
@st.cache_data
|
| 100 |
+
def load_and_validate_excel(file_contents_list, file_names_list, keywords_hash):
|
| 101 |
+
"""Load multiple Excel files and combine - cached to avoid re-running on filter changes
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
tuple: (combined_df, original_count, dedup_count)
|
| 105 |
+
"""
|
| 106 |
+
all_dfs = []
|
| 107 |
+
|
| 108 |
+
for file_content in file_contents_list:
|
| 109 |
+
df = load_single_excel(file_content)
|
| 110 |
+
all_dfs.append(df)
|
| 111 |
+
|
| 112 |
+
# Concatenate all dataframes
|
| 113 |
+
combined_df = pd.concat(all_dfs, ignore_index=True)
|
| 114 |
+
original_count = len(combined_df)
|
| 115 |
+
|
| 116 |
+
# Remove duplicates based on Tariff + Description (after combining all files)
|
| 117 |
+
# Keep first occurrence
|
| 118 |
+
dedup_cols = ["Tariff", "Description"]
|
| 119 |
+
dedup_cols = [c for c in dedup_cols if c in combined_df.columns]
|
| 120 |
+
|
| 121 |
+
if dedup_cols:
|
| 122 |
+
combined_df = combined_df.drop_duplicates(subset=dedup_cols, keep='first')
|
| 123 |
+
|
| 124 |
+
dedup_count = original_count - len(combined_df)
|
| 125 |
+
return combined_df, original_count, dedup_count
|
| 126 |
+
|
| 127 |
+
|
| 128 |
@st.cache_data
|
| 129 |
def run_validation(df_hash, _df, _validator):
|
| 130 |
"""Run validation - cached based on dataframe hash"""
|
|
|
|
| 265 |
|
| 266 |
# Tab 1: Upload & Filter
|
| 267 |
with tab1:
|
| 268 |
+
st.header("Upload Excel Files")
|
| 269 |
|
| 270 |
+
uploaded_files = st.file_uploader(
|
| 271 |
+
"Upload entry report Excel files (multiple allowed)",
|
| 272 |
type=["xlsx", "xls"],
|
| 273 |
+
accept_multiple_files=True,
|
| 274 |
+
help="Upload one or more customizable entry reports from NetCHB. Duplicates across files will be removed."
|
| 275 |
)
|
| 276 |
|
| 277 |
+
if uploaded_files:
|
| 278 |
try:
|
| 279 |
+
# Use cached loading function with multiple files
|
| 280 |
keywords_hash = get_keywords_hash(st.session_state.keywords)
|
| 281 |
+
file_contents = [f.read() for f in uploaded_files]
|
| 282 |
+
file_names = [f.name for f in uploaded_files]
|
| 283 |
+
|
| 284 |
+
# Reset file positions for potential re-read
|
| 285 |
+
for f in uploaded_files:
|
| 286 |
+
f.seek(0)
|
| 287 |
+
|
| 288 |
+
df, original_count, dedup_count = load_and_validate_excel(file_contents, file_names, keywords_hash)
|
| 289 |
|
| 290 |
st.session_state.original_df = df
|
| 291 |
+
|
| 292 |
+
# Show load summary
|
| 293 |
+
if len(uploaded_files) > 1:
|
| 294 |
+
msg = f"Loaded {len(df)} unique rows from {len(uploaded_files)} files"
|
| 295 |
+
if dedup_count > 0:
|
| 296 |
+
msg += f" ({dedup_count} duplicates removed)"
|
| 297 |
+
st.success(msg)
|
| 298 |
+
else:
|
| 299 |
+
st.success(f"Loaded {len(df)} rows")
|
| 300 |
|
| 301 |
# Display column mapping info
|
| 302 |
with st.expander("Column Mapping"):
|
|
|
|
| 364 |
st.info("Showing first 100 rows.")
|
| 365 |
|
| 366 |
# Run validation ONCE on full dataset (cached), then filter results
|
| 367 |
+
file_names_key = ",".join(sorted(file_names))
|
| 368 |
+
if "cached_full_results" not in st.session_state or st.session_state.get("cached_file_namess") != file_names_key:
|
| 369 |
with st.spinner("Validating all entries (one-time)..."):
|
| 370 |
validator = get_validator()
|
| 371 |
full_results = validate_dataframe(df, validator)
|
| 372 |
full_results_df = results_to_dataframe(full_results)
|
| 373 |
st.session_state.cached_full_results = full_results_df
|
| 374 |
+
st.session_state.cached_file_namess = file_names_key
|
| 375 |
|
| 376 |
# Filter cached results based on current filters
|
| 377 |
full_results_df = st.session_state.cached_full_results
|
|
|
|
| 814 |
# Clear cached results to force re-validation
|
| 815 |
if "cached_full_results" in st.session_state:
|
| 816 |
del st.session_state.cached_full_results
|
| 817 |
+
if "cached_file_names" in st.session_state:
|
| 818 |
+
del st.session_state.cached_file_names
|
| 819 |
st.success("Keywords saved! Re-upload file or refresh to apply changes.")
|
| 820 |
|
| 821 |
with col2:
|
|
|
|
| 830 |
# Clear cached results
|
| 831 |
if "cached_full_results" in st.session_state:
|
| 832 |
del st.session_state.cached_full_results
|
| 833 |
+
if "cached_file_names" in st.session_state:
|
| 834 |
+
del st.session_state.cached_file_names
|
| 835 |
st.success("Keywords reset to defaults!")
|
| 836 |
st.rerun()
|
| 837 |
|