joycecast commited on
Commit
8ad3217
·
verified ·
1 Parent(s): e934b8c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -19
app.py CHANGED
@@ -73,11 +73,9 @@ if not check_password():
73
  st.stop()
74
 
75
 
76
- @st.cache_data
77
- def load_and_validate_excel(file_content, file_name, keywords_hash):
78
- """Load Excel and run validation - cached to avoid re-running on filter changes"""
79
- # Read Excel with HTS columns as string
80
- df = pd.read_excel(file_content, dtype={
81
  "Tariff": str,
82
  "Primary 1": str,
83
  "Primary 2": str,
@@ -98,6 +96,35 @@ def load_and_validate_excel(file_content, file_name, keywords_hash):
98
  return df
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  @st.cache_data
102
  def run_validation(df_hash, _df, _validator):
103
  """Run validation - cached based on dataframe hash"""
@@ -238,22 +265,38 @@ tab1, tab2, tab2b, tab3, tab4, tab5 = st.tabs([
238
 
239
  # Tab 1: Upload & Filter
240
  with tab1:
241
- st.header("Upload Excel File")
242
 
243
- uploaded_file = st.file_uploader(
244
- "Upload your entry report Excel file",
245
  type=["xlsx", "xls"],
246
- help="Upload the customizable entry report from NetCHB"
 
247
  )
248
 
249
- if uploaded_file is not None:
250
  try:
251
- # Use cached loading function
252
  keywords_hash = get_keywords_hash(st.session_state.keywords)
253
- df = load_and_validate_excel(uploaded_file, uploaded_file.name, keywords_hash)
 
 
 
 
 
 
 
254
 
255
  st.session_state.original_df = df
256
- st.success(f"Loaded {len(df)} rows")
 
 
 
 
 
 
 
 
257
 
258
  # Display column mapping info
259
  with st.expander("Column Mapping"):
@@ -321,13 +364,14 @@ with tab1:
321
  st.info("Showing first 100 rows.")
322
 
323
  # Run validation ONCE on full dataset (cached), then filter results
324
- if "cached_full_results" not in st.session_state or st.session_state.get("cached_file_name") != uploaded_file.name:
 
325
  with st.spinner("Validating all entries (one-time)..."):
326
  validator = get_validator()
327
  full_results = validate_dataframe(df, validator)
328
  full_results_df = results_to_dataframe(full_results)
329
  st.session_state.cached_full_results = full_results_df
330
- st.session_state.cached_file_name = uploaded_file.name
331
 
332
  # Filter cached results based on current filters
333
  full_results_df = st.session_state.cached_full_results
@@ -770,8 +814,8 @@ with tab3:
770
  # Clear cached results to force re-validation
771
  if "cached_full_results" in st.session_state:
772
  del st.session_state.cached_full_results
773
- if "cached_file_name" in st.session_state:
774
- del st.session_state.cached_file_name
775
  st.success("Keywords saved! Re-upload file or refresh to apply changes.")
776
 
777
  with col2:
@@ -786,8 +830,8 @@ with tab3:
786
  # Clear cached results
787
  if "cached_full_results" in st.session_state:
788
  del st.session_state.cached_full_results
789
- if "cached_file_name" in st.session_state:
790
- del st.session_state.cached_file_name
791
  st.success("Keywords reset to defaults!")
792
  st.rerun()
793
 
 
73
  st.stop()
74
 
75
 
76
+ def load_single_excel(file_content):
77
+ """Load a single Excel file with proper HTS column types"""
78
+ df = pd.read_excel(BytesIO(file_content), dtype={
 
 
79
  "Tariff": str,
80
  "Primary 1": str,
81
  "Primary 2": str,
 
96
  return df
97
 
98
 
99
+ @st.cache_data
100
+ def load_and_validate_excel(file_contents_list, file_names_list, keywords_hash):
101
+ """Load multiple Excel files and combine - cached to avoid re-running on filter changes
102
+
103
+ Returns:
104
+ tuple: (combined_df, original_count, dedup_count)
105
+ """
106
+ all_dfs = []
107
+
108
+ for file_content in file_contents_list:
109
+ df = load_single_excel(file_content)
110
+ all_dfs.append(df)
111
+
112
+ # Concatenate all dataframes
113
+ combined_df = pd.concat(all_dfs, ignore_index=True)
114
+ original_count = len(combined_df)
115
+
116
+ # Remove duplicates based on Tariff + Description (after combining all files)
117
+ # Keep first occurrence
118
+ dedup_cols = ["Tariff", "Description"]
119
+ dedup_cols = [c for c in dedup_cols if c in combined_df.columns]
120
+
121
+ if dedup_cols:
122
+ combined_df = combined_df.drop_duplicates(subset=dedup_cols, keep='first')
123
+
124
+ dedup_count = original_count - len(combined_df)
125
+ return combined_df, original_count, dedup_count
126
+
127
+
128
  @st.cache_data
129
  def run_validation(df_hash, _df, _validator):
130
  """Run validation - cached based on dataframe hash"""
 
265
 
266
  # Tab 1: Upload & Filter
267
  with tab1:
268
+ st.header("Upload Excel Files")
269
 
270
+ uploaded_files = st.file_uploader(
271
+ "Upload entry report Excel files (multiple allowed)",
272
  type=["xlsx", "xls"],
273
+ accept_multiple_files=True,
274
+ help="Upload one or more customizable entry reports from NetCHB. Duplicates across files will be removed."
275
  )
276
 
277
+ if uploaded_files:
278
  try:
279
+ # Use cached loading function with multiple files
280
  keywords_hash = get_keywords_hash(st.session_state.keywords)
281
+ file_contents = [f.read() for f in uploaded_files]
282
+ file_names = [f.name for f in uploaded_files]
283
+
284
+ # Reset file positions for potential re-read
285
+ for f in uploaded_files:
286
+ f.seek(0)
287
+
288
+ df, original_count, dedup_count = load_and_validate_excel(file_contents, file_names, keywords_hash)
289
 
290
  st.session_state.original_df = df
291
+
292
+ # Show load summary
293
+ if len(uploaded_files) > 1:
294
+ msg = f"Loaded {len(df)} unique rows from {len(uploaded_files)} files"
295
+ if dedup_count > 0:
296
+ msg += f" ({dedup_count} duplicates removed)"
297
+ st.success(msg)
298
+ else:
299
+ st.success(f"Loaded {len(df)} rows")
300
 
301
  # Display column mapping info
302
  with st.expander("Column Mapping"):
 
364
  st.info("Showing first 100 rows.")
365
 
366
  # Run validation ONCE on full dataset (cached), then filter results
367
+ file_names_key = ",".join(sorted(file_names))
368
+ if "cached_full_results" not in st.session_state or st.session_state.get("cached_file_namess") != file_names_key:
369
  with st.spinner("Validating all entries (one-time)..."):
370
  validator = get_validator()
371
  full_results = validate_dataframe(df, validator)
372
  full_results_df = results_to_dataframe(full_results)
373
  st.session_state.cached_full_results = full_results_df
374
+ st.session_state.cached_file_namess = file_names_key
375
 
376
  # Filter cached results based on current filters
377
  full_results_df = st.session_state.cached_full_results
 
814
  # Clear cached results to force re-validation
815
  if "cached_full_results" in st.session_state:
816
  del st.session_state.cached_full_results
817
+ if "cached_file_names" in st.session_state:
818
+ del st.session_state.cached_file_names
819
  st.success("Keywords saved! Re-upload file or refresh to apply changes.")
820
 
821
  with col2:
 
830
  # Clear cached results
831
  if "cached_full_results" in st.session_state:
832
  del st.session_state.cached_full_results
833
+ if "cached_file_names" in st.session_state:
834
+ del st.session_state.cached_file_names
835
  st.success("Keywords reset to defaults!")
836
  st.rerun()
837