Spaces:

APEXlogistics
/

HTSReivewTool

Sleeping

App Files Files Community

joycecast commited on Jan 12

Commit

8ad3217

verified ·

1 Parent(s): e934b8c

Upload app.py

Browse files

Files changed (1) hide show

app.py +63 -19

app.py CHANGED Viewed

@@ -73,11 +73,9 @@ if not check_password():
     st.stop()
-@st.cache_data
-def load_and_validate_excel(file_content, file_name, keywords_hash):
-    """Load Excel and run validation - cached to avoid re-running on filter changes"""
-    # Read Excel with HTS columns as string
-    df = pd.read_excel(file_content, dtype={
         "Tariff": str,
         "Primary 1": str,
         "Primary 2": str,
@@ -98,6 +96,35 @@ def load_and_validate_excel(file_content, file_name, keywords_hash):
     return df
 @st.cache_data
 def run_validation(df_hash, _df, _validator):
     """Run validation - cached based on dataframe hash"""
@@ -238,22 +265,38 @@ tab1, tab2, tab2b, tab3, tab4, tab5 = st.tabs([
 # Tab 1: Upload & Filter
 with tab1:
-    st.header("Upload Excel File")
-    uploaded_file = st.file_uploader(
-        "Upload your entry report Excel file",
         type=["xlsx", "xls"],
-        help="Upload the customizable entry report from NetCHB"
     )
-    if uploaded_file is not None:
         try:
-            # Use cached loading function
             keywords_hash = get_keywords_hash(st.session_state.keywords)
-            df = load_and_validate_excel(uploaded_file, uploaded_file.name, keywords_hash)
             st.session_state.original_df = df
-            st.success(f"Loaded {len(df)} rows")
             # Display column mapping info
             with st.expander("Column Mapping"):
@@ -321,13 +364,14 @@ with tab1:
                     st.info("Showing first 100 rows.")
                 # Run validation ONCE on full dataset (cached), then filter results
-                if "cached_full_results" not in st.session_state or st.session_state.get("cached_file_name") != uploaded_file.name:
                     with st.spinner("Validating all entries (one-time)..."):
                         validator = get_validator()
                         full_results = validate_dataframe(df, validator)
                         full_results_df = results_to_dataframe(full_results)
                         st.session_state.cached_full_results = full_results_df
-                        st.session_state.cached_file_name = uploaded_file.name
                 # Filter cached results based on current filters
                 full_results_df = st.session_state.cached_full_results
@@ -770,8 +814,8 @@ with tab3:
             # Clear cached results to force re-validation
             if "cached_full_results" in st.session_state:
                 del st.session_state.cached_full_results
-            if "cached_file_name" in st.session_state:
-                del st.session_state.cached_file_name
             st.success("Keywords saved! Re-upload file or refresh to apply changes.")
     with col2:
@@ -786,8 +830,8 @@ with tab3:
             # Clear cached results
             if "cached_full_results" in st.session_state:
                 del st.session_state.cached_full_results
-            if "cached_file_name" in st.session_state:
-                del st.session_state.cached_file_name
             st.success("Keywords reset to defaults!")
             st.rerun()

     st.stop()
+def load_single_excel(file_content):
+    """Load a single Excel file with proper HTS column types"""
+    df = pd.read_excel(BytesIO(file_content), dtype={
         "Tariff": str,
         "Primary 1": str,
         "Primary 2": str,
     return df
+@st.cache_data
+def load_and_validate_excel(file_contents_list, file_names_list, keywords_hash):
+    """Load multiple Excel files and combine - cached to avoid re-running on filter changes
+    Returns:
+        tuple: (combined_df, original_count, dedup_count)
+    """
+    all_dfs = []
+    for file_content in file_contents_list:
+        df = load_single_excel(file_content)
+        all_dfs.append(df)
+    # Concatenate all dataframes
+    combined_df = pd.concat(all_dfs, ignore_index=True)
+    original_count = len(combined_df)
+    # Remove duplicates based on Tariff + Description (after combining all files)
+    # Keep first occurrence
+    dedup_cols = ["Tariff", "Description"]
+    dedup_cols = [c for c in dedup_cols if c in combined_df.columns]
+    if dedup_cols:
+        combined_df = combined_df.drop_duplicates(subset=dedup_cols, keep='first')
+    dedup_count = original_count - len(combined_df)
+    return combined_df, original_count, dedup_count
 @st.cache_data
 def run_validation(df_hash, _df, _validator):
     """Run validation - cached based on dataframe hash"""
 # Tab 1: Upload & Filter
 with tab1:
+    st.header("Upload Excel Files")
+    uploaded_files = st.file_uploader(
+        "Upload entry report Excel files (multiple allowed)",
         type=["xlsx", "xls"],
+        accept_multiple_files=True,
+        help="Upload one or more customizable entry reports from NetCHB. Duplicates across files will be removed."
     )
+    if uploaded_files:
         try:
+            # Use cached loading function with multiple files
             keywords_hash = get_keywords_hash(st.session_state.keywords)
+            file_contents = [f.read() for f in uploaded_files]
+            file_names = [f.name for f in uploaded_files]
+            # Reset file positions for potential re-read
+            for f in uploaded_files:
+                f.seek(0)
+            df, original_count, dedup_count = load_and_validate_excel(file_contents, file_names, keywords_hash)
             st.session_state.original_df = df
+            # Show load summary
+            if len(uploaded_files) > 1:
+                msg = f"Loaded {len(df)} unique rows from {len(uploaded_files)} files"
+                if dedup_count > 0:
+                    msg += f" ({dedup_count} duplicates removed)"
+                st.success(msg)
+            else:
+                st.success(f"Loaded {len(df)} rows")
             # Display column mapping info
             with st.expander("Column Mapping"):
                     st.info("Showing first 100 rows.")
                 # Run validation ONCE on full dataset (cached), then filter results
+                file_names_key = ",".join(sorted(file_names))
+                if "cached_full_results" not in st.session_state or st.session_state.get("cached_file_namess") != file_names_key:
                     with st.spinner("Validating all entries (one-time)..."):
                         validator = get_validator()
                         full_results = validate_dataframe(df, validator)
                         full_results_df = results_to_dataframe(full_results)
                         st.session_state.cached_full_results = full_results_df
+                        st.session_state.cached_file_namess = file_names_key
                 # Filter cached results based on current filters
                 full_results_df = st.session_state.cached_full_results
             # Clear cached results to force re-validation
             if "cached_full_results" in st.session_state:
                 del st.session_state.cached_full_results
+            if "cached_file_names" in st.session_state:
+                del st.session_state.cached_file_names
             st.success("Keywords saved! Re-upload file or refresh to apply changes.")
     with col2:
             # Clear cached results
             if "cached_full_results" in st.session_state:
                 del st.session_state.cached_full_results
+            if "cached_file_names" in st.session_state:
+                del st.session_state.cached_file_names
             st.success("Keywords reset to defaults!")
             st.rerun()