joycecast commited on
Commit
56c462f
·
verified ·
1 Parent(s): d98c784

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -24
app.py CHANGED
@@ -98,11 +98,7 @@ def load_single_excel(file_content):
98
 
99
  @st.cache_data
100
  def load_and_validate_excel(file_contents_list, file_names_list, keywords_hash):
101
- """Load multiple Excel files and combine - cached to avoid re-running on filter changes
102
-
103
- Returns:
104
- tuple: (combined_df, original_count, dedup_count)
105
- """
106
  all_dfs = []
107
 
108
  for file_content in file_contents_list:
@@ -111,21 +107,8 @@ def load_and_validate_excel(file_contents_list, file_names_list, keywords_hash):
111
 
112
  # Concatenate all dataframes
113
  combined_df = pd.concat(all_dfs, ignore_index=True)
114
- original_count = len(combined_df)
115
-
116
- # Remove duplicates based on Tariff + Description (after combining all files)
117
- # Keep first occurrence
118
- dedup_cols = ["Tariff", "Description"]
119
- dedup_cols = [c for c in dedup_cols if c in combined_df.columns]
120
-
121
- if dedup_cols:
122
- combined_df = combined_df.drop_duplicates(subset=dedup_cols, keep='first')
123
-
124
- # Reset index to ensure sequential indices after deduplication
125
- combined_df = combined_df.reset_index(drop=True)
126
 
127
- dedup_count = original_count - len(combined_df)
128
- return combined_df, original_count, dedup_count
129
 
130
 
131
  @st.cache_data
@@ -288,16 +271,13 @@ with tab1:
288
  for f in uploaded_files:
289
  f.seek(0)
290
 
291
- df, original_count, dedup_count = load_and_validate_excel(file_contents, file_names, keywords_hash)
292
 
293
  st.session_state.original_df = df
294
 
295
  # Show load summary
296
  if len(uploaded_files) > 1:
297
- msg = f"Loaded {len(df)} unique rows from {len(uploaded_files)} files"
298
- if dedup_count > 0:
299
- msg += f" ({dedup_count} duplicates removed)"
300
- st.success(msg)
301
  else:
302
  st.success(f"Loaded {len(df)} rows")
303
 
 
98
 
99
  @st.cache_data
100
  def load_and_validate_excel(file_contents_list, file_names_list, keywords_hash):
101
+ """Load multiple Excel files and combine - cached to avoid re-running on filter changes"""
 
 
 
 
102
  all_dfs = []
103
 
104
  for file_content in file_contents_list:
 
107
 
108
  # Concatenate all dataframes
109
  combined_df = pd.concat(all_dfs, ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ return combined_df
 
112
 
113
 
114
  @st.cache_data
 
271
  for f in uploaded_files:
272
  f.seek(0)
273
 
274
+ df = load_and_validate_excel(file_contents, file_names, keywords_hash)
275
 
276
  st.session_state.original_df = df
277
 
278
  # Show load summary
279
  if len(uploaded_files) > 1:
280
+ st.success(f"Loaded {len(df)} rows from {len(uploaded_files)} files")
 
 
 
281
  else:
282
  st.success(f"Loaded {len(df)} rows")
283