Flopot2 commited on
Commit
35eb3e9
Β·
verified Β·
1 Parent(s): 065b9c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -20
app.py CHANGED
@@ -37,32 +37,43 @@ if file_404 and file_200:
37
 
38
  st.success("βœ… Files loaded!")
39
 
40
- # Column selection
41
- st.markdown("### 🧠 Step 2: Select Columns to Compare")
42
- common_cols = list(set(df_404.columns) & set(df_200.columns))
43
- selected_cols = st.multiselect("Select common columns (e.g. URL, title, H1, keywords):", common_cols)
44
-
45
- if selected_cols:
 
 
 
 
 
 
 
 
 
 
 
 
46
  if st.button("πŸš€ Run Matching"):
47
- st.info("Generating embeddings...")
48
 
49
  # Clean + combine text for each row
50
- def prepare_text(df):
51
  df = df.copy()
52
- if 'url' in [col.lower() for col in selected_cols]:
53
- for col in df.columns:
54
- if col.lower() == 'url':
55
- df[col] = df[col].astype(str).apply(clean_url)
56
- df_text = df[selected_cols].astype(str).fillna("")
57
- return df_text.apply(lambda row: " ".join(row), axis=1).tolist()
58
 
59
- text_404 = prepare_text(df_404)
60
- text_200 = prepare_text(df_200)
61
 
62
- # Embedding
63
  model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
64
  emb_404 = model.encode(text_404, show_progress_bar=True)
65
  emb_200 = model.encode(text_200, show_progress_bar=True)
 
66
 
67
  # Similarity matrix
68
  sim_matrix = cosine_similarity(emb_404, emb_200)
@@ -74,8 +85,8 @@ if file_404 and file_200:
74
  top_idx = np.argsort(row)[-top_k:][::-1]
75
  for rank, j in enumerate(top_idx, start=1):
76
  matches.append({
77
- "404 URL": df_404.iloc[i]['url'] if 'url' in df_404.columns else f"Row {i+1}",
78
- "Matched 200 URL": df_200.iloc[j]['url'] if 'url' in df_200.columns else f"Row {j+1}",
79
  "Similarity Score": round(row[j], 4),
80
  "Rank": rank
81
  })
@@ -89,4 +100,4 @@ if file_404 and file_200:
89
  st.download_button("πŸ“₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
90
 
91
  else:
92
- st.info("πŸ‘‰ Please select at least one column to compare.")
 
37
 
38
  st.success("βœ… Files loaded!")
39
 
40
+ # === Column selection (URLs + text fields)
41
+ st.markdown("### 🧠 Step 2: Select Columns")
42
+
43
+ st.markdown("#### πŸ”΄ For 404 URLs")
44
+ col1, col2 = st.columns(2)
45
+ with col1:
46
+ url_col_404 = st.selectbox("Select URL column (404):", df_404.columns.tolist())
47
+ with col2:
48
+ text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
49
+
50
+ st.markdown("#### 🟒 For 200 URLs")
51
+ col3, col4 = st.columns(2)
52
+ with col3:
53
+ url_col_200 = st.selectbox("Select URL column (200):", df_200.columns.tolist())
54
+ with col4:
55
+ text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
56
+
57
+ if url_col_404 and url_col_200 and text_cols_404 and text_cols_200:
58
  if st.button("πŸš€ Run Matching"):
59
+ status_msg = st.empty()
60
 
61
  # Clean + combine text for each row
62
+ def prepare_text(df, url_col, text_cols):
63
  df = df.copy()
64
+ df[url_col] = df[url_col].astype(str).apply(clean_url)
65
+ df_text = df[text_cols].astype(str).fillna("")
66
+ combined = df_text.apply(lambda row: " ".join(row), axis=1).tolist()
67
+ return df[url_col].tolist(), combined
 
 
68
 
69
+ urls_404, text_404 = prepare_text(df_404, url_col_404, text_cols_404)
70
+ urls_200, text_200 = prepare_text(df_200, url_col_200, text_cols_200)
71
 
72
+ status_msg.info("πŸ”„ Generating embeddings...")
73
  model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
74
  emb_404 = model.encode(text_404, show_progress_bar=True)
75
  emb_200 = model.encode(text_200, show_progress_bar=True)
76
+ status_msg.empty() # βœ… Clear status message
77
 
78
  # Similarity matrix
79
  sim_matrix = cosine_similarity(emb_404, emb_200)
 
85
  top_idx = np.argsort(row)[-top_k:][::-1]
86
  for rank, j in enumerate(top_idx, start=1):
87
  matches.append({
88
+ "404 URL": urls_404[i],
89
+ "Matched 200 URL": urls_200[j],
90
  "Similarity Score": round(row[j], 4),
91
  "Rank": rank
92
  })
 
100
  st.download_button("πŸ“₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
101
 
102
  else:
103
+ st.info("πŸ‘‰ Please select 1 URL column + at least 1 text field **for each file**.")