Spaces:

Flopot2
/

404-redirect-mapper

Sleeping

App Files Files Community

Flopot2 commited on Apr 25, 2025

Commit

35eb3e9

verified ·

1 Parent(s): 065b9c7

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -20

app.py CHANGED Viewed

@@ -37,32 +37,43 @@ if file_404 and file_200:
     st.success("✅ Files loaded!")
-    # Column selection
-    st.markdown("### 🧠 Step 2: Select Columns to Compare")
-    common_cols = list(set(df_404.columns) & set(df_200.columns))
-    selected_cols = st.multiselect("Select common columns (e.g. URL, title, H1, keywords):", common_cols)
-    if selected_cols:
         if st.button("🚀 Run Matching"):
-            st.info("Generating embeddings...")
             # Clean + combine text for each row
-            def prepare_text(df):
                 df = df.copy()
-                if 'url' in [col.lower() for col in selected_cols]:
-                    for col in df.columns:
-                        if col.lower() == 'url':
-                            df[col] = df[col].astype(str).apply(clean_url)
-                df_text = df[selected_cols].astype(str).fillna("")
-                return df_text.apply(lambda row: " ".join(row), axis=1).tolist()
-            text_404 = prepare_text(df_404)
-            text_200 = prepare_text(df_200)
-            # Embedding
             model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
             emb_404 = model.encode(text_404, show_progress_bar=True)
             emb_200 = model.encode(text_200, show_progress_bar=True)
             # Similarity matrix
             sim_matrix = cosine_similarity(emb_404, emb_200)
@@ -74,8 +85,8 @@ if file_404 and file_200:
                 top_idx = np.argsort(row)[-top_k:][::-1]
                 for rank, j in enumerate(top_idx, start=1):
                     matches.append({
-                        "404 URL": df_404.iloc[i]['url'] if 'url' in df_404.columns else f"Row {i+1}",
-                        "Matched 200 URL": df_200.iloc[j]['url'] if 'url' in df_200.columns else f"Row {j+1}",
                         "Similarity Score": round(row[j], 4),
                         "Rank": rank
                     })
@@ -89,4 +100,4 @@ if file_404 and file_200:
             st.download_button("📥 Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
     else:
-        st.info("👉 Please select at least one column to compare.")

     st.success("✅ Files loaded!")
+    # === Column selection (URLs + text fields)
+    st.markdown("### 🧠 Step 2: Select Columns")
+    st.markdown("#### 🔴 For 404 URLs")
+    col1, col2 = st.columns(2)
+    with col1:
+        url_col_404 = st.selectbox("Select URL column (404):", df_404.columns.tolist())
+    with col2:
+        text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
+    st.markdown("#### 🟢 For 200 URLs")
+    col3, col4 = st.columns(2)
+    with col3:
+        url_col_200 = st.selectbox("Select URL column (200):", df_200.columns.tolist())
+    with col4:
+        text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
+    if url_col_404 and url_col_200 and text_cols_404 and text_cols_200:
         if st.button("🚀 Run Matching"):
+            status_msg = st.empty()
             # Clean + combine text for each row
+            def prepare_text(df, url_col, text_cols):
                 df = df.copy()
+                df[url_col] = df[url_col].astype(str).apply(clean_url)
+                df_text = df[text_cols].astype(str).fillna("")
+                combined = df_text.apply(lambda row: " ".join(row), axis=1).tolist()
+                return df[url_col].tolist(), combined
+            urls_404, text_404 = prepare_text(df_404, url_col_404, text_cols_404)
+            urls_200, text_200 = prepare_text(df_200, url_col_200, text_cols_200)
+            status_msg.info("🔄 Generating embeddings...")
             model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
             emb_404 = model.encode(text_404, show_progress_bar=True)
             emb_200 = model.encode(text_200, show_progress_bar=True)
+            status_msg.empty()  # ✅ Clear status message
             # Similarity matrix
             sim_matrix = cosine_similarity(emb_404, emb_200)
                 top_idx = np.argsort(row)[-top_k:][::-1]
                 for rank, j in enumerate(top_idx, start=1):
                     matches.append({
+                        "404 URL": urls_404[i],
+                        "Matched 200 URL": urls_200[j],
                         "Similarity Score": round(row[j], 4),
                         "Rank": rank
                     })
             st.download_button("📥 Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
     else:
+        st.info("👉 Please select 1 URL column + at least 1 text field **for each file**.")