Spaces:

Flopot2
/

404-redirect-mapper

Sleeping

App Files Files Community

Flopot2 commited on Apr 25, 2025

Commit

b7d724f

verified ·

1 Parent(s): 9f2cfe8

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -16

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -20,8 +21,8 @@ with col1:
 with col2:
     file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv")
-# === Utility Function: Strip URL to path
-def clean_url(url):
     try:
         parsed = urlparse(url)
         path = parsed.path or ""
@@ -29,7 +30,7 @@ def clean_url(url):
         path = path.strip("/")
         return path.replace("-", " ").replace("_", " ").lower()
     except:
-        return url
 # === When both files are uploaded
 if file_404 and file_200:
@@ -44,14 +45,14 @@ if file_404 and file_200:
     st.markdown("#### 🔴 For 404 URLs")
     col1, col2 = st.columns(2)
     with col1:
-        url_col_404 = st.selectbox("Select URL column (404):", df_404.columns.tolist())
     with col2:
         text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
     st.markdown("#### 🟢 For 200 URLs")
     col3, col4 = st.columns(2)
     with col3:
-        url_col_200 = st.selectbox("Select URL column (200):", df_200.columns.tolist())
     with col4:
         text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
@@ -59,27 +60,25 @@ if file_404 and file_200:
         if st.button("🚀 Run Matching"):
             status_msg = st.empty()
-            # Clean + combine text for each row
-            def prepare_text(df, url_col, text_cols):
                 df = df.copy()
-                df[url_col] = df[url_col].astype(str).apply(clean_url)
                 df_text = df[text_cols].astype(str).fillna("")
-                combined = df_text.apply(lambda row: " ".join(row), axis=1).tolist()
-                return df[url_col].tolist(), combined
-            urls_404, text_404 = prepare_text(df_404, url_col_404, text_cols_404)
-            urls_200, text_200 = prepare_text(df_200, url_col_200, text_cols_200)
             status_msg.info("🔄 Generating embeddings...")
             model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
             emb_404 = model.encode(text_404, show_progress_bar=True)
             emb_200 = model.encode(text_200, show_progress_bar=True)
-            status_msg.empty()  # ✅ Clear status message
-            # Similarity matrix
             sim_matrix = cosine_similarity(emb_404, emb_200)
-            # Get top 3 matches per 404
             top_k = 3
             matches = []
             for i, row in enumerate(sim_matrix):
@@ -96,7 +95,6 @@ if file_404 and file_200:
             st.markdown("### 🔍 Top Matches")
             st.dataframe(match_df)
-            # Download button
             csv = match_df.to_csv(index=False).encode('utf-8')
             st.download_button("📥 Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")

 import streamlit as st
 import pandas as pd
 import numpy as np
 with col2:
     file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv")
+# === Utility: Clean URL path for embedding, not for output
+def clean_url_path(url):
     try:
         parsed = urlparse(url)
         path = parsed.path or ""
         path = path.strip("/")
         return path.replace("-", " ").replace("_", " ").lower()
     except:
+        return ""
 # === When both files are uploaded
 if file_404 and file_200:
     st.markdown("#### 🔴 For 404 URLs")
     col1, col2 = st.columns(2)
     with col1:
+        url_col_404 = st.selectbox("Select full URL column (404):", df_404.columns.tolist())
     with col2:
         text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
     st.markdown("#### 🟢 For 200 URLs")
     col3, col4 = st.columns(2)
     with col3:
+        url_col_200 = st.selectbox("Select full URL column (200):", df_200.columns.tolist())
     with col4:
         text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
         if st.button("🚀 Run Matching"):
             status_msg = st.empty()
+            def prepare(df, url_col, text_cols):
                 df = df.copy()
+                urls = df[url_col].astype(str).fillna("MISSING")
+                cleaned_paths = urls.apply(clean_url_path)
                 df_text = df[text_cols].astype(str).fillna("")
+                combined = cleaned_paths + " " + df_text.apply(lambda row: " ".join(row), axis=1)
+                return urls.tolist(), combined.tolist()
+            urls_404, text_404 = prepare(df_404, url_col_404, text_cols_404)
+            urls_200, text_200 = prepare(df_200, url_col_200, text_cols_200)
             status_msg.info("🔄 Generating embeddings...")
             model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
             emb_404 = model.encode(text_404, show_progress_bar=True)
             emb_200 = model.encode(text_200, show_progress_bar=True)
+            status_msg.empty()
             sim_matrix = cosine_similarity(emb_404, emb_200)
             top_k = 3
             matches = []
             for i, row in enumerate(sim_matrix):
             st.markdown("### 🔍 Top Matches")
             st.dataframe(match_df)
             csv = match_df.to_csv(index=False).encode('utf-8')
             st.download_button("📥 Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")