import streamlit as st import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from urllib.parse import urlparse import re import os, shutil # Prevent Hugging Face cache from exceeding 50 GB os.environ["TRANSFORMERS_CACHE"] = "/tmp" os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp" # Optional: clean old cache if Space restarts for folder in ["/root/.cache", "/tmp"]: shutil.rmtree(folder, ignore_errors=True) st.set_page_config(page_title="404 Redirect Mapper", layout="wide") st.title("🔁 404 Redirect Mapper") st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/) """, unsafe_allow_html=True) # === Upload Section st.markdown("### 📁 Step 1: Upload CSV Files") col1, col2 = st.columns(2) with col1: file_404 = st.file_uploader("Upload 404 URLs CSV", type="csv") with col2: file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv") # === Utility: Clean URL path for embedding, not for output def clean_url_path(url): try: parsed = urlparse(url) path = parsed.path or "" path = re.sub(r"\.(html|htm|php|aspx|jsp)$", "", path) path = path.strip("/") return path.replace("-", " ").replace("_", " ").lower() except: return "" # === When both files are uploaded if file_404 and file_200: df_404 = pd.read_csv(file_404) df_200 = pd.read_csv(file_200) st.success("✅ Files loaded!") # === Column selection (URLs + text fields) st.markdown("### 🧠 Step 2: Select Columns") st.markdown("#### 🔴 For 404 URLs") col1, col2 = st.columns(2) with col1: url_col_404 = st.selectbox("Select full URL column (404):", df_404.columns.tolist()) with col2: text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist()) st.markdown("#### 🟢 For 200 URLs") col3, col4 = st.columns(2) with col3: url_col_200 = st.selectbox("Select full URL column (200):", df_200.columns.tolist()) with col4: text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist()) if url_col_404 and url_col_200 and text_cols_404 and text_cols_200: if st.button("🚀 Run Matching"): status_msg = st.empty() def prepare(df, url_col, text_cols): df = df.copy() urls = df[url_col].astype(str).fillna("MISSING") cleaned_paths = urls.apply(clean_url_path) df_text = df[text_cols].astype(str).fillna("") combined = cleaned_paths + " " + df_text.apply(lambda row: " ".join(row), axis=1) return urls.tolist(), combined.tolist() urls_404, text_404 = prepare(df_404, url_col_404, text_cols_404) urls_200, text_200 = prepare(df_200, url_col_200, text_cols_200) status_msg.info("🔄 Generating embeddings...") model = SentenceTransformer("all-MiniLM-L6-v2") emb_404 = model.encode(text_404, show_progress_bar=True, batch_size=32) emb_200 = model.encode(text_200, show_progress_bar=True, batch_size=32) status_msg.empty() sim_matrix = cosine_similarity(emb_404, emb_200) top_k = 3 matches = [] for i, row in enumerate(sim_matrix): top_idx = np.argsort(row)[-top_k:][::-1] for rank, j in enumerate(top_idx, start=1): matches.append({ "404 URL": urls_404[i], "Matched 200 URL": urls_200[j], "Similarity Score": round(row[j], 4), "Rank": rank }) match_df = pd.DataFrame(matches) st.markdown("### 🔍 Top Matches") st.dataframe(match_df) csv = match_df.to_csv(index=False).encode('utf-8') st.download_button("📥 Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv") else: st.info("👉 Please select 1 URL column + at least 1 text field **for each file**.")