Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from urllib.parse import urlparse | |
| import re | |
| import os, shutil | |
| # Prevent Hugging Face cache from exceeding 50 GB | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp" | |
| os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp" | |
| # Optional: clean old cache if Space restarts | |
| for folder in ["/root/.cache", "/tmp"]: | |
| shutil.rmtree(folder, ignore_errors=True) | |
| st.set_page_config(page_title="404 Redirect Mapper", layout="wide") | |
| st.title("π 404 Redirect Mapper") | |
| st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/) | |
| """, unsafe_allow_html=True) | |
| # === Upload Section | |
| st.markdown("### π Step 1: Upload CSV Files") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| file_404 = st.file_uploader("Upload 404 URLs CSV", type="csv") | |
| with col2: | |
| file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv") | |
| # === Utility: Clean URL path for embedding, not for output | |
| def clean_url_path(url): | |
| try: | |
| parsed = urlparse(url) | |
| path = parsed.path or "" | |
| path = re.sub(r"\.(html|htm|php|aspx|jsp)$", "", path) | |
| path = path.strip("/") | |
| return path.replace("-", " ").replace("_", " ").lower() | |
| except: | |
| return "" | |
| # === When both files are uploaded | |
| if file_404 and file_200: | |
| df_404 = pd.read_csv(file_404) | |
| df_200 = pd.read_csv(file_200) | |
| st.success("β Files loaded!") | |
| # === Column selection (URLs + text fields) | |
| st.markdown("### π§ Step 2: Select Columns") | |
| st.markdown("#### π΄ For 404 URLs") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| url_col_404 = st.selectbox("Select full URL column (404):", df_404.columns.tolist()) | |
| with col2: | |
| text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist()) | |
| st.markdown("#### π’ For 200 URLs") | |
| col3, col4 = st.columns(2) | |
| with col3: | |
| url_col_200 = st.selectbox("Select full URL column (200):", df_200.columns.tolist()) | |
| with col4: | |
| text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist()) | |
| if url_col_404 and url_col_200 and text_cols_404 and text_cols_200: | |
| if st.button("π Run Matching"): | |
| status_msg = st.empty() | |
| def prepare(df, url_col, text_cols): | |
| df = df.copy() | |
| urls = df[url_col].astype(str).fillna("MISSING") | |
| cleaned_paths = urls.apply(clean_url_path) | |
| df_text = df[text_cols].astype(str).fillna("") | |
| combined = cleaned_paths + " " + df_text.apply(lambda row: " ".join(row), axis=1) | |
| return urls.tolist(), combined.tolist() | |
| urls_404, text_404 = prepare(df_404, url_col_404, text_cols_404) | |
| urls_200, text_200 = prepare(df_200, url_col_200, text_cols_200) | |
| status_msg.info("π Generating embeddings...") | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| emb_404 = model.encode(text_404, show_progress_bar=True, batch_size=32) | |
| emb_200 = model.encode(text_200, show_progress_bar=True, batch_size=32) | |
| status_msg.empty() | |
| sim_matrix = cosine_similarity(emb_404, emb_200) | |
| top_k = 3 | |
| matches = [] | |
| for i, row in enumerate(sim_matrix): | |
| top_idx = np.argsort(row)[-top_k:][::-1] | |
| for rank, j in enumerate(top_idx, start=1): | |
| matches.append({ | |
| "404 URL": urls_404[i], | |
| "Matched 200 URL": urls_200[j], | |
| "Similarity Score": round(row[j], 4), | |
| "Rank": rank | |
| }) | |
| match_df = pd.DataFrame(matches) | |
| st.markdown("### π Top Matches") | |
| st.dataframe(match_df) | |
| csv = match_df.to_csv(index=False).encode('utf-8') | |
| st.download_button("π₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv") | |
| else: | |
| st.info("π Please select 1 URL column + at least 1 text field **for each file**.") | |