Spaces:
Sleeping
Sleeping
File size: 4,216 Bytes
b7d724f 9053a94 f849f04 9053a94 9c11ee2 ea05ca4 9053a94 b7d724f 9053a94 b7d724f 9053a94 35eb3e9 b7d724f 35eb3e9 b7d724f 35eb3e9 9053a94 35eb3e9 9053a94 b7d724f 9053a94 b7d724f 35eb3e9 b7d724f 9053a94 b7d724f 9053a94 35eb3e9 f849f04 b7d724f 9053a94 35eb3e9 9053a94 35eb3e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import urlparse
import re
import os, shutil
# Prevent Hugging Face cache from exceeding 50 GB
os.environ["TRANSFORMERS_CACHE"] = "/tmp"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp"
# Optional: clean old cache if Space restarts
for folder in ["/root/.cache", "/tmp"]:
shutil.rmtree(folder, ignore_errors=True)
st.set_page_config(page_title="404 Redirect Mapper", layout="wide")
st.title("π 404 Redirect Mapper")
st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/)
""", unsafe_allow_html=True)
# === Upload Section
st.markdown("### π Step 1: Upload CSV Files")
col1, col2 = st.columns(2)
with col1:
file_404 = st.file_uploader("Upload 404 URLs CSV", type="csv")
with col2:
file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv")
# === Utility: Clean URL path for embedding, not for output
def clean_url_path(url):
try:
parsed = urlparse(url)
path = parsed.path or ""
path = re.sub(r"\.(html|htm|php|aspx|jsp)$", "", path)
path = path.strip("/")
return path.replace("-", " ").replace("_", " ").lower()
except:
return ""
# === When both files are uploaded
if file_404 and file_200:
df_404 = pd.read_csv(file_404)
df_200 = pd.read_csv(file_200)
st.success("β
Files loaded!")
# === Column selection (URLs + text fields)
st.markdown("### π§ Step 2: Select Columns")
st.markdown("#### π΄ For 404 URLs")
col1, col2 = st.columns(2)
with col1:
url_col_404 = st.selectbox("Select full URL column (404):", df_404.columns.tolist())
with col2:
text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
st.markdown("#### π’ For 200 URLs")
col3, col4 = st.columns(2)
with col3:
url_col_200 = st.selectbox("Select full URL column (200):", df_200.columns.tolist())
with col4:
text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
if url_col_404 and url_col_200 and text_cols_404 and text_cols_200:
if st.button("π Run Matching"):
status_msg = st.empty()
def prepare(df, url_col, text_cols):
df = df.copy()
urls = df[url_col].astype(str).fillna("MISSING")
cleaned_paths = urls.apply(clean_url_path)
df_text = df[text_cols].astype(str).fillna("")
combined = cleaned_paths + " " + df_text.apply(lambda row: " ".join(row), axis=1)
return urls.tolist(), combined.tolist()
urls_404, text_404 = prepare(df_404, url_col_404, text_cols_404)
urls_200, text_200 = prepare(df_200, url_col_200, text_cols_200)
status_msg.info("π Generating embeddings...")
model = SentenceTransformer("all-MiniLM-L6-v2")
emb_404 = model.encode(text_404, show_progress_bar=True, batch_size=32)
emb_200 = model.encode(text_200, show_progress_bar=True, batch_size=32)
status_msg.empty()
sim_matrix = cosine_similarity(emb_404, emb_200)
top_k = 3
matches = []
for i, row in enumerate(sim_matrix):
top_idx = np.argsort(row)[-top_k:][::-1]
for rank, j in enumerate(top_idx, start=1):
matches.append({
"404 URL": urls_404[i],
"Matched 200 URL": urls_200[j],
"Similarity Score": round(row[j], 4),
"Rank": rank
})
match_df = pd.DataFrame(matches)
st.markdown("### π Top Matches")
st.dataframe(match_df)
csv = match_df.to_csv(index=False).encode('utf-8')
st.download_button("π₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
else:
st.info("π Please select 1 URL column + at least 1 text field **for each file**.")
|