Flopot2's picture
Update app.py
f849f04 verified
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import urlparse
import re
import os, shutil
# Prevent Hugging Face cache from exceeding 50 GB
os.environ["TRANSFORMERS_CACHE"] = "/tmp"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp"
# Optional: clean old cache if Space restarts
for folder in ["/root/.cache", "/tmp"]:
shutil.rmtree(folder, ignore_errors=True)
st.set_page_config(page_title="404 Redirect Mapper", layout="wide")
st.title("πŸ” 404 Redirect Mapper")
st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/)
""", unsafe_allow_html=True)
# === Upload Section
st.markdown("### πŸ“ Step 1: Upload CSV Files")
col1, col2 = st.columns(2)
with col1:
file_404 = st.file_uploader("Upload 404 URLs CSV", type="csv")
with col2:
file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv")
# === Utility: Clean URL path for embedding, not for output
def clean_url_path(url):
try:
parsed = urlparse(url)
path = parsed.path or ""
path = re.sub(r"\.(html|htm|php|aspx|jsp)$", "", path)
path = path.strip("/")
return path.replace("-", " ").replace("_", " ").lower()
except:
return ""
# === When both files are uploaded
if file_404 and file_200:
df_404 = pd.read_csv(file_404)
df_200 = pd.read_csv(file_200)
st.success("βœ… Files loaded!")
# === Column selection (URLs + text fields)
st.markdown("### 🧠 Step 2: Select Columns")
st.markdown("#### πŸ”΄ For 404 URLs")
col1, col2 = st.columns(2)
with col1:
url_col_404 = st.selectbox("Select full URL column (404):", df_404.columns.tolist())
with col2:
text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
st.markdown("#### 🟒 For 200 URLs")
col3, col4 = st.columns(2)
with col3:
url_col_200 = st.selectbox("Select full URL column (200):", df_200.columns.tolist())
with col4:
text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
if url_col_404 and url_col_200 and text_cols_404 and text_cols_200:
if st.button("πŸš€ Run Matching"):
status_msg = st.empty()
def prepare(df, url_col, text_cols):
df = df.copy()
urls = df[url_col].astype(str).fillna("MISSING")
cleaned_paths = urls.apply(clean_url_path)
df_text = df[text_cols].astype(str).fillna("")
combined = cleaned_paths + " " + df_text.apply(lambda row: " ".join(row), axis=1)
return urls.tolist(), combined.tolist()
urls_404, text_404 = prepare(df_404, url_col_404, text_cols_404)
urls_200, text_200 = prepare(df_200, url_col_200, text_cols_200)
status_msg.info("πŸ”„ Generating embeddings...")
model = SentenceTransformer("all-MiniLM-L6-v2")
emb_404 = model.encode(text_404, show_progress_bar=True, batch_size=32)
emb_200 = model.encode(text_200, show_progress_bar=True, batch_size=32)
status_msg.empty()
sim_matrix = cosine_similarity(emb_404, emb_200)
top_k = 3
matches = []
for i, row in enumerate(sim_matrix):
top_idx = np.argsort(row)[-top_k:][::-1]
for rank, j in enumerate(top_idx, start=1):
matches.append({
"404 URL": urls_404[i],
"Matched 200 URL": urls_200[j],
"Similarity Score": round(row[j], 4),
"Rank": rank
})
match_df = pd.DataFrame(matches)
st.markdown("### πŸ” Top Matches")
st.dataframe(match_df)
csv = match_df.to_csv(index=False).encode('utf-8')
st.download_button("πŸ“₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
else:
st.info("πŸ‘‰ Please select 1 URL column + at least 1 text field **for each file**.")