File size: 4,216 Bytes
b7d724f
9053a94
 
 
 
 
 
 
f849f04
 
 
 
 
 
 
 
 
9053a94
9c11ee2
 
ea05ca4
 
9053a94
 
 
 
 
 
 
 
 
 
b7d724f
 
9053a94
 
 
 
 
 
 
b7d724f
9053a94
 
 
 
 
 
 
 
35eb3e9
 
 
 
 
 
b7d724f
35eb3e9
 
 
 
 
 
b7d724f
35eb3e9
 
 
 
9053a94
35eb3e9
9053a94
b7d724f
9053a94
b7d724f
 
35eb3e9
b7d724f
 
9053a94
b7d724f
 
9053a94
35eb3e9
f849f04
 
 
 
b7d724f
9053a94
 
 
 
 
 
 
 
 
35eb3e9
 
9053a94
 
 
 
 
 
 
 
 
 
 
 
35eb3e9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import urlparse
import re
import os, shutil

# Prevent Hugging Face cache from exceeding 50 GB
os.environ["TRANSFORMERS_CACHE"] = "/tmp"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp"

# Optional: clean old cache if Space restarts
for folder in ["/root/.cache", "/tmp"]:
    shutil.rmtree(folder, ignore_errors=True)

st.set_page_config(page_title="404 Redirect Mapper", layout="wide")
st.title("πŸ” 404 Redirect Mapper")
st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/)
""", unsafe_allow_html=True)

# === Upload Section
st.markdown("### πŸ“ Step 1: Upload CSV Files")

col1, col2 = st.columns(2)
with col1:
    file_404 = st.file_uploader("Upload 404 URLs CSV", type="csv")
with col2:
    file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv")

# === Utility: Clean URL path for embedding, not for output
def clean_url_path(url):
    try:
        parsed = urlparse(url)
        path = parsed.path or ""
        path = re.sub(r"\.(html|htm|php|aspx|jsp)$", "", path)
        path = path.strip("/")
        return path.replace("-", " ").replace("_", " ").lower()
    except:
        return ""

# === When both files are uploaded
if file_404 and file_200:
    df_404 = pd.read_csv(file_404)
    df_200 = pd.read_csv(file_200)

    st.success("βœ… Files loaded!")

    # === Column selection (URLs + text fields)
    st.markdown("### 🧠 Step 2: Select Columns")

    st.markdown("#### πŸ”΄ For 404 URLs")
    col1, col2 = st.columns(2)
    with col1:
        url_col_404 = st.selectbox("Select full URL column (404):", df_404.columns.tolist())
    with col2:
        text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())

    st.markdown("#### 🟒 For 200 URLs")
    col3, col4 = st.columns(2)
    with col3:
        url_col_200 = st.selectbox("Select full URL column (200):", df_200.columns.tolist())
    with col4:
        text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())

    if url_col_404 and url_col_200 and text_cols_404 and text_cols_200:
        if st.button("πŸš€ Run Matching"):
            status_msg = st.empty()

            def prepare(df, url_col, text_cols):
                df = df.copy()
                urls = df[url_col].astype(str).fillna("MISSING")
                cleaned_paths = urls.apply(clean_url_path)
                df_text = df[text_cols].astype(str).fillna("")
                combined = cleaned_paths + " " + df_text.apply(lambda row: " ".join(row), axis=1)
                return urls.tolist(), combined.tolist()

            urls_404, text_404 = prepare(df_404, url_col_404, text_cols_404)
            urls_200, text_200 = prepare(df_200, url_col_200, text_cols_200)

            status_msg.info("πŸ”„ Generating embeddings...")
            model = SentenceTransformer("all-MiniLM-L6-v2")
            emb_404 = model.encode(text_404, show_progress_bar=True, batch_size=32)
            emb_200 = model.encode(text_200, show_progress_bar=True, batch_size=32)

            status_msg.empty()

            sim_matrix = cosine_similarity(emb_404, emb_200)

            top_k = 3
            matches = []
            for i, row in enumerate(sim_matrix):
                top_idx = np.argsort(row)[-top_k:][::-1]
                for rank, j in enumerate(top_idx, start=1):
                    matches.append({
                        "404 URL": urls_404[i],
                        "Matched 200 URL": urls_200[j],
                        "Similarity Score": round(row[j], 4),
                        "Rank": rank
                    })

            match_df = pd.DataFrame(matches)
            st.markdown("### πŸ” Top Matches")
            st.dataframe(match_df)

            csv = match_df.to_csv(index=False).encode('utf-8')
            st.download_button("πŸ“₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")

    else:
        st.info("πŸ‘‰ Please select 1 URL column + at least 1 text field **for each file**.")