Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -37,32 +37,43 @@ if file_404 and file_200:
|
|
| 37 |
|
| 38 |
st.success("β
Files loaded!")
|
| 39 |
|
| 40 |
-
# Column selection
|
| 41 |
-
st.markdown("### π§ Step 2: Select Columns
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
if st.button("π Run Matching"):
|
| 47 |
-
st.
|
| 48 |
|
| 49 |
# Clean + combine text for each row
|
| 50 |
-
def prepare_text(df):
|
| 51 |
df = df.copy()
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
df_text = df[selected_cols].astype(str).fillna("")
|
| 57 |
-
return df_text.apply(lambda row: " ".join(row), axis=1).tolist()
|
| 58 |
|
| 59 |
-
text_404 = prepare_text(df_404)
|
| 60 |
-
text_200 = prepare_text(df_200)
|
| 61 |
|
| 62 |
-
|
| 63 |
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
| 64 |
emb_404 = model.encode(text_404, show_progress_bar=True)
|
| 65 |
emb_200 = model.encode(text_200, show_progress_bar=True)
|
|
|
|
| 66 |
|
| 67 |
# Similarity matrix
|
| 68 |
sim_matrix = cosine_similarity(emb_404, emb_200)
|
|
@@ -74,8 +85,8 @@ if file_404 and file_200:
|
|
| 74 |
top_idx = np.argsort(row)[-top_k:][::-1]
|
| 75 |
for rank, j in enumerate(top_idx, start=1):
|
| 76 |
matches.append({
|
| 77 |
-
"404 URL":
|
| 78 |
-
"Matched 200 URL":
|
| 79 |
"Similarity Score": round(row[j], 4),
|
| 80 |
"Rank": rank
|
| 81 |
})
|
|
@@ -89,4 +100,4 @@ if file_404 and file_200:
|
|
| 89 |
st.download_button("π₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
|
| 90 |
|
| 91 |
else:
|
| 92 |
-
st.info("π Please select at least
|
|
|
|
| 37 |
|
| 38 |
st.success("β
Files loaded!")
|
| 39 |
|
| 40 |
+
# === Column selection (URLs + text fields)
|
| 41 |
+
st.markdown("### π§ Step 2: Select Columns")
|
| 42 |
+
|
| 43 |
+
st.markdown("#### π΄ For 404 URLs")
|
| 44 |
+
col1, col2 = st.columns(2)
|
| 45 |
+
with col1:
|
| 46 |
+
url_col_404 = st.selectbox("Select URL column (404):", df_404.columns.tolist())
|
| 47 |
+
with col2:
|
| 48 |
+
text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
|
| 49 |
+
|
| 50 |
+
st.markdown("#### π’ For 200 URLs")
|
| 51 |
+
col3, col4 = st.columns(2)
|
| 52 |
+
with col3:
|
| 53 |
+
url_col_200 = st.selectbox("Select URL column (200):", df_200.columns.tolist())
|
| 54 |
+
with col4:
|
| 55 |
+
text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
|
| 56 |
+
|
| 57 |
+
if url_col_404 and url_col_200 and text_cols_404 and text_cols_200:
|
| 58 |
if st.button("π Run Matching"):
|
| 59 |
+
status_msg = st.empty()
|
| 60 |
|
| 61 |
# Clean + combine text for each row
|
| 62 |
+
def prepare_text(df, url_col, text_cols):
|
| 63 |
df = df.copy()
|
| 64 |
+
df[url_col] = df[url_col].astype(str).apply(clean_url)
|
| 65 |
+
df_text = df[text_cols].astype(str).fillna("")
|
| 66 |
+
combined = df_text.apply(lambda row: " ".join(row), axis=1).tolist()
|
| 67 |
+
return df[url_col].tolist(), combined
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
urls_404, text_404 = prepare_text(df_404, url_col_404, text_cols_404)
|
| 70 |
+
urls_200, text_200 = prepare_text(df_200, url_col_200, text_cols_200)
|
| 71 |
|
| 72 |
+
status_msg.info("π Generating embeddings...")
|
| 73 |
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
| 74 |
emb_404 = model.encode(text_404, show_progress_bar=True)
|
| 75 |
emb_200 = model.encode(text_200, show_progress_bar=True)
|
| 76 |
+
status_msg.empty() # β
Clear status message
|
| 77 |
|
| 78 |
# Similarity matrix
|
| 79 |
sim_matrix = cosine_similarity(emb_404, emb_200)
|
|
|
|
| 85 |
top_idx = np.argsort(row)[-top_k:][::-1]
|
| 86 |
for rank, j in enumerate(top_idx, start=1):
|
| 87 |
matches.append({
|
| 88 |
+
"404 URL": urls_404[i],
|
| 89 |
+
"Matched 200 URL": urls_200[j],
|
| 90 |
"Similarity Score": round(row[j], 4),
|
| 91 |
"Rank": rank
|
| 92 |
})
|
|
|
|
| 100 |
st.download_button("π₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
|
| 101 |
|
| 102 |
else:
|
| 103 |
+
st.info("π Please select 1 URL column + at least 1 text field **for each file**.")
|