Flopot2 commited on
Commit
b7d724f
Β·
verified Β·
1 Parent(s): 9f2cfe8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -16
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
@@ -20,8 +21,8 @@ with col1:
20
  with col2:
21
  file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv")
22
 
23
- # === Utility Function: Strip URL to path
24
- def clean_url(url):
25
  try:
26
  parsed = urlparse(url)
27
  path = parsed.path or ""
@@ -29,7 +30,7 @@ def clean_url(url):
29
  path = path.strip("/")
30
  return path.replace("-", " ").replace("_", " ").lower()
31
  except:
32
- return url
33
 
34
  # === When both files are uploaded
35
  if file_404 and file_200:
@@ -44,14 +45,14 @@ if file_404 and file_200:
44
  st.markdown("#### πŸ”΄ For 404 URLs")
45
  col1, col2 = st.columns(2)
46
  with col1:
47
- url_col_404 = st.selectbox("Select URL column (404):", df_404.columns.tolist())
48
  with col2:
49
  text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
50
 
51
  st.markdown("#### 🟒 For 200 URLs")
52
  col3, col4 = st.columns(2)
53
  with col3:
54
- url_col_200 = st.selectbox("Select URL column (200):", df_200.columns.tolist())
55
  with col4:
56
  text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
57
 
@@ -59,27 +60,25 @@ if file_404 and file_200:
59
  if st.button("πŸš€ Run Matching"):
60
  status_msg = st.empty()
61
 
62
- # Clean + combine text for each row
63
- def prepare_text(df, url_col, text_cols):
64
  df = df.copy()
65
- df[url_col] = df[url_col].astype(str).apply(clean_url)
 
66
  df_text = df[text_cols].astype(str).fillna("")
67
- combined = df_text.apply(lambda row: " ".join(row), axis=1).tolist()
68
- return df[url_col].tolist(), combined
69
 
70
- urls_404, text_404 = prepare_text(df_404, url_col_404, text_cols_404)
71
- urls_200, text_200 = prepare_text(df_200, url_col_200, text_cols_200)
72
 
73
  status_msg.info("πŸ”„ Generating embeddings...")
74
  model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
75
  emb_404 = model.encode(text_404, show_progress_bar=True)
76
  emb_200 = model.encode(text_200, show_progress_bar=True)
77
- status_msg.empty() # βœ… Clear status message
78
 
79
- # Similarity matrix
80
  sim_matrix = cosine_similarity(emb_404, emb_200)
81
 
82
- # Get top 3 matches per 404
83
  top_k = 3
84
  matches = []
85
  for i, row in enumerate(sim_matrix):
@@ -96,7 +95,6 @@ if file_404 and file_200:
96
  st.markdown("### πŸ” Top Matches")
97
  st.dataframe(match_df)
98
 
99
- # Download button
100
  csv = match_df.to_csv(index=False).encode('utf-8')
101
  st.download_button("πŸ“₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
102
 
 
1
+
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
 
21
  with col2:
22
  file_200 = st.file_uploader("Upload 200 URLs CSV", type="csv")
23
 
24
+ # === Utility: Clean URL path for embedding, not for output
25
+ def clean_url_path(url):
26
  try:
27
  parsed = urlparse(url)
28
  path = parsed.path or ""
 
30
  path = path.strip("/")
31
  return path.replace("-", " ").replace("_", " ").lower()
32
  except:
33
+ return ""
34
 
35
  # === When both files are uploaded
36
  if file_404 and file_200:
 
45
  st.markdown("#### πŸ”΄ For 404 URLs")
46
  col1, col2 = st.columns(2)
47
  with col1:
48
+ url_col_404 = st.selectbox("Select full URL column (404):", df_404.columns.tolist())
49
  with col2:
50
  text_cols_404 = st.multiselect("Select text fields (404):", df_404.columns.tolist())
51
 
52
  st.markdown("#### 🟒 For 200 URLs")
53
  col3, col4 = st.columns(2)
54
  with col3:
55
+ url_col_200 = st.selectbox("Select full URL column (200):", df_200.columns.tolist())
56
  with col4:
57
  text_cols_200 = st.multiselect("Select text fields (200):", df_200.columns.tolist())
58
 
 
60
  if st.button("πŸš€ Run Matching"):
61
  status_msg = st.empty()
62
 
63
+ def prepare(df, url_col, text_cols):
 
64
  df = df.copy()
65
+ urls = df[url_col].astype(str).fillna("MISSING")
66
+ cleaned_paths = urls.apply(clean_url_path)
67
  df_text = df[text_cols].astype(str).fillna("")
68
+ combined = cleaned_paths + " " + df_text.apply(lambda row: " ".join(row), axis=1)
69
+ return urls.tolist(), combined.tolist()
70
 
71
+ urls_404, text_404 = prepare(df_404, url_col_404, text_cols_404)
72
+ urls_200, text_200 = prepare(df_200, url_col_200, text_cols_200)
73
 
74
  status_msg.info("πŸ”„ Generating embeddings...")
75
  model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
76
  emb_404 = model.encode(text_404, show_progress_bar=True)
77
  emb_200 = model.encode(text_200, show_progress_bar=True)
78
+ status_msg.empty()
79
 
 
80
  sim_matrix = cosine_similarity(emb_404, emb_200)
81
 
 
82
  top_k = 3
83
  matches = []
84
  for i, row in enumerate(sim_matrix):
 
95
  st.markdown("### πŸ” Top Matches")
96
  st.dataframe(match_df)
97
 
 
98
  csv = match_df.to_csv(index=False).encode('utf-8')
99
  st.download_button("πŸ“₯ Download Match Results as CSV", csv, "404_redirect_matches.csv", "text/csv")
100