Spaces:

dsleo
/

math-dedup

Build error

App Files Files Community

dsleo commited on Feb 6, 2025

Commit

17186a1

verified ·

1 Parent(s): 7362def

v2

Browse files

Files changed (1) hide show

app.py +185 -116

app.py CHANGED Viewed

@@ -9,98 +9,110 @@ from sentence_transformers import SentenceTransformer, util
 from loguru import logger
 # ================== CONFIGURATION ==================
-st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
-# Load a pre-trained model for embeddings
-MODEL_NAME = "all-MiniLM-L6-v2"
-model = SentenceTransformer(MODEL_NAME)
 # Load preloaded dataset
 @st.cache_data
 def load_data():
-    file_path = "data/merged_dataset.csv.zip"
-    with zipfile.ZipFile(file_path, 'r') as zip_ref:
-        zip_ref.printdir()
-        zip_ref.extractall("data/extracted")
-        df = pd.read_csv("data/extracted/merged_dataset.csv")
-    return df
-df = load_data()
-display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
-df = df[display_columns]
-# ================== FUNCTION DEFINITIONS ==================
 def compute_embeddings(problems):
-    """Compute sentence embeddings."""
-    return model.encode(problems, normalize_embeddings=True)
-def find_similar_problems(df, similarity_threshold=0.9):
     """Find similar problems using cosine similarity, optimized for speed."""
-    status_msgs = []  # Store status messages to clear later
-    # Step 1: Compute embeddings
-    msg = st.status("🔄 Computing problem embeddings...")
-    status_msgs.append(msg)
-    start_time = time.time()
     embeddings = compute_embeddings(df['problem'].tolist())
-    # Step 2: Compute similarity matrix
-    msg = st.status("🔄 Computing cosine similarity matrix...")
-    status_msgs.append(msg)
     similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
-    # Step 3: Filter top similarities
-    msg = st.status("🔄 Filtering similar problems...")
-    status_msgs.append(msg)
     num_problems = len(df)
     upper_triangle_indices = np.triu_indices(num_problems, k=1)
-    i_indices, j_indices = upper_triangle_indices
-    similarity_scores = similarity_matrix[i_indices, j_indices]
     # Filter based on threshold
     mask = similarity_scores > similarity_threshold
-    filtered_i = i_indices[mask]
-    filtered_j = j_indices[mask]
-    filtered_scores = similarity_scores[mask]
     pairs = [
-        (df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
-        for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
     ]
-    sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
-    # Step 4: Remove intermediate messages
-    for msg in status_msgs:
-        msg.empty()  # Clear only the intermediate messages
-    # Step 5: Display final success message
-    st.success(f"✅ Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="🎉")
-    return sorted_pairs
-def analyze_clusters(df, similarity_threshold=0.9):
-    """Analyze duplicate problem clusters."""
-    pairs = find_similar_problems(df, similarity_threshold)
     detailed_analysis = []
     for base_uuid, comp_uuid, score in pairs:
-        base_row = df[df["uuid"] == base_uuid].iloc[0]
-        comp_row = df[df["uuid"] == comp_uuid].iloc[0]
-        column_differences = {}
-        for col in df.columns:
-            if col != "uuid":
-                base_val = base_row[col]
-                comp_val = comp_row[col]
-                column_differences[col] = {
-                    'base': base_val,
-                    'comparison': comp_val,
-                    'match': bool(base_val == comp_val)
-                }
         detailed_analysis.append({
             'base_uuid': base_uuid,
             'comp_uuid': comp_uuid,
@@ -110,61 +122,118 @@ def analyze_clusters(df, similarity_threshold=0.9):
     return detailed_analysis
 # ================== STREAMLIT UI ==================
-st.title("🔍 Problem Deduplication Explorer")
-st.sidebar.header("Settings")
-similarity_threshold = st.sidebar.slider(
-    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
-)
-# Display first 5 rows of dataset
-st.subheader("📄 Explore the Dataset")
-st.dataframe(df.head(5))
-if st.sidebar.button("Run Deduplication Analysis"):
-    with st.spinner("Analyzing..."):
-        results = analyze_clusters(df, similarity_threshold)
-    st.success("Analysis Complete!")
-    st.subheader("📊 Duplicate Problem Pairs")
-    # Filtering options
-    sources = df["source"].unique().tolist()
-    question_types = df["question_type"].unique().tolist()
-    selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
-    selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
-    if selected_source:
-        results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
-    if selected_qtype:
-        results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
-    # Display top 5 initially
-    num_display = 5
-    shown_results = results[:num_display]
-    for entry in shown_results:
-        base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
-        similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
-        st.markdown(f"### Problem: {base_problem}")
-        st.write(f"**Similar to:** {similar_problem}")
-        st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
-        with st.expander("Show Column Differences"):
-            st.json(entry["column_differences"])
-        st.markdown("---")
-    if len(results) > num_display:
-        if st.button("Show More Results"):
-            extra_results = results[num_display:num_display * 2]
-            for entry in extra_results:
-                base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
-                similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
-                st.markdown(f"### Problem: {base_problem}")
-                st.write(f"**Similar to:** {similar_problem}")
-                st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
-                with st.expander("Show Column Differences"):
                     st.json(entry["column_differences"])
                 st.markdown("---")

 from loguru import logger
 # ================== CONFIGURATION ==================
+st.set_page_config(
+    page_title="Problem Deduplication Explorer",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Load a pre-trained model for embeddings with HF caching
+@st.cache_resource
+def load_model():
+    model_name = "sentence-transformers/all-MiniLM-L6-v2"
+    try:
+        return SentenceTransformer(model_name, cache_folder="/tmp/sentence_transformers")
+    except Exception as e:
+        st.error(f"Error loading model: {e}")
+        return None
+model = load_model()
 # Load preloaded dataset
 @st.cache_data
 def load_data():
+    try:
+        file_path = "data/merged_dataset.csv.zip"
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            with zip_ref.open(zip_ref.namelist()[0]) as file:
+                df = pd.read_csv(file)
+        return df[["uuid", "problem", "source", "question_type", "problem_type"]]
+    except Exception as e:
+        st.error(f"Error loading dataset: {e}")
+        # Return empty DataFrame with correct columns if loading fails
+        return pd.DataFrame(columns=["uuid", "problem", "source", "question_type", "problem_type"])
+# Cache embeddings computation with error handling
+@st.cache_data
 def compute_embeddings(problems):
+    """Compute and cache sentence embeddings."""
+    try:
+        return model.encode(problems, normalize_embeddings=True)
+    except Exception as e:
+        st.error(f"Error computing embeddings: {e}")
+        return np.array([])
+# ================== FUNCTION DEFINITIONS ==================
+def find_similar_problems(df, similarity_threshold=0.9, progress_bar=None):
     """Find similar problems using cosine similarity, optimized for speed."""
+    if df.empty:
+        return []
+    # Compute embeddings with progress tracking
     embeddings = compute_embeddings(df['problem'].tolist())
+    if embeddings.size == 0:
+        return []
+    if progress_bar:
+        progress_bar.progress(0.33, "Computing similarity matrix...")
+    # Compute similarity matrix
     similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
+    if progress_bar:
+        progress_bar.progress(0.66, "Finding similar pairs...")
+    # Use numpy operations for better performance
     num_problems = len(df)
     upper_triangle_indices = np.triu_indices(num_problems, k=1)
+    similarity_scores = similarity_matrix[upper_triangle_indices]
     # Filter based on threshold
     mask = similarity_scores > similarity_threshold
+    filtered_indices = np.where(mask)[0]
     pairs = [
+        (df.iloc[upper_triangle_indices[0][i]]["uuid"],
+         df.iloc[upper_triangle_indices[1][i]]["uuid"],
+         float(similarity_scores[i]))
+        for i in filtered_indices
     ]
+    if progress_bar:
+        progress_bar.progress(1.0, "Analysis complete!")
+        time.sleep(0.5)
+        progress_bar.empty()
+    return sorted(pairs, key=lambda x: x[2], reverse=True)
+@st.cache_data
+def analyze_clusters(_df, pairs):
+    """Analyze duplicate problem clusters with caching."""
+    if not pairs or _df.empty:
+        return []
     detailed_analysis = []
     for base_uuid, comp_uuid, score in pairs:
+        base_row = _df[_df["uuid"] == base_uuid].iloc[0]
+        comp_row = _df[_df["uuid"] == comp_uuid].iloc[0]
+        column_differences = {
+            col: {
+                'base': base_row[col],
+                'comparison': comp_row[col],
+                'match': bool(base_row[col] == comp_row[col])
+            }
+            for col in _df.columns if col != "uuid"
+        }
         detailed_analysis.append({
             'base_uuid': base_uuid,
             'comp_uuid': comp_uuid,
     return detailed_analysis
 # ================== STREAMLIT UI ==================
+def main():
+    st.title("🔍 Problem Deduplication Explorer")
+    # Check if model loaded successfully
+    if model is None:
+        st.error("Failed to load the model. Please try again later.")
+        return
+    # Initialize session state for pagination
+    if 'page_number' not in st.session_state:
+        st.session_state.page_number = 0
+    # Sidebar configuration
+    with st.sidebar:
+        st.header("Settings")
+        similarity_threshold = st.slider(
+            "Similarity Threshold",
+            min_value=0.5,
+            max_value=1.0,
+            value=0.9,
+            step=0.01,
+            help="Higher values mean more similar problems"
+        )
+        items_per_page = st.select_slider(
+            "Items per page",
+            options=[5, 10, 20, 50],
+            value=10,
+            help="Number of results to show per page"
+        )
+    # Load and display dataset
+    df = load_data()
+    if df.empty:
+        st.error("Failed to load the dataset. Please check if the data file exists in the correct location.")
+        return
+    with st.expander("📄 Dataset Preview", expanded=False):
+        st.dataframe(
+            df.head(),
+            use_container_width=True,
+            hide_index=True
+        )
+    # Analysis section
+    if st.sidebar.button("Run Deduplication Analysis", type="primary"):
+        progress_bar = st.progress(0, "Starting analysis...")
+        # Run analysis
+        pairs = find_similar_problems(df, similarity_threshold, progress_bar)
+        results = analyze_clusters(df, pairs)
+        if not results:
+            st.warning("No similar problems found with the current threshold.")
+            return
+        # Filtering options
+        sources = sorted(df["source"].unique().tolist())
+        question_types = sorted(df["question_type"].unique().tolist())
+        col1, col2 = st.columns(2)
+        with col1:
+            selected_source = st.selectbox("Filter by Source", [None] + sources)
+        with col2:
+            selected_qtype = st.selectbox("Filter by Question Type", [None] + question_types)
+        # Apply filters
+        if selected_source:
+            results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
+        if selected_qtype:
+            results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
+        if not results:
+            st.warning("No results found with the current filters.")
+            return
+        # Pagination
+        total_pages = len(results) // items_per_page
+        col1, col2, col3 = st.columns([1, 3, 1])
+        with col1:
+            if st.button("← Previous", disabled=st.session_state.page_number <= 0):
+                st.session_state.page_number -= 1
+        with col2:
+            st.write(f"Page {st.session_state.page_number + 1} of {total_pages + 1}")
+        with col3:
+            if st.button("Next →", disabled=st.session_state.page_number >= total_pages):
+                st.session_state.page_number += 1
+        # Display results
+        start_idx = st.session_state.page_number * items_per_page
+        end_idx = start_idx + items_per_page
+        page_results = results[start_idx:end_idx]
+        for entry in page_results:
+            with st.container():
+                col1, col2 = st.columns([1, 1])
+                with col1:
+                    st.markdown("### Original Problem")
+                    st.info(df[df["uuid"] == entry["base_uuid"]]["problem"].values[0])
+                with col2:
+                    st.markdown("### Similar Problem")
+                    st.info(df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0])
+                st.metric("Similarity Score", f"{entry['similarity_score']:.4f}")
+                with st.expander("Show Details"):
                     st.json(entry["column_differences"])
                 st.markdown("---")
+if __name__ == "__main__":
+    main()