Spaces:
Build error
Build error
serialization issue
Browse files
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import pandas as pd
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
from sentence_transformers import SentenceTransformer, util
|
| 6 |
-
from openai import OpenAI
|
| 7 |
from loguru import logger
|
| 8 |
|
| 9 |
# ================== CONFIGURATION ==================
|
|
@@ -31,50 +30,49 @@ def find_similar_problems(df, similarity_threshold=0.9):
|
|
| 31 |
"""Find similar problems using cosine similarity."""
|
| 32 |
embeddings = compute_embeddings(df['problem'].tolist())
|
| 33 |
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
|
| 34 |
-
|
| 35 |
clusters = {}
|
| 36 |
for i in range(len(df)):
|
| 37 |
current_uuid = df["uuid"][i]
|
| 38 |
similar_items = [
|
| 39 |
-
(df["uuid"][j], similarity_matrix[i][j])
|
| 40 |
for j in range(i + 1, len(df))
|
| 41 |
if similarity_matrix[i][j] > similarity_threshold
|
| 42 |
]
|
| 43 |
-
|
| 44 |
if similar_items:
|
| 45 |
clusters[current_uuid] = similar_items
|
| 46 |
-
|
| 47 |
return clusters
|
| 48 |
|
| 49 |
def analyze_clusters(df, similarity_threshold=0.9):
|
| 50 |
"""Analyze duplicate problem clusters."""
|
| 51 |
clusters = find_similar_problems(df, similarity_threshold)
|
| 52 |
detailed_analysis = {}
|
| 53 |
-
|
| 54 |
for key, values in clusters.items():
|
| 55 |
base_row = df[df["uuid"] == key].iloc[0]
|
| 56 |
cluster_details = []
|
| 57 |
-
|
| 58 |
for val, score in values:
|
| 59 |
comparison_row = df[df["uuid"] == val].iloc[0]
|
| 60 |
|
| 61 |
column_differences = {}
|
| 62 |
for col in df.columns:
|
| 63 |
if col != "uuid":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
column_differences[col] = {
|
| 65 |
-
'base':
|
| 66 |
-
'comparison':
|
| 67 |
-
'match':
|
| 68 |
}
|
| 69 |
-
|
| 70 |
cluster_details.append({
|
| 71 |
'uuid': val,
|
| 72 |
-
'similarity_score': score,
|
| 73 |
'column_differences': column_differences,
|
| 74 |
})
|
| 75 |
-
|
| 76 |
detailed_analysis[key] = cluster_details
|
| 77 |
-
|
| 78 |
return detailed_analysis
|
| 79 |
|
| 80 |
# ================== STREAMLIT UI ==================
|
|
@@ -88,22 +86,18 @@ similarity_threshold = st.sidebar.slider(
|
|
| 88 |
if st.sidebar.button("Run Deduplication Analysis"):
|
| 89 |
with st.spinner("Analyzing..."):
|
| 90 |
results = analyze_clusters(df, similarity_threshold)
|
| 91 |
-
|
| 92 |
st.success("Analysis Complete!")
|
| 93 |
-
|
| 94 |
st.subheader("📊 Duplicate Problem Clusters")
|
| 95 |
for base_uuid, cluster in results.items():
|
| 96 |
base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
|
| 97 |
st.markdown(f"### Problem: {base_problem}")
|
| 98 |
-
|
| 99 |
for entry in cluster:
|
| 100 |
similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
|
| 101 |
st.write(f"**Similar to:** {similar_problem}")
|
| 102 |
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
|
| 103 |
-
|
| 104 |
with st.expander("Show Column Differences"):
|
| 105 |
st.json(entry["column_differences"])
|
| 106 |
-
|
| 107 |
st.markdown("---")
|
| 108 |
|
| 109 |
# Export results
|
|
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
from sentence_transformers import SentenceTransformer, util
|
|
|
|
| 6 |
from loguru import logger
|
| 7 |
|
| 8 |
# ================== CONFIGURATION ==================
|
|
|
|
| 30 |
"""Find similar problems using cosine similarity."""
|
| 31 |
embeddings = compute_embeddings(df['problem'].tolist())
|
| 32 |
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
|
|
|
|
| 33 |
clusters = {}
|
| 34 |
for i in range(len(df)):
|
| 35 |
current_uuid = df["uuid"][i]
|
| 36 |
similar_items = [
|
| 37 |
+
(df["uuid"][j], float(similarity_matrix[i][j])) # Convert float32 to float
|
| 38 |
for j in range(i + 1, len(df))
|
| 39 |
if similarity_matrix[i][j] > similarity_threshold
|
| 40 |
]
|
|
|
|
| 41 |
if similar_items:
|
| 42 |
clusters[current_uuid] = similar_items
|
|
|
|
| 43 |
return clusters
|
| 44 |
|
| 45 |
def analyze_clusters(df, similarity_threshold=0.9):
|
| 46 |
"""Analyze duplicate problem clusters."""
|
| 47 |
clusters = find_similar_problems(df, similarity_threshold)
|
| 48 |
detailed_analysis = {}
|
|
|
|
| 49 |
for key, values in clusters.items():
|
| 50 |
base_row = df[df["uuid"] == key].iloc[0]
|
| 51 |
cluster_details = []
|
|
|
|
| 52 |
for val, score in values:
|
| 53 |
comparison_row = df[df["uuid"] == val].iloc[0]
|
| 54 |
|
| 55 |
column_differences = {}
|
| 56 |
for col in df.columns:
|
| 57 |
if col != "uuid":
|
| 58 |
+
base_val = base_row[col]
|
| 59 |
+
comp_val = comparison_row[col]
|
| 60 |
+
# Convert numpy types to native Python types
|
| 61 |
+
if hasattr(base_val, 'item'):
|
| 62 |
+
base_val = base_val.item()
|
| 63 |
+
if hasattr(comp_val, 'item'):
|
| 64 |
+
comp_val = comp_val.item()
|
| 65 |
column_differences[col] = {
|
| 66 |
+
'base': base_val,
|
| 67 |
+
'comparison': comp_val,
|
| 68 |
+
'match': bool(base_val == comp_val) # Convert numpy bool to Python bool
|
| 69 |
}
|
|
|
|
| 70 |
cluster_details.append({
|
| 71 |
'uuid': val,
|
| 72 |
+
'similarity_score': float(score), # Convert float32 to float
|
| 73 |
'column_differences': column_differences,
|
| 74 |
})
|
|
|
|
| 75 |
detailed_analysis[key] = cluster_details
|
|
|
|
| 76 |
return detailed_analysis
|
| 77 |
|
| 78 |
# ================== STREAMLIT UI ==================
|
|
|
|
| 86 |
if st.sidebar.button("Run Deduplication Analysis"):
|
| 87 |
with st.spinner("Analyzing..."):
|
| 88 |
results = analyze_clusters(df, similarity_threshold)
|
|
|
|
| 89 |
st.success("Analysis Complete!")
|
| 90 |
+
|
| 91 |
st.subheader("📊 Duplicate Problem Clusters")
|
| 92 |
for base_uuid, cluster in results.items():
|
| 93 |
base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
|
| 94 |
st.markdown(f"### Problem: {base_problem}")
|
|
|
|
| 95 |
for entry in cluster:
|
| 96 |
similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
|
| 97 |
st.write(f"**Similar to:** {similar_problem}")
|
| 98 |
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
|
|
|
|
| 99 |
with st.expander("Show Column Differences"):
|
| 100 |
st.json(entry["column_differences"])
|
|
|
|
| 101 |
st.markdown("---")
|
| 102 |
|
| 103 |
# Export results
|