ohmygaugh commited on
Commit
1e03b0a
·
1 Parent(s): 7634b73

Fix: Copy full application code to app.py for Docker

Browse files
Files changed (1) hide show
  1. app.py +349 -10
app.py CHANGED
@@ -1,10 +1,349 @@
1
- import subprocess
2
- import sys
3
- import os
4
-
5
- if __name__ == '__main__':
6
- subprocess.run([
7
- sys.executable, '-m', 'streamlit', 'run', 'visualize_ER_networks_from_csv.py',
8
- '--server.port=7860',
9
- '--server.address=0.0.0.0'
10
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # streamlit run visualize_splink_networks_from_csv.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import jellyfish # For quick string similarity (Levenshtein, Jaro, etc.)
6
+ import io
7
+ import uuid
8
+
9
+ from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
10
+
11
+ # Try to import networkx, fall back to manual implementation if not available
12
+ try:
13
+ import networkx as nx
14
+ HAS_NETWORKX = True
15
+ except ImportError:
16
+ HAS_NETWORKX = False
17
+
18
+ # ----------------------
19
+ # CONFIG
20
+ # ----------------------
21
+ DEFAULT_NODE_LABEL = "Record"
22
+ DEFAULT_REL_TYPE = "SIMILAR"
23
+ DEFAULT_THRESHOLD = 0.80 # default similarity threshold
24
+ MAX_REDLINE_PREVIEW = 10 # how many top edges to preview with "red-lining"
25
+
26
+ st.set_page_config(
27
+ page_title="CSV ER & Network Graph",
28
+ layout="wide",
29
+ initial_sidebar_state="expanded"
30
+ )
31
+ st.title("Entity Resolution on CSV (Network Graph)")
32
+
33
+ # ----------------------
34
+ # SIDEBAR: CSV UPLOAD
35
+ # ----------------------
36
+ st.sidebar.header("Upload CSV for Entity Resolution")
37
+ uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"])
38
+
39
+ similarity_threshold = st.sidebar.slider(
40
+ "Similarity Threshold",
41
+ min_value=0.0,
42
+ max_value=1.0,
43
+ value=DEFAULT_THRESHOLD,
44
+ step=0.01
45
+ )
46
+
47
+ # Choose which columns to compare
48
+ st.sidebar.header("Similarity Columns")
49
+ # The user can list (or guess) which columns in the CSV are relevant for measuring similarity
50
+ # We'll default to common ones from 'create_mock_data_csv.py': first_name, last_name, email_address, phone_number
51
+ default_cols = "first_name,last_name,email_address,phone_number"
52
+ similarity_cols_raw = st.sidebar.text_input(
53
+ "Columns to compare (comma-separated):",
54
+ value=default_cols
55
+ )
56
+ similarity_cols = [c.strip() for c in similarity_cols_raw.split(",") if c.strip()]
57
+
58
+ # If the user wants to see red-lining differences
59
+ show_redlining = st.sidebar.checkbox("Show red-lined differences for top pairs", value=True)
60
+
61
+ # Data and Graph placeholders
62
+ df = None
63
+ elements = {"nodes": [], "edges": []}
64
+
65
+
66
+ # ----------------------
67
+ # UTILITY FUNCTIONS
68
+ # ----------------------
69
+ def jaro_winkler_score(str1, str2):
70
+ """Simple wrapper around jellyfish.jaro_winkler for string similarity."""
71
+ return jellyfish.jaro_winkler_similarity(str1 or "", str2 or "")
72
+
73
+ def overall_similarity(row1, row2, cols):
74
+ """
75
+ Compute an average similarity across the provided columns.
76
+ You could weight them or do more sophisticated logic.
77
+ """
78
+ scores = []
79
+ for col in cols:
80
+ val1 = str(row1.get(col, "")).lower()
81
+ val2 = str(row2.get(col, "")).lower()
82
+ if val1 == "" or val2 == "":
83
+ # If one is empty, skip or treat as partial
84
+ continue
85
+ sim = jaro_winkler_score(val1, val2)
86
+ scores.append(sim)
87
+ if len(scores) == 0:
88
+ return 0.0
89
+ return sum(scores) / len(scores)
90
+
91
+ def redline_text(str1, str2):
92
+ """
93
+ A simplistic "red-lining" of differences:
94
+ We'll highlight mismatched characters in red.
95
+ This helps show how two strings differ.
96
+ """
97
+ # For brevity, let's just do a character-by-character compare:
98
+ # if they match, we keep them black; if not, we color them red.
99
+ # In practice, you might do a diff algorithm for better results.
100
+ out = []
101
+ max_len = max(len(str1), len(str2))
102
+ for i in range(max_len):
103
+ c1 = str1[i] if i < len(str1) else ""
104
+ c2 = str2[i] if i < len(str2) else ""
105
+ if c1 == c2:
106
+ out.append(c1) # same char
107
+ else:
108
+ # highlight mismatch
109
+ out.append(f"<span style='color:red'>{c1 or '_'}</span>")
110
+ # If str2 is longer, we won't show it in the same line for now.
111
+ # You can adapt to show side-by-side. We'll keep it simple.
112
+ return "".join(out)
113
+
114
+ def find_connected_components_manual(nodes, edges):
115
+ """
116
+ Manual implementation of connected components finding.
117
+ Fallback when NetworkX is not available.
118
+ """
119
+ # Build adjacency list
120
+ adj_list = {node: set() for node in nodes}
121
+ for edge in edges:
122
+ source = edge["data"]["source"]
123
+ target = edge["data"]["target"]
124
+ adj_list[source].add(target)
125
+ adj_list[target].add(source)
126
+
127
+ visited = set()
128
+ components = []
129
+
130
+ def dfs(node, component):
131
+ if node in visited:
132
+ return
133
+ visited.add(node)
134
+ component.add(node)
135
+ for neighbor in adj_list[node]:
136
+ dfs(neighbor, component)
137
+
138
+ for node in nodes:
139
+ if node not in visited:
140
+ component = set()
141
+ dfs(node, component)
142
+ if component: # Only add non-empty components
143
+ components.append(component)
144
+
145
+ return components
146
+
147
+
148
+ # ----------------------
149
+ # LOAD CSV & PROCESS
150
+ # ----------------------
151
+ if uploaded_file is not None:
152
+ st.markdown("### Preview of Uploaded CSV Data")
153
+ df = pd.read_csv(uploaded_file)
154
+ st.dataframe(df.head(10))
155
+
156
+ # Provide a "Run Entity Resolution" button
157
+ if st.button("Run Entity Resolution"):
158
+ # STEP 1: Generate nodes
159
+ # We'll create one node per row, storing all row data as properties
160
+ nodes = []
161
+ for idx, row in df.iterrows():
162
+ node_data = row.to_dict()
163
+ node_data["id"] = str(idx) # use row index as unique ID
164
+ node_data["label"] = DEFAULT_NODE_LABEL
165
+ # We'll store "name" as a short label for the node
166
+ # e.g. we might use something like first_name + last_name or a subset
167
+ # but for demonstration, let's just do "row index" or any chosen fields
168
+ first_name = row.get("first_name", "")
169
+ last_name = row.get("last_name", "")
170
+ short_label = f"{first_name} {last_name}".strip()
171
+ if not short_label.strip():
172
+ short_label = f"Row-{idx}"
173
+ node_data["name"] = short_label
174
+ nodes.append({"data": node_data})
175
+
176
+ # STEP 2: Pairwise similarity for edges
177
+ # We'll do a naive all-pairs approach. For large data, you'd do blocking.
178
+ edges = []
179
+ for i in range(len(df)):
180
+ for j in range(i + 1, len(df)):
181
+ sim = overall_similarity(df.loc[i], df.loc[j], similarity_cols)
182
+ if sim >= similarity_threshold:
183
+ edge_data = {
184
+ "id": f"edge_{i}_{j}",
185
+ "source": str(i),
186
+ "target": str(j),
187
+ "label": DEFAULT_REL_TYPE,
188
+ "similarity": round(sim, 3)
189
+ }
190
+ edges.append({"data": edge_data})
191
+
192
+ elements = {"nodes": nodes, "edges": edges}
193
+ st.success("Entity Resolution complete! Network graph built.")
194
+
195
+
196
+ # ------------
197
+ # Visualization
198
+ st.markdown("### Network Graph")
199
+ node_labels = set(node["data"]["label"] for node in elements["nodes"])
200
+ rel_labels = set(edge["data"]["label"] for edge in elements["edges"])
201
+
202
+ # Basic styling
203
+ default_colors = ["#2A629A", "#FF7F3E", "#C0C0C0", "#008000", "#800080"]
204
+ node_styles = []
205
+ for i, label in enumerate(sorted(node_labels)):
206
+ color = default_colors[i % len(default_colors)]
207
+ node_styles.append(NodeStyle(label=label, color=color, caption="name"))
208
+
209
+ edge_styles = []
210
+ for rel in sorted(rel_labels):
211
+ edge_styles.append(EdgeStyle(rel, caption="similarity", directed=False))
212
+
213
+ st_link_analysis(
214
+ elements,
215
+ layout="cose",
216
+ node_styles=node_styles,
217
+ edge_styles=edge_styles
218
+ )
219
+
220
+ # ------------
221
+ # Community Detection & CSV Export
222
+ st.markdown("### Community Detection Results")
223
+
224
+ # Find connected components (communities)
225
+ if HAS_NETWORKX:
226
+ # Use NetworkX if available
227
+ G = nx.Graph()
228
+ for node in elements["nodes"]:
229
+ G.add_node(node["data"]["id"])
230
+ for edge in elements["edges"]:
231
+ G.add_edge(edge["data"]["source"], edge["data"]["target"])
232
+ communities = list(nx.connected_components(G))
233
+ else:
234
+ # Use manual implementation as fallback
235
+ st.info("NetworkX not found. Using manual connected components algorithm. Install NetworkX for better performance: `pip install networkx`")
236
+ node_ids = [node["data"]["id"] for node in elements["nodes"]]
237
+ communities = find_connected_components_manual(node_ids, elements["edges"])
238
+
239
+ # Create a mapping from node_id to community_id
240
+ node_to_community = {}
241
+ community_uuids = {}
242
+
243
+ for i, community in enumerate(communities):
244
+ community_uuid = str(uuid.uuid4())
245
+ community_uuids[i] = community_uuid
246
+ for node_id in community:
247
+ node_to_community[node_id] = community_uuid
248
+
249
+ # Add community IDs to the original dataframe
250
+ df_with_communities = df.copy()
251
+ df_with_communities['community_id'] = [
252
+ node_to_community.get(str(idx), str(uuid.uuid4()))
253
+ for idx in df_with_communities.index
254
+ ]
255
+
256
+ st.write(f"**Found {len(communities)} communities:**")
257
+ for i, community in enumerate(communities):
258
+ st.write(f"- Community {i+1}: {len(community)} records (UUID: {community_uuids[i]})")
259
+
260
+ # Show the results dataframe
261
+ st.markdown("#### Results with Community IDs")
262
+ st.dataframe(df_with_communities)
263
+
264
+ # CSV Export option
265
+ st.markdown("#### Export Results")
266
+ csv_buffer = io.StringIO()
267
+ df_with_communities.to_csv(csv_buffer, index=False)
268
+ csv_data = csv_buffer.getvalue()
269
+
270
+ st.download_button(
271
+ label="📥 Download Results as CSV",
272
+ data=csv_data,
273
+ file_name="entity_resolution_results.csv",
274
+ mime="text/csv"
275
+ )
276
+
277
+ # ------------
278
+ # Red-lining (moved to bottom as lower priority)
279
+ if show_redlining and len(edges) > 0:
280
+ st.markdown("### Top Similar Pairs (Red-Lined Differences)")
281
+
282
+ # Filter out exact matches (similarity == 1.0)
283
+ filtered_edges = [
284
+ edge for edge in edges if edge["data"]["similarity"] < 1.0
285
+ ]
286
+
287
+ # Sort by highest similarity (closest matches first)
288
+ sorted_edges = sorted(filtered_edges, key=lambda e: e["data"]["similarity"], reverse=True)
289
+ top_edges = sorted_edges[:MAX_REDLINE_PREVIEW]
290
+
291
+ if not top_edges:
292
+ st.info("No slightly different pairs found; all matches are exact or none meet the threshold.")
293
+ else:
294
+ for edge_item in top_edges:
295
+ s_idx = int(edge_item["data"]["source"])
296
+ t_idx = int(edge_item["data"]["target"])
297
+ sim_val = edge_item["data"]["similarity"]
298
+ st.markdown(f"**Pair:** Row {s_idx} ↔ Row {t_idx}, **similarity**={sim_val}")
299
+
300
+ # Highlight differences in selected columns
301
+ mismatch_cols = []
302
+ for col in similarity_cols:
303
+ val1 = str(df.loc[s_idx, col])
304
+ val2 = str(df.loc[t_idx, col])
305
+ if val1.lower() != val2.lower():
306
+ mismatch_cols.append((col, val1, val2))
307
+
308
+ if mismatch_cols:
309
+ st.write("Differences in the following columns:")
310
+ for col_name, str1, str2 in mismatch_cols:
311
+ redlined = redline_text(str1, str2)
312
+ st.markdown(f"&nbsp;&nbsp;**{col_name}:** {redlined}", unsafe_allow_html=True)
313
+ else:
314
+ st.write("No differences in the compared columns.")
315
+
316
+ st.markdown("---")
317
+
318
+ # ------------
319
+ # Enterprise Scale Note
320
+ st.markdown("---")
321
+ st.markdown("### 📈 Enterprise Scale Solutions")
322
+
323
+ if not HAS_NETWORKX:
324
+ st.warning("""
325
+ **Missing NetworkX Dependency**
326
+
327
+ For better performance, install NetworkX:
328
+ ```bash
329
+ pip install networkx
330
+ ```
331
+ """)
332
+
333
+ st.info("""
334
+ **Need help with larger scale deployments?**
335
+
336
+ If you need to persist UUIDs from run to run, handle larger datasets, or require more sophisticated
337
+ entity resolution capabilities, you may need an enterprise-scale solution. Consider:
338
+
339
+ - **Database Integration**: Store community IDs in a persistent database
340
+ - **Incremental Processing**: Handle new data without re-processing everything
341
+ - **Advanced Blocking**: Use more sophisticated blocking strategies for large datasets
342
+ - **Distributed Computing**: Scale across multiple machines for very large datasets
343
+ - **Custom ML Models**: Train domain-specific models for better accuracy
344
+
345
+ Contact **Eastridge Analytics** for guidance on enterprise implementations.
346
+ """)
347
+
348
+ else:
349
+ st.info("Please upload a CSV file in the sidebar to begin.")