ohmygaugh commited on
Commit
68356aa
·
1 Parent(s): 95d3f21

Add Entity Resolution Network Analysis app

Browse files

- visualize_ER_networks_from_csv.py: Main Streamlit application
- requirements.txt: Dependencies including st-link-analysis for network viz
- create_mock_CSV_data.py: Demo data generator
- app.py: Entry point for Hugging Face Spaces

app.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import os
4
+
5
+ if __name__ == '__main__':
6
+ subprocess.run([
7
+ sys.executable, '-m', 'streamlit', 'run', 'visualize_ER_networks_from_csv.py',
8
+ '--server.port=7860',
9
+ '--server.address=0.0.0.0'
10
+ ])
create_mock_CSV_data.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ create_mock_data_csv.py
5
+
6
+ Fetches random user data from randomuser.me (or a similar service) and creates
7
+ mock data in CSV format that imitates having multiple 'Profiles' and multiple
8
+ 'Identity' rows. Each row in the CSV represents an Identity and includes:
9
+ - a parent Profile ID and Profile name,
10
+ - first_name, last_name, birth_year, etc.,
11
+ - and possibly random typos in selected fields (based on a user-defined percentage).
12
+
13
+ Usage example:
14
+ python create_mock_data_csv.py --num_profiles=100 --typo_percentage=10 --output_file="mock_data.csv"
15
+ """
16
+
17
+ import requests
18
+ import random
19
+ import logging
20
+ import argparse
21
+ import csv
22
+ import uuid
23
+ import numpy as np
24
+ # If you use the nicknames library: pip install nicknames
25
+ # from nicknames import NickNamer
26
+ # For demonstration, let's fallback gracefully if not installed.
27
+
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger(__name__)
30
+
31
+ try:
32
+ from nicknames import NickNamer
33
+ NICKNAMES_AVAILABLE = True
34
+ except ImportError:
35
+ NICKNAMES_AVAILABLE = False
36
+ logger.warning("nicknames library is not installed. Nickname feature will be limited.")
37
+
38
+
39
+ def fetch_random_users(num_profiles):
40
+ """
41
+ Fetch random user data from the randomuser.me API.
42
+ Returns a list of user dicts with relevant attributes.
43
+ """
44
+ url = f"https://randomuser.me/api/?results={num_profiles}&nat=us"
45
+ response = requests.get(url)
46
+ response.raise_for_status()
47
+ data = response.json()
48
+ return data.get("results", [])
49
+
50
+
51
+ class User:
52
+ """
53
+ Simple container for user data fetched from randomuser.me,
54
+ plus logic for generating nicknames, emails, phone numbers,
55
+ and introducing random typos.
56
+ """
57
+ def __init__(self, user_data):
58
+ self.user_data = user_data
59
+
60
+ # Extract basic info
61
+ self.name_data = user_data.get("name", {})
62
+ self.first_name = self.name_data.get("first", "Unknown")
63
+ self.last_name = self.name_data.get("last", "Unknown")
64
+ self.nickname = self._choose_nickname()
65
+
66
+ dob = user_data.get("dob", {})
67
+ self.birth_year = str(dob.get("date", "")[:4]) # 'YYYY-MM-DD...' -> 'YYYY'
68
+
69
+ # Email address: random combination of first, last, year, etc.
70
+ self.email_address = self._generate_email()
71
+
72
+ # Phone number: just digits from the API phone.
73
+ phone_raw = user_data.get("phone", "")
74
+ self.phone_number = "".join(filter(str.isdigit, phone_raw))
75
+
76
+ # Address fields
77
+ location = user_data.get("location", {})
78
+ self.street_number = str(location.get("street", {}).get("number", ""))
79
+ self.street_name = location.get("street", {}).get("name", "")
80
+ self.city = location.get("city", "")
81
+ self.state = location.get("state", "")
82
+ self.country = location.get("country", "")
83
+ self.zip_code = str(location.get("postcode", ""))
84
+
85
+ @property
86
+ def full_name(self):
87
+ return f"{self.first_name} {self.last_name}"
88
+
89
+ @property
90
+ def full_address(self):
91
+ return f"{self.street_number} {self.street_name}, {self.city}, {self.state} {self.zip_code}"
92
+
93
+ def _choose_nickname(self):
94
+ """
95
+ Uses the nicknames library if available, otherwise falls back to the first name.
96
+ """
97
+ if NICKNAMES_AVAILABLE:
98
+ nn = NickNamer()
99
+ possible_nicknames = nn.nicknames_of(self.first_name)
100
+ if possible_nicknames:
101
+ return random.choice(list(possible_nicknames))
102
+ return self.first_name
103
+
104
+ def _generate_email(self):
105
+ domain = random.choice(["gmail", "yahoo", "hotmail", "outlook"])
106
+ first_part = random.choice([self.first_name, self.nickname, self.first_name[:1]])
107
+ last_part = random.choice([self.last_name, self.last_name[:1]])
108
+ optional = random.choice(["", self.birth_year, self.birth_year[-2:], str(random.randint(1, 100))])
109
+ return f"{first_part}{last_part}{optional}@{domain}.com".lower()
110
+
111
+ def add_typo(self, property_name):
112
+ """
113
+ Introduce a random typo into the specified property (e.g. 'first_name').
114
+ If property_name == 'full_address', we randomly pick an address field to modify.
115
+ """
116
+ if property_name == "full_address":
117
+ property_name = random.choice(
118
+ ["street_number", "street_name", "city", "state", "zip_code"]
119
+ )
120
+
121
+ current_value = getattr(self, property_name, None)
122
+ if not current_value or not isinstance(current_value, str):
123
+ return # If it's empty or not a string, skip
124
+
125
+ original_value = current_value
126
+ new_value = self._apply_random_typo(current_value)
127
+ setattr(self, property_name, new_value)
128
+ logger.debug(f"Applying typo: {property_name}: '{original_value}' -> '{new_value}'")
129
+
130
+ def _apply_random_typo(self, text):
131
+ """
132
+ Introduce a random single-character error (delete, swap, insert, replace)
133
+ or regenerate an email.
134
+ """
135
+ if not text:
136
+ return text
137
+
138
+ option = random.choice(["delete", "swap", "insert", "replace"])
139
+
140
+ # If email, sometimes just regenerate the entire email.
141
+ if "@" in text:
142
+ # 1 in 3 chance we fully regenerate the email.
143
+ if random.random() < 0.33:
144
+ return self._generate_email()
145
+
146
+ if len(text) == 1:
147
+ # If we have only one character, we can only do replace or insert.
148
+ option = random.choice(["insert", "replace"])
149
+
150
+ index = random.randint(0, len(text) - 1)
151
+
152
+ if option == "delete":
153
+ # Remove 1 char
154
+ return text[:index] + text[index+1:]
155
+
156
+ elif option == "swap":
157
+ # Swap with the next char if possible
158
+ if index < len(text) - 1:
159
+ # swap
160
+ lst = list(text)
161
+ lst[index], lst[index+1] = lst[index+1], lst[index]
162
+ return "".join(lst)
163
+ else:
164
+ # fallback to replace if we can't swap
165
+ letter = random.choice("abcdefghijklmnopqrstuvwxyz")
166
+ return text[:index] + letter + text[index+1:]
167
+
168
+ elif option == "insert":
169
+ # Insert a random letter at index
170
+ letter = random.choice("abcdefghijklmnopqrstuvwxyz")
171
+ return text[:index] + letter + text[index:]
172
+
173
+ elif option == "replace":
174
+ letter = random.choice("abcdefghijklmnopqrstuvwxyz")
175
+ return text[:index] + letter + text[index+1:]
176
+
177
+ # Fallback: no change
178
+ return text
179
+
180
+
181
+ def main(num_profiles, typo_percentage, output_file):
182
+ """
183
+ 1) Fetch random user data from randomuser.me
184
+ 2) For each user, create 1..N 'Profile' nodes
185
+ 3) For each 'Profile', create 1..M 'Identities'
186
+ 4) Introduce random typos in selected fields
187
+ 5) Write all Identity rows to CSV, including their associated Profile info
188
+ """
189
+ logger.info(f"Generating mock data for {num_profiles} profiles...")
190
+ api_data = fetch_random_users(num_profiles)
191
+
192
+ rows_to_write = []
193
+
194
+ # The number of identity nodes depends on random gaussian logic or your own preference
195
+ # e.g. a normal distribution around 8 with std=5, clipped to positives
196
+ # We'll keep the same approach from the original script.
197
+ for data in api_data:
198
+ user = User(data)
199
+
200
+ # random number of Identities
201
+ num_ids = abs(int(np.random.normal(8, 5))) # e.g. mean=8, std=5
202
+
203
+ # pick how many distinct "Profile" nodes each user might produce
204
+ # (in the original code, we used some logic to decide 1 or 2 or 3 profiles)
205
+ if num_ids > 4:
206
+ num_profiles_for_user = random.choice([1, 1, 1, 2, 2, 3])
207
+ else:
208
+ num_profiles_for_user = 1
209
+
210
+ # Create the Profile IDs and store them
211
+ profile_ids = [str(uuid.uuid4()) for _ in range(num_profiles_for_user)]
212
+ profile_name = user.full_name # in the original script, we used the same name for each 'Profile'
213
+
214
+ # We'll distribute the Identity rows across these profiles
215
+ profile_idx = 0
216
+
217
+ for i in range(num_ids):
218
+ # If the fraction i/num_ids > fraction dividing the profiles,
219
+ # move to next profile. (just a simple distribution approach)
220
+ if num_profiles_for_user > 1:
221
+ if i / num_ids > (profile_idx + 1) / num_profiles_for_user:
222
+ profile_idx += 1
223
+
224
+ current_profile_id = profile_ids[profile_idx]
225
+
226
+ # Possibly apply a typo
227
+ # For each new identity row (beyond the first?), there's a chance to add a typo
228
+ if i > 0 and random.random() < (typo_percentage / 100.0):
229
+ # choose a random field
230
+ possible_fields = ["first_name", "last_name", "email_address",
231
+ "phone_number", "full_address", "birth_year"]
232
+ chosen_field = random.choice(possible_fields)
233
+ user.add_typo(chosen_field)
234
+
235
+ # Create a row for the Identity
236
+ identity_id = str(uuid.uuid4())
237
+ row = {
238
+ "profile_id": current_profile_id,
239
+ "profile_name": profile_name,
240
+ "identity_id": identity_id,
241
+ "first_name": user.first_name,
242
+ "last_name": user.last_name,
243
+ "nickname": user.nickname,
244
+ "birth_year": user.birth_year,
245
+ "email_address": user.email_address,
246
+ "phone_number": user.phone_number,
247
+ "street_number": user.street_number,
248
+ "street_name": user.street_name,
249
+ "city": user.city,
250
+ "state": user.state,
251
+ "country": user.country,
252
+ "zip_code": user.zip_code
253
+ }
254
+ rows_to_write.append(row)
255
+
256
+ # Now write the CSV
257
+ fieldnames = [
258
+ "profile_id",
259
+ "profile_name",
260
+ "identity_id",
261
+ "first_name",
262
+ "last_name",
263
+ "nickname",
264
+ "birth_year",
265
+ "email_address",
266
+ "phone_number",
267
+ "street_number",
268
+ "street_name",
269
+ "city",
270
+ "state",
271
+ "country",
272
+ "zip_code"
273
+ ]
274
+
275
+ logger.info(f"Writing {len(rows_to_write)} rows to {output_file}...")
276
+
277
+ with open(output_file, mode="w", newline="", encoding="utf-8") as f:
278
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
279
+ writer.writeheader()
280
+ writer.writerows(rows_to_write)
281
+
282
+ logger.info("Finished writing CSV mock data.")
283
+
284
+
285
+ if __name__ == "__main__":
286
+ parser = argparse.ArgumentParser(
287
+ description="Generate mock entity-resolution data in CSV format."
288
+ )
289
+ parser.add_argument("--num_profiles", type=int, default=10,
290
+ help="Number of random 'users' to fetch from randomuser.me (default 10).")
291
+ parser.add_argument("--typo_percentage", type=float, default=10.0,
292
+ help="Chance (0..100) that each new Identity row (beyond the first) has a random typo (default 10%).")
293
+ parser.add_argument("--output_file", type=str, default="mock_data.csv",
294
+ help="Output CSV filename (default 'mock_data.csv').")
295
+
296
+ args = parser.parse_args()
297
+ main(args.num_profiles, args.typo_percentage, args.output_file)
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
1
+ streamlit
2
  pandas
3
+ numpy
4
+ jellyfish
5
+ st-link-analysis
6
+ networkx
visualize_ER_networks_from_csv.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # streamlit run visualize_splink_networks_from_csv.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import jellyfish # For quick string similarity (Levenshtein, Jaro, etc.)
6
+ import io
7
+ import uuid
8
+
9
+ from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
10
+
11
+ # Try to import networkx, fall back to manual implementation if not available
12
+ try:
13
+ import networkx as nx
14
+ HAS_NETWORKX = True
15
+ except ImportError:
16
+ HAS_NETWORKX = False
17
+
18
+ # ----------------------
19
+ # CONFIG
20
+ # ----------------------
21
+ DEFAULT_NODE_LABEL = "Record"
22
+ DEFAULT_REL_TYPE = "SIMILAR"
23
+ DEFAULT_THRESHOLD = 0.80 # default similarity threshold
24
+ MAX_REDLINE_PREVIEW = 10 # how many top edges to preview with "red-lining"
25
+
26
+ st.set_page_config(
27
+ page_title="CSV ER & Network Graph",
28
+ layout="wide",
29
+ initial_sidebar_state="expanded"
30
+ )
31
+ st.title("Entity Resolution on CSV (Network Graph)")
32
+
33
+ # ----------------------
34
+ # SIDEBAR: CSV UPLOAD
35
+ # ----------------------
36
+ st.sidebar.header("Upload CSV for Entity Resolution")
37
+ uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"])
38
+
39
+ similarity_threshold = st.sidebar.slider(
40
+ "Similarity Threshold",
41
+ min_value=0.0,
42
+ max_value=1.0,
43
+ value=DEFAULT_THRESHOLD,
44
+ step=0.01
45
+ )
46
+
47
+ # Choose which columns to compare
48
+ st.sidebar.header("Similarity Columns")
49
+ # The user can list (or guess) which columns in the CSV are relevant for measuring similarity
50
+ # We'll default to common ones from 'create_mock_data_csv.py': first_name, last_name, email_address, phone_number
51
+ default_cols = "first_name,last_name,email_address,phone_number"
52
+ similarity_cols_raw = st.sidebar.text_input(
53
+ "Columns to compare (comma-separated):",
54
+ value=default_cols
55
+ )
56
+ similarity_cols = [c.strip() for c in similarity_cols_raw.split(",") if c.strip()]
57
+
58
+ # If the user wants to see red-lining differences
59
+ show_redlining = st.sidebar.checkbox("Show red-lined differences for top pairs", value=True)
60
+
61
+ # Data and Graph placeholders
62
+ df = None
63
+ elements = {"nodes": [], "edges": []}
64
+
65
+
66
+ # ----------------------
67
+ # UTILITY FUNCTIONS
68
+ # ----------------------
69
+ def jaro_winkler_score(str1, str2):
70
+ """Simple wrapper around jellyfish.jaro_winkler for string similarity."""
71
+ return jellyfish.jaro_winkler_similarity(str1 or "", str2 or "")
72
+
73
+ def overall_similarity(row1, row2, cols):
74
+ """
75
+ Compute an average similarity across the provided columns.
76
+ You could weight them or do more sophisticated logic.
77
+ """
78
+ scores = []
79
+ for col in cols:
80
+ val1 = str(row1.get(col, "")).lower()
81
+ val2 = str(row2.get(col, "")).lower()
82
+ if val1 == "" or val2 == "":
83
+ # If one is empty, skip or treat as partial
84
+ continue
85
+ sim = jaro_winkler_score(val1, val2)
86
+ scores.append(sim)
87
+ if len(scores) == 0:
88
+ return 0.0
89
+ return sum(scores) / len(scores)
90
+
91
+ def redline_text(str1, str2):
92
+ """
93
+ A simplistic "red-lining" of differences:
94
+ We'll highlight mismatched characters in red.
95
+ This helps show how two strings differ.
96
+ """
97
+ # For brevity, let's just do a character-by-character compare:
98
+ # if they match, we keep them black; if not, we color them red.
99
+ # In practice, you might do a diff algorithm for better results.
100
+ out = []
101
+ max_len = max(len(str1), len(str2))
102
+ for i in range(max_len):
103
+ c1 = str1[i] if i < len(str1) else ""
104
+ c2 = str2[i] if i < len(str2) else ""
105
+ if c1 == c2:
106
+ out.append(c1) # same char
107
+ else:
108
+ # highlight mismatch
109
+ out.append(f"<span style='color:red'>{c1 or '_'}</span>")
110
+ # If str2 is longer, we won't show it in the same line for now.
111
+ # You can adapt to show side-by-side. We'll keep it simple.
112
+ return "".join(out)
113
+
114
+ def find_connected_components_manual(nodes, edges):
115
+ """
116
+ Manual implementation of connected components finding.
117
+ Fallback when NetworkX is not available.
118
+ """
119
+ # Build adjacency list
120
+ adj_list = {node: set() for node in nodes}
121
+ for edge in edges:
122
+ source = edge["data"]["source"]
123
+ target = edge["data"]["target"]
124
+ adj_list[source].add(target)
125
+ adj_list[target].add(source)
126
+
127
+ visited = set()
128
+ components = []
129
+
130
+ def dfs(node, component):
131
+ if node in visited:
132
+ return
133
+ visited.add(node)
134
+ component.add(node)
135
+ for neighbor in adj_list[node]:
136
+ dfs(neighbor, component)
137
+
138
+ for node in nodes:
139
+ if node not in visited:
140
+ component = set()
141
+ dfs(node, component)
142
+ if component: # Only add non-empty components
143
+ components.append(component)
144
+
145
+ return components
146
+
147
+
148
+ # ----------------------
149
+ # LOAD CSV & PROCESS
150
+ # ----------------------
151
+ if uploaded_file is not None:
152
+ st.markdown("### Preview of Uploaded CSV Data")
153
+ df = pd.read_csv(uploaded_file)
154
+ st.dataframe(df.head(10))
155
+
156
+ # Provide a "Run Entity Resolution" button
157
+ if st.button("Run Entity Resolution"):
158
+ # STEP 1: Generate nodes
159
+ # We'll create one node per row, storing all row data as properties
160
+ nodes = []
161
+ for idx, row in df.iterrows():
162
+ node_data = row.to_dict()
163
+ node_data["id"] = str(idx) # use row index as unique ID
164
+ node_data["label"] = DEFAULT_NODE_LABEL
165
+ # We'll store "name" as a short label for the node
166
+ # e.g. we might use something like first_name + last_name or a subset
167
+ # but for demonstration, let's just do "row index" or any chosen fields
168
+ first_name = row.get("first_name", "")
169
+ last_name = row.get("last_name", "")
170
+ short_label = f"{first_name} {last_name}".strip()
171
+ if not short_label.strip():
172
+ short_label = f"Row-{idx}"
173
+ node_data["name"] = short_label
174
+ nodes.append({"data": node_data})
175
+
176
+ # STEP 2: Pairwise similarity for edges
177
+ # We'll do a naive all-pairs approach. For large data, you'd do blocking.
178
+ edges = []
179
+ for i in range(len(df)):
180
+ for j in range(i + 1, len(df)):
181
+ sim = overall_similarity(df.loc[i], df.loc[j], similarity_cols)
182
+ if sim >= similarity_threshold:
183
+ edge_data = {
184
+ "id": f"edge_{i}_{j}",
185
+ "source": str(i),
186
+ "target": str(j),
187
+ "label": DEFAULT_REL_TYPE,
188
+ "similarity": round(sim, 3)
189
+ }
190
+ edges.append({"data": edge_data})
191
+
192
+ elements = {"nodes": nodes, "edges": edges}
193
+ st.success("Entity Resolution complete! Network graph built.")
194
+
195
+
196
+ # ------------
197
+ # Visualization
198
+ st.markdown("### Network Graph")
199
+ node_labels = set(node["data"]["label"] for node in elements["nodes"])
200
+ rel_labels = set(edge["data"]["label"] for edge in elements["edges"])
201
+
202
+ # Basic styling
203
+ default_colors = ["#2A629A", "#FF7F3E", "#C0C0C0", "#008000", "#800080"]
204
+ node_styles = []
205
+ for i, label in enumerate(sorted(node_labels)):
206
+ color = default_colors[i % len(default_colors)]
207
+ node_styles.append(NodeStyle(label=label, color=color, caption="name"))
208
+
209
+ edge_styles = []
210
+ for rel in sorted(rel_labels):
211
+ edge_styles.append(EdgeStyle(rel, caption="similarity", directed=False))
212
+
213
+ st_link_analysis(
214
+ elements,
215
+ layout="cose",
216
+ node_styles=node_styles,
217
+ edge_styles=edge_styles
218
+ )
219
+
220
+ # ------------
221
+ # Community Detection & CSV Export
222
+ st.markdown("### Community Detection Results")
223
+
224
+ # Find connected components (communities)
225
+ if HAS_NETWORKX:
226
+ # Use NetworkX if available
227
+ G = nx.Graph()
228
+ for node in elements["nodes"]:
229
+ G.add_node(node["data"]["id"])
230
+ for edge in elements["edges"]:
231
+ G.add_edge(edge["data"]["source"], edge["data"]["target"])
232
+ communities = list(nx.connected_components(G))
233
+ else:
234
+ # Use manual implementation as fallback
235
+ st.info("NetworkX not found. Using manual connected components algorithm. Install NetworkX for better performance: `pip install networkx`")
236
+ node_ids = [node["data"]["id"] for node in elements["nodes"]]
237
+ communities = find_connected_components_manual(node_ids, elements["edges"])
238
+
239
+ # Create a mapping from node_id to community_id
240
+ node_to_community = {}
241
+ community_uuids = {}
242
+
243
+ for i, community in enumerate(communities):
244
+ community_uuid = str(uuid.uuid4())
245
+ community_uuids[i] = community_uuid
246
+ for node_id in community:
247
+ node_to_community[node_id] = community_uuid
248
+
249
+ # Add community IDs to the original dataframe
250
+ df_with_communities = df.copy()
251
+ df_with_communities['community_id'] = [
252
+ node_to_community.get(str(idx), str(uuid.uuid4()))
253
+ for idx in df_with_communities.index
254
+ ]
255
+
256
+ st.write(f"**Found {len(communities)} communities:**")
257
+ for i, community in enumerate(communities):
258
+ st.write(f"- Community {i+1}: {len(community)} records (UUID: {community_uuids[i]})")
259
+
260
+ # Show the results dataframe
261
+ st.markdown("#### Results with Community IDs")
262
+ st.dataframe(df_with_communities)
263
+
264
+ # CSV Export option
265
+ st.markdown("#### Export Results")
266
+ csv_buffer = io.StringIO()
267
+ df_with_communities.to_csv(csv_buffer, index=False)
268
+ csv_data = csv_buffer.getvalue()
269
+
270
+ st.download_button(
271
+ label="📥 Download Results as CSV",
272
+ data=csv_data,
273
+ file_name="entity_resolution_results.csv",
274
+ mime="text/csv"
275
+ )
276
+
277
+ # ------------
278
+ # Red-lining (moved to bottom as lower priority)
279
+ if show_redlining and len(edges) > 0:
280
+ st.markdown("### Top Similar Pairs (Red-Lined Differences)")
281
+
282
+ # Filter out exact matches (similarity == 1.0)
283
+ filtered_edges = [
284
+ edge for edge in edges if edge["data"]["similarity"] < 1.0
285
+ ]
286
+
287
+ # Sort by highest similarity (closest matches first)
288
+ sorted_edges = sorted(filtered_edges, key=lambda e: e["data"]["similarity"], reverse=True)
289
+ top_edges = sorted_edges[:MAX_REDLINE_PREVIEW]
290
+
291
+ if not top_edges:
292
+ st.info("No slightly different pairs found; all matches are exact or none meet the threshold.")
293
+ else:
294
+ for edge_item in top_edges:
295
+ s_idx = int(edge_item["data"]["source"])
296
+ t_idx = int(edge_item["data"]["target"])
297
+ sim_val = edge_item["data"]["similarity"]
298
+ st.markdown(f"**Pair:** Row {s_idx} ↔ Row {t_idx}, **similarity**={sim_val}")
299
+
300
+ # Highlight differences in selected columns
301
+ mismatch_cols = []
302
+ for col in similarity_cols:
303
+ val1 = str(df.loc[s_idx, col])
304
+ val2 = str(df.loc[t_idx, col])
305
+ if val1.lower() != val2.lower():
306
+ mismatch_cols.append((col, val1, val2))
307
+
308
+ if mismatch_cols:
309
+ st.write("Differences in the following columns:")
310
+ for col_name, str1, str2 in mismatch_cols:
311
+ redlined = redline_text(str1, str2)
312
+ st.markdown(f"&nbsp;&nbsp;**{col_name}:** {redlined}", unsafe_allow_html=True)
313
+ else:
314
+ st.write("No differences in the compared columns.")
315
+
316
+ st.markdown("---")
317
+
318
+ # ------------
319
+ # Enterprise Scale Note
320
+ st.markdown("---")
321
+ st.markdown("### 📈 Enterprise Scale Solutions")
322
+
323
+ if not HAS_NETWORKX:
324
+ st.warning("""
325
+ **Missing NetworkX Dependency**
326
+
327
+ For better performance, install NetworkX:
328
+ ```bash
329
+ pip install networkx
330
+ ```
331
+ """)
332
+
333
+ st.info("""
334
+ **Need help with larger scale deployments?**
335
+
336
+ If you need to persist UUIDs from run to run, handle larger datasets, or require more sophisticated
337
+ entity resolution capabilities, you may need an enterprise-scale solution. Consider:
338
+
339
+ - **Database Integration**: Store community IDs in a persistent database
340
+ - **Incremental Processing**: Handle new data without re-processing everything
341
+ - **Advanced Blocking**: Use more sophisticated blocking strategies for large datasets
342
+ - **Distributed Computing**: Scale across multiple machines for very large datasets
343
+ - **Custom ML Models**: Train domain-specific models for better accuracy
344
+
345
+ Contact **Eastridge Analytics** for guidance on enterprise implementations.
346
+ """)
347
+
348
+ else:
349
+ st.info("Please upload a CSV file in the sidebar to begin.")