entity-resolution-network-analysis

Runtime error

ohmygaugh commited on Aug 12, 2025

Commit

68356aa

1 Parent(s): 95d3f21

Add Entity Resolution Network Analysis app

- visualize_ER_networks_from_csv.py: Main Streamlit application
- requirements.txt: Dependencies including st-link-analysis for network viz
- create_mock_CSV_data.py: Demo data generator
- app.py: Entry point for Hugging Face Spaces

Files changed (4) hide show

app.py +10 -0
create_mock_CSV_data.py +297 -0
requirements.txt +5 -2
visualize_ER_networks_from_csv.py +349 -0

app.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import subprocess
+import sys
+import os
+if __name__ == '__main__':
+    subprocess.run([
+        sys.executable, '-m', 'streamlit', 'run', 'visualize_ER_networks_from_csv.py',
+        '--server.port=7860',
+        '--server.address=0.0.0.0'
+    ])

create_mock_CSV_data.py ADDED Viewed

	@@ -0,0 +1,297 @@

+#!/usr/bin/env python3
+"""
+create_mock_data_csv.py
+Fetches random user data from randomuser.me (or a similar service) and creates
+mock data in CSV format that imitates having multiple 'Profiles' and multiple
+'Identity' rows. Each row in the CSV represents an Identity and includes:
+- a parent Profile ID and Profile name,
+- first_name, last_name, birth_year, etc.,
+- and possibly random typos in selected fields (based on a user-defined percentage).
+Usage example:
+    python create_mock_data_csv.py --num_profiles=100 --typo_percentage=10 --output_file="mock_data.csv"
+"""
+import requests
+import random
+import logging
+import argparse
+import csv
+import uuid
+import numpy as np
+# If you use the nicknames library: pip install nicknames
+# from nicknames import NickNamer
+# For demonstration, let's fallback gracefully if not installed.
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+try:
+    from nicknames import NickNamer
+    NICKNAMES_AVAILABLE = True
+except ImportError:
+    NICKNAMES_AVAILABLE = False
+    logger.warning("nicknames library is not installed. Nickname feature will be limited.")
+def fetch_random_users(num_profiles):
+    """
+    Fetch random user data from the randomuser.me API.
+    Returns a list of user dicts with relevant attributes.
+    """
+    url = f"https://randomuser.me/api/?results={num_profiles}&nat=us"
+    response = requests.get(url)
+    response.raise_for_status()
+    data = response.json()
+    return data.get("results", [])
+class User:
+    """
+    Simple container for user data fetched from randomuser.me,
+    plus logic for generating nicknames, emails, phone numbers,
+    and introducing random typos.
+    """
+    def __init__(self, user_data):
+        self.user_data = user_data
+        # Extract basic info
+        self.name_data = user_data.get("name", {})
+        self.first_name = self.name_data.get("first", "Unknown")
+        self.last_name = self.name_data.get("last", "Unknown")
+        self.nickname = self._choose_nickname()
+        dob = user_data.get("dob", {})
+        self.birth_year = str(dob.get("date", "")[:4])  # 'YYYY-MM-DD...' -> 'YYYY'
+        # Email address: random combination of first, last, year, etc.
+        self.email_address = self._generate_email()
+        # Phone number: just digits from the API phone.
+        phone_raw = user_data.get("phone", "")
+        self.phone_number = "".join(filter(str.isdigit, phone_raw))
+        # Address fields
+        location = user_data.get("location", {})
+        self.street_number = str(location.get("street", {}).get("number", ""))
+        self.street_name = location.get("street", {}).get("name", "")
+        self.city = location.get("city", "")
+        self.state = location.get("state", "")
+        self.country = location.get("country", "")
+        self.zip_code = str(location.get("postcode", ""))
+    @property
+    def full_name(self):
+        return f"{self.first_name} {self.last_name}"
+    @property
+    def full_address(self):
+        return f"{self.street_number} {self.street_name}, {self.city}, {self.state} {self.zip_code}"
+    def _choose_nickname(self):
+        """
+        Uses the nicknames library if available, otherwise falls back to the first name.
+        """
+        if NICKNAMES_AVAILABLE:
+            nn = NickNamer()
+            possible_nicknames = nn.nicknames_of(self.first_name)
+            if possible_nicknames:
+                return random.choice(list(possible_nicknames))
+        return self.first_name
+    def _generate_email(self):
+        domain = random.choice(["gmail", "yahoo", "hotmail", "outlook"])
+        first_part = random.choice([self.first_name, self.nickname, self.first_name[:1]])
+        last_part = random.choice([self.last_name, self.last_name[:1]])
+        optional = random.choice(["", self.birth_year, self.birth_year[-2:], str(random.randint(1, 100))])
+        return f"{first_part}{last_part}{optional}@{domain}.com".lower()
+    def add_typo(self, property_name):
+        """
+        Introduce a random typo into the specified property (e.g. 'first_name').
+        If property_name == 'full_address', we randomly pick an address field to modify.
+        """
+        if property_name == "full_address":
+            property_name = random.choice(
+                ["street_number", "street_name", "city", "state", "zip_code"]
+            )
+        current_value = getattr(self, property_name, None)
+        if not current_value or not isinstance(current_value, str):
+            return  # If it's empty or not a string, skip
+        original_value = current_value
+        new_value = self._apply_random_typo(current_value)
+        setattr(self, property_name, new_value)
+        logger.debug(f"Applying typo: {property_name}: '{original_value}' -> '{new_value}'")
+    def _apply_random_typo(self, text):
+        """
+        Introduce a random single-character error (delete, swap, insert, replace)
+        or regenerate an email.
+        """
+        if not text:
+            return text
+        option = random.choice(["delete", "swap", "insert", "replace"])
+        # If email, sometimes just regenerate the entire email.
+        if "@" in text:
+            # 1 in 3 chance we fully regenerate the email.
+            if random.random() < 0.33:
+                return self._generate_email()
+        if len(text) == 1:
+            # If we have only one character, we can only do replace or insert.
+            option = random.choice(["insert", "replace"])
+        index = random.randint(0, len(text) - 1)
+        if option == "delete":
+            # Remove 1 char
+            return text[:index] + text[index+1:]
+        elif option == "swap":
+            # Swap with the next char if possible
+            if index < len(text) - 1:
+                # swap
+                lst = list(text)
+                lst[index], lst[index+1] = lst[index+1], lst[index]
+                return "".join(lst)
+            else:
+                # fallback to replace if we can't swap
+                letter = random.choice("abcdefghijklmnopqrstuvwxyz")
+                return text[:index] + letter + text[index+1:]
+        elif option == "insert":
+            # Insert a random letter at index
+            letter = random.choice("abcdefghijklmnopqrstuvwxyz")
+            return text[:index] + letter + text[index:]
+        elif option == "replace":
+            letter = random.choice("abcdefghijklmnopqrstuvwxyz")
+            return text[:index] + letter + text[index+1:]
+        # Fallback: no change
+        return text
+def main(num_profiles, typo_percentage, output_file):
+    """
+    1) Fetch random user data from randomuser.me
+    2) For each user, create 1..N 'Profile' nodes
+    3) For each 'Profile', create 1..M 'Identities'
+    4) Introduce random typos in selected fields
+    5) Write all Identity rows to CSV, including their associated Profile info
+    """
+    logger.info(f"Generating mock data for {num_profiles} profiles...")
+    api_data = fetch_random_users(num_profiles)
+    rows_to_write = []
+    # The number of identity nodes depends on random gaussian logic or your own preference
+    # e.g. a normal distribution around 8 with std=5, clipped to positives
+    # We'll keep the same approach from the original script.
+    for data in api_data:
+        user = User(data)
+        # random number of Identities
+        num_ids = abs(int(np.random.normal(8, 5)))  # e.g. mean=8, std=5
+        # pick how many distinct "Profile" nodes each user might produce
+        # (in the original code, we used some logic to decide 1 or 2 or 3 profiles)
+        if num_ids > 4:
+            num_profiles_for_user = random.choice([1, 1, 1, 2, 2, 3])
+        else:
+            num_profiles_for_user = 1
+        # Create the Profile IDs and store them
+        profile_ids = [str(uuid.uuid4()) for _ in range(num_profiles_for_user)]
+        profile_name = user.full_name  # in the original script, we used the same name for each 'Profile'
+        # We'll distribute the Identity rows across these profiles
+        profile_idx = 0
+        for i in range(num_ids):
+            # If the fraction i/num_ids > fraction dividing the profiles,
+            # move to next profile. (just a simple distribution approach)
+            if num_profiles_for_user > 1:
+                if i / num_ids > (profile_idx + 1) / num_profiles_for_user:
+                    profile_idx += 1
+            current_profile_id = profile_ids[profile_idx]
+            # Possibly apply a typo
+            # For each new identity row (beyond the first?), there's a chance to add a typo
+            if i > 0 and random.random() < (typo_percentage / 100.0):
+                # choose a random field
+                possible_fields = ["first_name", "last_name", "email_address",
+                                   "phone_number", "full_address", "birth_year"]
+                chosen_field = random.choice(possible_fields)
+                user.add_typo(chosen_field)
+            # Create a row for the Identity
+            identity_id = str(uuid.uuid4())
+            row = {
+                "profile_id": current_profile_id,
+                "profile_name": profile_name,
+                "identity_id": identity_id,
+                "first_name": user.first_name,
+                "last_name": user.last_name,
+                "nickname": user.nickname,
+                "birth_year": user.birth_year,
+                "email_address": user.email_address,
+                "phone_number": user.phone_number,
+                "street_number": user.street_number,
+                "street_name": user.street_name,
+                "city": user.city,
+                "state": user.state,
+                "country": user.country,
+                "zip_code": user.zip_code
+            }
+            rows_to_write.append(row)
+    # Now write the CSV
+    fieldnames = [
+        "profile_id",
+        "profile_name",
+        "identity_id",
+        "first_name",
+        "last_name",
+        "nickname",
+        "birth_year",
+        "email_address",
+        "phone_number",
+        "street_number",
+        "street_name",
+        "city",
+        "state",
+        "country",
+        "zip_code"
+    ]
+    logger.info(f"Writing {len(rows_to_write)} rows to {output_file}...")
+    with open(output_file, mode="w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows_to_write)
+    logger.info("Finished writing CSV mock data.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate mock entity-resolution data in CSV format."
+    )
+    parser.add_argument("--num_profiles", type=int, default=10,
+                        help="Number of random 'users' to fetch from randomuser.me (default 10).")
+    parser.add_argument("--typo_percentage", type=float, default=10.0,
+                        help="Chance (0..100) that each new Identity row (beyond the first) has a random typo (default 10%).")
+    parser.add_argument("--output_file", type=str, default="mock_data.csv",
+                        help="Output CSV filename (default 'mock_data.csv').")
+    args = parser.parse_args()
+    main(args.num_profiles, args.typo_percentage, args.output_file)

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
 pandas
-streamlit

+streamlit
 pandas
+numpy
+jellyfish
+st-link-analysis
+networkx

visualize_ER_networks_from_csv.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# streamlit run visualize_splink_networks_from_csv.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+import jellyfish  # For quick string similarity (Levenshtein, Jaro, etc.)
+import io
+import uuid
+from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
+# Try to import networkx, fall back to manual implementation if not available
+try:
+    import networkx as nx
+    HAS_NETWORKX = True
+except ImportError:
+    HAS_NETWORKX = False
+# ----------------------
+# CONFIG
+# ----------------------
+DEFAULT_NODE_LABEL = "Record"
+DEFAULT_REL_TYPE = "SIMILAR"
+DEFAULT_THRESHOLD = 0.80  # default similarity threshold
+MAX_REDLINE_PREVIEW = 10  # how many top edges to preview with "red-lining"
+st.set_page_config(
+    page_title="CSV ER & Network Graph",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+st.title("Entity Resolution on CSV (Network Graph)")
+# ----------------------
+# SIDEBAR: CSV UPLOAD
+# ----------------------
+st.sidebar.header("Upload CSV for Entity Resolution")
+uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"])
+similarity_threshold = st.sidebar.slider(
+    "Similarity Threshold",
+    min_value=0.0,
+    max_value=1.0,
+    value=DEFAULT_THRESHOLD,
+    step=0.01
+)
+# Choose which columns to compare
+st.sidebar.header("Similarity Columns")
+# The user can list (or guess) which columns in the CSV are relevant for measuring similarity
+# We'll default to common ones from 'create_mock_data_csv.py': first_name, last_name, email_address, phone_number
+default_cols = "first_name,last_name,email_address,phone_number"
+similarity_cols_raw = st.sidebar.text_input(
+    "Columns to compare (comma-separated):",
+    value=default_cols
+)
+similarity_cols = [c.strip() for c in similarity_cols_raw.split(",") if c.strip()]
+# If the user wants to see red-lining differences
+show_redlining = st.sidebar.checkbox("Show red-lined differences for top pairs", value=True)
+# Data and Graph placeholders
+df = None
+elements = {"nodes": [], "edges": []}
+# ----------------------
+# UTILITY FUNCTIONS
+# ----------------------
+def jaro_winkler_score(str1, str2):
+    """Simple wrapper around jellyfish.jaro_winkler for string similarity."""
+    return jellyfish.jaro_winkler_similarity(str1 or "", str2 or "")
+def overall_similarity(row1, row2, cols):
+    """
+    Compute an average similarity across the provided columns.
+    You could weight them or do more sophisticated logic.
+    """
+    scores = []
+    for col in cols:
+        val1 = str(row1.get(col, "")).lower()
+        val2 = str(row2.get(col, "")).lower()
+        if val1 == "" or val2 == "":
+            # If one is empty, skip or treat as partial
+            continue
+        sim = jaro_winkler_score(val1, val2)
+        scores.append(sim)
+    if len(scores) == 0:
+        return 0.0
+    return sum(scores) / len(scores)
+def redline_text(str1, str2):
+    """
+    A simplistic "red-lining" of differences:
+    We'll highlight mismatched characters in red.
+    This helps show how two strings differ.
+    """
+    # For brevity, let's just do a character-by-character compare:
+    #   if they match, we keep them black; if not, we color them red.
+    # In practice, you might do a diff algorithm for better results.
+    out = []
+    max_len = max(len(str1), len(str2))
+    for i in range(max_len):
+        c1 = str1[i] if i < len(str1) else ""
+        c2 = str2[i] if i < len(str2) else ""
+        if c1 == c2:
+            out.append(c1)  # same char
+        else:
+            # highlight mismatch
+            out.append(f"<span style='color:red'>{c1 or '_'}</span>")
+    # If str2 is longer, we won't show it in the same line for now.
+    # You can adapt to show side-by-side. We'll keep it simple.
+    return "".join(out)
+def find_connected_components_manual(nodes, edges):
+    """
+    Manual implementation of connected components finding.
+    Fallback when NetworkX is not available.
+    """
+    # Build adjacency list
+    adj_list = {node: set() for node in nodes}
+    for edge in edges:
+        source = edge["data"]["source"]
+        target = edge["data"]["target"]
+        adj_list[source].add(target)
+        adj_list[target].add(source)
+    visited = set()
+    components = []
+    def dfs(node, component):
+        if node in visited:
+            return
+        visited.add(node)
+        component.add(node)
+        for neighbor in adj_list[node]:
+            dfs(neighbor, component)
+    for node in nodes:
+        if node not in visited:
+            component = set()
+            dfs(node, component)
+            if component:  # Only add non-empty components
+                components.append(component)
+    return components
+# ----------------------
+# LOAD CSV & PROCESS
+# ----------------------
+if uploaded_file is not None:
+    st.markdown("### Preview of Uploaded CSV Data")
+    df = pd.read_csv(uploaded_file)
+    st.dataframe(df.head(10))
+    # Provide a "Run Entity Resolution" button
+    if st.button("Run Entity Resolution"):
+        # STEP 1: Generate nodes
+        # We'll create one node per row, storing all row data as properties
+        nodes = []
+        for idx, row in df.iterrows():
+            node_data = row.to_dict()
+            node_data["id"] = str(idx)  # use row index as unique ID
+            node_data["label"] = DEFAULT_NODE_LABEL
+            # We'll store "name" as a short label for the node
+            # e.g. we might use something like first_name + last_name or a subset
+            # but for demonstration, let's just do "row index" or any chosen fields
+            first_name = row.get("first_name", "")
+            last_name = row.get("last_name", "")
+            short_label = f"{first_name} {last_name}".strip()
+            if not short_label.strip():
+                short_label = f"Row-{idx}"
+            node_data["name"] = short_label
+            nodes.append({"data": node_data})
+        # STEP 2: Pairwise similarity for edges
+        # We'll do a naive all-pairs approach. For large data, you'd do blocking.
+        edges = []
+        for i in range(len(df)):
+            for j in range(i + 1, len(df)):
+                sim = overall_similarity(df.loc[i], df.loc[j], similarity_cols)
+                if sim >= similarity_threshold:
+                    edge_data = {
+                        "id": f"edge_{i}_{j}",
+                        "source": str(i),
+                        "target": str(j),
+                        "label": DEFAULT_REL_TYPE,
+                        "similarity": round(sim, 3)
+                    }
+                    edges.append({"data": edge_data})
+        elements = {"nodes": nodes, "edges": edges}
+        st.success("Entity Resolution complete! Network graph built.")
+        # ------------
+        # Visualization
+        st.markdown("### Network Graph")
+        node_labels = set(node["data"]["label"] for node in elements["nodes"])
+        rel_labels = set(edge["data"]["label"] for edge in elements["edges"])
+        # Basic styling
+        default_colors = ["#2A629A", "#FF7F3E", "#C0C0C0", "#008000", "#800080"]
+        node_styles = []
+        for i, label in enumerate(sorted(node_labels)):
+            color = default_colors[i % len(default_colors)]
+            node_styles.append(NodeStyle(label=label, color=color, caption="name"))
+        edge_styles = []
+        for rel in sorted(rel_labels):
+            edge_styles.append(EdgeStyle(rel, caption="similarity", directed=False))
+        st_link_analysis(
+            elements,
+            layout="cose",
+            node_styles=node_styles,
+            edge_styles=edge_styles
+        )
+        # ------------
+        # Community Detection & CSV Export
+        st.markdown("### Community Detection Results")
+        # Find connected components (communities)
+        if HAS_NETWORKX:
+            # Use NetworkX if available
+            G = nx.Graph()
+            for node in elements["nodes"]:
+                G.add_node(node["data"]["id"])
+            for edge in elements["edges"]:
+                G.add_edge(edge["data"]["source"], edge["data"]["target"])
+            communities = list(nx.connected_components(G))
+        else:
+            # Use manual implementation as fallback
+            st.info("NetworkX not found. Using manual connected components algorithm. Install NetworkX for better performance: `pip install networkx`")
+            node_ids = [node["data"]["id"] for node in elements["nodes"]]
+            communities = find_connected_components_manual(node_ids, elements["edges"])
+        # Create a mapping from node_id to community_id
+        node_to_community = {}
+        community_uuids = {}
+        for i, community in enumerate(communities):
+            community_uuid = str(uuid.uuid4())
+            community_uuids[i] = community_uuid
+            for node_id in community:
+                node_to_community[node_id] = community_uuid
+        # Add community IDs to the original dataframe
+        df_with_communities = df.copy()
+        df_with_communities['community_id'] = [
+            node_to_community.get(str(idx), str(uuid.uuid4()))
+            for idx in df_with_communities.index
+        ]
+        st.write(f"**Found {len(communities)} communities:**")
+        for i, community in enumerate(communities):
+            st.write(f"- Community {i+1}: {len(community)} records (UUID: {community_uuids[i]})")
+        # Show the results dataframe
+        st.markdown("#### Results with Community IDs")
+        st.dataframe(df_with_communities)
+        # CSV Export option
+        st.markdown("#### Export Results")
+        csv_buffer = io.StringIO()
+        df_with_communities.to_csv(csv_buffer, index=False)
+        csv_data = csv_buffer.getvalue()
+        st.download_button(
+            label="📥 Download Results as CSV",
+            data=csv_data,
+            file_name="entity_resolution_results.csv",
+            mime="text/csv"
+        )
+        # ------------
+        # Red-lining (moved to bottom as lower priority)
+        if show_redlining and len(edges) > 0:
+            st.markdown("### Top Similar Pairs (Red-Lined Differences)")
+            # Filter out exact matches (similarity == 1.0)
+            filtered_edges = [
+                edge for edge in edges if edge["data"]["similarity"] < 1.0
+            ]
+            # Sort by highest similarity (closest matches first)
+            sorted_edges = sorted(filtered_edges, key=lambda e: e["data"]["similarity"], reverse=True)
+            top_edges = sorted_edges[:MAX_REDLINE_PREVIEW]
+            if not top_edges:
+                st.info("No slightly different pairs found; all matches are exact or none meet the threshold.")
+            else:
+                for edge_item in top_edges:
+                    s_idx = int(edge_item["data"]["source"])
+                    t_idx = int(edge_item["data"]["target"])
+                    sim_val = edge_item["data"]["similarity"]
+                    st.markdown(f"**Pair:** Row {s_idx} ↔ Row {t_idx}, **similarity**={sim_val}")
+                    # Highlight differences in selected columns
+                    mismatch_cols = []
+                    for col in similarity_cols:
+                        val1 = str(df.loc[s_idx, col])
+                        val2 = str(df.loc[t_idx, col])
+                        if val1.lower() != val2.lower():
+                            mismatch_cols.append((col, val1, val2))
+                    if mismatch_cols:
+                        st.write("Differences in the following columns:")
+                        for col_name, str1, str2 in mismatch_cols:
+                            redlined = redline_text(str1, str2)
+                            st.markdown(f"&nbsp;&nbsp;**{col_name}:** {redlined}", unsafe_allow_html=True)
+                    else:
+                        st.write("No differences in the compared columns.")
+                    st.markdown("---")
+        # ------------
+        # Enterprise Scale Note
+        st.markdown("---")
+        st.markdown("### 📈 Enterprise Scale Solutions")
+        if not HAS_NETWORKX:
+            st.warning("""
+            **Missing NetworkX Dependency**
+            For better performance, install NetworkX:
+            ```bash
+            pip install networkx
+            ```
+            """)
+        st.info("""
+        **Need help with larger scale deployments?**
+        If you need to persist UUIDs from run to run, handle larger datasets, or require more sophisticated
+        entity resolution capabilities, you may need an enterprise-scale solution. Consider:
+        - **Database Integration**: Store community IDs in a persistent database
+        - **Incremental Processing**: Handle new data without re-processing everything
+        - **Advanced Blocking**: Use more sophisticated blocking strategies for large datasets
+        - **Distributed Computing**: Scale across multiple machines for very large datasets
+        - **Custom ML Models**: Train domain-specific models for better accuracy
+        Contact **Eastridge Analytics** for guidance on enterprise implementations.
+        """)
+else:
+    st.info("Please upload a CSV file in the sidebar to begin.")