Refactor: Align project with standard HF Docker structure
Browse files- Dockerfile +3 -12
- requirements.txt +1 -1
- src/create_mock_CSV_data.py +0 -297
- src/debug_upload.py +0 -26
- src/requirements.txt +0 -6
- src/streamlit_app.py +0 -63
Dockerfile
CHANGED
|
@@ -2,20 +2,11 @@ FROM python:3.9-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
build-essential \
|
| 7 |
-
curl \
|
| 8 |
-
software-properties-common \
|
| 9 |
-
git \
|
| 10 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
-
|
| 12 |
COPY requirements.txt ./
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
EXPOSE 8501
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
COPY requirements.txt ./
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
|
| 8 |
+
COPY . .
|
| 9 |
|
| 10 |
EXPOSE 8501
|
| 11 |
|
| 12 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
streamlit
|
| 2 |
pandas
|
| 3 |
numpy
|
| 4 |
jellyfish
|
|
|
|
| 1 |
+
streamlit==1.33.0
|
| 2 |
pandas
|
| 3 |
numpy
|
| 4 |
jellyfish
|
src/create_mock_CSV_data.py
DELETED
|
@@ -1,297 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
|
| 3 |
-
"""
|
| 4 |
-
create_mock_data_csv.py
|
| 5 |
-
|
| 6 |
-
Fetches random user data from randomuser.me (or a similar service) and creates
|
| 7 |
-
mock data in CSV format that imitates having multiple 'Profiles' and multiple
|
| 8 |
-
'Identity' rows. Each row in the CSV represents an Identity and includes:
|
| 9 |
-
- a parent Profile ID and Profile name,
|
| 10 |
-
- first_name, last_name, birth_year, etc.,
|
| 11 |
-
- and possibly random typos in selected fields (based on a user-defined percentage).
|
| 12 |
-
|
| 13 |
-
Usage example:
|
| 14 |
-
python create_mock_data_csv.py --num_profiles=100 --typo_percentage=10 --output_file="mock_data.csv"
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import requests
|
| 18 |
-
import random
|
| 19 |
-
import logging
|
| 20 |
-
import argparse
|
| 21 |
-
import csv
|
| 22 |
-
import uuid
|
| 23 |
-
import numpy as np
|
| 24 |
-
# If you use the nicknames library: pip install nicknames
|
| 25 |
-
# from nicknames import NickNamer
|
| 26 |
-
# For demonstration, let's fallback gracefully if not installed.
|
| 27 |
-
|
| 28 |
-
logging.basicConfig(level=logging.INFO)
|
| 29 |
-
logger = logging.getLogger(__name__)
|
| 30 |
-
|
| 31 |
-
try:
|
| 32 |
-
from nicknames import NickNamer
|
| 33 |
-
NICKNAMES_AVAILABLE = True
|
| 34 |
-
except ImportError:
|
| 35 |
-
NICKNAMES_AVAILABLE = False
|
| 36 |
-
logger.warning("nicknames library is not installed. Nickname feature will be limited.")
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def fetch_random_users(num_profiles):
|
| 40 |
-
"""
|
| 41 |
-
Fetch random user data from the randomuser.me API.
|
| 42 |
-
Returns a list of user dicts with relevant attributes.
|
| 43 |
-
"""
|
| 44 |
-
url = f"https://randomuser.me/api/?results={num_profiles}&nat=us"
|
| 45 |
-
response = requests.get(url)
|
| 46 |
-
response.raise_for_status()
|
| 47 |
-
data = response.json()
|
| 48 |
-
return data.get("results", [])
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
class User:
|
| 52 |
-
"""
|
| 53 |
-
Simple container for user data fetched from randomuser.me,
|
| 54 |
-
plus logic for generating nicknames, emails, phone numbers,
|
| 55 |
-
and introducing random typos.
|
| 56 |
-
"""
|
| 57 |
-
def __init__(self, user_data):
|
| 58 |
-
self.user_data = user_data
|
| 59 |
-
|
| 60 |
-
# Extract basic info
|
| 61 |
-
self.name_data = user_data.get("name", {})
|
| 62 |
-
self.first_name = self.name_data.get("first", "Unknown")
|
| 63 |
-
self.last_name = self.name_data.get("last", "Unknown")
|
| 64 |
-
self.nickname = self._choose_nickname()
|
| 65 |
-
|
| 66 |
-
dob = user_data.get("dob", {})
|
| 67 |
-
self.birth_year = str(dob.get("date", "")[:4]) # 'YYYY-MM-DD...' -> 'YYYY'
|
| 68 |
-
|
| 69 |
-
# Email address: random combination of first, last, year, etc.
|
| 70 |
-
self.email_address = self._generate_email()
|
| 71 |
-
|
| 72 |
-
# Phone number: just digits from the API phone.
|
| 73 |
-
phone_raw = user_data.get("phone", "")
|
| 74 |
-
self.phone_number = "".join(filter(str.isdigit, phone_raw))
|
| 75 |
-
|
| 76 |
-
# Address fields
|
| 77 |
-
location = user_data.get("location", {})
|
| 78 |
-
self.street_number = str(location.get("street", {}).get("number", ""))
|
| 79 |
-
self.street_name = location.get("street", {}).get("name", "")
|
| 80 |
-
self.city = location.get("city", "")
|
| 81 |
-
self.state = location.get("state", "")
|
| 82 |
-
self.country = location.get("country", "")
|
| 83 |
-
self.zip_code = str(location.get("postcode", ""))
|
| 84 |
-
|
| 85 |
-
@property
|
| 86 |
-
def full_name(self):
|
| 87 |
-
return f"{self.first_name} {self.last_name}"
|
| 88 |
-
|
| 89 |
-
@property
|
| 90 |
-
def full_address(self):
|
| 91 |
-
return f"{self.street_number} {self.street_name}, {self.city}, {self.state} {self.zip_code}"
|
| 92 |
-
|
| 93 |
-
def _choose_nickname(self):
|
| 94 |
-
"""
|
| 95 |
-
Uses the nicknames library if available, otherwise falls back to the first name.
|
| 96 |
-
"""
|
| 97 |
-
if NICKNAMES_AVAILABLE:
|
| 98 |
-
nn = NickNamer()
|
| 99 |
-
possible_nicknames = nn.nicknames_of(self.first_name)
|
| 100 |
-
if possible_nicknames:
|
| 101 |
-
return random.choice(list(possible_nicknames))
|
| 102 |
-
return self.first_name
|
| 103 |
-
|
| 104 |
-
def _generate_email(self):
|
| 105 |
-
domain = random.choice(["gmail", "yahoo", "hotmail", "outlook"])
|
| 106 |
-
first_part = random.choice([self.first_name, self.nickname, self.first_name[:1]])
|
| 107 |
-
last_part = random.choice([self.last_name, self.last_name[:1]])
|
| 108 |
-
optional = random.choice(["", self.birth_year, self.birth_year[-2:], str(random.randint(1, 100))])
|
| 109 |
-
return f"{first_part}{last_part}{optional}@{domain}.com".lower()
|
| 110 |
-
|
| 111 |
-
def add_typo(self, property_name):
|
| 112 |
-
"""
|
| 113 |
-
Introduce a random typo into the specified property (e.g. 'first_name').
|
| 114 |
-
If property_name == 'full_address', we randomly pick an address field to modify.
|
| 115 |
-
"""
|
| 116 |
-
if property_name == "full_address":
|
| 117 |
-
property_name = random.choice(
|
| 118 |
-
["street_number", "street_name", "city", "state", "zip_code"]
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
current_value = getattr(self, property_name, None)
|
| 122 |
-
if not current_value or not isinstance(current_value, str):
|
| 123 |
-
return # If it's empty or not a string, skip
|
| 124 |
-
|
| 125 |
-
original_value = current_value
|
| 126 |
-
new_value = self._apply_random_typo(current_value)
|
| 127 |
-
setattr(self, property_name, new_value)
|
| 128 |
-
logger.debug(f"Applying typo: {property_name}: '{original_value}' -> '{new_value}'")
|
| 129 |
-
|
| 130 |
-
def _apply_random_typo(self, text):
|
| 131 |
-
"""
|
| 132 |
-
Introduce a random single-character error (delete, swap, insert, replace)
|
| 133 |
-
or regenerate an email.
|
| 134 |
-
"""
|
| 135 |
-
if not text:
|
| 136 |
-
return text
|
| 137 |
-
|
| 138 |
-
option = random.choice(["delete", "swap", "insert", "replace"])
|
| 139 |
-
|
| 140 |
-
# If email, sometimes just regenerate the entire email.
|
| 141 |
-
if "@" in text:
|
| 142 |
-
# 1 in 3 chance we fully regenerate the email.
|
| 143 |
-
if random.random() < 0.33:
|
| 144 |
-
return self._generate_email()
|
| 145 |
-
|
| 146 |
-
if len(text) == 1:
|
| 147 |
-
# If we have only one character, we can only do replace or insert.
|
| 148 |
-
option = random.choice(["insert", "replace"])
|
| 149 |
-
|
| 150 |
-
index = random.randint(0, len(text) - 1)
|
| 151 |
-
|
| 152 |
-
if option == "delete":
|
| 153 |
-
# Remove 1 char
|
| 154 |
-
return text[:index] + text[index+1:]
|
| 155 |
-
|
| 156 |
-
elif option == "swap":
|
| 157 |
-
# Swap with the next char if possible
|
| 158 |
-
if index < len(text) - 1:
|
| 159 |
-
# swap
|
| 160 |
-
lst = list(text)
|
| 161 |
-
lst[index], lst[index+1] = lst[index+1], lst[index]
|
| 162 |
-
return "".join(lst)
|
| 163 |
-
else:
|
| 164 |
-
# fallback to replace if we can't swap
|
| 165 |
-
letter = random.choice("abcdefghijklmnopqrstuvwxyz")
|
| 166 |
-
return text[:index] + letter + text[index+1:]
|
| 167 |
-
|
| 168 |
-
elif option == "insert":
|
| 169 |
-
# Insert a random letter at index
|
| 170 |
-
letter = random.choice("abcdefghijklmnopqrstuvwxyz")
|
| 171 |
-
return text[:index] + letter + text[index:]
|
| 172 |
-
|
| 173 |
-
elif option == "replace":
|
| 174 |
-
letter = random.choice("abcdefghijklmnopqrstuvwxyz")
|
| 175 |
-
return text[:index] + letter + text[index+1:]
|
| 176 |
-
|
| 177 |
-
# Fallback: no change
|
| 178 |
-
return text
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
def main(num_profiles, typo_percentage, output_file):
|
| 182 |
-
"""
|
| 183 |
-
1) Fetch random user data from randomuser.me
|
| 184 |
-
2) For each user, create 1..N 'Profile' nodes
|
| 185 |
-
3) For each 'Profile', create 1..M 'Identities'
|
| 186 |
-
4) Introduce random typos in selected fields
|
| 187 |
-
5) Write all Identity rows to CSV, including their associated Profile info
|
| 188 |
-
"""
|
| 189 |
-
logger.info(f"Generating mock data for {num_profiles} profiles...")
|
| 190 |
-
api_data = fetch_random_users(num_profiles)
|
| 191 |
-
|
| 192 |
-
rows_to_write = []
|
| 193 |
-
|
| 194 |
-
# The number of identity nodes depends on random gaussian logic or your own preference
|
| 195 |
-
# e.g. a normal distribution around 8 with std=5, clipped to positives
|
| 196 |
-
# We'll keep the same approach from the original script.
|
| 197 |
-
for data in api_data:
|
| 198 |
-
user = User(data)
|
| 199 |
-
|
| 200 |
-
# random number of Identities
|
| 201 |
-
num_ids = abs(int(np.random.normal(8, 5))) # e.g. mean=8, std=5
|
| 202 |
-
|
| 203 |
-
# pick how many distinct "Profile" nodes each user might produce
|
| 204 |
-
# (in the original code, we used some logic to decide 1 or 2 or 3 profiles)
|
| 205 |
-
if num_ids > 4:
|
| 206 |
-
num_profiles_for_user = random.choice([1, 1, 1, 2, 2, 3])
|
| 207 |
-
else:
|
| 208 |
-
num_profiles_for_user = 1
|
| 209 |
-
|
| 210 |
-
# Create the Profile IDs and store them
|
| 211 |
-
profile_ids = [str(uuid.uuid4()) for _ in range(num_profiles_for_user)]
|
| 212 |
-
profile_name = user.full_name # in the original script, we used the same name for each 'Profile'
|
| 213 |
-
|
| 214 |
-
# We'll distribute the Identity rows across these profiles
|
| 215 |
-
profile_idx = 0
|
| 216 |
-
|
| 217 |
-
for i in range(num_ids):
|
| 218 |
-
# If the fraction i/num_ids > fraction dividing the profiles,
|
| 219 |
-
# move to next profile. (just a simple distribution approach)
|
| 220 |
-
if num_profiles_for_user > 1:
|
| 221 |
-
if i / num_ids > (profile_idx + 1) / num_profiles_for_user:
|
| 222 |
-
profile_idx += 1
|
| 223 |
-
|
| 224 |
-
current_profile_id = profile_ids[profile_idx]
|
| 225 |
-
|
| 226 |
-
# Possibly apply a typo
|
| 227 |
-
# For each new identity row (beyond the first?), there's a chance to add a typo
|
| 228 |
-
if i > 0 and random.random() < (typo_percentage / 100.0):
|
| 229 |
-
# choose a random field
|
| 230 |
-
possible_fields = ["first_name", "last_name", "email_address",
|
| 231 |
-
"phone_number", "full_address", "birth_year"]
|
| 232 |
-
chosen_field = random.choice(possible_fields)
|
| 233 |
-
user.add_typo(chosen_field)
|
| 234 |
-
|
| 235 |
-
# Create a row for the Identity
|
| 236 |
-
identity_id = str(uuid.uuid4())
|
| 237 |
-
row = {
|
| 238 |
-
"profile_id": current_profile_id,
|
| 239 |
-
"profile_name": profile_name,
|
| 240 |
-
"identity_id": identity_id,
|
| 241 |
-
"first_name": user.first_name,
|
| 242 |
-
"last_name": user.last_name,
|
| 243 |
-
"nickname": user.nickname,
|
| 244 |
-
"birth_year": user.birth_year,
|
| 245 |
-
"email_address": user.email_address,
|
| 246 |
-
"phone_number": user.phone_number,
|
| 247 |
-
"street_number": user.street_number,
|
| 248 |
-
"street_name": user.street_name,
|
| 249 |
-
"city": user.city,
|
| 250 |
-
"state": user.state,
|
| 251 |
-
"country": user.country,
|
| 252 |
-
"zip_code": user.zip_code
|
| 253 |
-
}
|
| 254 |
-
rows_to_write.append(row)
|
| 255 |
-
|
| 256 |
-
# Now write the CSV
|
| 257 |
-
fieldnames = [
|
| 258 |
-
"profile_id",
|
| 259 |
-
"profile_name",
|
| 260 |
-
"identity_id",
|
| 261 |
-
"first_name",
|
| 262 |
-
"last_name",
|
| 263 |
-
"nickname",
|
| 264 |
-
"birth_year",
|
| 265 |
-
"email_address",
|
| 266 |
-
"phone_number",
|
| 267 |
-
"street_number",
|
| 268 |
-
"street_name",
|
| 269 |
-
"city",
|
| 270 |
-
"state",
|
| 271 |
-
"country",
|
| 272 |
-
"zip_code"
|
| 273 |
-
]
|
| 274 |
-
|
| 275 |
-
logger.info(f"Writing {len(rows_to_write)} rows to {output_file}...")
|
| 276 |
-
|
| 277 |
-
with open(output_file, mode="w", newline="", encoding="utf-8") as f:
|
| 278 |
-
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 279 |
-
writer.writeheader()
|
| 280 |
-
writer.writerows(rows_to_write)
|
| 281 |
-
|
| 282 |
-
logger.info("Finished writing CSV mock data.")
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
if __name__ == "__main__":
|
| 286 |
-
parser = argparse.ArgumentParser(
|
| 287 |
-
description="Generate mock entity-resolution data in CSV format."
|
| 288 |
-
)
|
| 289 |
-
parser.add_argument("--num_profiles", type=int, default=10,
|
| 290 |
-
help="Number of random 'users' to fetch from randomuser.me (default 10).")
|
| 291 |
-
parser.add_argument("--typo_percentage", type=float, default=10.0,
|
| 292 |
-
help="Chance (0..100) that each new Identity row (beyond the first) has a random typo (default 10%).")
|
| 293 |
-
parser.add_argument("--output_file", type=str, default="mock_data.csv",
|
| 294 |
-
help="Output CSV filename (default 'mock_data.csv').")
|
| 295 |
-
|
| 296 |
-
args = parser.parse_args()
|
| 297 |
-
main(args.num_profiles, args.typo_percentage, args.output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/debug_upload.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
st.title("File Upload Debug Test")
|
| 5 |
-
|
| 6 |
-
# Simple file uploader
|
| 7 |
-
uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
|
| 8 |
-
|
| 9 |
-
st.write("Debug Info:")
|
| 10 |
-
st.write(f"uploaded_file object: {uploaded_file}")
|
| 11 |
-
st.write(f"uploaded_file is None: {uploaded_file is None}")
|
| 12 |
-
|
| 13 |
-
if uploaded_file is not None:
|
| 14 |
-
st.success(f"File detected: {uploaded_file.name}")
|
| 15 |
-
st.write(f"File size: {uploaded_file.size}")
|
| 16 |
-
st.write(f"File type: {uploaded_file.type}")
|
| 17 |
-
|
| 18 |
-
try:
|
| 19 |
-
df = pd.read_csv(uploaded_file)
|
| 20 |
-
st.success("CSV read successfully!")
|
| 21 |
-
st.write(f"Shape: {df.shape}")
|
| 22 |
-
st.dataframe(df.head())
|
| 23 |
-
except Exception as e:
|
| 24 |
-
st.error(f"Error reading CSV: {e}")
|
| 25 |
-
else:
|
| 26 |
-
st.warning("No file uploaded")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/requirements.txt
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
streamlit
|
| 2 |
-
pandas
|
| 3 |
-
numpy
|
| 4 |
-
jellyfish
|
| 5 |
-
st-link-analysis
|
| 6 |
-
networkx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/streamlit_app.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import io
|
| 4 |
-
import uuid
|
| 5 |
-
import jellyfish
|
| 6 |
-
from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
|
| 7 |
-
import networkx as nx
|
| 8 |
-
|
| 9 |
-
# --- App Configuration ---
|
| 10 |
-
st.set_page_config(
|
| 11 |
-
page_title="Entity Resolution Network Graph",
|
| 12 |
-
layout="wide",
|
| 13 |
-
initial_sidebar_state="expanded"
|
| 14 |
-
)
|
| 15 |
-
st.title("Entity Resolution on CSV (Network Graph)")
|
| 16 |
-
|
| 17 |
-
# --- Session State Initialization ---
|
| 18 |
-
if 'data_df' not in st.session_state:
|
| 19 |
-
st.session_state.data_df = None
|
| 20 |
-
|
| 21 |
-
# --- Sidebar ---
|
| 22 |
-
st.sidebar.header("1. Load Data")
|
| 23 |
-
|
| 24 |
-
# File Uploader
|
| 25 |
-
uploaded_file = st.sidebar.file_uploader(
|
| 26 |
-
"Upload a CSV file",
|
| 27 |
-
type=["csv"],
|
| 28 |
-
key="file_uploader"
|
| 29 |
-
)
|
| 30 |
-
|
| 31 |
-
if uploaded_file is not None:
|
| 32 |
-
try:
|
| 33 |
-
st.session_state.data_df = pd.read_csv(uploaded_file)
|
| 34 |
-
st.sidebar.success("File uploaded and processed!")
|
| 35 |
-
except Exception as e:
|
| 36 |
-
st.sidebar.error(f"Error reading file: {e}")
|
| 37 |
-
st.session_state.data_df = None
|
| 38 |
-
|
| 39 |
-
st.sidebar.markdown("---")
|
| 40 |
-
st.sidebar.markdown("OR")
|
| 41 |
-
|
| 42 |
-
# Sample Data Button
|
| 43 |
-
if st.sidebar.button("Use Sample Data"):
|
| 44 |
-
st.session_state.data_df = pd.DataFrame({
|
| 45 |
-
'first_name': ['John', 'Jon', 'Jane', 'Jain', 'Mike', 'Michael'],
|
| 46 |
-
'last_name': ['Smith', 'Smith', 'Doe', 'Doe', 'Johnson', 'Johnson'],
|
| 47 |
-
'email_address': ['john.smith@email.com', 'j.smith@gmail.com', 'jane.doe@company.com', 'jdoe@company.com', 'mike.j@work.com', 'michael.johnson@work.com'],
|
| 48 |
-
'phone_number': ['555-0123', '555-0123', '555-0456', '(555) 456-0000', '555-0789', '5550789']
|
| 49 |
-
})
|
| 50 |
-
st.sidebar.success("Sample data loaded!")
|
| 51 |
-
|
| 52 |
-
# --- Main App Logic ---
|
| 53 |
-
if st.session_state.data_df is not None:
|
| 54 |
-
df = st.session_state.data_df
|
| 55 |
-
|
| 56 |
-
st.header("Data Preview")
|
| 57 |
-
st.dataframe(df.head())
|
| 58 |
-
|
| 59 |
-
# (Your existing entity resolution and network graph code would go here)
|
| 60 |
-
# For now, let's just confirm data loading works.
|
| 61 |
-
|
| 62 |
-
else:
|
| 63 |
-
st.info("Please upload a CSV file or use the sample data to get started.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|