ohmygaugh commited on
Commit
7634b73
·
1 Parent(s): 22ea7aa

Refactor: Align project with standard HF Docker structure

Browse files
Dockerfile CHANGED
@@ -2,20 +2,11 @@ FROM python:3.9-slim
2
 
3
  WORKDIR /app
4
 
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- software-properties-common \
9
- git \
10
- && rm -rf /var/lib/apt/lists/*
11
-
12
  COPY requirements.txt ./
13
- COPY src/ ./src/
14
 
15
- RUN pip3 install -r requirements.txt
16
 
17
  EXPOSE 8501
18
 
19
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
-
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
2
 
3
  WORKDIR /app
4
 
 
 
 
 
 
 
 
5
  COPY requirements.txt ./
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
 
8
+ COPY . .
9
 
10
  EXPOSE 8501
11
 
12
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- streamlit
2
  pandas
3
  numpy
4
  jellyfish
 
1
+ streamlit==1.33.0
2
  pandas
3
  numpy
4
  jellyfish
src/create_mock_CSV_data.py DELETED
@@ -1,297 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- """
4
- create_mock_data_csv.py
5
-
6
- Fetches random user data from randomuser.me (or a similar service) and creates
7
- mock data in CSV format that imitates having multiple 'Profiles' and multiple
8
- 'Identity' rows. Each row in the CSV represents an Identity and includes:
9
- - a parent Profile ID and Profile name,
10
- - first_name, last_name, birth_year, etc.,
11
- - and possibly random typos in selected fields (based on a user-defined percentage).
12
-
13
- Usage example:
14
- python create_mock_data_csv.py --num_profiles=100 --typo_percentage=10 --output_file="mock_data.csv"
15
- """
16
-
17
- import requests
18
- import random
19
- import logging
20
- import argparse
21
- import csv
22
- import uuid
23
- import numpy as np
24
- # If you use the nicknames library: pip install nicknames
25
- # from nicknames import NickNamer
26
- # For demonstration, let's fallback gracefully if not installed.
27
-
28
- logging.basicConfig(level=logging.INFO)
29
- logger = logging.getLogger(__name__)
30
-
31
- try:
32
- from nicknames import NickNamer
33
- NICKNAMES_AVAILABLE = True
34
- except ImportError:
35
- NICKNAMES_AVAILABLE = False
36
- logger.warning("nicknames library is not installed. Nickname feature will be limited.")
37
-
38
-
39
- def fetch_random_users(num_profiles):
40
- """
41
- Fetch random user data from the randomuser.me API.
42
- Returns a list of user dicts with relevant attributes.
43
- """
44
- url = f"https://randomuser.me/api/?results={num_profiles}&nat=us"
45
- response = requests.get(url)
46
- response.raise_for_status()
47
- data = response.json()
48
- return data.get("results", [])
49
-
50
-
51
- class User:
52
- """
53
- Simple container for user data fetched from randomuser.me,
54
- plus logic for generating nicknames, emails, phone numbers,
55
- and introducing random typos.
56
- """
57
- def __init__(self, user_data):
58
- self.user_data = user_data
59
-
60
- # Extract basic info
61
- self.name_data = user_data.get("name", {})
62
- self.first_name = self.name_data.get("first", "Unknown")
63
- self.last_name = self.name_data.get("last", "Unknown")
64
- self.nickname = self._choose_nickname()
65
-
66
- dob = user_data.get("dob", {})
67
- self.birth_year = str(dob.get("date", "")[:4]) # 'YYYY-MM-DD...' -> 'YYYY'
68
-
69
- # Email address: random combination of first, last, year, etc.
70
- self.email_address = self._generate_email()
71
-
72
- # Phone number: just digits from the API phone.
73
- phone_raw = user_data.get("phone", "")
74
- self.phone_number = "".join(filter(str.isdigit, phone_raw))
75
-
76
- # Address fields
77
- location = user_data.get("location", {})
78
- self.street_number = str(location.get("street", {}).get("number", ""))
79
- self.street_name = location.get("street", {}).get("name", "")
80
- self.city = location.get("city", "")
81
- self.state = location.get("state", "")
82
- self.country = location.get("country", "")
83
- self.zip_code = str(location.get("postcode", ""))
84
-
85
- @property
86
- def full_name(self):
87
- return f"{self.first_name} {self.last_name}"
88
-
89
- @property
90
- def full_address(self):
91
- return f"{self.street_number} {self.street_name}, {self.city}, {self.state} {self.zip_code}"
92
-
93
- def _choose_nickname(self):
94
- """
95
- Uses the nicknames library if available, otherwise falls back to the first name.
96
- """
97
- if NICKNAMES_AVAILABLE:
98
- nn = NickNamer()
99
- possible_nicknames = nn.nicknames_of(self.first_name)
100
- if possible_nicknames:
101
- return random.choice(list(possible_nicknames))
102
- return self.first_name
103
-
104
- def _generate_email(self):
105
- domain = random.choice(["gmail", "yahoo", "hotmail", "outlook"])
106
- first_part = random.choice([self.first_name, self.nickname, self.first_name[:1]])
107
- last_part = random.choice([self.last_name, self.last_name[:1]])
108
- optional = random.choice(["", self.birth_year, self.birth_year[-2:], str(random.randint(1, 100))])
109
- return f"{first_part}{last_part}{optional}@{domain}.com".lower()
110
-
111
- def add_typo(self, property_name):
112
- """
113
- Introduce a random typo into the specified property (e.g. 'first_name').
114
- If property_name == 'full_address', we randomly pick an address field to modify.
115
- """
116
- if property_name == "full_address":
117
- property_name = random.choice(
118
- ["street_number", "street_name", "city", "state", "zip_code"]
119
- )
120
-
121
- current_value = getattr(self, property_name, None)
122
- if not current_value or not isinstance(current_value, str):
123
- return # If it's empty or not a string, skip
124
-
125
- original_value = current_value
126
- new_value = self._apply_random_typo(current_value)
127
- setattr(self, property_name, new_value)
128
- logger.debug(f"Applying typo: {property_name}: '{original_value}' -> '{new_value}'")
129
-
130
- def _apply_random_typo(self, text):
131
- """
132
- Introduce a random single-character error (delete, swap, insert, replace)
133
- or regenerate an email.
134
- """
135
- if not text:
136
- return text
137
-
138
- option = random.choice(["delete", "swap", "insert", "replace"])
139
-
140
- # If email, sometimes just regenerate the entire email.
141
- if "@" in text:
142
- # 1 in 3 chance we fully regenerate the email.
143
- if random.random() < 0.33:
144
- return self._generate_email()
145
-
146
- if len(text) == 1:
147
- # If we have only one character, we can only do replace or insert.
148
- option = random.choice(["insert", "replace"])
149
-
150
- index = random.randint(0, len(text) - 1)
151
-
152
- if option == "delete":
153
- # Remove 1 char
154
- return text[:index] + text[index+1:]
155
-
156
- elif option == "swap":
157
- # Swap with the next char if possible
158
- if index < len(text) - 1:
159
- # swap
160
- lst = list(text)
161
- lst[index], lst[index+1] = lst[index+1], lst[index]
162
- return "".join(lst)
163
- else:
164
- # fallback to replace if we can't swap
165
- letter = random.choice("abcdefghijklmnopqrstuvwxyz")
166
- return text[:index] + letter + text[index+1:]
167
-
168
- elif option == "insert":
169
- # Insert a random letter at index
170
- letter = random.choice("abcdefghijklmnopqrstuvwxyz")
171
- return text[:index] + letter + text[index:]
172
-
173
- elif option == "replace":
174
- letter = random.choice("abcdefghijklmnopqrstuvwxyz")
175
- return text[:index] + letter + text[index+1:]
176
-
177
- # Fallback: no change
178
- return text
179
-
180
-
181
- def main(num_profiles, typo_percentage, output_file):
182
- """
183
- 1) Fetch random user data from randomuser.me
184
- 2) For each user, create 1..N 'Profile' nodes
185
- 3) For each 'Profile', create 1..M 'Identities'
186
- 4) Introduce random typos in selected fields
187
- 5) Write all Identity rows to CSV, including their associated Profile info
188
- """
189
- logger.info(f"Generating mock data for {num_profiles} profiles...")
190
- api_data = fetch_random_users(num_profiles)
191
-
192
- rows_to_write = []
193
-
194
- # The number of identity nodes depends on random gaussian logic or your own preference
195
- # e.g. a normal distribution around 8 with std=5, clipped to positives
196
- # We'll keep the same approach from the original script.
197
- for data in api_data:
198
- user = User(data)
199
-
200
- # random number of Identities
201
- num_ids = abs(int(np.random.normal(8, 5))) # e.g. mean=8, std=5
202
-
203
- # pick how many distinct "Profile" nodes each user might produce
204
- # (in the original code, we used some logic to decide 1 or 2 or 3 profiles)
205
- if num_ids > 4:
206
- num_profiles_for_user = random.choice([1, 1, 1, 2, 2, 3])
207
- else:
208
- num_profiles_for_user = 1
209
-
210
- # Create the Profile IDs and store them
211
- profile_ids = [str(uuid.uuid4()) for _ in range(num_profiles_for_user)]
212
- profile_name = user.full_name # in the original script, we used the same name for each 'Profile'
213
-
214
- # We'll distribute the Identity rows across these profiles
215
- profile_idx = 0
216
-
217
- for i in range(num_ids):
218
- # If the fraction i/num_ids > fraction dividing the profiles,
219
- # move to next profile. (just a simple distribution approach)
220
- if num_profiles_for_user > 1:
221
- if i / num_ids > (profile_idx + 1) / num_profiles_for_user:
222
- profile_idx += 1
223
-
224
- current_profile_id = profile_ids[profile_idx]
225
-
226
- # Possibly apply a typo
227
- # For each new identity row (beyond the first?), there's a chance to add a typo
228
- if i > 0 and random.random() < (typo_percentage / 100.0):
229
- # choose a random field
230
- possible_fields = ["first_name", "last_name", "email_address",
231
- "phone_number", "full_address", "birth_year"]
232
- chosen_field = random.choice(possible_fields)
233
- user.add_typo(chosen_field)
234
-
235
- # Create a row for the Identity
236
- identity_id = str(uuid.uuid4())
237
- row = {
238
- "profile_id": current_profile_id,
239
- "profile_name": profile_name,
240
- "identity_id": identity_id,
241
- "first_name": user.first_name,
242
- "last_name": user.last_name,
243
- "nickname": user.nickname,
244
- "birth_year": user.birth_year,
245
- "email_address": user.email_address,
246
- "phone_number": user.phone_number,
247
- "street_number": user.street_number,
248
- "street_name": user.street_name,
249
- "city": user.city,
250
- "state": user.state,
251
- "country": user.country,
252
- "zip_code": user.zip_code
253
- }
254
- rows_to_write.append(row)
255
-
256
- # Now write the CSV
257
- fieldnames = [
258
- "profile_id",
259
- "profile_name",
260
- "identity_id",
261
- "first_name",
262
- "last_name",
263
- "nickname",
264
- "birth_year",
265
- "email_address",
266
- "phone_number",
267
- "street_number",
268
- "street_name",
269
- "city",
270
- "state",
271
- "country",
272
- "zip_code"
273
- ]
274
-
275
- logger.info(f"Writing {len(rows_to_write)} rows to {output_file}...")
276
-
277
- with open(output_file, mode="w", newline="", encoding="utf-8") as f:
278
- writer = csv.DictWriter(f, fieldnames=fieldnames)
279
- writer.writeheader()
280
- writer.writerows(rows_to_write)
281
-
282
- logger.info("Finished writing CSV mock data.")
283
-
284
-
285
- if __name__ == "__main__":
286
- parser = argparse.ArgumentParser(
287
- description="Generate mock entity-resolution data in CSV format."
288
- )
289
- parser.add_argument("--num_profiles", type=int, default=10,
290
- help="Number of random 'users' to fetch from randomuser.me (default 10).")
291
- parser.add_argument("--typo_percentage", type=float, default=10.0,
292
- help="Chance (0..100) that each new Identity row (beyond the first) has a random typo (default 10%).")
293
- parser.add_argument("--output_file", type=str, default="mock_data.csv",
294
- help="Output CSV filename (default 'mock_data.csv').")
295
-
296
- args = parser.parse_args()
297
- main(args.num_profiles, args.typo_percentage, args.output_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/debug_upload.py DELETED
@@ -1,26 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
-
4
- st.title("File Upload Debug Test")
5
-
6
- # Simple file uploader
7
- uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
8
-
9
- st.write("Debug Info:")
10
- st.write(f"uploaded_file object: {uploaded_file}")
11
- st.write(f"uploaded_file is None: {uploaded_file is None}")
12
-
13
- if uploaded_file is not None:
14
- st.success(f"File detected: {uploaded_file.name}")
15
- st.write(f"File size: {uploaded_file.size}")
16
- st.write(f"File type: {uploaded_file.type}")
17
-
18
- try:
19
- df = pd.read_csv(uploaded_file)
20
- st.success("CSV read successfully!")
21
- st.write(f"Shape: {df.shape}")
22
- st.dataframe(df.head())
23
- except Exception as e:
24
- st.error(f"Error reading CSV: {e}")
25
- else:
26
- st.warning("No file uploaded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/requirements.txt DELETED
@@ -1,6 +0,0 @@
1
- streamlit
2
- pandas
3
- numpy
4
- jellyfish
5
- st-link-analysis
6
- networkx
 
 
 
 
 
 
 
src/streamlit_app.py DELETED
@@ -1,63 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import io
4
- import uuid
5
- import jellyfish
6
- from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
7
- import networkx as nx
8
-
9
- # --- App Configuration ---
10
- st.set_page_config(
11
- page_title="Entity Resolution Network Graph",
12
- layout="wide",
13
- initial_sidebar_state="expanded"
14
- )
15
- st.title("Entity Resolution on CSV (Network Graph)")
16
-
17
- # --- Session State Initialization ---
18
- if 'data_df' not in st.session_state:
19
- st.session_state.data_df = None
20
-
21
- # --- Sidebar ---
22
- st.sidebar.header("1. Load Data")
23
-
24
- # File Uploader
25
- uploaded_file = st.sidebar.file_uploader(
26
- "Upload a CSV file",
27
- type=["csv"],
28
- key="file_uploader"
29
- )
30
-
31
- if uploaded_file is not None:
32
- try:
33
- st.session_state.data_df = pd.read_csv(uploaded_file)
34
- st.sidebar.success("File uploaded and processed!")
35
- except Exception as e:
36
- st.sidebar.error(f"Error reading file: {e}")
37
- st.session_state.data_df = None
38
-
39
- st.sidebar.markdown("---")
40
- st.sidebar.markdown("OR")
41
-
42
- # Sample Data Button
43
- if st.sidebar.button("Use Sample Data"):
44
- st.session_state.data_df = pd.DataFrame({
45
- 'first_name': ['John', 'Jon', 'Jane', 'Jain', 'Mike', 'Michael'],
46
- 'last_name': ['Smith', 'Smith', 'Doe', 'Doe', 'Johnson', 'Johnson'],
47
- 'email_address': ['john.smith@email.com', 'j.smith@gmail.com', 'jane.doe@company.com', 'jdoe@company.com', 'mike.j@work.com', 'michael.johnson@work.com'],
48
- 'phone_number': ['555-0123', '555-0123', '555-0456', '(555) 456-0000', '555-0789', '5550789']
49
- })
50
- st.sidebar.success("Sample data loaded!")
51
-
52
- # --- Main App Logic ---
53
- if st.session_state.data_df is not None:
54
- df = st.session_state.data_df
55
-
56
- st.header("Data Preview")
57
- st.dataframe(df.head())
58
-
59
- # (Your existing entity resolution and network graph code would go here)
60
- # For now, let's just confirm data loading works.
61
-
62
- else:
63
- st.info("Please upload a CSV file or use the sample data to get started.")