Upload 3 files
Browse files- anonymize_mlflow.py +53 -0
- mlruns.zip +3 -0
- restore_mlflow_paths.py +58 -0
anonymize_mlflow.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import glob
|
| 5 |
+
|
| 6 |
+
def anonymize_file(file_path):
|
| 7 |
+
"""Anonymize personal data in files."""
|
| 8 |
+
with open(file_path, 'r') as f:
|
| 9 |
+
lines = f.readlines()
|
| 10 |
+
|
| 11 |
+
modified = False
|
| 12 |
+
for i, line in enumerate(lines):
|
| 13 |
+
# Anonymize artifact_uri or artifact_location
|
| 14 |
+
if 'artifact_uri:' in line or 'artifact_location:' in line:
|
| 15 |
+
new_line = re.sub(r'file://(.+?)(/mlruns/)', r'file://<placeholder>\2', line)
|
| 16 |
+
if new_line != line:
|
| 17 |
+
lines[i] = new_line
|
| 18 |
+
modified = True
|
| 19 |
+
# Anonymize user_id
|
| 20 |
+
elif 'user_id:' in line:
|
| 21 |
+
new_line = re.sub(r'user_id:\s*.+', 'user_id: <user_placeholder>', line)
|
| 22 |
+
if new_line != line:
|
| 23 |
+
lines[i] = new_line
|
| 24 |
+
modified = True
|
| 25 |
+
# Anonymize data_dir and csv_path in config files
|
| 26 |
+
elif 'data_dir:' in line or 'csv_path:' in line:
|
| 27 |
+
new_line = re.sub(r':\s*(.+?)/pdmx/', r': pdmx/', line)
|
| 28 |
+
if new_line != line:
|
| 29 |
+
lines[i] = new_line
|
| 30 |
+
modified = True
|
| 31 |
+
|
| 32 |
+
if modified:
|
| 33 |
+
with open(file_path, 'w') as f:
|
| 34 |
+
f.writelines(lines)
|
| 35 |
+
print(f"Anonymized: {file_path}")
|
| 36 |
+
|
| 37 |
+
def main():
|
| 38 |
+
# Find all relevant files recursively
|
| 39 |
+
yaml_files = glob.glob('**/*.yaml', recursive=True)
|
| 40 |
+
for file_path in yaml_files:
|
| 41 |
+
anonymize_file(file_path)
|
| 42 |
+
|
| 43 |
+
# Also anonymize mlflow.user tag files
|
| 44 |
+
user_files = glob.glob('**/tags/mlflow.user', recursive=True)
|
| 45 |
+
for file_path in user_files:
|
| 46 |
+
with open(file_path, 'w') as f:
|
| 47 |
+
f.write('<user_placeholder>\n')
|
| 48 |
+
print(f"Anonymized: {file_path}")
|
| 49 |
+
|
| 50 |
+
print("Anonymization complete.")
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
main()
|
mlruns.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b7e943d91f708ea6291d208a3f97705c7851961fa3efecb456d95637f07e511
|
| 3 |
+
size 1558271080
|
restore_mlflow_paths.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import glob
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
def restore_file(file_path, absolute_path, user_id):
|
| 6 |
+
"""Restore placeholders in files with the specified absolute path and user ID."""
|
| 7 |
+
with open(file_path, 'r') as f:
|
| 8 |
+
lines = f.readlines()
|
| 9 |
+
|
| 10 |
+
modified = False
|
| 11 |
+
for i, line in enumerate(lines):
|
| 12 |
+
# Restore artifact_uri or artifact_location
|
| 13 |
+
if 'artifact_uri:' in line or 'artifact_location:' in line:
|
| 14 |
+
new_line = re.sub(r'file://<placeholder>(/mlruns/)', rf'file://{absolute_path}\1', line)
|
| 15 |
+
if new_line != line:
|
| 16 |
+
lines[i] = new_line
|
| 17 |
+
modified = True
|
| 18 |
+
# Restore user_id
|
| 19 |
+
elif 'user_id:' in line and '<user_placeholder>' in line:
|
| 20 |
+
new_line = re.sub(r'user_id:\s*<user_placeholder>', f'user_id: {user_id}', line)
|
| 21 |
+
if new_line != line:
|
| 22 |
+
lines[i] = new_line
|
| 23 |
+
modified = True
|
| 24 |
+
|
| 25 |
+
if modified:
|
| 26 |
+
with open(file_path, 'w') as f:
|
| 27 |
+
f.writelines(lines)
|
| 28 |
+
print(f"Restored: {file_path}")
|
| 29 |
+
|
| 30 |
+
def main():
|
| 31 |
+
if len(sys.argv) != 3:
|
| 32 |
+
print("Usage: python restore_mlflow_paths.py <absolute_path> <user_id>")
|
| 33 |
+
print("Example: python restore_mlflow_paths.py /home/user/path/to/mlruns hendrik-roth")
|
| 34 |
+
sys.exit(1)
|
| 35 |
+
|
| 36 |
+
absolute_path = sys.argv[1]
|
| 37 |
+
user_id = sys.argv[2]
|
| 38 |
+
# Ensure the path starts with / and does not end with /
|
| 39 |
+
if not absolute_path.startswith('/'):
|
| 40 |
+
absolute_path = '/' + absolute_path
|
| 41 |
+
absolute_path = absolute_path.rstrip('/')
|
| 42 |
+
|
| 43 |
+
# Find all yaml files recursively
|
| 44 |
+
yaml_files = glob.glob('**/*.yaml', recursive=True)
|
| 45 |
+
for file_path in yaml_files:
|
| 46 |
+
restore_file(file_path, absolute_path, user_id)
|
| 47 |
+
|
| 48 |
+
# Also restore mlflow.user tag files
|
| 49 |
+
user_files = glob.glob('**/tags/mlflow.user', recursive=True)
|
| 50 |
+
for file_path in user_files:
|
| 51 |
+
with open(file_path, 'w') as f:
|
| 52 |
+
f.write(f'{user_id}\n')
|
| 53 |
+
print(f"Restored: {file_path}")
|
| 54 |
+
|
| 55 |
+
print("Path restoration complete.")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
main()
|