Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ HF_TOKEN = os.getenv("HF_TOKEN")
|
|
| 12 |
DATASET_ID = "Threatthriver/sys_log_dump_v4_shards"
|
| 13 |
BASE_PATH = "data/blobs"
|
| 14 |
MAX_FILES_PER_FOLDER = 100
|
| 15 |
-
START_HEX = "0x44" # The starting folder name
|
| 16 |
|
| 17 |
if not HF_TOKEN:
|
| 18 |
raise RuntimeError("CORE_FAILURE: HF_TOKEN is missing. Please add it to Space Secrets.")
|
|
@@ -25,8 +25,8 @@ api = HfApi(token=HF_TOKEN)
|
|
| 25 |
|
| 26 |
def get_target_path(repo_id, base_path):
|
| 27 |
"""
|
| 28 |
-
Scans the dataset to find the correct folder
|
| 29 |
-
|
| 30 |
"""
|
| 31 |
print("π [DAEMON] Scanning dataset state...")
|
| 32 |
|
|
@@ -34,51 +34,53 @@ def get_target_path(repo_id, base_path):
|
|
| 34 |
# 1. List all files in the dataset
|
| 35 |
all_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
|
| 36 |
|
| 37 |
-
# 2. Filter for files in our base_path
|
| 38 |
-
# We expect paths like: data/blobs/0x44/file.bin
|
| 39 |
blob_files = [f for f in all_files if f.startswith(base_path)]
|
| 40 |
|
| 41 |
# 3. Group counts by folder
|
| 42 |
folder_counts = {}
|
| 43 |
for f in blob_files:
|
| 44 |
parts = f.split('/')
|
| 45 |
-
# Expected
|
| 46 |
if len(parts) >= 4:
|
| 47 |
folder_name = parts[2]
|
| 48 |
folder_counts[folder_name] = folder_counts.get(folder_name, 0) + 1
|
| 49 |
|
| 50 |
# 4. Determine Active Folder
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
active_folder = START_HEX
|
| 53 |
current_count = 0
|
|
|
|
| 54 |
else:
|
| 55 |
-
#
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
else:
|
| 63 |
-
# Sort hex strings by converting to int
|
| 64 |
-
valid_folders.sort(key=lambda x: int(x, 16))
|
| 65 |
-
active_folder = valid_folders[-1] # Get the highest one (e.g., 0x45)
|
| 66 |
-
current_count = folder_counts[active_folder]
|
| 67 |
-
|
| 68 |
-
# 5. Apply "100 Limit" Logic
|
| 69 |
target_folder = active_folder
|
| 70 |
|
| 71 |
if current_count >= MAX_FILES_PER_FOLDER:
|
| 72 |
-
# Folder is full,
|
| 73 |
current_int = int(active_folder, 16)
|
| 74 |
new_int = current_int + 1
|
| 75 |
target_folder = hex(new_int)
|
| 76 |
-
print(f"β οΈ [DAEMON] Folder {active_folder} is full ({current_count}).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
else:
|
| 78 |
-
print(f"βΉοΈ [DAEMON] Using existing folder {
|
| 79 |
|
| 80 |
-
#
|
| 81 |
-
# Since we are putting multiple files in one folder, the filename MUST be unique.
|
| 82 |
unique_id = str(uuid.uuid4())[:8]
|
| 83 |
filename = f"sys_core_{unique_id}.bin"
|
| 84 |
|
|
@@ -86,9 +88,10 @@ def get_target_path(repo_id, base_path):
|
|
| 86 |
|
| 87 |
except Exception as e:
|
| 88 |
print(f"β [DAEMON] State check failed: {e}")
|
| 89 |
-
# Fallback
|
| 90 |
-
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
# ==========================================
|
| 94 |
# 3. CORE BUSINESS LOGIC (The Engine)
|
|
@@ -98,18 +101,18 @@ def upload_shard(file_obj):
|
|
| 98 |
"""
|
| 99 |
Allocates a binary file to the determined folder.
|
| 100 |
"""
|
| 101 |
-
# 1. Input Validation
|
| 102 |
if file_obj is None:
|
| 103 |
return "ERROR: No packet received."
|
| 104 |
|
| 105 |
-
#
|
| 106 |
target_path = get_target_path(DATASET_ID, BASE_PATH)
|
| 107 |
|
| 108 |
print(f"βοΈ [DAEMON] Processing Shard...")
|
| 109 |
print(f"π [DAEMON] Target Location: {target_path}")
|
| 110 |
|
| 111 |
try:
|
| 112 |
-
#
|
|
|
|
| 113 |
api.upload_file(
|
| 114 |
path_or_fileobj=file_obj,
|
| 115 |
path_in_repo=target_path,
|
|
@@ -118,7 +121,7 @@ def upload_shard(file_obj):
|
|
| 118 |
commit_message=f"daemon_sync: new shard entry"
|
| 119 |
)
|
| 120 |
|
| 121 |
-
#
|
| 122 |
data_stream_url = f"https://huggingface.co/datasets/{DATASET_ID}/resolve/main/{target_path}"
|
| 123 |
print(f"β
[DAEMON] Success: {data_stream_url}")
|
| 124 |
return data_stream_url
|
|
@@ -128,7 +131,6 @@ def upload_shard(file_obj):
|
|
| 128 |
print(f"β [DAEMON] {error_msg}")
|
| 129 |
return error_msg
|
| 130 |
|
| 131 |
-
|
| 132 |
# ==========================================
|
| 133 |
# 4. INTERFACE LAYER (The Bridge)
|
| 134 |
# ==========================================
|
|
@@ -137,7 +139,7 @@ def build_interface():
|
|
| 137 |
with gr.Blocks(title="System Sync Daemon") as demo:
|
| 138 |
|
| 139 |
gr.Markdown("## π SYSTEM SYNC DAEMON [SEQUENTIAL STORAGE]")
|
| 140 |
-
gr.Markdown(f"Bridges raw binary packets. Logic:
|
| 141 |
|
| 142 |
with gr.Row():
|
| 143 |
with gr.Column():
|
|
|
|
| 12 |
DATASET_ID = "Threatthriver/sys_log_dump_v4_shards"
|
| 13 |
BASE_PATH = "data/blobs"
|
| 14 |
MAX_FILES_PER_FOLDER = 100
|
| 15 |
+
START_HEX = "0x44" # The starting folder name
|
| 16 |
|
| 17 |
if not HF_TOKEN:
|
| 18 |
raise RuntimeError("CORE_FAILURE: HF_TOKEN is missing. Please add it to Space Secrets.")
|
|
|
|
| 25 |
|
| 26 |
def get_target_path(repo_id, base_path):
|
| 27 |
"""
|
| 28 |
+
Scans the dataset to find the correct folder.
|
| 29 |
+
SYSTEM UPGRADE: Automatically detects if a folder is missing and initializes it.
|
| 30 |
"""
|
| 31 |
print("π [DAEMON] Scanning dataset state...")
|
| 32 |
|
|
|
|
| 34 |
# 1. List all files in the dataset
|
| 35 |
all_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
|
| 36 |
|
| 37 |
+
# 2. Filter for files in our base_path
|
|
|
|
| 38 |
blob_files = [f for f in all_files if f.startswith(base_path)]
|
| 39 |
|
| 40 |
# 3. Group counts by folder
|
| 41 |
folder_counts = {}
|
| 42 |
for f in blob_files:
|
| 43 |
parts = f.split('/')
|
| 44 |
+
# Expected: ['data', 'blobs', '0x44', 'filename.bin']
|
| 45 |
if len(parts) >= 4:
|
| 46 |
folder_name = parts[2]
|
| 47 |
folder_counts[folder_name] = folder_counts.get(folder_name, 0) + 1
|
| 48 |
|
| 49 |
# 4. Determine Active Folder
|
| 50 |
+
valid_folders = [k for k in folder_counts.keys() if k.startswith("0x")]
|
| 51 |
+
|
| 52 |
+
if not valid_folders:
|
| 53 |
+
# Case A: Dataset is empty or no hex folders exist yet
|
| 54 |
active_folder = START_HEX
|
| 55 |
current_count = 0
|
| 56 |
+
is_existing_folder = False
|
| 57 |
else:
|
| 58 |
+
# Case B: Found existing folders, sort to find the latest
|
| 59 |
+
valid_folders.sort(key=lambda x: int(x, 16))
|
| 60 |
+
active_folder = valid_folders[-1]
|
| 61 |
+
current_count = folder_counts[active_folder]
|
| 62 |
+
is_existing_folder = True
|
| 63 |
+
|
| 64 |
+
# 5. Apply "Limit & Create" Logic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
target_folder = active_folder
|
| 66 |
|
| 67 |
if current_count >= MAX_FILES_PER_FOLDER:
|
| 68 |
+
# Folder is full, calculate next hex
|
| 69 |
current_int = int(active_folder, 16)
|
| 70 |
new_int = current_int + 1
|
| 71 |
target_folder = hex(new_int)
|
| 72 |
+
print(f"β οΈ [DAEMON] Folder {active_folder} is full ({current_count}). switching to {target_folder}")
|
| 73 |
+
|
| 74 |
+
# Since we switched, this is technically a 'new' folder until uploaded
|
| 75 |
+
is_existing_folder = False
|
| 76 |
+
|
| 77 |
+
# 6. [SYSTEM] Auto-Creation Check
|
| 78 |
+
if not is_existing_folder:
|
| 79 |
+
print(f"π [SYSTEM] Target folder '{target_folder}' not found. Auto-creating directory structure...")
|
| 80 |
else:
|
| 81 |
+
print(f"βΉοΈ [DAEMON] Using existing folder {target_folder} (Count: {current_count}/{MAX_FILES_PER_FOLDER})")
|
| 82 |
|
| 83 |
+
# 7. Generate Unique Filename
|
|
|
|
| 84 |
unique_id = str(uuid.uuid4())[:8]
|
| 85 |
filename = f"sys_core_{unique_id}.bin"
|
| 86 |
|
|
|
|
| 88 |
|
| 89 |
except Exception as e:
|
| 90 |
print(f"β [DAEMON] State check failed: {e}")
|
| 91 |
+
# Fallback ensures we don't crash, but still allows upload (which creates folders)
|
| 92 |
+
fallback_folder = f"fallback_{int(time.time())}"
|
| 93 |
+
print(f"β οΈ [SYSTEM] Initiating emergency folder creation: {fallback_folder}")
|
| 94 |
+
return f"{base_path}/{fallback_folder}/dump.bin"
|
| 95 |
|
| 96 |
# ==========================================
|
| 97 |
# 3. CORE BUSINESS LOGIC (The Engine)
|
|
|
|
| 101 |
"""
|
| 102 |
Allocates a binary file to the determined folder.
|
| 103 |
"""
|
|
|
|
| 104 |
if file_obj is None:
|
| 105 |
return "ERROR: No packet received."
|
| 106 |
|
| 107 |
+
# 1. Determine Path (This triggers the folder creation logic check)
|
| 108 |
target_path = get_target_path(DATASET_ID, BASE_PATH)
|
| 109 |
|
| 110 |
print(f"βοΈ [DAEMON] Processing Shard...")
|
| 111 |
print(f"π [DAEMON] Target Location: {target_path}")
|
| 112 |
|
| 113 |
try:
|
| 114 |
+
# 2. Execution (Upload)
|
| 115 |
+
# NOTE: upload_file AUTOMATICALLY creates the directory tree if it doesn't exist.
|
| 116 |
api.upload_file(
|
| 117 |
path_or_fileobj=file_obj,
|
| 118 |
path_in_repo=target_path,
|
|
|
|
| 121 |
commit_message=f"daemon_sync: new shard entry"
|
| 122 |
)
|
| 123 |
|
| 124 |
+
# 3. Result Construction
|
| 125 |
data_stream_url = f"https://huggingface.co/datasets/{DATASET_ID}/resolve/main/{target_path}"
|
| 126 |
print(f"β
[DAEMON] Success: {data_stream_url}")
|
| 127 |
return data_stream_url
|
|
|
|
| 131 |
print(f"β [DAEMON] {error_msg}")
|
| 132 |
return error_msg
|
| 133 |
|
|
|
|
| 134 |
# ==========================================
|
| 135 |
# 4. INTERFACE LAYER (The Bridge)
|
| 136 |
# ==========================================
|
|
|
|
| 139 |
with gr.Blocks(title="System Sync Daemon") as demo:
|
| 140 |
|
| 141 |
gr.Markdown("## π SYSTEM SYNC DAEMON [SEQUENTIAL STORAGE]")
|
| 142 |
+
gr.Markdown(f"Bridges raw binary packets. Logic: Auto-creates folders (e.g. {START_HEX}) when needed.")
|
| 143 |
|
| 144 |
with gr.Row():
|
| 145 |
with gr.Column():
|