Imaginethat commited on
Commit
90447dd
·
verified ·
1 Parent(s): 8f6b051

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sys7_phrase_lexicons_desc_only.json filter=lfs diff=lfs merge=lfs -text
Dockerfile.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the same PyTorch base image that worked for your clustering job
2
+ FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
3
+
4
+ # Install system basics
5
+ RUN apt-get update && \
6
+ apt-get install -y wget ca-certificates git && \
7
+ rm -rf /var/lib/apt/lists/*
8
+
9
+ WORKDIR /app
10
+
11
+ # Install Python dependencies
12
+ COPY requirements.txt ./
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Create non-root user and setup the Persistent Storage paths
16
+ # This ensures /data is writable, which is where your Parquet inputs/outputs will live
17
+ RUN useradd -m -u 1000 appuser && \
18
+ mkdir -p /data/.cache && \
19
+ mkdir -p /data/out && \
20
+ mkdir -p /data/input && \
21
+ chown -R appuser:appuser /data
22
+
23
+ # Set environment variables to force all Hugging Face caches to the persistent volume
24
+ ENV HF_HOME=/data/.cache \
25
+ HF_HUB_CACHE=/data/.cache/hub \
26
+ TRANSFORMERS_CACHE=/data/.cache/transformers \
27
+ HF_DATASETS_CACHE=/data/.cache/datasets \
28
+ SENTENCE_TRANSFORMERS_HOME=/data/.cache/sentence_transformers \
29
+ TOKENIZERS_PARALLELISM=false \
30
+ OMP_NUM_THREADS=1
31
+
32
+ # Copy your Miner script and the required Config JSONs
33
+ # Make sure you upload these JSONs to the Space Files along with this Dockerfile!
34
+ COPY sys7_miner.py .
35
+ COPY system7_lexicons.json .
36
+ COPY label_orders.json .
37
+ COPY slang_lexicon.json .
38
+ # COPY sys7_phrase_lexicons_desc_only.json . <-- Uncomment if you use this
39
+
40
+ # Copy the runner script
41
+ COPY start.sh .
42
+ RUN chmod +x start.sh && chown -R appuser:appuser /app
43
+
44
+ USER appuser
45
+
46
+ CMD ["./start.sh"]
Requirements.txt.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ pyarrow
3
+ ftfy
4
+ langdetect
5
+ sentence-transformers
6
+ huggingface_hub
7
+ datasets
8
+ --extra-index-url https://download.pytorch.org/whl/cu124
9
+ torch==2.5.1
10
+ torchvision==0.20.1
11
+ torchaudio==2.5.1
label_orders.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tribe": [
3
+ "BlueCollar",
4
+ "Faith",
5
+ "Gaming",
6
+ "MomTok",
7
+ "Pets",
8
+ "Storytime",
9
+ "SportsCombat",
10
+ "MusicPopFandom",
11
+ "TVStreaming",
12
+ "Roleplay"
13
+ ],
14
+ "commercial": [
15
+ "Product",
16
+ "MediaShow",
17
+ "EventService",
18
+ "InfluencerBrand"
19
+ ],
20
+ "vibe": [
21
+ "chaotic_funny",
22
+ "confessional_personal",
23
+ "educational_calm",
24
+ "motivational_inspirational",
25
+ "dramatic_romantic"
26
+ ],
27
+ "time": [
28
+ "seasonal",
29
+ "viral"
30
+ ],
31
+ "role": [
32
+ "parent",
33
+ "professional",
34
+ "single",
35
+ "student",
36
+ "athlete",
37
+ "creator"
38
+ ],
39
+ "format": [
40
+ "storytime",
41
+ "template_edit",
42
+ "vlog"
43
+ ]
44
+ }
run_job.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import threading
4
+ import subprocess
5
+ import time
6
+ import shutil
7
+ from huggingface_hub import HfApi, hf_hub_download, create_repo
8
+
9
+ # --- CONFIG ---
10
+ INPUT_DATASET = "The-data-company/TikTok-10M"
11
+ # CHANGE THIS TO YOUR USERNAME!
12
+ OUTPUT_REPO = "Imaginethat/tiktok-sys7-mined-results"
13
+ HF_TOKEN = os.environ.get("HF_TOKEN")
14
+
15
+ # Global variables for the UI
16
+ log_buffer = "Initializing Environment...\n"
17
+ dataset_link_md = ""
18
+ mining_active = False
19
+
20
+ def log(msg):
21
+ """Adds a message to the UI log window"""
22
+ global log_buffer
23
+ timestamp = time.strftime("%H:%M:%S")
24
+ entry = f"[{timestamp}] {msg}"
25
+ print(entry) # Keep console logs as backup
26
+ log_buffer += entry + "\n"
27
+
28
+ def run_mining_logic():
29
+ global dataset_link_md, mining_active
30
+
31
+ if not HF_TOKEN:
32
+ log("ERROR: HF_TOKEN secret is missing in Settings!")
33
+ return
34
+
35
+ api = HfApi(token=HF_TOKEN)
36
+
37
+ # 1. Setup Repo
38
+ log(f"Setting up Output Repo: {OUTPUT_REPO}")
39
+ try:
40
+ create_repo(repo_id=OUTPUT_REPO, repo_type="dataset", exist_ok=True)
41
+ except Exception as e:
42
+ log(f"Repo setup note: {e}")
43
+
44
+ # 2. List Files
45
+ log(f"Scanning {INPUT_DATASET} for files...")
46
+ try:
47
+ all_files = api.list_repo_files(repo_id=INPUT_DATASET, repo_type="dataset")
48
+ parquet_files = [f for f in all_files if f.endswith(".parquet") and "train" in f]
49
+ log(f"Found {len(parquet_files)} shards to process.")
50
+ except Exception as e:
51
+ log(f"Failed to list files: {e}")
52
+ return
53
+
54
+ # 3. Process Shards
55
+ input_dir = "./data_input"
56
+ output_dir = "./data_output"
57
+
58
+ for i, file_path in enumerate(parquet_files):
59
+ log(f"--- Processing Shard {i+1}/{len(parquet_files)}: {file_path} ---")
60
+
61
+ # Cleanup
62
+ if os.path.exists(input_dir): shutil.rmtree(input_dir)
63
+ if os.path.exists(output_dir): shutil.rmtree(output_dir)
64
+ os.makedirs(input_dir, exist_ok=True)
65
+ os.makedirs(output_dir, exist_ok=True)
66
+
67
+ # Download
68
+ try:
69
+ local_path = hf_hub_download(
70
+ repo_id=INPUT_DATASET,
71
+ filename=file_path,
72
+ repo_type="dataset",
73
+ local_dir=input_dir,
74
+ token=HF_TOKEN
75
+ )
76
+ except Exception as e:
77
+ log(f"Download failed: {e}")
78
+ continue
79
+
80
+ # Run Miner (Streaming output to UI)
81
+ shard_output_name = f"sys7_features_part_{i:04d}.parquet"
82
+ shard_output_path = os.path.join(output_dir, shard_output_name)
83
+
84
+ cmd = [
85
+ "python", "-u", "sys7_miner.py", # -u forces unbuffered output
86
+ "--input-path", input_dir,
87
+ "--output-parquet", shard_output_path,
88
+ "--lexicons", "system7_lexicons.json",
89
+ "--label-orders", "label_orders.json",
90
+ "--slang-lexicon", "slang_lexicon.json",
91
+ "--device", "cuda",
92
+ "--batch-size", "50000",
93
+ "--embedding-batch-size", "512",
94
+ "--workers", "4"
95
+ ]
96
+
97
+ # Use Popen to capture logs in real-time
98
+ try:
99
+ process = subprocess.Popen(
100
+ cmd,
101
+ stdout=subprocess.PIPE,
102
+ stderr=subprocess.STDOUT,
103
+ text=True,
104
+ bufsize=1
105
+ )
106
+
107
+ # Read stdout line by line
108
+ for line in process.stdout:
109
+ if "Processed" in line or "Error" in line: # Filter logs to keep UI clean?
110
+ log("MINER: " + line.strip())
111
+ elif "Loading" in line:
112
+ log("MINER: " + line.strip())
113
+
114
+ process.wait()
115
+
116
+ if process.returncode != 0:
117
+ log(f"Miner failed with code {process.returncode}")
118
+ continue
119
+
120
+ except Exception as e:
121
+ log(f"Subprocess error: {e}")
122
+ continue
123
+
124
+ # Upload
125
+ try:
126
+ log(f"Uploading {shard_output_name}...")
127
+ api.upload_file(
128
+ path_or_fileobj=shard_output_path,
129
+ path_in_repo=f"data/{shard_output_name}",
130
+ repo_id=OUTPUT_REPO,
131
+ repo_type="dataset"
132
+ )
133
+ log("Upload Successful!")
134
+ except Exception as e:
135
+ log(f"Upload failed: {e}")
136
+
137
+ # 4. Finish
138
+ final_url = f"https://huggingface.co/datasets/{OUTPUT_REPO}"
139
+ log("ALL JOBS COMPLETE.")
140
+ dataset_link_md = f"## 🎉 DONE! Data is ready: [Click Here to View Dataset]({final_url})"
141
+ mining_active = False
142
+
143
+ def start_thread():
144
+ global mining_active
145
+ if mining_active:
146
+ return "Already Running..."
147
+ mining_active = True
148
+ t = threading.Thread(target=run_mining_logic)
149
+ t.start()
150
+ return "Mining Started! Watch logs below."
151
+
152
+ # --- GRADIO UI ---
153
+ with gr.Blocks(title="TikTok Miner") as demo:
154
+ gr.Markdown("# ⛏️ System 7 Miner Control")
155
+
156
+ with gr.Row():
157
+ start_btn = gr.Button("🚀 Start Mining Job", variant="primary")
158
+ status_txt = gr.Textbox(label="Status", value="Ready to start.", interactive=False)
159
+
160
+ # The Log Window
161
+ logs_out = gr.TextArea(
162
+ label="Live Process Logs",
163
+ placeholder="Logs will stream here...",
164
+ lines=20,
165
+ max_lines=25,
166
+ autoscroll=True
167
+ )
168
+
169
+ # The Success Link (Hidden until done)
170
+ link_display = gr.Markdown("")
171
+
172
+ # Button Action
173
+ start_btn.click(fn=start_thread, inputs=None, outputs=status_txt)
174
+
175
+ # Auto-refresh the logs every 1 second
176
+ def update_ui():
177
+ return log_buffer, dataset_link_md
178
+
179
+ timer = gr.Timer(1)
180
+ timer.tick(fn=update_ui, outputs=[logs_out, link_display])
181
+
182
+ # Launch on the port HF expects
183
+ demo.launch(server_name="0.0.0.0", server_port=7860)
slang_lexicon.json ADDED
The diff for this file is too large to render. See raw diff
 
start.sh.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # 1. Ensure Persistent Cache is Writable
5
+ export HF_HOME="${HF_HOME:-/data/.cache}"
6
+ # Create subdirectories to be safe
7
+ for dir in "$HF_HOME/hub" "$HF_HOME/transformers" "$HF_HOME/datasets" "$HF_HOME/sentence_transformers" "/data/out"; do
8
+ mkdir -p "$dir"
9
+ done
10
+
11
+ # 2. Cleanup Threading Issues
12
+ unset OMP_NUM_THREADS
13
+ export TOKENIZERS_PARALLELISM=false
14
+
15
+ # 3. Quick GPU Check
16
+ echo '=== Torch / CUDA check ==='
17
+ python - <<'PY'
18
+ import torch
19
+ print(f'torch version: {torch.__version__}')
20
+ print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
21
+ if torch.cuda.is_available():
22
+ print(f'Device: {torch.cuda.get_device_name(0)}')
23
+ PY
24
+
25
+ # 4. Start Lightweight File Server (Port 7860)
26
+ # This keeps the Space "Healthy" and lets you browse /data/out to see if the parquet file exists.
27
+ PORT="${PORT:-7860}"
28
+ echo "Starting file server on port $PORT serving /data/out..."
29
+ cd /data/out && python3 -m http.server "$PORT" &
30
+ SERVER_PID=$!
31
+ trap 'kill $SERVER_PID' EXIT
32
+
33
+ # Return to app dir to run the miner
34
+ cd /app
35
+
36
+ # 5. Run System 7 Miner
37
+ # We assume input data is in /data/input. If not, you might need to download it here.
38
+ echo '=== Running System 7 Miner ==='
39
+ python sys7_miner.py \
40
+ --input-path "${INPUT_PATH:-/data/input}" \
41
+ --output-parquet "${OUTPUT_PATH:-/data/out/tiktok10m_sys7_features.parquet}" \
42
+ --lexicons system7_lexicons.json \
43
+ --label-orders label_orders.json \
44
+ --slang-lexicon slang_lexicon.json \
45
+ --device cuda \
46
+ --batch-size "${BATCH_SIZE:-50000}" \
47
+ --embedding-batch-size "${EMBED_BATCH:-512}" \
48
+ --metadata-path "/data/out/sys7_miner_metadata.json" \
49
+ --diagnostics-path "/data/out/sys7_miner_diagnostics.json" \
50
+ --workers 4
51
+
52
+ RESULT=$?
53
+
54
+ if [ $RESULT -eq 0 ]; then
55
+ echo "Miner completed successfully."
56
+ else
57
+ echo "Miner failed. Check logs."
58
+ fi
59
+
60
+ # Keep container alive so you can download files via the HTTP server
61
+ wait $SERVER_PID
sys7_phrase_lexicons_desc_only.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a4384c606dfab526df5b30198fd14453022114852a022ea60027bfc6c5f9327
3
+ size 78998019
system7_lexicons.json ADDED
The diff for this file is too large to render. See raw diff