Spaces:
Paused
Paused
Upload 9 files
Browse files- .gitattributes +1 -0
- Dockerfile.txt +47 -0
- Requirements.txt.txt +11 -0
- label_orders.json +44 -0
- run_job.py +188 -0
- slang_lexicon.json +0 -0
- start.sh.txt +66 -0
- sys7_miner_2.py +802 -0
- sys7_phrase_lexicons_desc_only.json +3 -0
- system7_lexicons.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
sys7_phrase_lexicons_desc_only.json filter=lfs diff=lfs merge=lfs -text
|
Dockerfile.txt
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use the same PyTorch base image that worked for your clustering job
|
| 2 |
+
FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
|
| 3 |
+
|
| 4 |
+
# Install system basics
|
| 5 |
+
RUN apt-get update && \
|
| 6 |
+
apt-get install -y wget ca-certificates git && \
|
| 7 |
+
rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install Python dependencies
|
| 12 |
+
COPY requirements.txt ./
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Create non-root user and setup the Persistent Storage paths
|
| 16 |
+
# This ensures /data is writable, which is where your Parquet inputs/outputs will live
|
| 17 |
+
RUN useradd -m -u 1000 appuser && \
|
| 18 |
+
mkdir -p /data/.cache && \
|
| 19 |
+
mkdir -p /data/out && \
|
| 20 |
+
mkdir -p /data/input && \
|
| 21 |
+
chown -R appuser:appuser /data
|
| 22 |
+
|
| 23 |
+
# Set environment variables to force all Hugging Face caches to the persistent volume
|
| 24 |
+
ENV HF_HOME=/data/.cache \
|
| 25 |
+
HF_HUB_CACHE=/data/.cache/hub \
|
| 26 |
+
TRANSFORMERS_CACHE=/data/.cache/transformers \
|
| 27 |
+
HF_DATASETS_CACHE=/data/.cache/datasets \
|
| 28 |
+
SENTENCE_TRANSFORMERS_HOME=/data/.cache/sentence_transformers \
|
| 29 |
+
TOKENIZERS_PARALLELISM=false \
|
| 30 |
+
OMP_NUM_THREADS=1
|
| 31 |
+
|
| 32 |
+
# Copy your Miner script and the required Config JSONs
|
| 33 |
+
# Make sure you upload these JSONs to the Space Files along with this Dockerfile!
|
| 34 |
+
COPY sys7_miner_2.py .
|
| 35 |
+
COPY sys7_miner.py .
|
| 36 |
+
COPY system7_lexicons.json .
|
| 37 |
+
COPY label_orders.json .
|
| 38 |
+
COPY slang_lexicon.json .
|
| 39 |
+
COPY sys7_phrase_lexicons_desc_only.json .
|
| 40 |
+
|
| 41 |
+
# Copy the runner script
|
| 42 |
+
COPY start.sh .
|
| 43 |
+
RUN chmod +x start.sh && chown -R appuser:appuser /app
|
| 44 |
+
|
| 45 |
+
USER appuser
|
| 46 |
+
|
| 47 |
+
CMD ["./start.sh"]
|
Requirements.txt.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
pyarrow
|
| 3 |
+
ftfy
|
| 4 |
+
langdetect
|
| 5 |
+
sentence-transformers
|
| 6 |
+
huggingface_hub
|
| 7 |
+
datasets
|
| 8 |
+
--extra-index-url https://download.pytorch.org/whl/cu124
|
| 9 |
+
torch==2.5.1
|
| 10 |
+
torchvision==0.20.1
|
| 11 |
+
torchaudio==2.5.1
|
label_orders.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tribe": [
|
| 3 |
+
"BlueCollar",
|
| 4 |
+
"Faith",
|
| 5 |
+
"Gaming",
|
| 6 |
+
"MomTok",
|
| 7 |
+
"Pets",
|
| 8 |
+
"Storytime",
|
| 9 |
+
"SportsCombat",
|
| 10 |
+
"MusicPopFandom",
|
| 11 |
+
"TVStreaming",
|
| 12 |
+
"Roleplay"
|
| 13 |
+
],
|
| 14 |
+
"commercial": [
|
| 15 |
+
"Product",
|
| 16 |
+
"MediaShow",
|
| 17 |
+
"EventService",
|
| 18 |
+
"InfluencerBrand"
|
| 19 |
+
],
|
| 20 |
+
"vibe": [
|
| 21 |
+
"chaotic_funny",
|
| 22 |
+
"confessional_personal",
|
| 23 |
+
"educational_calm",
|
| 24 |
+
"motivational_inspirational",
|
| 25 |
+
"dramatic_romantic"
|
| 26 |
+
],
|
| 27 |
+
"time": [
|
| 28 |
+
"seasonal",
|
| 29 |
+
"viral"
|
| 30 |
+
],
|
| 31 |
+
"role": [
|
| 32 |
+
"parent",
|
| 33 |
+
"professional",
|
| 34 |
+
"single",
|
| 35 |
+
"student",
|
| 36 |
+
"athlete",
|
| 37 |
+
"creator"
|
| 38 |
+
],
|
| 39 |
+
"format": [
|
| 40 |
+
"storytime",
|
| 41 |
+
"template_edit",
|
| 42 |
+
"vlog"
|
| 43 |
+
]
|
| 44 |
+
}
|
run_job.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import threading
|
| 4 |
+
import subprocess
|
| 5 |
+
import time
|
| 6 |
+
import shutil
|
| 7 |
+
from huggingface_hub import HfApi, hf_hub_download, create_repo
|
| 8 |
+
|
| 9 |
+
# --- CONFIG ---
|
| 10 |
+
INPUT_DATASET = "The-data-company/TikTok-10M"
|
| 11 |
+
# CHANGE THIS TO YOUR USERNAME!
|
| 12 |
+
OUTPUT_REPO = "Imaginethat/tiktok-sys7-mined-results"
|
| 13 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 14 |
+
|
| 15 |
+
# Global variables for the UI
|
| 16 |
+
log_buffer = "Initializing Environment...\n"
|
| 17 |
+
dataset_link_md = ""
|
| 18 |
+
mining_active = False
|
| 19 |
+
|
| 20 |
+
def log(msg):
|
| 21 |
+
"""Adds a message to the UI log window"""
|
| 22 |
+
global log_buffer
|
| 23 |
+
timestamp = time.strftime("%H:%M:%S")
|
| 24 |
+
entry = f"[{timestamp}] {msg}"
|
| 25 |
+
print(entry) # Keep console logs as backup
|
| 26 |
+
log_buffer += entry + "\n"
|
| 27 |
+
|
| 28 |
+
def run_mining_logic():
|
| 29 |
+
global dataset_link_md, mining_active
|
| 30 |
+
|
| 31 |
+
if not HF_TOKEN:
|
| 32 |
+
log("ERROR: HF_TOKEN secret is missing in Settings!")
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
api = HfApi(token=HF_TOKEN)
|
| 36 |
+
|
| 37 |
+
# 1. Setup Repo
|
| 38 |
+
log(f"Setting up Output Repo: {OUTPUT_REPO}")
|
| 39 |
+
try:
|
| 40 |
+
create_repo(repo_id=OUTPUT_REPO, repo_type="dataset", exist_ok=True)
|
| 41 |
+
except Exception as e:
|
| 42 |
+
log(f"Repo setup note: {e}")
|
| 43 |
+
|
| 44 |
+
# 2. List Files
|
| 45 |
+
log(f"Scanning {INPUT_DATASET} for files...")
|
| 46 |
+
try:
|
| 47 |
+
all_files = api.list_repo_files(repo_id=INPUT_DATASET, repo_type="dataset")
|
| 48 |
+
parquet_files = [f for f in all_files if f.endswith(".parquet") and "train" in f]
|
| 49 |
+
log(f"Found {len(parquet_files)} shards to process.")
|
| 50 |
+
except Exception as e:
|
| 51 |
+
log(f"Failed to list files: {e}")
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
# 3. Process Shards
|
| 55 |
+
input_dir = "./data_input"
|
| 56 |
+
output_dir = "./data_output"
|
| 57 |
+
|
| 58 |
+
for i, file_path in enumerate(parquet_files):
|
| 59 |
+
log(f"--- Processing Shard {i+1}/{len(parquet_files)}: {file_path} ---")
|
| 60 |
+
|
| 61 |
+
# Cleanup
|
| 62 |
+
if os.path.exists(input_dir): shutil.rmtree(input_dir)
|
| 63 |
+
if os.path.exists(output_dir): shutil.rmtree(output_dir)
|
| 64 |
+
os.makedirs(input_dir, exist_ok=True)
|
| 65 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 66 |
+
|
| 67 |
+
# Download
|
| 68 |
+
try:
|
| 69 |
+
local_path = hf_hub_download(
|
| 70 |
+
repo_id=INPUT_DATASET,
|
| 71 |
+
filename=file_path,
|
| 72 |
+
repo_type="dataset",
|
| 73 |
+
local_dir=input_dir,
|
| 74 |
+
token=HF_TOKEN
|
| 75 |
+
)
|
| 76 |
+
except Exception as e:
|
| 77 |
+
log(f"Download failed: {e}")
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
# Run Miner (Streaming output to UI)
|
| 81 |
+
shard_output_name = f"sys7_features_part_{i:04d}.parquet"
|
| 82 |
+
shard_output_path = os.path.join(output_dir, shard_output_name)
|
| 83 |
+
|
| 84 |
+
cmd = [
|
| 85 |
+
"python", "-u", "sys7_miner_2.py", # -u forces unbuffered output
|
| 86 |
+
"--input-path", input_dir,
|
| 87 |
+
"--output-parquet", shard_output_path,
|
| 88 |
+
"--lexicons", "system7_lexicons.json",
|
| 89 |
+
"--label-orders", "label_orders.json",
|
| 90 |
+
"--slang-lexicon", "slang_lexicon.json",
|
| 91 |
+
"--phrase-lexicon", "sys7_phrase_lexicons_desc_only.json",
|
| 92 |
+
"--auto-language-detect",
|
| 93 |
+
"--language-allowlist", "en",
|
| 94 |
+
"--min-chars", "10",
|
| 95 |
+
"--min-tokens", "2",
|
| 96 |
+
"--device", "cuda",
|
| 97 |
+
"--batch-size", "50000",
|
| 98 |
+
"--embedding-batch-size", "512",
|
| 99 |
+
"--workers", "4"
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
# Use Popen to capture logs in real-time
|
| 103 |
+
try:
|
| 104 |
+
process = subprocess.Popen(
|
| 105 |
+
cmd,
|
| 106 |
+
stdout=subprocess.PIPE,
|
| 107 |
+
stderr=subprocess.STDOUT,
|
| 108 |
+
text=True,
|
| 109 |
+
bufsize=1
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Read stdout line by line
|
| 113 |
+
for line in process.stdout:
|
| 114 |
+
if "Processed" in line or "Error" in line: # Filter logs to keep UI clean?
|
| 115 |
+
log("MINER: " + line.strip())
|
| 116 |
+
elif "Loading" in line:
|
| 117 |
+
log("MINER: " + line.strip())
|
| 118 |
+
|
| 119 |
+
process.wait()
|
| 120 |
+
|
| 121 |
+
if process.returncode != 0:
|
| 122 |
+
log(f"Miner failed with code {process.returncode}")
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
log(f"Subprocess error: {e}")
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
# Upload
|
| 130 |
+
try:
|
| 131 |
+
log(f"Uploading {shard_output_name}...")
|
| 132 |
+
api.upload_file(
|
| 133 |
+
path_or_fileobj=shard_output_path,
|
| 134 |
+
path_in_repo=f"data/{shard_output_name}",
|
| 135 |
+
repo_id=OUTPUT_REPO,
|
| 136 |
+
repo_type="dataset"
|
| 137 |
+
)
|
| 138 |
+
log("Upload Successful!")
|
| 139 |
+
except Exception as e:
|
| 140 |
+
log(f"Upload failed: {e}")
|
| 141 |
+
|
| 142 |
+
# 4. Finish
|
| 143 |
+
final_url = f"https://huggingface.co/datasets/{OUTPUT_REPO}"
|
| 144 |
+
log("ALL JOBS COMPLETE.")
|
| 145 |
+
dataset_link_md = f"## 🎉 DONE! Data is ready: [Click Here to View Dataset]({final_url})"
|
| 146 |
+
mining_active = False
|
| 147 |
+
|
| 148 |
+
def start_thread():
|
| 149 |
+
global mining_active
|
| 150 |
+
if mining_active:
|
| 151 |
+
return "Already Running..."
|
| 152 |
+
mining_active = True
|
| 153 |
+
t = threading.Thread(target=run_mining_logic)
|
| 154 |
+
t.start()
|
| 155 |
+
return "Mining Started! Watch logs below."
|
| 156 |
+
|
| 157 |
+
# --- GRADIO UI ---
|
| 158 |
+
with gr.Blocks(title="TikTok Miner") as demo:
|
| 159 |
+
gr.Markdown("# ⛏️ System 7 Miner Control")
|
| 160 |
+
|
| 161 |
+
with gr.Row():
|
| 162 |
+
start_btn = gr.Button("🚀 Start Mining Job", variant="primary")
|
| 163 |
+
status_txt = gr.Textbox(label="Status", value="Ready to start.", interactive=False)
|
| 164 |
+
|
| 165 |
+
# The Log Window
|
| 166 |
+
logs_out = gr.TextArea(
|
| 167 |
+
label="Live Process Logs",
|
| 168 |
+
placeholder="Logs will stream here...",
|
| 169 |
+
lines=20,
|
| 170 |
+
max_lines=25,
|
| 171 |
+
autoscroll=True
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# The Success Link (Hidden until done)
|
| 175 |
+
link_display = gr.Markdown("")
|
| 176 |
+
|
| 177 |
+
# Button Action
|
| 178 |
+
start_btn.click(fn=start_thread, inputs=None, outputs=status_txt)
|
| 179 |
+
|
| 180 |
+
# Auto-refresh the logs every 1 second
|
| 181 |
+
def update_ui():
|
| 182 |
+
return log_buffer, dataset_link_md
|
| 183 |
+
|
| 184 |
+
timer = gr.Timer(1)
|
| 185 |
+
timer.tick(fn=update_ui, outputs=[logs_out, link_display])
|
| 186 |
+
|
| 187 |
+
# Launch on the port HF expects
|
| 188 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
slang_lexicon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
start.sh.txt
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# 1. Ensure Persistent Cache is Writable
|
| 5 |
+
export HF_HOME="${HF_HOME:-/data/.cache}"
|
| 6 |
+
# Create subdirectories to be safe
|
| 7 |
+
for dir in "$HF_HOME/hub" "$HF_HOME/transformers" "$HF_HOME/datasets" "$HF_HOME/sentence_transformers" "/data/out"; do
|
| 8 |
+
mkdir -p "$dir"
|
| 9 |
+
done
|
| 10 |
+
|
| 11 |
+
# 2. Cleanup Threading Issues
|
| 12 |
+
unset OMP_NUM_THREADS
|
| 13 |
+
export TOKENIZERS_PARALLELISM=false
|
| 14 |
+
|
| 15 |
+
# 3. Quick GPU Check
|
| 16 |
+
echo '=== Torch / CUDA check ==='
|
| 17 |
+
python - <<'PY'
|
| 18 |
+
import torch
|
| 19 |
+
print(f'torch version: {torch.__version__}')
|
| 20 |
+
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
|
| 21 |
+
if torch.cuda.is_available():
|
| 22 |
+
print(f'Device: {torch.cuda.get_device_name(0)}')
|
| 23 |
+
PY
|
| 24 |
+
|
| 25 |
+
# 4. Start Lightweight File Server (Port 7860)
|
| 26 |
+
# This keeps the Space "Healthy" and lets you browse /data/out to see if the parquet file exists.
|
| 27 |
+
PORT="${PORT:-7860}"
|
| 28 |
+
echo "Starting file server on port $PORT serving /data/out..."
|
| 29 |
+
cd /data/out && python3 -m http.server "$PORT" &
|
| 30 |
+
SERVER_PID=$!
|
| 31 |
+
trap 'kill $SERVER_PID' EXIT
|
| 32 |
+
|
| 33 |
+
# Return to app dir to run the miner
|
| 34 |
+
cd /app
|
| 35 |
+
|
| 36 |
+
# 5. Run System 7 Miner
|
| 37 |
+
# We assume input data is in /data/input. If not, you might need to download it here.
|
| 38 |
+
echo '=== Running System 7 Miner ==='
|
| 39 |
+
python sys7_miner_2.py \
|
| 40 |
+
--input-path "${INPUT_PATH:-/data/input}" \
|
| 41 |
+
--output-parquet "${OUTPUT_PATH:-/data/out/tiktok10m_sys7_features.parquet}" \
|
| 42 |
+
--lexicons system7_lexicons.json \
|
| 43 |
+
--label-orders label_orders.json \
|
| 44 |
+
--slang-lexicon slang_lexicon.json \
|
| 45 |
+
--phrase-lexicon sys7_phrase_lexicons_desc_only.json \
|
| 46 |
+
--auto-language-detect \
|
| 47 |
+
--language-allowlist en \
|
| 48 |
+
--min-chars "${MIN_CHARS:-10}" \
|
| 49 |
+
--min-tokens "${MIN_TOKENS:-2}" \
|
| 50 |
+
--device cuda \
|
| 51 |
+
--batch-size "${BATCH_SIZE:-50000}" \
|
| 52 |
+
--embedding-batch-size "${EMBED_BATCH:-512}" \
|
| 53 |
+
--metadata-path "/data/out/sys7_miner_metadata.json" \
|
| 54 |
+
--diagnostics-path "/data/out/sys7_miner_diagnostics.json" \
|
| 55 |
+
--workers 4
|
| 56 |
+
|
| 57 |
+
RESULT=$?
|
| 58 |
+
|
| 59 |
+
if [ $RESULT -eq 0 ]; then
|
| 60 |
+
echo "Miner completed successfully."
|
| 61 |
+
else
|
| 62 |
+
echo "Miner failed. Check logs."
|
| 63 |
+
fi
|
| 64 |
+
|
| 65 |
+
# Keep container alive so you can download files via the HTTP server
|
| 66 |
+
wait $SERVER_PID
|
sys7_miner_2.py
ADDED
|
@@ -0,0 +1,802 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
System 7 Miner
|
| 3 |
+
---------------
|
| 4 |
+
Module 1: Clean TikTok rows, apply lexicon-derived raw signals, and emit feature vectors.
|
| 5 |
+
|
| 6 |
+
Core outputs:
|
| 7 |
+
- tiktok10m_sys7_features.parquet
|
| 8 |
+
- optional sys7_miner_metadata.json
|
| 9 |
+
|
| 10 |
+
This script is designed to stream large CSVs in chunks (no 10M-row load),
|
| 11 |
+
normalize text with ftfy, compute raw lexical scores from System 6 lexicons,
|
| 12 |
+
and generate contextual embeddings via sentence-transformers.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import argparse
|
| 18 |
+
import json
|
| 19 |
+
import logging
|
| 20 |
+
import math
|
| 21 |
+
import os
|
| 22 |
+
import re
|
| 23 |
+
import time
|
| 24 |
+
from collections import defaultdict
|
| 25 |
+
from concurrent.futures import ProcessPoolExecutor
|
| 26 |
+
from dataclasses import asdict, dataclass
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
from typing import Dict, List, Optional, Sequence, Tuple
|
| 29 |
+
|
| 30 |
+
import numpy as np
|
| 31 |
+
import pandas as pd
|
| 32 |
+
import pyarrow as pa
|
| 33 |
+
import pyarrow.parquet as pq
|
| 34 |
+
import pyarrow.dataset as ds
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from sentence_transformers import SentenceTransformer
|
| 38 |
+
except ImportError: # pragma: no cover - handled at runtime
|
| 39 |
+
SentenceTransformer = None # type: ignore
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
import ftfy
|
| 43 |
+
except ImportError: # pragma: no cover - handled at runtime
|
| 44 |
+
ftfy = None # type: ignore
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
from langdetect import detect as langdetect_detect
|
| 48 |
+
from langdetect.detector_factory import DetectorFactory
|
| 49 |
+
except ImportError: # pragma: no cover - optional dependency
|
| 50 |
+
langdetect_detect = None # type: ignore
|
| 51 |
+
else: # pragma: no cover - runtime behavior
|
| 52 |
+
# Make langdetect deterministic across runs/processes.
|
| 53 |
+
DetectorFactory.seed = 0
|
| 54 |
+
|
| 55 |
+
logger = logging.getLogger("sys7_miner")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
CONTROL_CHARS_RE = re.compile(r"[\u0000-\u0008\u000b\u000c\u000e-\u001f]")
|
| 59 |
+
REPLACEMENT_RUN_RE = re.compile("�{2,}")
|
| 60 |
+
WHITESPACE_RE = re.compile(r"\s+")
|
| 61 |
+
HASHTAG_SPLIT_RE = re.compile(r"[A-Z]?[a-z]+|[0-9]+")
|
| 62 |
+
TOKENIZER_RE = re.compile(r"[a-z0-9']+")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@dataclass
|
| 66 |
+
class MinerConfig:
|
| 67 |
+
input_path: str
|
| 68 |
+
output_parquet: str = "tiktok10m_sys7_features.parquet"
|
| 69 |
+
lexicons: str = "system7_lexicons.json"
|
| 70 |
+
phrase_lexicon: Optional[str] = None
|
| 71 |
+
phrase_weight_scale: float = 1.0
|
| 72 |
+
slang_lexicon: str = "slang_lexicon.json"
|
| 73 |
+
label_orders: str = "label_orders.json"
|
| 74 |
+
batch_size: int = 50_000
|
| 75 |
+
text_cap: int = 1024
|
| 76 |
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
| 77 |
+
embedding_batch_size: int = 128
|
| 78 |
+
device: str = "cpu"
|
| 79 |
+
workers: int = 2
|
| 80 |
+
metadata_path: Optional[str] = "sys7_miner_metadata.json"
|
| 81 |
+
diagnostics_path: Optional[str] = "sys7_miner_diagnostics.json"
|
| 82 |
+
min_chars: int = 10
|
| 83 |
+
min_tokens: int = 2
|
| 84 |
+
language_allowlist: Optional[List[str]] = None # e.g., ["en"]
|
| 85 |
+
auto_language_detect: bool = False
|
| 86 |
+
emit_flat_scores: bool = False
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def load_json(path: str) -> Dict:
|
| 90 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 91 |
+
return json.load(f)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def auto_detect_language(text: str) -> Optional[str]:
|
| 95 |
+
"""Best-effort language detection using langdetect, if available."""
|
| 96 |
+
if not langdetect_detect:
|
| 97 |
+
return None
|
| 98 |
+
if not text:
|
| 99 |
+
return None
|
| 100 |
+
# langdetect cost scales with text length; captions + hashtags can be long/noisy.
|
| 101 |
+
# Truncate to keep detection cheap and stable.
|
| 102 |
+
if len(text) > 400:
|
| 103 |
+
text = text[:400]
|
| 104 |
+
try:
|
| 105 |
+
return str(langdetect_detect(text))
|
| 106 |
+
except Exception:
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def detect_languages_parallel(texts: List[str], workers: int) -> List[Optional[str]]:
|
| 111 |
+
"""Detect languages for many texts, optionally using multiprocessing.
|
| 112 |
+
|
| 113 |
+
langdetect is CPU-bound; using a ProcessPool sidesteps the GIL.
|
| 114 |
+
"""
|
| 115 |
+
if not texts:
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
workers = int(workers or 0)
|
| 119 |
+
if workers <= 1 or not langdetect_detect:
|
| 120 |
+
return [auto_detect_language(t) for t in texts]
|
| 121 |
+
|
| 122 |
+
# Clamp to avoid spawning an excessive number of processes.
|
| 123 |
+
max_workers = min(workers, os.cpu_count() or workers)
|
| 124 |
+
chunksize = 256 # critical for many short strings
|
| 125 |
+
with ProcessPoolExecutor(max_workers=max_workers) as ex:
|
| 126 |
+
return list(ex.map(auto_detect_language, texts, chunksize=chunksize))
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def is_nullish(val) -> bool:
|
| 130 |
+
if val is None:
|
| 131 |
+
return True
|
| 132 |
+
if isinstance(val, float) and math.isnan(val):
|
| 133 |
+
return True
|
| 134 |
+
if isinstance(val, str) and val.strip() == "":
|
| 135 |
+
return True
|
| 136 |
+
return False
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def coerce_id(val) -> Optional[str]:
|
| 140 |
+
"""Coerce IDs to strings (safe for very large integers)."""
|
| 141 |
+
if is_nullish(val):
|
| 142 |
+
return None
|
| 143 |
+
if isinstance(val, (int, np.integer)):
|
| 144 |
+
return str(int(val))
|
| 145 |
+
if isinstance(val, (float, np.floating)):
|
| 146 |
+
if math.isnan(float(val)):
|
| 147 |
+
return None
|
| 148 |
+
if float(val).is_integer():
|
| 149 |
+
return str(int(val))
|
| 150 |
+
return str(val)
|
| 151 |
+
s = str(val).strip()
|
| 152 |
+
if s.endswith(".0"):
|
| 153 |
+
try:
|
| 154 |
+
as_int = int(float(s))
|
| 155 |
+
return str(as_int)
|
| 156 |
+
except Exception:
|
| 157 |
+
pass
|
| 158 |
+
return s
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def first_present(row, *names: str):
|
| 162 |
+
for name in names:
|
| 163 |
+
if hasattr(row, name):
|
| 164 |
+
val = getattr(row, name, None)
|
| 165 |
+
if not is_nullish(val):
|
| 166 |
+
return val
|
| 167 |
+
return None
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def extract_hashtag_tags(raw) -> List[str]:
|
| 171 |
+
"""
|
| 172 |
+
Extract a list of hashtag/challenge strings from a variety of formats:
|
| 173 |
+
- JSON string list: '["tag1","tag2"]'
|
| 174 |
+
- JSON string list of objects: '[{"title":"tag"}]'
|
| 175 |
+
- whitespace/comma separated string
|
| 176 |
+
- list/tuple of strings (or dicts)
|
| 177 |
+
"""
|
| 178 |
+
if is_nullish(raw):
|
| 179 |
+
return []
|
| 180 |
+
if isinstance(raw, (list, tuple)):
|
| 181 |
+
out: List[str] = []
|
| 182 |
+
for item in raw:
|
| 183 |
+
if is_nullish(item):
|
| 184 |
+
continue
|
| 185 |
+
if isinstance(item, str):
|
| 186 |
+
out.append(item.strip())
|
| 187 |
+
continue
|
| 188 |
+
if isinstance(item, dict):
|
| 189 |
+
for k in ("title", "name", "tag", "challenge", "challenge_name", "text"):
|
| 190 |
+
v = item.get(k)
|
| 191 |
+
if isinstance(v, str) and v.strip():
|
| 192 |
+
out.append(v.strip())
|
| 193 |
+
break
|
| 194 |
+
return [t for t in out if t]
|
| 195 |
+
if isinstance(raw, str):
|
| 196 |
+
s = raw.strip()
|
| 197 |
+
if not s:
|
| 198 |
+
return []
|
| 199 |
+
if s[:1] in ("[", "{"):
|
| 200 |
+
try:
|
| 201 |
+
obj = json.loads(s)
|
| 202 |
+
if isinstance(obj, list):
|
| 203 |
+
return extract_hashtag_tags(obj)
|
| 204 |
+
if isinstance(obj, dict):
|
| 205 |
+
for k in ("hashtags", "challenges", "tags"):
|
| 206 |
+
v = obj.get(k)
|
| 207 |
+
if v is not None:
|
| 208 |
+
return extract_hashtag_tags(v)
|
| 209 |
+
except Exception:
|
| 210 |
+
pass
|
| 211 |
+
# Fallback: find #tags, else split on commas/whitespace
|
| 212 |
+
tags = re.findall(r"#([A-Za-z0-9_]+)", s)
|
| 213 |
+
if tags:
|
| 214 |
+
return tags
|
| 215 |
+
parts = [p.strip().strip('"').strip("'").strip("[](){}") for p in re.split(r"[,\s]+", s) if p.strip()]
|
| 216 |
+
return [p.lstrip("#") for p in parts if p and p != "#"]
|
| 217 |
+
return []
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def prepare_slang_map(raw) -> Dict[str, str]:
|
| 221 |
+
"""
|
| 222 |
+
Normalize slang lexicon to token -> canonical mapping.
|
| 223 |
+
Accepts dict (already mapping) or list of objects with 'token' and optional 'canonical' keys.
|
| 224 |
+
"""
|
| 225 |
+
mapping: Dict[str, str] = {}
|
| 226 |
+
if isinstance(raw, dict):
|
| 227 |
+
for k, v in raw.items():
|
| 228 |
+
mapping[str(k).lower()] = str(v) if v is not None else str(k).lower()
|
| 229 |
+
return mapping
|
| 230 |
+
if isinstance(raw, list):
|
| 231 |
+
for item in raw:
|
| 232 |
+
if not isinstance(item, dict):
|
| 233 |
+
continue
|
| 234 |
+
token = item.get("token") or item.get("slang") or item.get("term")
|
| 235 |
+
canonical = item.get("canonical") or item.get("value") or token
|
| 236 |
+
if token:
|
| 237 |
+
mapping[str(token).lower()] = str(canonical).lower()
|
| 238 |
+
return mapping
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def orient_lexicons(raw_lexicons: Dict[str, Dict]) -> Dict[str, Dict[str, Dict[str, float]]]:
|
| 242 |
+
"""
|
| 243 |
+
Normalize lexicons into dim -> token -> {label: weight}.
|
| 244 |
+
Accepts lexicons shaped as label -> token -> weight or weight objects.
|
| 245 |
+
Ignores non-dict top-level entries (e.g., config/sources).
|
| 246 |
+
"""
|
| 247 |
+
|
| 248 |
+
def to_weight(val):
|
| 249 |
+
if isinstance(val, dict) and "weight" in val:
|
| 250 |
+
return val["weight"]
|
| 251 |
+
return val
|
| 252 |
+
|
| 253 |
+
oriented: Dict[str, Dict[str, Dict[str, float]]] = {}
|
| 254 |
+
for dim, labels in raw_lexicons.items():
|
| 255 |
+
if not isinstance(labels, dict):
|
| 256 |
+
continue # skip config/metadata blocks
|
| 257 |
+
token_map: Dict[str, Dict[str, float]] = defaultdict(dict)
|
| 258 |
+
for label, token_weights in labels.items():
|
| 259 |
+
if not isinstance(token_weights, dict):
|
| 260 |
+
continue
|
| 261 |
+
for token, weight in token_weights.items():
|
| 262 |
+
w = to_weight(weight)
|
| 263 |
+
try:
|
| 264 |
+
w_float = float(w)
|
| 265 |
+
except (TypeError, ValueError):
|
| 266 |
+
continue
|
| 267 |
+
token_map[token.lower()][label] = w_float
|
| 268 |
+
oriented[dim] = token_map
|
| 269 |
+
return oriented
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def standardize_quotes(text: str) -> str:
|
| 273 |
+
return (
|
| 274 |
+
text.replace("“", '"')
|
| 275 |
+
.replace("”", '"')
|
| 276 |
+
.replace("‘", "'")
|
| 277 |
+
.replace("’", "'")
|
| 278 |
+
.replace("´", "'")
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def split_hashtag(tag: str) -> List[str]:
|
| 283 |
+
parts = HASHTAG_SPLIT_RE.findall(tag)
|
| 284 |
+
if parts:
|
| 285 |
+
return [p.lower() for p in parts]
|
| 286 |
+
return [tag.lower()]
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def normalize_hashtags(raw: Optional[str], slang_map: Dict[str, str]) -> List[str]:
|
| 290 |
+
if raw is None or (isinstance(raw, float) and math.isnan(raw)):
|
| 291 |
+
return []
|
| 292 |
+
tokens: List[str] = []
|
| 293 |
+
if isinstance(raw, str):
|
| 294 |
+
raw_parts = re.split(r"[,\s]+", raw)
|
| 295 |
+
elif isinstance(raw, (list, tuple)):
|
| 296 |
+
raw_parts = raw
|
| 297 |
+
else:
|
| 298 |
+
return tokens
|
| 299 |
+
for part in raw_parts:
|
| 300 |
+
if not part:
|
| 301 |
+
continue
|
| 302 |
+
cleaned = part.lstrip("#").strip()
|
| 303 |
+
if not cleaned:
|
| 304 |
+
continue
|
| 305 |
+
for token in split_hashtag(cleaned):
|
| 306 |
+
mapped = slang_map.get(token, token)
|
| 307 |
+
if mapped:
|
| 308 |
+
tokens.append(mapped)
|
| 309 |
+
return tokens
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def fuse_text(description: Optional[str], hashtags: Optional[str | Sequence[str]], slang_map: Dict[str, str], cap: int) -> Tuple[str, List[str]]:
|
| 313 |
+
desc = "" if description is None or (isinstance(description, float) and math.isnan(description)) else str(description)
|
| 314 |
+
ht_tokens = normalize_hashtags(hashtags, slang_map)
|
| 315 |
+
hashtag_text = " ".join(ht_tokens)
|
| 316 |
+
fused = " ".join(filter(None, [desc, hashtag_text])).lower()
|
| 317 |
+
# if ftfy:
|
| 318 |
+
# fused = ftfy.fix_text(fused, normalization="NFC")
|
| 319 |
+
fused = CONTROL_CHARS_RE.sub(" ", fused)
|
| 320 |
+
fused = REPLACEMENT_RUN_RE.sub(" ", fused)
|
| 321 |
+
fused = standardize_quotes(fused)
|
| 322 |
+
fused = WHITESPACE_RE.sub(" ", fused).strip()
|
| 323 |
+
if cap and len(fused) > cap:
|
| 324 |
+
fused = fused[:cap]
|
| 325 |
+
return fused, ht_tokens
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def tokenize(text: str, hashtag_tokens: Sequence[str]) -> List[str]:
|
| 329 |
+
base_tokens = TOKENIZER_RE.findall(text)
|
| 330 |
+
tokens = [t for t in base_tokens if t]
|
| 331 |
+
tokens.extend([t.lower() for t in hashtag_tokens if t])
|
| 332 |
+
return tokens
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def compute_raw_scores(tokens: Sequence[str], lexicons: Dict[str, Dict[str, Dict[str, float]]], label_orders: Dict[str, List[str]]) -> Dict[str, List[float]]:
|
| 336 |
+
scores: Dict[str, Dict[str, float]] = {dim: defaultdict(float) for dim in label_orders}
|
| 337 |
+
for token in tokens:
|
| 338 |
+
for dim, token_map in lexicons.items():
|
| 339 |
+
if token not in token_map:
|
| 340 |
+
continue
|
| 341 |
+
for label, weight in token_map[token].items():
|
| 342 |
+
scores[dim][label] += weight
|
| 343 |
+
norm = max(1.0, math.log1p(len(tokens)))
|
| 344 |
+
vectorized: Dict[str, List[float]] = {}
|
| 345 |
+
for dim, order in label_orders.items():
|
| 346 |
+
vectorized[dim] = [scores[dim].get(label, 0.0) / norm for label in order]
|
| 347 |
+
return vectorized
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def load_embedder(model_name: str, device: str) -> SentenceTransformer:
|
| 351 |
+
if SentenceTransformer is None:
|
| 352 |
+
raise ImportError("sentence-transformers is required. Install via `pip install sentence-transformers`.")
|
| 353 |
+
logger.info("Loading embedding model %s on device=%s", model_name, device)
|
| 354 |
+
return SentenceTransformer(model_name, device=device)
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def embed_texts(model: SentenceTransformer, texts: List[str], batch_size: int) -> List[List[float]]:
|
| 358 |
+
embeddings = model.encode(
|
| 359 |
+
texts,
|
| 360 |
+
batch_size=batch_size,
|
| 361 |
+
show_progress_bar=False,
|
| 362 |
+
convert_to_numpy=True,
|
| 363 |
+
)
|
| 364 |
+
return embeddings.tolist()
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def process_chunk(
|
| 368 |
+
df: pd.DataFrame,
|
| 369 |
+
lexicons: Dict[str, Dict[str, Dict[str, float]]],
|
| 370 |
+
phrase_vocab: Optional[set[str]],
|
| 371 |
+
phrase_ngram_sizes: Sequence[int],
|
| 372 |
+
slang_map: Dict[str, str],
|
| 373 |
+
label_orders: Dict[str, List[str]],
|
| 374 |
+
embedder: SentenceTransformer,
|
| 375 |
+
cfg: MinerConfig,
|
| 376 |
+
) -> Tuple[pa.Table, int, int]:
|
| 377 |
+
# First pass: cheap per-row work + min-length filter.
|
| 378 |
+
rows: List[object] = []
|
| 379 |
+
descriptions: List[Optional[str]] = []
|
| 380 |
+
hashtag_tags_list: List[List[str]] = []
|
| 381 |
+
fused_texts: List[str] = []
|
| 382 |
+
base_tokens_list: List[List[str]] = []
|
| 383 |
+
langs: List[Optional[str]] = []
|
| 384 |
+
needs_detect: List[bool] = []
|
| 385 |
+
|
| 386 |
+
filtered_lang = 0
|
| 387 |
+
filtered_short = 0
|
| 388 |
+
|
| 389 |
+
for row in df.itertuples(index=False):
|
| 390 |
+
description_val = first_present(row, "description", "desc")
|
| 391 |
+
hashtags_raw = first_present(row, "hashtags", "challenges")
|
| 392 |
+
hashtag_tags = extract_hashtag_tags(hashtags_raw)
|
| 393 |
+
|
| 394 |
+
description = str(description_val) if description_val is not None else None
|
| 395 |
+
fused_text, ht_tokens = fuse_text(description, hashtag_tags, slang_map, cfg.text_cap)
|
| 396 |
+
base_tokens = tokenize(fused_text, ht_tokens)
|
| 397 |
+
|
| 398 |
+
if len(fused_text) < cfg.min_chars or len(base_tokens) < cfg.min_tokens:
|
| 399 |
+
filtered_short += 1
|
| 400 |
+
continue
|
| 401 |
+
|
| 402 |
+
lang_val = getattr(row, "language", None) or getattr(row, "lang", None)
|
| 403 |
+
lang = None if is_nullish(lang_val) else str(lang_val).strip()
|
| 404 |
+
lang_norm = lang.lower() if lang else ""
|
| 405 |
+
need = bool(cfg.auto_language_detect and langdetect_detect and (not lang_norm or lang_norm in ("und", "unknown")))
|
| 406 |
+
|
| 407 |
+
rows.append(row)
|
| 408 |
+
descriptions.append(description)
|
| 409 |
+
hashtag_tags_list.append(hashtag_tags)
|
| 410 |
+
fused_texts.append(fused_text)
|
| 411 |
+
base_tokens_list.append(base_tokens)
|
| 412 |
+
langs.append(lang)
|
| 413 |
+
needs_detect.append(need)
|
| 414 |
+
|
| 415 |
+
# Multi-process language detection (CPU-bound).
|
| 416 |
+
if any(needs_detect):
|
| 417 |
+
detect_indices = [i for i, need in enumerate(needs_detect) if need]
|
| 418 |
+
detect_texts = [(descriptions[i] or "") for i in detect_indices]
|
| 419 |
+
t0 = time.perf_counter()
|
| 420 |
+
detected_langs = detect_languages_parallel(detect_texts, cfg.workers)
|
| 421 |
+
dt = time.perf_counter() - t0
|
| 422 |
+
used_workers = min(max(int(cfg.workers or 0), 1), os.cpu_count() or max(int(cfg.workers or 0), 1))
|
| 423 |
+
logger.info(
|
| 424 |
+
"Langdetect: detected %s rows in %.2fs using workers=%s",
|
| 425 |
+
len(detect_texts),
|
| 426 |
+
dt,
|
| 427 |
+
used_workers,
|
| 428 |
+
)
|
| 429 |
+
for idx, detected in zip(detect_indices, detected_langs):
|
| 430 |
+
if detected:
|
| 431 |
+
langs[idx] = detected
|
| 432 |
+
|
| 433 |
+
# Second pass: apply allowlist + phrase matching + scoring + record building.
|
| 434 |
+
records: List[Dict[str, object]] = []
|
| 435 |
+
texts: List[str] = []
|
| 436 |
+
|
| 437 |
+
for i, row in enumerate(rows):
|
| 438 |
+
lang = langs[i]
|
| 439 |
+
if cfg.language_allowlist:
|
| 440 |
+
if not lang:
|
| 441 |
+
filtered_lang += 1
|
| 442 |
+
continue
|
| 443 |
+
if str(lang).lower() not in cfg.language_allowlist:
|
| 444 |
+
filtered_lang += 1
|
| 445 |
+
continue
|
| 446 |
+
|
| 447 |
+
description = descriptions[i]
|
| 448 |
+
hashtag_tags = hashtag_tags_list[i]
|
| 449 |
+
fused_text = fused_texts[i]
|
| 450 |
+
|
| 451 |
+
tokens = base_tokens_list[i]
|
| 452 |
+
# Add matched phrases as tokens (if provided). Phrase lexicons are mined from descriptions only,
|
| 453 |
+
# so we match on cleaned description text (no hashtags). This runs only for rows that survive
|
| 454 |
+
# length + language filters (for speed).
|
| 455 |
+
if phrase_vocab and phrase_ngram_sizes and description:
|
| 456 |
+
desc_clean, _ = fuse_text(description, None, slang_map, cfg.text_cap)
|
| 457 |
+
desc_tokens = TOKENIZER_RE.findall(desc_clean)
|
| 458 |
+
matched: set[str] = set()
|
| 459 |
+
for n in phrase_ngram_sizes:
|
| 460 |
+
if n <= 1:
|
| 461 |
+
continue
|
| 462 |
+
for j in range(0, len(desc_tokens) - n + 1):
|
| 463 |
+
ph = " ".join(desc_tokens[j : j + n])
|
| 464 |
+
if ph in phrase_vocab:
|
| 465 |
+
matched.add(ph)
|
| 466 |
+
if matched:
|
| 467 |
+
tokens = tokens + list(matched)
|
| 468 |
+
|
| 469 |
+
texts.append(fused_text)
|
| 470 |
+
raw_scores = compute_raw_scores(tokens, lexicons, label_orders)
|
| 471 |
+
created_raw = first_present(row, "created_at", "create_time", "createTime", "collected_time", "stats_time")
|
| 472 |
+
created_date = None
|
| 473 |
+
if created_raw:
|
| 474 |
+
try:
|
| 475 |
+
created_int = int(created_raw)
|
| 476 |
+
created_date = pd.to_datetime(created_int, unit="s", utc=True).date().isoformat()
|
| 477 |
+
except Exception:
|
| 478 |
+
try:
|
| 479 |
+
created_date = str(pd.to_datetime(created_raw)).split(" ")[0]
|
| 480 |
+
except Exception:
|
| 481 |
+
created_date = None
|
| 482 |
+
video_id = coerce_id(first_present(row, "video_id", "aweme_id", "id"))
|
| 483 |
+
author_id = coerce_id(first_present(row, "author_id", "user_id"))
|
| 484 |
+
record: Dict[str, object] = {
|
| 485 |
+
"video_id": video_id,
|
| 486 |
+
"author_id": author_id,
|
| 487 |
+
"created_at": str(created_raw) if created_raw is not None else None,
|
| 488 |
+
"created_date": created_date,
|
| 489 |
+
"language": lang,
|
| 490 |
+
"description": description,
|
| 491 |
+
"hashtags": hashtag_tags,
|
| 492 |
+
"clean_text": fused_text,
|
| 493 |
+
"short_text": len(fused_text) < max(cfg.min_chars, 20),
|
| 494 |
+
"tribe_scores_raw": raw_scores.get("tribe", []),
|
| 495 |
+
"vibe_scores_raw": raw_scores.get("vibe", []),
|
| 496 |
+
"commercial_scores_raw": raw_scores.get("commercial", []),
|
| 497 |
+
"role_scores_raw": raw_scores.get("role", []),
|
| 498 |
+
"format_scores_raw": raw_scores.get("format", []),
|
| 499 |
+
"time_scores_raw": raw_scores.get("time", []),
|
| 500 |
+
}
|
| 501 |
+
if cfg.emit_flat_scores:
|
| 502 |
+
for dim, labels in label_orders.items():
|
| 503 |
+
scores = raw_scores.get(dim, []) or []
|
| 504 |
+
dim_max = float(max(scores)) if scores else 0.0
|
| 505 |
+
if dim == "time":
|
| 506 |
+
record["time_overall"] = dim_max
|
| 507 |
+
else:
|
| 508 |
+
record[f"{dim}_max"] = dim_max
|
| 509 |
+
for label, val in zip(labels, scores):
|
| 510 |
+
record[f"{dim}_{label}"] = float(val)
|
| 511 |
+
records.append(record)
|
| 512 |
+
|
| 513 |
+
if not records:
|
| 514 |
+
return pa.Table.from_pylist([]), filtered_lang, filtered_short
|
| 515 |
+
|
| 516 |
+
embeddings = embed_texts(embedder, texts, cfg.embedding_batch_size)
|
| 517 |
+
for rec, emb in zip(records, embeddings):
|
| 518 |
+
rec["embed_vec"] = emb
|
| 519 |
+
|
| 520 |
+
table = pa.Table.from_pylist(records)
|
| 521 |
+
return table, filtered_lang, filtered_short
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
def write_metadata(cfg: MinerConfig, label_orders: Dict[str, List[str]], diagnostics: Optional[Dict[str, int]] = None) -> None:
|
| 525 |
+
if not cfg.metadata_path:
|
| 526 |
+
return
|
| 527 |
+
meta = {
|
| 528 |
+
"config": asdict(cfg),
|
| 529 |
+
"label_orders": label_orders,
|
| 530 |
+
"diagnostics": diagnostics or {},
|
| 531 |
+
}
|
| 532 |
+
with open(cfg.metadata_path, "w", encoding="utf-8") as f:
|
| 533 |
+
json.dump(meta, f, ensure_ascii=False, indent=2)
|
| 534 |
+
logger.info("Wrote metadata to %s", cfg.metadata_path)
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
def run(cfg: MinerConfig) -> None:
|
| 538 |
+
logger.info("Starting sys7_miner with batch_size=%s", cfg.batch_size)
|
| 539 |
+
# Prefer System 7 lexicons if configured; fall back to System 6 bundle if needed.
|
| 540 |
+
lexicon_path = cfg.lexicons
|
| 541 |
+
if not Path(lexicon_path).exists() and Path("system6_lexicons.json").exists():
|
| 542 |
+
lexicon_path = "system6_lexicons.json"
|
| 543 |
+
logger.info("Lexicon bundle %s not found; falling back to %s", cfg.lexicons, lexicon_path)
|
| 544 |
+
else:
|
| 545 |
+
logger.info("Using lexicon bundle %s", lexicon_path)
|
| 546 |
+
raw_lexicons = load_json(str(lexicon_path))
|
| 547 |
+
slang_map = prepare_slang_map(load_json(cfg.slang_lexicon))
|
| 548 |
+
label_orders = load_json(cfg.label_orders)
|
| 549 |
+
lexicons = orient_lexicons(raw_lexicons)
|
| 550 |
+
|
| 551 |
+
phrase_vocab: Optional[set[str]] = None
|
| 552 |
+
phrase_ngram_sizes: List[int] = []
|
| 553 |
+
# Merge mined phrase lexicons into scoring (multi-word tokens).
|
| 554 |
+
if cfg.phrase_lexicon and Path(cfg.phrase_lexicon).exists():
|
| 555 |
+
try:
|
| 556 |
+
phrase_bundle = load_json(cfg.phrase_lexicon)
|
| 557 |
+
by_label = phrase_bundle.get("phrases_by_label")
|
| 558 |
+
if not isinstance(by_label, dict):
|
| 559 |
+
logger.warning(
|
| 560 |
+
"Phrase lexicon %s has no 'phrases_by_label'; it will not affect scoring. Re-run phraseminer.py to generate phrases_by_label.",
|
| 561 |
+
cfg.phrase_lexicon,
|
| 562 |
+
)
|
| 563 |
+
else:
|
| 564 |
+
phrase_vocab = set()
|
| 565 |
+
sizes = set()
|
| 566 |
+
merged = 0
|
| 567 |
+
for dim, label_map in by_label.items():
|
| 568 |
+
if dim not in label_orders:
|
| 569 |
+
continue
|
| 570 |
+
if not isinstance(label_map, dict):
|
| 571 |
+
continue
|
| 572 |
+
for label, phrases in label_map.items():
|
| 573 |
+
if label not in label_orders.get(dim, []):
|
| 574 |
+
continue
|
| 575 |
+
if not isinstance(phrases, dict):
|
| 576 |
+
continue
|
| 577 |
+
for phrase, w in phrases.items():
|
| 578 |
+
p = str(phrase).lower().strip()
|
| 579 |
+
if not p:
|
| 580 |
+
continue
|
| 581 |
+
try:
|
| 582 |
+
w_float = float(w) * float(cfg.phrase_weight_scale)
|
| 583 |
+
except Exception:
|
| 584 |
+
continue
|
| 585 |
+
lexicons.setdefault(dim, {}).setdefault(p, {})
|
| 586 |
+
lexicons[dim][p][label] = lexicons[dim][p].get(label, 0.0) + w_float
|
| 587 |
+
phrase_vocab.add(p)
|
| 588 |
+
sizes.add(len(p.split()))
|
| 589 |
+
merged += 1
|
| 590 |
+
phrase_ngram_sizes = sorted(n for n in sizes if n >= 2)
|
| 591 |
+
logger.info(
|
| 592 |
+
"Merged phrase lexicon into scoring: %s phrases (%s label assignments), ngram_sizes=%s",
|
| 593 |
+
len(phrase_vocab),
|
| 594 |
+
merged,
|
| 595 |
+
phrase_ngram_sizes,
|
| 596 |
+
)
|
| 597 |
+
except Exception as exc: # pragma: no cover - defensive
|
| 598 |
+
logger.warning("Failed to merge phrase lexicon into scoring: %s", exc)
|
| 599 |
+
embedder = load_embedder(cfg.embedding_model, cfg.device)
|
| 600 |
+
|
| 601 |
+
writer: Optional[pq.ParquetWriter] = None
|
| 602 |
+
total_rows = 0
|
| 603 |
+
kept_rows = 0
|
| 604 |
+
filtered_lang = 0
|
| 605 |
+
filtered_short = 0
|
| 606 |
+
|
| 607 |
+
path_obj = Path(cfg.input_path)
|
| 608 |
+
is_parquet = path_obj.is_dir() or path_obj.suffix.lower() == ".parquet"
|
| 609 |
+
|
| 610 |
+
def write_table(table: pa.Table):
|
| 611 |
+
nonlocal writer, kept_rows
|
| 612 |
+
if table.num_rows == 0:
|
| 613 |
+
return
|
| 614 |
+
if writer is None:
|
| 615 |
+
writer = pq.ParquetWriter(cfg.output_parquet, table.schema, compression="snappy")
|
| 616 |
+
writer.write_table(table)
|
| 617 |
+
kept_rows += table.num_rows
|
| 618 |
+
|
| 619 |
+
if is_parquet:
|
| 620 |
+
dataset = ds.dataset(cfg.input_path, format="parquet")
|
| 621 |
+
available = list(dataset.schema.names)
|
| 622 |
+
wanted = []
|
| 623 |
+
for c in (
|
| 624 |
+
"video_id",
|
| 625 |
+
"aweme_id",
|
| 626 |
+
"id",
|
| 627 |
+
"author_id",
|
| 628 |
+
"user_id",
|
| 629 |
+
"created_at",
|
| 630 |
+
"create_time",
|
| 631 |
+
"createTime",
|
| 632 |
+
"collected_time",
|
| 633 |
+
"stats_time",
|
| 634 |
+
"description",
|
| 635 |
+
"desc",
|
| 636 |
+
"hashtags",
|
| 637 |
+
"challenges",
|
| 638 |
+
"language",
|
| 639 |
+
"lang",
|
| 640 |
+
):
|
| 641 |
+
if c in available and c not in wanted:
|
| 642 |
+
wanted.append(c)
|
| 643 |
+
scanner = dataset.scanner(batch_size=cfg.batch_size, columns=wanted or None)
|
| 644 |
+
for idx, batch in enumerate(scanner.to_batches()):
|
| 645 |
+
df = batch.to_pandas()
|
| 646 |
+
total_rows += len(df)
|
| 647 |
+
table, fl, fs = process_chunk(df, lexicons, phrase_vocab, phrase_ngram_sizes, slang_map, label_orders, embedder, cfg)
|
| 648 |
+
filtered_lang += fl
|
| 649 |
+
filtered_short += fs
|
| 650 |
+
write_table(table)
|
| 651 |
+
logger.info("Processed parquet batch %s (input rows %s, kept total %s)", idx, len(df), kept_rows)
|
| 652 |
+
else:
|
| 653 |
+
header = pd.read_csv(cfg.input_path, nrows=0).columns.tolist()
|
| 654 |
+
wanted = []
|
| 655 |
+
for c in (
|
| 656 |
+
"video_id",
|
| 657 |
+
"aweme_id",
|
| 658 |
+
"id",
|
| 659 |
+
"author_id",
|
| 660 |
+
"user_id",
|
| 661 |
+
"created_at",
|
| 662 |
+
"create_time",
|
| 663 |
+
"createTime",
|
| 664 |
+
"collected_time",
|
| 665 |
+
"stats_time",
|
| 666 |
+
"description",
|
| 667 |
+
"desc",
|
| 668 |
+
"hashtags",
|
| 669 |
+
"challenges",
|
| 670 |
+
"language",
|
| 671 |
+
"lang",
|
| 672 |
+
):
|
| 673 |
+
if c in header and c not in wanted:
|
| 674 |
+
wanted.append(c)
|
| 675 |
+
with pd.read_csv(cfg.input_path, chunksize=cfg.batch_size, usecols=wanted or None, dtype=str, keep_default_na=False) as reader:
|
| 676 |
+
for idx, chunk in enumerate(reader):
|
| 677 |
+
total_rows += len(chunk)
|
| 678 |
+
table, fl, fs = process_chunk(chunk, lexicons, phrase_vocab, phrase_ngram_sizes, slang_map, label_orders, embedder, cfg)
|
| 679 |
+
filtered_lang += fl
|
| 680 |
+
filtered_short += fs
|
| 681 |
+
write_table(table)
|
| 682 |
+
logger.info("Processed CSV chunk %s (input rows %s, kept total %s)", idx, len(chunk), kept_rows)
|
| 683 |
+
|
| 684 |
+
if writer:
|
| 685 |
+
writer.close()
|
| 686 |
+
diagnostics = {
|
| 687 |
+
"input_rows": total_rows,
|
| 688 |
+
"kept_rows": kept_rows,
|
| 689 |
+
"filtered_language": filtered_lang,
|
| 690 |
+
"filtered_short": filtered_short,
|
| 691 |
+
}
|
| 692 |
+
write_metadata(cfg, label_orders, diagnostics)
|
| 693 |
+
if cfg.diagnostics_path:
|
| 694 |
+
Path(cfg.diagnostics_path).write_text(json.dumps(diagnostics, indent=2), encoding="utf-8")
|
| 695 |
+
logger.info(
|
| 696 |
+
"Completed. Input rows: %s, kept: %s, filtered_language: %s, filtered_short: %s",
|
| 697 |
+
total_rows,
|
| 698 |
+
kept_rows,
|
| 699 |
+
filtered_lang,
|
| 700 |
+
filtered_short,
|
| 701 |
+
)
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
def parse_args() -> MinerConfig:
|
| 705 |
+
parser = argparse.ArgumentParser(description="System 7 Miner: clean + embed + score raw TikTok text.")
|
| 706 |
+
parser.add_argument("--input-path", "--input", dest="input_path", required=True, help="Path to CSV or Parquet directory (e.g., Data/hf10m/Small)")
|
| 707 |
+
parser.add_argument("--output-parquet", default="tiktok10m_sys7_features.parquet", help="Output parquet path")
|
| 708 |
+
parser.add_argument(
|
| 709 |
+
"--lexicons",
|
| 710 |
+
default=None,
|
| 711 |
+
help="Preferred lexicon bundle (default: system7_lexicons.json if present, else system6_lexicons.json).",
|
| 712 |
+
)
|
| 713 |
+
parser.add_argument(
|
| 714 |
+
"--phrase-lexicon",
|
| 715 |
+
default=None,
|
| 716 |
+
help="Optional phrase lexicon JSON from phraseminer.py (uses phrases_by_label to score multi-word phrases).",
|
| 717 |
+
)
|
| 718 |
+
parser.add_argument(
|
| 719 |
+
"--phrase-weight-scale",
|
| 720 |
+
type=float,
|
| 721 |
+
default=1.0,
|
| 722 |
+
help="Scale factor applied to phrase lexicon weights when merged into scoring.",
|
| 723 |
+
)
|
| 724 |
+
parser.add_argument(
|
| 725 |
+
"--system6-lexicons",
|
| 726 |
+
default="system6_lexicons.json",
|
| 727 |
+
help="Fallback System 6 lexicon bundle (used if system7_lexicons.json is missing).",
|
| 728 |
+
)
|
| 729 |
+
parser.add_argument("--slang-lexicon", default="slang_lexicon.json", help="Path to slang_lexicon.json")
|
| 730 |
+
parser.add_argument("--label-orders", default="label_orders.json", help="Path to label_orders.json")
|
| 731 |
+
parser.add_argument("--batch-size", type=int, default=50_000)
|
| 732 |
+
parser.add_argument("--text-cap", type=int, default=1024)
|
| 733 |
+
parser.add_argument("--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2")
|
| 734 |
+
parser.add_argument("--embedding-batch-size", type=int, default=128)
|
| 735 |
+
parser.add_argument("--device", default="cpu", help="Embedding device: cpu or cuda")
|
| 736 |
+
parser.add_argument(
|
| 737 |
+
"--workers",
|
| 738 |
+
type=int,
|
| 739 |
+
default=2,
|
| 740 |
+
help="Number of worker processes for CPU-bound language detection when --auto-language-detect is set (default: 2).",
|
| 741 |
+
)
|
| 742 |
+
parser.add_argument("--metadata-path", default="sys7_miner_metadata.json")
|
| 743 |
+
parser.add_argument("--diagnostics-path", default="sys7_miner_diagnostics.json")
|
| 744 |
+
parser.add_argument("--min-chars", type=int, default=10, help="Minimum clean_text length to keep")
|
| 745 |
+
parser.add_argument("--min-tokens", type=int, default=2, help="Minimum token count to keep")
|
| 746 |
+
parser.add_argument(
|
| 747 |
+
"--emit-flat-scores",
|
| 748 |
+
action="store_true",
|
| 749 |
+
help="If set, emit *_max and per-label scalar columns (e.g., tribe_MomTok) in addition to list score vectors.",
|
| 750 |
+
)
|
| 751 |
+
parser.add_argument(
|
| 752 |
+
"--language-allowlist",
|
| 753 |
+
default=None,
|
| 754 |
+
help="Comma-separated list of language codes to keep (e.g., en,fr). "
|
| 755 |
+
"Combined with optional --auto-language-detect to filter down to core languages.",
|
| 756 |
+
)
|
| 757 |
+
parser.add_argument(
|
| 758 |
+
"--auto-language-detect",
|
| 759 |
+
action="store_true",
|
| 760 |
+
help="If set, auto-detect language for rows with missing/unknown language using langdetect, when available.",
|
| 761 |
+
)
|
| 762 |
+
args = parser.parse_args()
|
| 763 |
+
lexicon_path = args.lexicons
|
| 764 |
+
if not lexicon_path:
|
| 765 |
+
# Default preference: system7_lexicons.json if present, else System 6 bundle.
|
| 766 |
+
default_sys7 = Path("system7_lexicons.json")
|
| 767 |
+
lexicon_path = str(default_sys7) if default_sys7.exists() else args.system6_lexicons
|
| 768 |
+
return MinerConfig(
|
| 769 |
+
input_path=args.input_path,
|
| 770 |
+
output_parquet=args.output_parquet,
|
| 771 |
+
lexicons=lexicon_path,
|
| 772 |
+
phrase_lexicon=args.phrase_lexicon,
|
| 773 |
+
phrase_weight_scale=float(args.phrase_weight_scale),
|
| 774 |
+
slang_lexicon=args.slang_lexicon,
|
| 775 |
+
label_orders=args.label_orders,
|
| 776 |
+
batch_size=args.batch_size,
|
| 777 |
+
text_cap=args.text_cap,
|
| 778 |
+
embedding_model=args.embedding_model,
|
| 779 |
+
embedding_batch_size=args.embedding_batch_size,
|
| 780 |
+
device=args.device,
|
| 781 |
+
workers=args.workers,
|
| 782 |
+
metadata_path=args.metadata_path,
|
| 783 |
+
diagnostics_path=args.diagnostics_path,
|
| 784 |
+
min_chars=args.min_chars,
|
| 785 |
+
min_tokens=args.min_tokens,
|
| 786 |
+
language_allowlist=[s.strip().lower() for s in args.language_allowlist.split(",")] if args.language_allowlist else None,
|
| 787 |
+
auto_language_detect=bool(args.auto_language_detect),
|
| 788 |
+
emit_flat_scores=bool(args.emit_flat_scores),
|
| 789 |
+
)
|
| 790 |
+
|
| 791 |
+
|
| 792 |
+
def main() -> None:
|
| 793 |
+
logging.basicConfig(
|
| 794 |
+
level=logging.INFO,
|
| 795 |
+
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
|
| 796 |
+
)
|
| 797 |
+
cfg = parse_args()
|
| 798 |
+
run(cfg)
|
| 799 |
+
|
| 800 |
+
|
| 801 |
+
if __name__ == "__main__":
|
| 802 |
+
main()
|
sys7_phrase_lexicons_desc_only.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a4384c606dfab526df5b30198fd14453022114852a022ea60027bfc6c5f9327
|
| 3 |
+
size 78998019
|
system7_lexicons.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|