Imaginethat commited on
Commit
96cfc9b
·
verified ·
1 Parent(s): fc84163

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sys7_phrase_lexicons_desc_only.json filter=lfs diff=lfs merge=lfs -text
Dockerfile.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the same PyTorch base image that worked for your clustering job
2
+ FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
3
+
4
+ # Install system basics
5
+ RUN apt-get update && \
6
+ apt-get install -y wget ca-certificates git && \
7
+ rm -rf /var/lib/apt/lists/*
8
+
9
+ WORKDIR /app
10
+
11
+ # Install Python dependencies
12
+ COPY requirements.txt ./
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Create non-root user and setup the Persistent Storage paths
16
+ # This ensures /data is writable, which is where your Parquet inputs/outputs will live
17
+ RUN useradd -m -u 1000 appuser && \
18
+ mkdir -p /data/.cache && \
19
+ mkdir -p /data/out && \
20
+ mkdir -p /data/input && \
21
+ chown -R appuser:appuser /data
22
+
23
+ # Set environment variables to force all Hugging Face caches to the persistent volume
24
+ ENV HF_HOME=/data/.cache \
25
+ HF_HUB_CACHE=/data/.cache/hub \
26
+ TRANSFORMERS_CACHE=/data/.cache/transformers \
27
+ HF_DATASETS_CACHE=/data/.cache/datasets \
28
+ SENTENCE_TRANSFORMERS_HOME=/data/.cache/sentence_transformers \
29
+ TOKENIZERS_PARALLELISM=false \
30
+ OMP_NUM_THREADS=1
31
+
32
+ # Copy your Miner script and the required Config JSONs
33
+ # Make sure you upload these JSONs to the Space Files along with this Dockerfile!
34
+ COPY sys7_miner_2.py .
35
+ COPY sys7_miner.py .
36
+ COPY system7_lexicons.json .
37
+ COPY label_orders.json .
38
+ COPY slang_lexicon.json .
39
+ COPY sys7_phrase_lexicons_desc_only.json .
40
+
41
+ # Copy the runner script
42
+ COPY start.sh .
43
+ RUN chmod +x start.sh && chown -R appuser:appuser /app
44
+
45
+ USER appuser
46
+
47
+ CMD ["./start.sh"]
Requirements.txt.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ pyarrow
3
+ ftfy
4
+ langdetect
5
+ sentence-transformers
6
+ huggingface_hub
7
+ datasets
8
+ --extra-index-url https://download.pytorch.org/whl/cu124
9
+ torch==2.5.1
10
+ torchvision==0.20.1
11
+ torchaudio==2.5.1
label_orders.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tribe": [
3
+ "BlueCollar",
4
+ "Faith",
5
+ "Gaming",
6
+ "MomTok",
7
+ "Pets",
8
+ "Storytime",
9
+ "SportsCombat",
10
+ "MusicPopFandom",
11
+ "TVStreaming",
12
+ "Roleplay"
13
+ ],
14
+ "commercial": [
15
+ "Product",
16
+ "MediaShow",
17
+ "EventService",
18
+ "InfluencerBrand"
19
+ ],
20
+ "vibe": [
21
+ "chaotic_funny",
22
+ "confessional_personal",
23
+ "educational_calm",
24
+ "motivational_inspirational",
25
+ "dramatic_romantic"
26
+ ],
27
+ "time": [
28
+ "seasonal",
29
+ "viral"
30
+ ],
31
+ "role": [
32
+ "parent",
33
+ "professional",
34
+ "single",
35
+ "student",
36
+ "athlete",
37
+ "creator"
38
+ ],
39
+ "format": [
40
+ "storytime",
41
+ "template_edit",
42
+ "vlog"
43
+ ]
44
+ }
run_job.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import threading
4
+ import subprocess
5
+ import time
6
+ import shutil
7
+ from huggingface_hub import HfApi, hf_hub_download, create_repo
8
+
9
+ # --- CONFIG ---
10
+ INPUT_DATASET = "The-data-company/TikTok-10M"
11
+ # CHANGE THIS TO YOUR USERNAME!
12
+ OUTPUT_REPO = "Imaginethat/tiktok-sys7-mined-results"
13
+ HF_TOKEN = os.environ.get("HF_TOKEN")
14
+
15
+ # Global variables for the UI
16
+ log_buffer = "Initializing Environment...\n"
17
+ dataset_link_md = ""
18
+ mining_active = False
19
+
20
+ def log(msg):
21
+ """Adds a message to the UI log window"""
22
+ global log_buffer
23
+ timestamp = time.strftime("%H:%M:%S")
24
+ entry = f"[{timestamp}] {msg}"
25
+ print(entry) # Keep console logs as backup
26
+ log_buffer += entry + "\n"
27
+
28
+ def run_mining_logic():
29
+ global dataset_link_md, mining_active
30
+
31
+ if not HF_TOKEN:
32
+ log("ERROR: HF_TOKEN secret is missing in Settings!")
33
+ return
34
+
35
+ api = HfApi(token=HF_TOKEN)
36
+
37
+ # 1. Setup Repo
38
+ log(f"Setting up Output Repo: {OUTPUT_REPO}")
39
+ try:
40
+ create_repo(repo_id=OUTPUT_REPO, repo_type="dataset", exist_ok=True)
41
+ except Exception as e:
42
+ log(f"Repo setup note: {e}")
43
+
44
+ # 2. List Files
45
+ log(f"Scanning {INPUT_DATASET} for files...")
46
+ try:
47
+ all_files = api.list_repo_files(repo_id=INPUT_DATASET, repo_type="dataset")
48
+ parquet_files = [f for f in all_files if f.endswith(".parquet") and "train" in f]
49
+ log(f"Found {len(parquet_files)} shards to process.")
50
+ except Exception as e:
51
+ log(f"Failed to list files: {e}")
52
+ return
53
+
54
+ # 3. Process Shards
55
+ input_dir = "./data_input"
56
+ output_dir = "./data_output"
57
+
58
+ for i, file_path in enumerate(parquet_files):
59
+ log(f"--- Processing Shard {i+1}/{len(parquet_files)}: {file_path} ---")
60
+
61
+ # Cleanup
62
+ if os.path.exists(input_dir): shutil.rmtree(input_dir)
63
+ if os.path.exists(output_dir): shutil.rmtree(output_dir)
64
+ os.makedirs(input_dir, exist_ok=True)
65
+ os.makedirs(output_dir, exist_ok=True)
66
+
67
+ # Download
68
+ try:
69
+ local_path = hf_hub_download(
70
+ repo_id=INPUT_DATASET,
71
+ filename=file_path,
72
+ repo_type="dataset",
73
+ local_dir=input_dir,
74
+ token=HF_TOKEN
75
+ )
76
+ except Exception as e:
77
+ log(f"Download failed: {e}")
78
+ continue
79
+
80
+ # Run Miner (Streaming output to UI)
81
+ shard_output_name = f"sys7_features_part_{i:04d}.parquet"
82
+ shard_output_path = os.path.join(output_dir, shard_output_name)
83
+
84
+ cmd = [
85
+ "python", "-u", "sys7_miner_2.py", # -u forces unbuffered output
86
+ "--input-path", input_dir,
87
+ "--output-parquet", shard_output_path,
88
+ "--lexicons", "system7_lexicons.json",
89
+ "--label-orders", "label_orders.json",
90
+ "--slang-lexicon", "slang_lexicon.json",
91
+ "--phrase-lexicon", "sys7_phrase_lexicons_desc_only.json",
92
+ "--auto-language-detect",
93
+ "--language-allowlist", "en",
94
+ "--min-chars", "10",
95
+ "--min-tokens", "2",
96
+ "--device", "cuda",
97
+ "--batch-size", "50000",
98
+ "--embedding-batch-size", "512",
99
+ "--workers", "4"
100
+ ]
101
+
102
+ # Use Popen to capture logs in real-time
103
+ try:
104
+ process = subprocess.Popen(
105
+ cmd,
106
+ stdout=subprocess.PIPE,
107
+ stderr=subprocess.STDOUT,
108
+ text=True,
109
+ bufsize=1
110
+ )
111
+
112
+ # Read stdout line by line
113
+ for line in process.stdout:
114
+ if "Processed" in line or "Error" in line: # Filter logs to keep UI clean?
115
+ log("MINER: " + line.strip())
116
+ elif "Loading" in line:
117
+ log("MINER: " + line.strip())
118
+
119
+ process.wait()
120
+
121
+ if process.returncode != 0:
122
+ log(f"Miner failed with code {process.returncode}")
123
+ continue
124
+
125
+ except Exception as e:
126
+ log(f"Subprocess error: {e}")
127
+ continue
128
+
129
+ # Upload
130
+ try:
131
+ log(f"Uploading {shard_output_name}...")
132
+ api.upload_file(
133
+ path_or_fileobj=shard_output_path,
134
+ path_in_repo=f"data/{shard_output_name}",
135
+ repo_id=OUTPUT_REPO,
136
+ repo_type="dataset"
137
+ )
138
+ log("Upload Successful!")
139
+ except Exception as e:
140
+ log(f"Upload failed: {e}")
141
+
142
+ # 4. Finish
143
+ final_url = f"https://huggingface.co/datasets/{OUTPUT_REPO}"
144
+ log("ALL JOBS COMPLETE.")
145
+ dataset_link_md = f"## 🎉 DONE! Data is ready: [Click Here to View Dataset]({final_url})"
146
+ mining_active = False
147
+
148
+ def start_thread():
149
+ global mining_active
150
+ if mining_active:
151
+ return "Already Running..."
152
+ mining_active = True
153
+ t = threading.Thread(target=run_mining_logic)
154
+ t.start()
155
+ return "Mining Started! Watch logs below."
156
+
157
+ # --- GRADIO UI ---
158
+ with gr.Blocks(title="TikTok Miner") as demo:
159
+ gr.Markdown("# ⛏️ System 7 Miner Control")
160
+
161
+ with gr.Row():
162
+ start_btn = gr.Button("🚀 Start Mining Job", variant="primary")
163
+ status_txt = gr.Textbox(label="Status", value="Ready to start.", interactive=False)
164
+
165
+ # The Log Window
166
+ logs_out = gr.TextArea(
167
+ label="Live Process Logs",
168
+ placeholder="Logs will stream here...",
169
+ lines=20,
170
+ max_lines=25,
171
+ autoscroll=True
172
+ )
173
+
174
+ # The Success Link (Hidden until done)
175
+ link_display = gr.Markdown("")
176
+
177
+ # Button Action
178
+ start_btn.click(fn=start_thread, inputs=None, outputs=status_txt)
179
+
180
+ # Auto-refresh the logs every 1 second
181
+ def update_ui():
182
+ return log_buffer, dataset_link_md
183
+
184
+ timer = gr.Timer(1)
185
+ timer.tick(fn=update_ui, outputs=[logs_out, link_display])
186
+
187
+ # Launch on the port HF expects
188
+ demo.launch(server_name="0.0.0.0", server_port=7860)
slang_lexicon.json ADDED
The diff for this file is too large to render. See raw diff
 
start.sh.txt ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # 1. Ensure Persistent Cache is Writable
5
+ export HF_HOME="${HF_HOME:-/data/.cache}"
6
+ # Create subdirectories to be safe
7
+ for dir in "$HF_HOME/hub" "$HF_HOME/transformers" "$HF_HOME/datasets" "$HF_HOME/sentence_transformers" "/data/out"; do
8
+ mkdir -p "$dir"
9
+ done
10
+
11
+ # 2. Cleanup Threading Issues
12
+ unset OMP_NUM_THREADS
13
+ export TOKENIZERS_PARALLELISM=false
14
+
15
+ # 3. Quick GPU Check
16
+ echo '=== Torch / CUDA check ==='
17
+ python - <<'PY'
18
+ import torch
19
+ print(f'torch version: {torch.__version__}')
20
+ print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
21
+ if torch.cuda.is_available():
22
+ print(f'Device: {torch.cuda.get_device_name(0)}')
23
+ PY
24
+
25
+ # 4. Start Lightweight File Server (Port 7860)
26
+ # This keeps the Space "Healthy" and lets you browse /data/out to see if the parquet file exists.
27
+ PORT="${PORT:-7860}"
28
+ echo "Starting file server on port $PORT serving /data/out..."
29
+ cd /data/out && python3 -m http.server "$PORT" &
30
+ SERVER_PID=$!
31
+ trap 'kill $SERVER_PID' EXIT
32
+
33
+ # Return to app dir to run the miner
34
+ cd /app
35
+
36
+ # 5. Run System 7 Miner
37
+ # We assume input data is in /data/input. If not, you might need to download it here.
38
+ echo '=== Running System 7 Miner ==='
39
+ python sys7_miner_2.py \
40
+ --input-path "${INPUT_PATH:-/data/input}" \
41
+ --output-parquet "${OUTPUT_PATH:-/data/out/tiktok10m_sys7_features.parquet}" \
42
+ --lexicons system7_lexicons.json \
43
+ --label-orders label_orders.json \
44
+ --slang-lexicon slang_lexicon.json \
45
+ --phrase-lexicon sys7_phrase_lexicons_desc_only.json \
46
+ --auto-language-detect \
47
+ --language-allowlist en \
48
+ --min-chars "${MIN_CHARS:-10}" \
49
+ --min-tokens "${MIN_TOKENS:-2}" \
50
+ --device cuda \
51
+ --batch-size "${BATCH_SIZE:-50000}" \
52
+ --embedding-batch-size "${EMBED_BATCH:-512}" \
53
+ --metadata-path "/data/out/sys7_miner_metadata.json" \
54
+ --diagnostics-path "/data/out/sys7_miner_diagnostics.json" \
55
+ --workers 4
56
+
57
+ RESULT=$?
58
+
59
+ if [ $RESULT -eq 0 ]; then
60
+ echo "Miner completed successfully."
61
+ else
62
+ echo "Miner failed. Check logs."
63
+ fi
64
+
65
+ # Keep container alive so you can download files via the HTTP server
66
+ wait $SERVER_PID
sys7_miner_2.py ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ System 7 Miner
3
+ ---------------
4
+ Module 1: Clean TikTok rows, apply lexicon-derived raw signals, and emit feature vectors.
5
+
6
+ Core outputs:
7
+ - tiktok10m_sys7_features.parquet
8
+ - optional sys7_miner_metadata.json
9
+
10
+ This script is designed to stream large CSVs in chunks (no 10M-row load),
11
+ normalize text with ftfy, compute raw lexical scores from System 6 lexicons,
12
+ and generate contextual embeddings via sentence-transformers.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import logging
20
+ import math
21
+ import os
22
+ import re
23
+ import time
24
+ from collections import defaultdict
25
+ from concurrent.futures import ProcessPoolExecutor
26
+ from dataclasses import asdict, dataclass
27
+ from pathlib import Path
28
+ from typing import Dict, List, Optional, Sequence, Tuple
29
+
30
+ import numpy as np
31
+ import pandas as pd
32
+ import pyarrow as pa
33
+ import pyarrow.parquet as pq
34
+ import pyarrow.dataset as ds
35
+
36
+ try:
37
+ from sentence_transformers import SentenceTransformer
38
+ except ImportError: # pragma: no cover - handled at runtime
39
+ SentenceTransformer = None # type: ignore
40
+
41
+ try:
42
+ import ftfy
43
+ except ImportError: # pragma: no cover - handled at runtime
44
+ ftfy = None # type: ignore
45
+
46
+ try:
47
+ from langdetect import detect as langdetect_detect
48
+ from langdetect.detector_factory import DetectorFactory
49
+ except ImportError: # pragma: no cover - optional dependency
50
+ langdetect_detect = None # type: ignore
51
+ else: # pragma: no cover - runtime behavior
52
+ # Make langdetect deterministic across runs/processes.
53
+ DetectorFactory.seed = 0
54
+
55
+ logger = logging.getLogger("sys7_miner")
56
+
57
+
58
+ CONTROL_CHARS_RE = re.compile(r"[\u0000-\u0008\u000b\u000c\u000e-\u001f]")
59
+ REPLACEMENT_RUN_RE = re.compile("�{2,}")
60
+ WHITESPACE_RE = re.compile(r"\s+")
61
+ HASHTAG_SPLIT_RE = re.compile(r"[A-Z]?[a-z]+|[0-9]+")
62
+ TOKENIZER_RE = re.compile(r"[a-z0-9']+")
63
+
64
+
65
+ @dataclass
66
+ class MinerConfig:
67
+ input_path: str
68
+ output_parquet: str = "tiktok10m_sys7_features.parquet"
69
+ lexicons: str = "system7_lexicons.json"
70
+ phrase_lexicon: Optional[str] = None
71
+ phrase_weight_scale: float = 1.0
72
+ slang_lexicon: str = "slang_lexicon.json"
73
+ label_orders: str = "label_orders.json"
74
+ batch_size: int = 50_000
75
+ text_cap: int = 1024
76
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
77
+ embedding_batch_size: int = 128
78
+ device: str = "cpu"
79
+ workers: int = 2
80
+ metadata_path: Optional[str] = "sys7_miner_metadata.json"
81
+ diagnostics_path: Optional[str] = "sys7_miner_diagnostics.json"
82
+ min_chars: int = 10
83
+ min_tokens: int = 2
84
+ language_allowlist: Optional[List[str]] = None # e.g., ["en"]
85
+ auto_language_detect: bool = False
86
+ emit_flat_scores: bool = False
87
+
88
+
89
+ def load_json(path: str) -> Dict:
90
+ with open(path, "r", encoding="utf-8") as f:
91
+ return json.load(f)
92
+
93
+
94
+ def auto_detect_language(text: str) -> Optional[str]:
95
+ """Best-effort language detection using langdetect, if available."""
96
+ if not langdetect_detect:
97
+ return None
98
+ if not text:
99
+ return None
100
+ # langdetect cost scales with text length; captions + hashtags can be long/noisy.
101
+ # Truncate to keep detection cheap and stable.
102
+ if len(text) > 400:
103
+ text = text[:400]
104
+ try:
105
+ return str(langdetect_detect(text))
106
+ except Exception:
107
+ return None
108
+
109
+
110
+ def detect_languages_parallel(texts: List[str], workers: int) -> List[Optional[str]]:
111
+ """Detect languages for many texts, optionally using multiprocessing.
112
+
113
+ langdetect is CPU-bound; using a ProcessPool sidesteps the GIL.
114
+ """
115
+ if not texts:
116
+ return []
117
+
118
+ workers = int(workers or 0)
119
+ if workers <= 1 or not langdetect_detect:
120
+ return [auto_detect_language(t) for t in texts]
121
+
122
+ # Clamp to avoid spawning an excessive number of processes.
123
+ max_workers = min(workers, os.cpu_count() or workers)
124
+ chunksize = 256 # critical for many short strings
125
+ with ProcessPoolExecutor(max_workers=max_workers) as ex:
126
+ return list(ex.map(auto_detect_language, texts, chunksize=chunksize))
127
+
128
+
129
+ def is_nullish(val) -> bool:
130
+ if val is None:
131
+ return True
132
+ if isinstance(val, float) and math.isnan(val):
133
+ return True
134
+ if isinstance(val, str) and val.strip() == "":
135
+ return True
136
+ return False
137
+
138
+
139
+ def coerce_id(val) -> Optional[str]:
140
+ """Coerce IDs to strings (safe for very large integers)."""
141
+ if is_nullish(val):
142
+ return None
143
+ if isinstance(val, (int, np.integer)):
144
+ return str(int(val))
145
+ if isinstance(val, (float, np.floating)):
146
+ if math.isnan(float(val)):
147
+ return None
148
+ if float(val).is_integer():
149
+ return str(int(val))
150
+ return str(val)
151
+ s = str(val).strip()
152
+ if s.endswith(".0"):
153
+ try:
154
+ as_int = int(float(s))
155
+ return str(as_int)
156
+ except Exception:
157
+ pass
158
+ return s
159
+
160
+
161
+ def first_present(row, *names: str):
162
+ for name in names:
163
+ if hasattr(row, name):
164
+ val = getattr(row, name, None)
165
+ if not is_nullish(val):
166
+ return val
167
+ return None
168
+
169
+
170
+ def extract_hashtag_tags(raw) -> List[str]:
171
+ """
172
+ Extract a list of hashtag/challenge strings from a variety of formats:
173
+ - JSON string list: '["tag1","tag2"]'
174
+ - JSON string list of objects: '[{"title":"tag"}]'
175
+ - whitespace/comma separated string
176
+ - list/tuple of strings (or dicts)
177
+ """
178
+ if is_nullish(raw):
179
+ return []
180
+ if isinstance(raw, (list, tuple)):
181
+ out: List[str] = []
182
+ for item in raw:
183
+ if is_nullish(item):
184
+ continue
185
+ if isinstance(item, str):
186
+ out.append(item.strip())
187
+ continue
188
+ if isinstance(item, dict):
189
+ for k in ("title", "name", "tag", "challenge", "challenge_name", "text"):
190
+ v = item.get(k)
191
+ if isinstance(v, str) and v.strip():
192
+ out.append(v.strip())
193
+ break
194
+ return [t for t in out if t]
195
+ if isinstance(raw, str):
196
+ s = raw.strip()
197
+ if not s:
198
+ return []
199
+ if s[:1] in ("[", "{"):
200
+ try:
201
+ obj = json.loads(s)
202
+ if isinstance(obj, list):
203
+ return extract_hashtag_tags(obj)
204
+ if isinstance(obj, dict):
205
+ for k in ("hashtags", "challenges", "tags"):
206
+ v = obj.get(k)
207
+ if v is not None:
208
+ return extract_hashtag_tags(v)
209
+ except Exception:
210
+ pass
211
+ # Fallback: find #tags, else split on commas/whitespace
212
+ tags = re.findall(r"#([A-Za-z0-9_]+)", s)
213
+ if tags:
214
+ return tags
215
+ parts = [p.strip().strip('"').strip("'").strip("[](){}") for p in re.split(r"[,\s]+", s) if p.strip()]
216
+ return [p.lstrip("#") for p in parts if p and p != "#"]
217
+ return []
218
+
219
+
220
+ def prepare_slang_map(raw) -> Dict[str, str]:
221
+ """
222
+ Normalize slang lexicon to token -> canonical mapping.
223
+ Accepts dict (already mapping) or list of objects with 'token' and optional 'canonical' keys.
224
+ """
225
+ mapping: Dict[str, str] = {}
226
+ if isinstance(raw, dict):
227
+ for k, v in raw.items():
228
+ mapping[str(k).lower()] = str(v) if v is not None else str(k).lower()
229
+ return mapping
230
+ if isinstance(raw, list):
231
+ for item in raw:
232
+ if not isinstance(item, dict):
233
+ continue
234
+ token = item.get("token") or item.get("slang") or item.get("term")
235
+ canonical = item.get("canonical") or item.get("value") or token
236
+ if token:
237
+ mapping[str(token).lower()] = str(canonical).lower()
238
+ return mapping
239
+
240
+
241
+ def orient_lexicons(raw_lexicons: Dict[str, Dict]) -> Dict[str, Dict[str, Dict[str, float]]]:
242
+ """
243
+ Normalize lexicons into dim -> token -> {label: weight}.
244
+ Accepts lexicons shaped as label -> token -> weight or weight objects.
245
+ Ignores non-dict top-level entries (e.g., config/sources).
246
+ """
247
+
248
+ def to_weight(val):
249
+ if isinstance(val, dict) and "weight" in val:
250
+ return val["weight"]
251
+ return val
252
+
253
+ oriented: Dict[str, Dict[str, Dict[str, float]]] = {}
254
+ for dim, labels in raw_lexicons.items():
255
+ if not isinstance(labels, dict):
256
+ continue # skip config/metadata blocks
257
+ token_map: Dict[str, Dict[str, float]] = defaultdict(dict)
258
+ for label, token_weights in labels.items():
259
+ if not isinstance(token_weights, dict):
260
+ continue
261
+ for token, weight in token_weights.items():
262
+ w = to_weight(weight)
263
+ try:
264
+ w_float = float(w)
265
+ except (TypeError, ValueError):
266
+ continue
267
+ token_map[token.lower()][label] = w_float
268
+ oriented[dim] = token_map
269
+ return oriented
270
+
271
+
272
+ def standardize_quotes(text: str) -> str:
273
+ return (
274
+ text.replace("“", '"')
275
+ .replace("”", '"')
276
+ .replace("‘", "'")
277
+ .replace("’", "'")
278
+ .replace("´", "'")
279
+ )
280
+
281
+
282
+ def split_hashtag(tag: str) -> List[str]:
283
+ parts = HASHTAG_SPLIT_RE.findall(tag)
284
+ if parts:
285
+ return [p.lower() for p in parts]
286
+ return [tag.lower()]
287
+
288
+
289
+ def normalize_hashtags(raw: Optional[str], slang_map: Dict[str, str]) -> List[str]:
290
+ if raw is None or (isinstance(raw, float) and math.isnan(raw)):
291
+ return []
292
+ tokens: List[str] = []
293
+ if isinstance(raw, str):
294
+ raw_parts = re.split(r"[,\s]+", raw)
295
+ elif isinstance(raw, (list, tuple)):
296
+ raw_parts = raw
297
+ else:
298
+ return tokens
299
+ for part in raw_parts:
300
+ if not part:
301
+ continue
302
+ cleaned = part.lstrip("#").strip()
303
+ if not cleaned:
304
+ continue
305
+ for token in split_hashtag(cleaned):
306
+ mapped = slang_map.get(token, token)
307
+ if mapped:
308
+ tokens.append(mapped)
309
+ return tokens
310
+
311
+
312
+ def fuse_text(description: Optional[str], hashtags: Optional[str | Sequence[str]], slang_map: Dict[str, str], cap: int) -> Tuple[str, List[str]]:
313
+ desc = "" if description is None or (isinstance(description, float) and math.isnan(description)) else str(description)
314
+ ht_tokens = normalize_hashtags(hashtags, slang_map)
315
+ hashtag_text = " ".join(ht_tokens)
316
+ fused = " ".join(filter(None, [desc, hashtag_text])).lower()
317
+ # if ftfy:
318
+ # fused = ftfy.fix_text(fused, normalization="NFC")
319
+ fused = CONTROL_CHARS_RE.sub(" ", fused)
320
+ fused = REPLACEMENT_RUN_RE.sub(" ", fused)
321
+ fused = standardize_quotes(fused)
322
+ fused = WHITESPACE_RE.sub(" ", fused).strip()
323
+ if cap and len(fused) > cap:
324
+ fused = fused[:cap]
325
+ return fused, ht_tokens
326
+
327
+
328
+ def tokenize(text: str, hashtag_tokens: Sequence[str]) -> List[str]:
329
+ base_tokens = TOKENIZER_RE.findall(text)
330
+ tokens = [t for t in base_tokens if t]
331
+ tokens.extend([t.lower() for t in hashtag_tokens if t])
332
+ return tokens
333
+
334
+
335
+ def compute_raw_scores(tokens: Sequence[str], lexicons: Dict[str, Dict[str, Dict[str, float]]], label_orders: Dict[str, List[str]]) -> Dict[str, List[float]]:
336
+ scores: Dict[str, Dict[str, float]] = {dim: defaultdict(float) for dim in label_orders}
337
+ for token in tokens:
338
+ for dim, token_map in lexicons.items():
339
+ if token not in token_map:
340
+ continue
341
+ for label, weight in token_map[token].items():
342
+ scores[dim][label] += weight
343
+ norm = max(1.0, math.log1p(len(tokens)))
344
+ vectorized: Dict[str, List[float]] = {}
345
+ for dim, order in label_orders.items():
346
+ vectorized[dim] = [scores[dim].get(label, 0.0) / norm for label in order]
347
+ return vectorized
348
+
349
+
350
+ def load_embedder(model_name: str, device: str) -> SentenceTransformer:
351
+ if SentenceTransformer is None:
352
+ raise ImportError("sentence-transformers is required. Install via `pip install sentence-transformers`.")
353
+ logger.info("Loading embedding model %s on device=%s", model_name, device)
354
+ return SentenceTransformer(model_name, device=device)
355
+
356
+
357
+ def embed_texts(model: SentenceTransformer, texts: List[str], batch_size: int) -> List[List[float]]:
358
+ embeddings = model.encode(
359
+ texts,
360
+ batch_size=batch_size,
361
+ show_progress_bar=False,
362
+ convert_to_numpy=True,
363
+ )
364
+ return embeddings.tolist()
365
+
366
+
367
+ def process_chunk(
368
+ df: pd.DataFrame,
369
+ lexicons: Dict[str, Dict[str, Dict[str, float]]],
370
+ phrase_vocab: Optional[set[str]],
371
+ phrase_ngram_sizes: Sequence[int],
372
+ slang_map: Dict[str, str],
373
+ label_orders: Dict[str, List[str]],
374
+ embedder: SentenceTransformer,
375
+ cfg: MinerConfig,
376
+ ) -> Tuple[pa.Table, int, int]:
377
+ # First pass: cheap per-row work + min-length filter.
378
+ rows: List[object] = []
379
+ descriptions: List[Optional[str]] = []
380
+ hashtag_tags_list: List[List[str]] = []
381
+ fused_texts: List[str] = []
382
+ base_tokens_list: List[List[str]] = []
383
+ langs: List[Optional[str]] = []
384
+ needs_detect: List[bool] = []
385
+
386
+ filtered_lang = 0
387
+ filtered_short = 0
388
+
389
+ for row in df.itertuples(index=False):
390
+ description_val = first_present(row, "description", "desc")
391
+ hashtags_raw = first_present(row, "hashtags", "challenges")
392
+ hashtag_tags = extract_hashtag_tags(hashtags_raw)
393
+
394
+ description = str(description_val) if description_val is not None else None
395
+ fused_text, ht_tokens = fuse_text(description, hashtag_tags, slang_map, cfg.text_cap)
396
+ base_tokens = tokenize(fused_text, ht_tokens)
397
+
398
+ if len(fused_text) < cfg.min_chars or len(base_tokens) < cfg.min_tokens:
399
+ filtered_short += 1
400
+ continue
401
+
402
+ lang_val = getattr(row, "language", None) or getattr(row, "lang", None)
403
+ lang = None if is_nullish(lang_val) else str(lang_val).strip()
404
+ lang_norm = lang.lower() if lang else ""
405
+ need = bool(cfg.auto_language_detect and langdetect_detect and (not lang_norm or lang_norm in ("und", "unknown")))
406
+
407
+ rows.append(row)
408
+ descriptions.append(description)
409
+ hashtag_tags_list.append(hashtag_tags)
410
+ fused_texts.append(fused_text)
411
+ base_tokens_list.append(base_tokens)
412
+ langs.append(lang)
413
+ needs_detect.append(need)
414
+
415
+ # Multi-process language detection (CPU-bound).
416
+ if any(needs_detect):
417
+ detect_indices = [i for i, need in enumerate(needs_detect) if need]
418
+ detect_texts = [(descriptions[i] or "") for i in detect_indices]
419
+ t0 = time.perf_counter()
420
+ detected_langs = detect_languages_parallel(detect_texts, cfg.workers)
421
+ dt = time.perf_counter() - t0
422
+ used_workers = min(max(int(cfg.workers or 0), 1), os.cpu_count() or max(int(cfg.workers or 0), 1))
423
+ logger.info(
424
+ "Langdetect: detected %s rows in %.2fs using workers=%s",
425
+ len(detect_texts),
426
+ dt,
427
+ used_workers,
428
+ )
429
+ for idx, detected in zip(detect_indices, detected_langs):
430
+ if detected:
431
+ langs[idx] = detected
432
+
433
+ # Second pass: apply allowlist + phrase matching + scoring + record building.
434
+ records: List[Dict[str, object]] = []
435
+ texts: List[str] = []
436
+
437
+ for i, row in enumerate(rows):
438
+ lang = langs[i]
439
+ if cfg.language_allowlist:
440
+ if not lang:
441
+ filtered_lang += 1
442
+ continue
443
+ if str(lang).lower() not in cfg.language_allowlist:
444
+ filtered_lang += 1
445
+ continue
446
+
447
+ description = descriptions[i]
448
+ hashtag_tags = hashtag_tags_list[i]
449
+ fused_text = fused_texts[i]
450
+
451
+ tokens = base_tokens_list[i]
452
+ # Add matched phrases as tokens (if provided). Phrase lexicons are mined from descriptions only,
453
+ # so we match on cleaned description text (no hashtags). This runs only for rows that survive
454
+ # length + language filters (for speed).
455
+ if phrase_vocab and phrase_ngram_sizes and description:
456
+ desc_clean, _ = fuse_text(description, None, slang_map, cfg.text_cap)
457
+ desc_tokens = TOKENIZER_RE.findall(desc_clean)
458
+ matched: set[str] = set()
459
+ for n in phrase_ngram_sizes:
460
+ if n <= 1:
461
+ continue
462
+ for j in range(0, len(desc_tokens) - n + 1):
463
+ ph = " ".join(desc_tokens[j : j + n])
464
+ if ph in phrase_vocab:
465
+ matched.add(ph)
466
+ if matched:
467
+ tokens = tokens + list(matched)
468
+
469
+ texts.append(fused_text)
470
+ raw_scores = compute_raw_scores(tokens, lexicons, label_orders)
471
+ created_raw = first_present(row, "created_at", "create_time", "createTime", "collected_time", "stats_time")
472
+ created_date = None
473
+ if created_raw:
474
+ try:
475
+ created_int = int(created_raw)
476
+ created_date = pd.to_datetime(created_int, unit="s", utc=True).date().isoformat()
477
+ except Exception:
478
+ try:
479
+ created_date = str(pd.to_datetime(created_raw)).split(" ")[0]
480
+ except Exception:
481
+ created_date = None
482
+ video_id = coerce_id(first_present(row, "video_id", "aweme_id", "id"))
483
+ author_id = coerce_id(first_present(row, "author_id", "user_id"))
484
+ record: Dict[str, object] = {
485
+ "video_id": video_id,
486
+ "author_id": author_id,
487
+ "created_at": str(created_raw) if created_raw is not None else None,
488
+ "created_date": created_date,
489
+ "language": lang,
490
+ "description": description,
491
+ "hashtags": hashtag_tags,
492
+ "clean_text": fused_text,
493
+ "short_text": len(fused_text) < max(cfg.min_chars, 20),
494
+ "tribe_scores_raw": raw_scores.get("tribe", []),
495
+ "vibe_scores_raw": raw_scores.get("vibe", []),
496
+ "commercial_scores_raw": raw_scores.get("commercial", []),
497
+ "role_scores_raw": raw_scores.get("role", []),
498
+ "format_scores_raw": raw_scores.get("format", []),
499
+ "time_scores_raw": raw_scores.get("time", []),
500
+ }
501
+ if cfg.emit_flat_scores:
502
+ for dim, labels in label_orders.items():
503
+ scores = raw_scores.get(dim, []) or []
504
+ dim_max = float(max(scores)) if scores else 0.0
505
+ if dim == "time":
506
+ record["time_overall"] = dim_max
507
+ else:
508
+ record[f"{dim}_max"] = dim_max
509
+ for label, val in zip(labels, scores):
510
+ record[f"{dim}_{label}"] = float(val)
511
+ records.append(record)
512
+
513
+ if not records:
514
+ return pa.Table.from_pylist([]), filtered_lang, filtered_short
515
+
516
+ embeddings = embed_texts(embedder, texts, cfg.embedding_batch_size)
517
+ for rec, emb in zip(records, embeddings):
518
+ rec["embed_vec"] = emb
519
+
520
+ table = pa.Table.from_pylist(records)
521
+ return table, filtered_lang, filtered_short
522
+
523
+
524
+ def write_metadata(cfg: MinerConfig, label_orders: Dict[str, List[str]], diagnostics: Optional[Dict[str, int]] = None) -> None:
525
+ if not cfg.metadata_path:
526
+ return
527
+ meta = {
528
+ "config": asdict(cfg),
529
+ "label_orders": label_orders,
530
+ "diagnostics": diagnostics or {},
531
+ }
532
+ with open(cfg.metadata_path, "w", encoding="utf-8") as f:
533
+ json.dump(meta, f, ensure_ascii=False, indent=2)
534
+ logger.info("Wrote metadata to %s", cfg.metadata_path)
535
+
536
+
537
+ def run(cfg: MinerConfig) -> None:
538
+ logger.info("Starting sys7_miner with batch_size=%s", cfg.batch_size)
539
+ # Prefer System 7 lexicons if configured; fall back to System 6 bundle if needed.
540
+ lexicon_path = cfg.lexicons
541
+ if not Path(lexicon_path).exists() and Path("system6_lexicons.json").exists():
542
+ lexicon_path = "system6_lexicons.json"
543
+ logger.info("Lexicon bundle %s not found; falling back to %s", cfg.lexicons, lexicon_path)
544
+ else:
545
+ logger.info("Using lexicon bundle %s", lexicon_path)
546
+ raw_lexicons = load_json(str(lexicon_path))
547
+ slang_map = prepare_slang_map(load_json(cfg.slang_lexicon))
548
+ label_orders = load_json(cfg.label_orders)
549
+ lexicons = orient_lexicons(raw_lexicons)
550
+
551
+ phrase_vocab: Optional[set[str]] = None
552
+ phrase_ngram_sizes: List[int] = []
553
+ # Merge mined phrase lexicons into scoring (multi-word tokens).
554
+ if cfg.phrase_lexicon and Path(cfg.phrase_lexicon).exists():
555
+ try:
556
+ phrase_bundle = load_json(cfg.phrase_lexicon)
557
+ by_label = phrase_bundle.get("phrases_by_label")
558
+ if not isinstance(by_label, dict):
559
+ logger.warning(
560
+ "Phrase lexicon %s has no 'phrases_by_label'; it will not affect scoring. Re-run phraseminer.py to generate phrases_by_label.",
561
+ cfg.phrase_lexicon,
562
+ )
563
+ else:
564
+ phrase_vocab = set()
565
+ sizes = set()
566
+ merged = 0
567
+ for dim, label_map in by_label.items():
568
+ if dim not in label_orders:
569
+ continue
570
+ if not isinstance(label_map, dict):
571
+ continue
572
+ for label, phrases in label_map.items():
573
+ if label not in label_orders.get(dim, []):
574
+ continue
575
+ if not isinstance(phrases, dict):
576
+ continue
577
+ for phrase, w in phrases.items():
578
+ p = str(phrase).lower().strip()
579
+ if not p:
580
+ continue
581
+ try:
582
+ w_float = float(w) * float(cfg.phrase_weight_scale)
583
+ except Exception:
584
+ continue
585
+ lexicons.setdefault(dim, {}).setdefault(p, {})
586
+ lexicons[dim][p][label] = lexicons[dim][p].get(label, 0.0) + w_float
587
+ phrase_vocab.add(p)
588
+ sizes.add(len(p.split()))
589
+ merged += 1
590
+ phrase_ngram_sizes = sorted(n for n in sizes if n >= 2)
591
+ logger.info(
592
+ "Merged phrase lexicon into scoring: %s phrases (%s label assignments), ngram_sizes=%s",
593
+ len(phrase_vocab),
594
+ merged,
595
+ phrase_ngram_sizes,
596
+ )
597
+ except Exception as exc: # pragma: no cover - defensive
598
+ logger.warning("Failed to merge phrase lexicon into scoring: %s", exc)
599
+ embedder = load_embedder(cfg.embedding_model, cfg.device)
600
+
601
+ writer: Optional[pq.ParquetWriter] = None
602
+ total_rows = 0
603
+ kept_rows = 0
604
+ filtered_lang = 0
605
+ filtered_short = 0
606
+
607
+ path_obj = Path(cfg.input_path)
608
+ is_parquet = path_obj.is_dir() or path_obj.suffix.lower() == ".parquet"
609
+
610
+ def write_table(table: pa.Table):
611
+ nonlocal writer, kept_rows
612
+ if table.num_rows == 0:
613
+ return
614
+ if writer is None:
615
+ writer = pq.ParquetWriter(cfg.output_parquet, table.schema, compression="snappy")
616
+ writer.write_table(table)
617
+ kept_rows += table.num_rows
618
+
619
+ if is_parquet:
620
+ dataset = ds.dataset(cfg.input_path, format="parquet")
621
+ available = list(dataset.schema.names)
622
+ wanted = []
623
+ for c in (
624
+ "video_id",
625
+ "aweme_id",
626
+ "id",
627
+ "author_id",
628
+ "user_id",
629
+ "created_at",
630
+ "create_time",
631
+ "createTime",
632
+ "collected_time",
633
+ "stats_time",
634
+ "description",
635
+ "desc",
636
+ "hashtags",
637
+ "challenges",
638
+ "language",
639
+ "lang",
640
+ ):
641
+ if c in available and c not in wanted:
642
+ wanted.append(c)
643
+ scanner = dataset.scanner(batch_size=cfg.batch_size, columns=wanted or None)
644
+ for idx, batch in enumerate(scanner.to_batches()):
645
+ df = batch.to_pandas()
646
+ total_rows += len(df)
647
+ table, fl, fs = process_chunk(df, lexicons, phrase_vocab, phrase_ngram_sizes, slang_map, label_orders, embedder, cfg)
648
+ filtered_lang += fl
649
+ filtered_short += fs
650
+ write_table(table)
651
+ logger.info("Processed parquet batch %s (input rows %s, kept total %s)", idx, len(df), kept_rows)
652
+ else:
653
+ header = pd.read_csv(cfg.input_path, nrows=0).columns.tolist()
654
+ wanted = []
655
+ for c in (
656
+ "video_id",
657
+ "aweme_id",
658
+ "id",
659
+ "author_id",
660
+ "user_id",
661
+ "created_at",
662
+ "create_time",
663
+ "createTime",
664
+ "collected_time",
665
+ "stats_time",
666
+ "description",
667
+ "desc",
668
+ "hashtags",
669
+ "challenges",
670
+ "language",
671
+ "lang",
672
+ ):
673
+ if c in header and c not in wanted:
674
+ wanted.append(c)
675
+ with pd.read_csv(cfg.input_path, chunksize=cfg.batch_size, usecols=wanted or None, dtype=str, keep_default_na=False) as reader:
676
+ for idx, chunk in enumerate(reader):
677
+ total_rows += len(chunk)
678
+ table, fl, fs = process_chunk(chunk, lexicons, phrase_vocab, phrase_ngram_sizes, slang_map, label_orders, embedder, cfg)
679
+ filtered_lang += fl
680
+ filtered_short += fs
681
+ write_table(table)
682
+ logger.info("Processed CSV chunk %s (input rows %s, kept total %s)", idx, len(chunk), kept_rows)
683
+
684
+ if writer:
685
+ writer.close()
686
+ diagnostics = {
687
+ "input_rows": total_rows,
688
+ "kept_rows": kept_rows,
689
+ "filtered_language": filtered_lang,
690
+ "filtered_short": filtered_short,
691
+ }
692
+ write_metadata(cfg, label_orders, diagnostics)
693
+ if cfg.diagnostics_path:
694
+ Path(cfg.diagnostics_path).write_text(json.dumps(diagnostics, indent=2), encoding="utf-8")
695
+ logger.info(
696
+ "Completed. Input rows: %s, kept: %s, filtered_language: %s, filtered_short: %s",
697
+ total_rows,
698
+ kept_rows,
699
+ filtered_lang,
700
+ filtered_short,
701
+ )
702
+
703
+
704
+ def parse_args() -> MinerConfig:
705
+ parser = argparse.ArgumentParser(description="System 7 Miner: clean + embed + score raw TikTok text.")
706
+ parser.add_argument("--input-path", "--input", dest="input_path", required=True, help="Path to CSV or Parquet directory (e.g., Data/hf10m/Small)")
707
+ parser.add_argument("--output-parquet", default="tiktok10m_sys7_features.parquet", help="Output parquet path")
708
+ parser.add_argument(
709
+ "--lexicons",
710
+ default=None,
711
+ help="Preferred lexicon bundle (default: system7_lexicons.json if present, else system6_lexicons.json).",
712
+ )
713
+ parser.add_argument(
714
+ "--phrase-lexicon",
715
+ default=None,
716
+ help="Optional phrase lexicon JSON from phraseminer.py (uses phrases_by_label to score multi-word phrases).",
717
+ )
718
+ parser.add_argument(
719
+ "--phrase-weight-scale",
720
+ type=float,
721
+ default=1.0,
722
+ help="Scale factor applied to phrase lexicon weights when merged into scoring.",
723
+ )
724
+ parser.add_argument(
725
+ "--system6-lexicons",
726
+ default="system6_lexicons.json",
727
+ help="Fallback System 6 lexicon bundle (used if system7_lexicons.json is missing).",
728
+ )
729
+ parser.add_argument("--slang-lexicon", default="slang_lexicon.json", help="Path to slang_lexicon.json")
730
+ parser.add_argument("--label-orders", default="label_orders.json", help="Path to label_orders.json")
731
+ parser.add_argument("--batch-size", type=int, default=50_000)
732
+ parser.add_argument("--text-cap", type=int, default=1024)
733
+ parser.add_argument("--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2")
734
+ parser.add_argument("--embedding-batch-size", type=int, default=128)
735
+ parser.add_argument("--device", default="cpu", help="Embedding device: cpu or cuda")
736
+ parser.add_argument(
737
+ "--workers",
738
+ type=int,
739
+ default=2,
740
+ help="Number of worker processes for CPU-bound language detection when --auto-language-detect is set (default: 2).",
741
+ )
742
+ parser.add_argument("--metadata-path", default="sys7_miner_metadata.json")
743
+ parser.add_argument("--diagnostics-path", default="sys7_miner_diagnostics.json")
744
+ parser.add_argument("--min-chars", type=int, default=10, help="Minimum clean_text length to keep")
745
+ parser.add_argument("--min-tokens", type=int, default=2, help="Minimum token count to keep")
746
+ parser.add_argument(
747
+ "--emit-flat-scores",
748
+ action="store_true",
749
+ help="If set, emit *_max and per-label scalar columns (e.g., tribe_MomTok) in addition to list score vectors.",
750
+ )
751
+ parser.add_argument(
752
+ "--language-allowlist",
753
+ default=None,
754
+ help="Comma-separated list of language codes to keep (e.g., en,fr). "
755
+ "Combined with optional --auto-language-detect to filter down to core languages.",
756
+ )
757
+ parser.add_argument(
758
+ "--auto-language-detect",
759
+ action="store_true",
760
+ help="If set, auto-detect language for rows with missing/unknown language using langdetect, when available.",
761
+ )
762
+ args = parser.parse_args()
763
+ lexicon_path = args.lexicons
764
+ if not lexicon_path:
765
+ # Default preference: system7_lexicons.json if present, else System 6 bundle.
766
+ default_sys7 = Path("system7_lexicons.json")
767
+ lexicon_path = str(default_sys7) if default_sys7.exists() else args.system6_lexicons
768
+ return MinerConfig(
769
+ input_path=args.input_path,
770
+ output_parquet=args.output_parquet,
771
+ lexicons=lexicon_path,
772
+ phrase_lexicon=args.phrase_lexicon,
773
+ phrase_weight_scale=float(args.phrase_weight_scale),
774
+ slang_lexicon=args.slang_lexicon,
775
+ label_orders=args.label_orders,
776
+ batch_size=args.batch_size,
777
+ text_cap=args.text_cap,
778
+ embedding_model=args.embedding_model,
779
+ embedding_batch_size=args.embedding_batch_size,
780
+ device=args.device,
781
+ workers=args.workers,
782
+ metadata_path=args.metadata_path,
783
+ diagnostics_path=args.diagnostics_path,
784
+ min_chars=args.min_chars,
785
+ min_tokens=args.min_tokens,
786
+ language_allowlist=[s.strip().lower() for s in args.language_allowlist.split(",")] if args.language_allowlist else None,
787
+ auto_language_detect=bool(args.auto_language_detect),
788
+ emit_flat_scores=bool(args.emit_flat_scores),
789
+ )
790
+
791
+
792
+ def main() -> None:
793
+ logging.basicConfig(
794
+ level=logging.INFO,
795
+ format="%(asctime)s %(levelname)s %(name)s - %(message)s",
796
+ )
797
+ cfg = parse_args()
798
+ run(cfg)
799
+
800
+
801
+ if __name__ == "__main__":
802
+ main()
sys7_phrase_lexicons_desc_only.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a4384c606dfab526df5b30198fd14453022114852a022ea60027bfc6c5f9327
3
+ size 78998019
system7_lexicons.json ADDED
The diff for this file is too large to render. See raw diff