Spaces:

Imaginethat
/

Miner-v2

Paused

App Files Files Community

Imaginethat commited on Dec 12, 2025

Commit

90447dd

verified ·

1 Parent(s): 8f6b051

Upload 8 files

Browse files

Files changed (9) hide show

.gitattributes +1 -0
Dockerfile.txt +46 -0
Requirements.txt.txt +11 -0
label_orders.json +44 -0
run_job.py +183 -0
slang_lexicon.json +0 -0
start.sh.txt +61 -0
sys7_phrase_lexicons_desc_only.json +3 -0
system7_lexicons.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sys7_phrase_lexicons_desc_only.json filter=lfs diff=lfs merge=lfs -text

Dockerfile.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+# Use the same PyTorch base image that worked for your clustering job
+FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+# Install system basics
+RUN apt-get update && \
+    apt-get install -y wget ca-certificates git && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install Python dependencies
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+# Create non-root user and setup the Persistent Storage paths
+# This ensures /data is writable, which is where your Parquet inputs/outputs will live
+RUN useradd -m -u 1000 appuser && \
+    mkdir -p /data/.cache && \
+    mkdir -p /data/out && \
+    mkdir -p /data/input && \
+    chown -R appuser:appuser /data
+# Set environment variables to force all Hugging Face caches to the persistent volume
+ENV HF_HOME=/data/.cache \
+    HF_HUB_CACHE=/data/.cache/hub \
+    TRANSFORMERS_CACHE=/data/.cache/transformers \
+    HF_DATASETS_CACHE=/data/.cache/datasets \
+    SENTENCE_TRANSFORMERS_HOME=/data/.cache/sentence_transformers \
+    TOKENIZERS_PARALLELISM=false \
+    OMP_NUM_THREADS=1
+# Copy your Miner script and the required Config JSONs
+# Make sure you upload these JSONs to the Space Files along with this Dockerfile!
+COPY sys7_miner.py .
+COPY system7_lexicons.json .
+COPY label_orders.json .
+COPY slang_lexicon.json .
+# COPY sys7_phrase_lexicons_desc_only.json .  <-- Uncomment if you use this
+# Copy the runner script
+COPY start.sh .
+RUN chmod +x start.sh && chown -R appuser:appuser /app
+USER appuser
+CMD ["./start.sh"]

Requirements.txt.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas
+pyarrow
+ftfy
+langdetect
+sentence-transformers
+huggingface_hub
+datasets
+--extra-index-url https://download.pytorch.org/whl/cu124
+torch==2.5.1
+torchvision==0.20.1
+torchaudio==2.5.1

label_orders.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "tribe": [
+    "BlueCollar",
+    "Faith",
+    "Gaming",
+    "MomTok",
+    "Pets",
+    "Storytime",
+    "SportsCombat",
+    "MusicPopFandom",
+    "TVStreaming",
+    "Roleplay"
+  ],
+  "commercial": [
+    "Product",
+    "MediaShow",
+    "EventService",
+    "InfluencerBrand"
+  ],
+  "vibe": [
+    "chaotic_funny",
+    "confessional_personal",
+    "educational_calm",
+    "motivational_inspirational",
+    "dramatic_romantic"
+  ],
+  "time": [
+    "seasonal",
+    "viral"
+  ],
+  "role": [
+    "parent",
+    "professional",
+    "single",
+    "student",
+    "athlete",
+    "creator"
+  ],
+  "format": [
+    "storytime",
+    "template_edit",
+    "vlog"
+  ]
+}

run_job.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import gradio as gr
+import os
+import threading
+import subprocess
+import time
+import shutil
+from huggingface_hub import HfApi, hf_hub_download, create_repo
+# --- CONFIG ---
+INPUT_DATASET = "The-data-company/TikTok-10M"
+# CHANGE THIS TO YOUR USERNAME!
+OUTPUT_REPO = "Imaginethat/tiktok-sys7-mined-results"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+# Global variables for the UI
+log_buffer = "Initializing Environment...\n"
+dataset_link_md = ""
+mining_active = False
+def log(msg):
+    """Adds a message to the UI log window"""
+    global log_buffer
+    timestamp = time.strftime("%H:%M:%S")
+    entry = f"[{timestamp}] {msg}"
+    print(entry) # Keep console logs as backup
+    log_buffer += entry + "\n"
+def run_mining_logic():
+    global dataset_link_md, mining_active
+    if not HF_TOKEN:
+        log("ERROR: HF_TOKEN secret is missing in Settings!")
+        return
+    api = HfApi(token=HF_TOKEN)
+    # 1. Setup Repo
+    log(f"Setting up Output Repo: {OUTPUT_REPO}")
+    try:
+        create_repo(repo_id=OUTPUT_REPO, repo_type="dataset", exist_ok=True)
+    except Exception as e:
+        log(f"Repo setup note: {e}")
+    # 2. List Files
+    log(f"Scanning {INPUT_DATASET} for files...")
+    try:
+        all_files = api.list_repo_files(repo_id=INPUT_DATASET, repo_type="dataset")
+        parquet_files = [f for f in all_files if f.endswith(".parquet") and "train" in f]
+        log(f"Found {len(parquet_files)} shards to process.")
+    except Exception as e:
+        log(f"Failed to list files: {e}")
+        return
+    # 3. Process Shards
+    input_dir = "./data_input"
+    output_dir = "./data_output"
+    for i, file_path in enumerate(parquet_files):
+        log(f"--- Processing Shard {i+1}/{len(parquet_files)}: {file_path} ---")
+        # Cleanup
+        if os.path.exists(input_dir): shutil.rmtree(input_dir)
+        if os.path.exists(output_dir): shutil.rmtree(output_dir)
+        os.makedirs(input_dir, exist_ok=True)
+        os.makedirs(output_dir, exist_ok=True)
+        # Download
+        try:
+            local_path = hf_hub_download(
+                repo_id=INPUT_DATASET,
+                filename=file_path,
+                repo_type="dataset",
+                local_dir=input_dir,
+                token=HF_TOKEN
+            )
+        except Exception as e:
+            log(f"Download failed: {e}")
+            continue
+        # Run Miner (Streaming output to UI)
+        shard_output_name = f"sys7_features_part_{i:04d}.parquet"
+        shard_output_path = os.path.join(output_dir, shard_output_name)
+        cmd = [
+            "python", "-u", "sys7_miner.py", # -u forces unbuffered output
+            "--input-path", input_dir,
+            "--output-parquet", shard_output_path,
+            "--lexicons", "system7_lexicons.json",
+            "--label-orders", "label_orders.json",
+            "--slang-lexicon", "slang_lexicon.json",
+            "--device", "cuda",
+            "--batch-size", "50000",
+            "--embedding-batch-size", "512",
+            "--workers", "4"
+        ]
+        # Use Popen to capture logs in real-time
+        try:
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                bufsize=1
+            )
+            # Read stdout line by line
+            for line in process.stdout:
+                if "Processed" in line or "Error" in line: # Filter logs to keep UI clean?
+                    log("MINER: " + line.strip())
+                elif "Loading" in line:
+                    log("MINER: " + line.strip())
+            process.wait()
+            if process.returncode != 0:
+                log(f"Miner failed with code {process.returncode}")
+                continue
+        except Exception as e:
+            log(f"Subprocess error: {e}")
+            continue
+        # Upload
+        try:
+            log(f"Uploading {shard_output_name}...")
+            api.upload_file(
+                path_or_fileobj=shard_output_path,
+                path_in_repo=f"data/{shard_output_name}",
+                repo_id=OUTPUT_REPO,
+                repo_type="dataset"
+            )
+            log("Upload Successful!")
+        except Exception as e:
+            log(f"Upload failed: {e}")
+    # 4. Finish
+    final_url = f"https://huggingface.co/datasets/{OUTPUT_REPO}"
+    log("ALL JOBS COMPLETE.")
+    dataset_link_md = f"## 🎉 DONE! Data is ready: [Click Here to View Dataset]({final_url})"
+    mining_active = False
+def start_thread():
+    global mining_active
+    if mining_active:
+        return "Already Running..."
+    mining_active = True
+    t = threading.Thread(target=run_mining_logic)
+    t.start()
+    return "Mining Started! Watch logs below."
+# --- GRADIO UI ---
+with gr.Blocks(title="TikTok Miner") as demo:
+    gr.Markdown("# ⛏️ System 7 Miner Control")
+    with gr.Row():
+        start_btn = gr.Button("🚀 Start Mining Job", variant="primary")
+        status_txt = gr.Textbox(label="Status", value="Ready to start.", interactive=False)
+    # The Log Window
+    logs_out = gr.TextArea(
+        label="Live Process Logs",
+        placeholder="Logs will stream here...",
+        lines=20,
+        max_lines=25,
+        autoscroll=True
+    )
+    # The Success Link (Hidden until done)
+    link_display = gr.Markdown("")
+    # Button Action
+    start_btn.click(fn=start_thread, inputs=None, outputs=status_txt)
+    # Auto-refresh the logs every 1 second
+    def update_ui():
+        return log_buffer, dataset_link_md
+    timer = gr.Timer(1)
+    timer.tick(fn=update_ui, outputs=[logs_out, link_display])
+# Launch on the port HF expects
+demo.launch(server_name="0.0.0.0", server_port=7860)

slang_lexicon.json ADDED Viewed

The diff for this file is too large to render. See raw diff

start.sh.txt ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# 1. Ensure Persistent Cache is Writable
+export HF_HOME="${HF_HOME:-/data/.cache}"
+# Create subdirectories to be safe
+for dir in "$HF_HOME/hub" "$HF_HOME/transformers" "$HF_HOME/datasets" "$HF_HOME/sentence_transformers" "/data/out"; do
+  mkdir -p "$dir"
+done
+# 2. Cleanup Threading Issues
+unset OMP_NUM_THREADS
+export TOKENIZERS_PARALLELISM=false
+# 3. Quick GPU Check
+echo '=== Torch / CUDA check ==='
+python - <<'PY'
+import torch
+print(f'torch version: {torch.__version__}')
+print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
+if torch.cuda.is_available():
+    print(f'Device: {torch.cuda.get_device_name(0)}')
+PY
+# 4. Start Lightweight File Server (Port 7860)
+# This keeps the Space "Healthy" and lets you browse /data/out to see if the parquet file exists.
+PORT="${PORT:-7860}"
+echo "Starting file server on port $PORT serving /data/out..."
+cd /data/out && python3 -m http.server "$PORT" &
+SERVER_PID=$!
+trap 'kill $SERVER_PID' EXIT
+# Return to app dir to run the miner
+cd /app
+# 5. Run System 7 Miner
+# We assume input data is in /data/input. If not, you might need to download it here.
+echo '=== Running System 7 Miner ==='
+python sys7_miner.py \
+  --input-path "${INPUT_PATH:-/data/input}" \
+  --output-parquet "${OUTPUT_PATH:-/data/out/tiktok10m_sys7_features.parquet}" \
+  --lexicons system7_lexicons.json \
+  --label-orders label_orders.json \
+  --slang-lexicon slang_lexicon.json \
+  --device cuda \
+  --batch-size "${BATCH_SIZE:-50000}" \
+  --embedding-batch-size "${EMBED_BATCH:-512}" \
+  --metadata-path "/data/out/sys7_miner_metadata.json" \
+  --diagnostics-path "/data/out/sys7_miner_diagnostics.json" \
+  --workers 4
+RESULT=$?
+if [ $RESULT -eq 0 ]; then
+  echo "Miner completed successfully."
+else
+  echo "Miner failed. Check logs."
+fi
+# Keep container alive so you can download files via the HTTP server
+wait $SERVER_PID

sys7_phrase_lexicons_desc_only.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a4384c606dfab526df5b30198fd14453022114852a022ea60027bfc6c5f9327
+size 78998019

system7_lexicons.json ADDED Viewed

The diff for this file is too large to render. See raw diff