import os import time from huggingface_hub import HfApi, snapshot_download from datasets import load_dataset # CONFIGURATION DATASET_ID = "toecm/IEDID" # Where the app saves data MODEL_ID = "toecm/PureVersation_Model" # The model to train THRESHOLD = 1000 # How many new clips before training? HF_TOKEN = os.environ.get("HF_TOKEN") # Your Write Access Token api = HfApi(token=HF_TOKEN) def check_and_trigger_training(): print("🔍 Checking Pure Chain status...") # 1. Load the dataset (Streaming mode is faster for just counting) try: ds = load_dataset(DATASET_ID, split="train", streaming=True, token=HF_TOKEN) # Count verified clips (This is an estimate for streaming, or load full if small) # For robustness, we'll load the dataset info dataset_info = api.dataset_info(DATASET_ID) last_modified = dataset_info.lastModified # LOGIC: Check if 'verified' count > 1000 since last tag # This requires reading a 'metadata.json' or counting rows # Simplified: count = 0 for row in ds: if row.get('status') == 'verified': count += 1 print(f"✅ Found {count} verified clips.") if count >= THRESHOLD: print("🚀 Threshold reached! Triggering AutoTrain...") trigger_autotrain(count) else: print(f"💤 Not enough data yet ({count}/{THRESHOLD}). Sleeping.") except Exception as e: print(f"❌ Error checking dataset: {e}") def trigger_autotrain(data_count): # This uses the AutoTrain Advanced API (or simply launches a Space) # The simplest way is to create a new AutoTrain Project via API from autotrain.api import AutoTrainClient client = AutoTrainClient(hf_token=HF_TOKEN) # Create project project_name = f"pure-versation-finetune-{int(time.time())}" client.create_project(project_name, task="speech-recognition") # Add data & Start print(f"🔥 Training job '{project_name}' started with {data_count} clips.") # TODO: Reset the 'verified' counter or tag the dataset rows as 'trained' # so we don't train on them again next time. if __name__ == "__main__": # Run loop (e.g., every 24 hours) while True: check_and_trigger_training() time.sleep(86400) # Sleep for 24 hours