Spaces:
Running
Running
| import os | |
| import time | |
| from huggingface_hub import HfApi, snapshot_download | |
| from datasets import load_dataset | |
| # CONFIGURATION | |
| DATASET_ID = "toecm/IEDID" # Where the app saves data | |
| MODEL_ID = "toecm/PureVersation_Model" # The model to train | |
| THRESHOLD = 1000 # How many new clips before training? | |
| HF_TOKEN = os.environ.get("HF_TOKEN") # Your Write Access Token | |
| api = HfApi(token=HF_TOKEN) | |
| def check_and_trigger_training(): | |
| print("🔍 Checking Pure Chain status...") | |
| # 1. Load the dataset (Streaming mode is faster for just counting) | |
| try: | |
| ds = load_dataset(DATASET_ID, split="train", streaming=True, token=HF_TOKEN) | |
| # Count verified clips (This is an estimate for streaming, or load full if small) | |
| # For robustness, we'll load the dataset info | |
| dataset_info = api.dataset_info(DATASET_ID) | |
| last_modified = dataset_info.lastModified | |
| # LOGIC: Check if 'verified' count > 1000 since last tag | |
| # This requires reading a 'metadata.json' or counting rows | |
| # Simplified: | |
| count = 0 | |
| for row in ds: | |
| if row.get('status') == 'verified': | |
| count += 1 | |
| print(f"✅ Found {count} verified clips.") | |
| if count >= THRESHOLD: | |
| print("🚀 Threshold reached! Triggering AutoTrain...") | |
| trigger_autotrain(count) | |
| else: | |
| print(f"💤 Not enough data yet ({count}/{THRESHOLD}). Sleeping.") | |
| except Exception as e: | |
| print(f"❌ Error checking dataset: {e}") | |
| def trigger_autotrain(data_count): | |
| # This uses the AutoTrain Advanced API (or simply launches a Space) | |
| # The simplest way is to create a new AutoTrain Project via API | |
| from autotrain.api import AutoTrainClient | |
| client = AutoTrainClient(hf_token=HF_TOKEN) | |
| # Create project | |
| project_name = f"pure-versation-finetune-{int(time.time())}" | |
| client.create_project(project_name, task="speech-recognition") | |
| # Add data & Start | |
| print(f"🔥 Training job '{project_name}' started with {data_count} clips.") | |
| # TODO: Reset the 'verified' counter or tag the dataset rows as 'trained' | |
| # so we don't train on them again next time. | |
| if __name__ == "__main__": | |
| # Run loop (e.g., every 24 hours) | |
| while True: | |
| check_and_trigger_training() | |
| time.sleep(86400) # Sleep for 24 hours |