File size: 2,412 Bytes
a6cbd2a
 
 
 
 
 
ef938eb
76c71e9
a6cbd2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76c71e9
a6cbd2a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
import time
from huggingface_hub import HfApi, snapshot_download
from datasets import load_dataset

# CONFIGURATION
DATASET_ID = "toecm/IEDID"  # Where the app saves data
MODEL_ID = "toecm/PureVersation_Model"      # The model to train
THRESHOLD = 1000                          # How many new clips before training?
HF_TOKEN = os.environ.get("HF_TOKEN")     # Your Write Access Token

api = HfApi(token=HF_TOKEN)

def check_and_trigger_training():
    print("🔍 Checking Pure Chain status...")
    
    # 1. Load the dataset (Streaming mode is faster for just counting)
    try:
        ds = load_dataset(DATASET_ID, split="train", streaming=True, token=HF_TOKEN)
        # Count verified clips (This is an estimate for streaming, or load full if small)
        # For robustness, we'll load the dataset info
        dataset_info = api.dataset_info(DATASET_ID)
        last_modified = dataset_info.lastModified
        
        # LOGIC: Check if 'verified' count > 1000 since last tag
        # This requires reading a 'metadata.json' or counting rows
        # Simplified:
        count = 0
        for row in ds:
            if row.get('status') == 'verified':
                count += 1
        
        print(f"✅ Found {count} verified clips.")

        if count >= THRESHOLD:
            print("🚀 Threshold reached! Triggering AutoTrain...")
            trigger_autotrain(count)
        else:
            print(f"💤 Not enough data yet ({count}/{THRESHOLD}). Sleeping.")

    except Exception as e:
        print(f"❌ Error checking dataset: {e}")

def trigger_autotrain(data_count):
    # This uses the AutoTrain Advanced API (or simply launches a Space)
    # The simplest way is to create a new AutoTrain Project via API
    
    from autotrain.api import AutoTrainClient
    client = AutoTrainClient(hf_token=HF_TOKEN)
    
    # Create project
    project_name = f"pure-versation-finetune-{int(time.time())}"
    client.create_project(project_name, task="speech-recognition")
    
    # Add data & Start
    print(f"🔥 Training job '{project_name}' started with {data_count} clips.")
    
    # TODO: Reset the 'verified' counter or tag the dataset rows as 'trained'
    # so we don't train on them again next time.

if __name__ == "__main__":
    # Run loop (e.g., every 24 hours)
    while True:
        check_and_trigger_training()
        time.sleep(86400) # Sleep for 24 hours