Spaces:

toecm
/

PureVersation

Running

App Files Files Community

PureVersation / auto_train_watcher.py

toecm

Update auto_train_watcher.py

76c71e9 verified 3 months ago

raw

history blame contribute delete

2.41 kB

	import os
	import time
	from huggingface_hub import HfApi, snapshot_download
	from datasets import load_dataset

	# CONFIGURATION
	DATASET_ID = "toecm/IEDID" # Where the app saves data
	MODEL_ID = "toecm/PureVersation_Model" # The model to train
	THRESHOLD = 1000 # How many new clips before training?
	HF_TOKEN = os.environ.get("HF_TOKEN") # Your Write Access Token

	api = HfApi(token=HF_TOKEN)

	def check_and_trigger_training():
	print("🔍 Checking Pure Chain status...")

	# 1. Load the dataset (Streaming mode is faster for just counting)
	try:
	ds = load_dataset(DATASET_ID, split="train", streaming=True, token=HF_TOKEN)
	# Count verified clips (This is an estimate for streaming, or load full if small)
	# For robustness, we'll load the dataset info
	dataset_info = api.dataset_info(DATASET_ID)
	last_modified = dataset_info.lastModified

	# LOGIC: Check if 'verified' count > 1000 since last tag
	# This requires reading a 'metadata.json' or counting rows
	# Simplified:
	count = 0
	for row in ds:
	if row.get('status') == 'verified':
	count += 1

	print(f"✅ Found {count} verified clips.")

	if count >= THRESHOLD:
	print("🚀 Threshold reached! Triggering AutoTrain...")
	trigger_autotrain(count)
	else:
	print(f"💤 Not enough data yet ({count}/{THRESHOLD}). Sleeping.")

	except Exception as e:
	print(f"❌ Error checking dataset: {e}")

	def trigger_autotrain(data_count):
	# This uses the AutoTrain Advanced API (or simply launches a Space)
	# The simplest way is to create a new AutoTrain Project via API

	from autotrain.api import AutoTrainClient
	client = AutoTrainClient(hf_token=HF_TOKEN)

	# Create project
	project_name = f"pure-versation-finetune-{int(time.time())}"
	client.create_project(project_name, task="speech-recognition")

	# Add data & Start
	print(f"🔥 Training job '{project_name}' started with {data_count} clips.")

	# TODO: Reset the 'verified' counter or tag the dataset rows as 'trained'
	# so we don't train on them again next time.

	if __name__ == "__main__":
	# Run loop (e.g., every 24 hours)
	while True:
	check_and_trigger_training()
	time.sleep(86400) # Sleep for 24 hours