Marketing-Memory-Routing-8B / synthetic_data /run_batch_async.py

Upload folder using huggingface_hub

685d968 verified about 1 month ago

7.41 kB

	import json
	import random
	import time
	import sys
	import asyncio
	import os
	from typing import List, Dict, Any
	from synthetic_data.pipeline import SyntheticDataPipeline
	from synthetic_data.validate import validate_synthetic_data
	from synthetic_data.clean_data import clean_datum

	CATEGORY_DISTRIBUTION = {
	"none": 0.15,
	"user.interaction_preferences": 0.12,
	"user.session_history": 0.10,
	"company.brand_core": 0.10,

	"company.strategic_signatures": 0.07,
	"company.knowledge_artifacts": 0.07,
	"user.communication_style": 0.07,
	"user.strategic_approach": 0.07,
	"user.workflow_patterns": 0.07,

	"company.tools_config": 0.05,
	"company.performance_context": 0.05,
	"company.business_priorities": 0.04,
	"user.role_context": 0.04
	}

	async def generate_single_item(pipeline: SyntheticDataPipeline, category: str, item_num: int) -> Dict[str, Any]:
	"""Generate a single conversation item asynchronously."""
	print(f" Starting item {item_num} (Target: {category})...")

	# Determine distractor
	categories = list(CATEGORY_DISTRIBUTION.keys())
	distractor = None
	if random.random() < 0.30 and category != "none":
	possible_distractors = [c for c in categories if c != category and c != "none"]
	if possible_distractors:
	distractor = random.choice(possible_distractors)

	persistence = _get_persistence_for_category(category)
	turns = random.randint(4, 10)

	# Generate scenario (synchronous call wrapped in executor)
	loop = asyncio.get_event_loop()
	scenario = await loop.run_in_executor(
	None,
	pipeline.generate_scenario_spec,
	category,
	distractor,
	persistence,
	"neutral",
	turns,
	""
	)

	if not scenario:
	print(f" Failed item {item_num}: scenario generation failed")
	return None

	# Generate conversation
	conversation = await loop.run_in_executor(
	None,
	pipeline.generate_conversation,
	scenario,
	turns,
	category
	)

	if conversation:
	# Clean the item immediately
	cleaned_conversation = clean_datum(conversation)
	print(f" Completed item {item_num}: {cleaned_conversation.get('scenario_id', 'Unknown')}")
	return cleaned_conversation
	else:
	print(f" Failed item {item_num}: conversation generation failed")
	return None

	async def generate_batch_concurrent(pipeline: SyntheticDataPipeline, batch_size: int, batch_num: int) -> List[Dict[str, Any]]:
	"""Generate a full batch of items concurrently, retrying until batch is full."""
	print(f"\n=== Processing Batch {batch_num} (Concurrent) ===")

	categories = list(CATEGORY_DISTRIBUTION.keys())
	weights = list(CATEGORY_DISTRIBUTION.values())

	batch_data = []
	items_needed = batch_size

	while items_needed > 0:
	# Select categories for this chunk of work
	batch_categories = random.choices(categories, weights=weights, k=items_needed)

	# Create tasks for needed items
	tasks = [
	generate_single_item(pipeline, category, len(batch_data) + i + 1)
	for i, category in enumerate(batch_categories)
	]

	print(f" Launch {items_needed} concurrent tasks...")
	results = await asyncio.gather(*tasks, return_exceptions=True)

	# Collect successes
	success_count = 0
	for result in results:
	if isinstance(result, Exception):
	print(f" Task exception: {result}")
	elif result is not None:
	batch_data.append(result)
	success_count += 1

	items_needed = batch_size - len(batch_data)
	if items_needed > 0:
	print(f" Batch incomplete ({len(batch_data)}/{batch_size}). Retrying {items_needed} items in 5s...")
	await asyncio.sleep(5)

	print(f"Batch {batch_num} complete: {len(batch_data)}/{batch_size} items generated")
	return batch_data

	async def run_pipeline_batches_async(total_items: int = 100, batch_size: int = 10):
	"""Run the full pipeline with concurrent batch processing."""
	pipeline = SyntheticDataPipeline(max_retries=5)

	all_data = []
	num_batches = max(1, total_items // batch_size)

	print(f"Starting CONCURRENT generation of {total_items} items in {num_batches} batches...")
	print(f"Batch size: {batch_size} items (generated in parallel)")

	for batch_num in range(1, num_batches + 1):
	# Check if batch already exists
	batch_filename = f"synthetic_data/batch_{batch_num:02d}.jsonl"
	if os.path.exists(batch_filename):
	print(f"Batch {batch_num} already exists ({batch_filename}). Skipping generation...")
	# Load existing data to include in final output
	try:
	with open(batch_filename, 'r') as f:
	for line in f:
	if line.strip():
	all_data.append(json.loads(line))
	print(f"Loaded {len(all_data)} items so far.")
	continue
	except Exception as e:
	print(f"Error reading existing batch {batch_num}: {e}. Regenerating...")

	# Generate entire batch concurrently
	batch_data = await generate_batch_concurrent(pipeline, batch_size, batch_num)

	# Save batch as JSONL
	with open(batch_filename, "w") as f:
	for item in batch_data:
	f.write(json.dumps(item) + "\n")
	print(f"Saved batch to {batch_filename}")

	# Validate batch
	print("Validating batch...")
	metrics = validate_synthetic_data(batch_filename)
	print(json.dumps(metrics, indent=2))

	all_data.extend(batch_data)

	# Wait 5 seconds before next batch
	if batch_num < num_batches:
	print("Waiting 5 seconds before next batch...")
	await asyncio.sleep(5)

	# Save all data
	output_file = f"synthetic_data/all_generated_data_{total_items}.jsonl"
	with open(output_file, "w") as f:
	for item in all_data:
	f.write(json.dumps(item) + "\n")

	print(f"\n{'='*60}")
	print(f"COMPLETED: {len(all_data)} items generated")
	print(f"Full dataset saved to {output_file}")
	print(f"{'='*60}")

	def _get_persistence_for_category(category: str) -> str:
	"""Map category to its expected persistence level."""
	if "brand_core" in category or "strategic_signatures" in category or "knowledge_artifacts" in category or "communication_style" in category or "strategic_approach" in category:
	return "long"
	elif "tools_config" in category or "role_context" in category or "workflow_patterns" in category:
	return "medium"
	elif "business_priorities" in category or "session_history" in category:
	return "short"
	elif "performance_context" in category:
	return "rolling"
	elif "interaction_preferences" in category:
	return "evolving"
	elif "none" in category:
	return "short"
	return "medium"

	if __name__ == "__main__":
	total = int(sys.argv[1]) if len(sys.argv) > 1 else 100
	batch = int(sys.argv[2]) if len(sys.argv) > 2 else 10

	asyncio.run(run_pipeline_batches_async(total, batch))