Spaces:

rdisipio
/

coachable-course-agent

Runtime error

Massive catalog expansion: 3.6x growth to 2,023 courses

a9c286b 9 months ago

1.84 kB

	#!/usr/bin/env python3
	"""
	Monitor bulk scraping progress by counting new course files.
	"""

	import time
	import subprocess
	from pathlib import Path

	def count_new_courses():
	"""Count courses scraped today"""
	try:
	# Count files from today
	result = subprocess.run([
	"find", "data", "-name", "coursesity__20250830_", "-exec", "jq", ".courses \| length", "{}", ";"
	], capture_output=True, text=True)

	if result.returncode == 0:
	counts = [int(line.strip()) for line in result.stdout.strip().split('\n') if line.strip()]
	return sum(counts), len(counts)
	return 0, 0
	except:
	return 0, 0

	def main():
	print("📊 Monitoring bulk Coursesity scraping progress...")
	print("Press Ctrl+C to stop monitoring\n")

	try:
	while True:
	total_courses, file_count = count_new_courses()
	print(f"📈 Progress: {file_count} topics scraped, {total_courses:,} new courses")

	if file_count > 0:
	avg_per_file = total_courses / file_count
	print(f"📊 Average: {avg_per_file:.1f} courses per topic")

	if file_count < 233: # Total topics we're targeting
	remaining = 233 - file_count
	estimated_total = total_courses + (remaining * avg_per_file)
	print(f"🎯 Estimated final total: {estimated_total:,.0f} new courses")

	print("-" * 50)
	time.sleep(30) # Check every 30 seconds

	except KeyboardInterrupt:
	print("\n👋 Monitoring stopped")
	total_courses, file_count = count_new_courses()
	print(f"Final count: {file_count} topics, {total_courses:,} courses")

	if __name__ == "__main__":
	main()