#!/usr/bin/env python3 """ Monitor bulk scraping progress by counting new course files. """ import time import subprocess from pathlib import Path def count_new_courses(): """Count courses scraped today""" try: # Count files from today result = subprocess.run([ "find", "data", "-name", "coursesity_*_20250830_*", "-exec", "jq", ".courses | length", "{}", ";" ], capture_output=True, text=True) if result.returncode == 0: counts = [int(line.strip()) for line in result.stdout.strip().split('\n') if line.strip()] return sum(counts), len(counts) return 0, 0 except: return 0, 0 def main(): print("šŸ“Š Monitoring bulk Coursesity scraping progress...") print("Press Ctrl+C to stop monitoring\n") try: while True: total_courses, file_count = count_new_courses() print(f"šŸ“ˆ Progress: {file_count} topics scraped, {total_courses:,} new courses") if file_count > 0: avg_per_file = total_courses / file_count print(f"šŸ“Š Average: {avg_per_file:.1f} courses per topic") if file_count < 233: # Total topics we're targeting remaining = 233 - file_count estimated_total = total_courses + (remaining * avg_per_file) print(f"šŸŽÆ Estimated final total: {estimated_total:,.0f} new courses") print("-" * 50) time.sleep(30) # Check every 30 seconds except KeyboardInterrupt: print("\nšŸ‘‹ Monitoring stopped") total_courses, file_count = count_new_courses() print(f"Final count: {file_count} topics, {total_courses:,} courses") if __name__ == "__main__": main()