coachable-course-agent / scripts /monitor_scraping.py
rdisipio's picture
Massive catalog expansion: 3.6x growth to 2,023 courses
a9c286b
#!/usr/bin/env python3
"""
Monitor bulk scraping progress by counting new course files.
"""
import time
import subprocess
from pathlib import Path
def count_new_courses():
"""Count courses scraped today"""
try:
# Count files from today
result = subprocess.run([
"find", "data", "-name", "coursesity_*_20250830_*", "-exec", "jq", ".courses | length", "{}", ";"
], capture_output=True, text=True)
if result.returncode == 0:
counts = [int(line.strip()) for line in result.stdout.strip().split('\n') if line.strip()]
return sum(counts), len(counts)
return 0, 0
except:
return 0, 0
def main():
print("πŸ“Š Monitoring bulk Coursesity scraping progress...")
print("Press Ctrl+C to stop monitoring\n")
try:
while True:
total_courses, file_count = count_new_courses()
print(f"πŸ“ˆ Progress: {file_count} topics scraped, {total_courses:,} new courses")
if file_count > 0:
avg_per_file = total_courses / file_count
print(f"πŸ“Š Average: {avg_per_file:.1f} courses per topic")
if file_count < 233: # Total topics we're targeting
remaining = 233 - file_count
estimated_total = total_courses + (remaining * avg_per_file)
print(f"🎯 Estimated final total: {estimated_total:,.0f} new courses")
print("-" * 50)
time.sleep(30) # Check every 30 seconds
except KeyboardInterrupt:
print("\nπŸ‘‹ Monitoring stopped")
total_courses, file_count = count_new_courses()
print(f"Final count: {file_count} topics, {total_courses:,} courses")
if __name__ == "__main__":
main()