Spaces:

rasgaard
/

DTU-MLOps-TA

Sleeping

DTU-MLOps-TA / src /mlops_mentor /scraper /scraper.py

Rasmus Aagaard

Deploy MLOps Mentor Leaderboard to Hugging Face Space

567b305 about 2 months ago

7.75 kB

	import datetime
	import os
	import subprocess

	import numpy as np
	import requests
	from loguru import logger
	from typer import Typer

	from mlops_mentor.common import github_headers as headers
	from mlops_mentor.common.models import RepoInfo
	from mlops_mentor.scraper.models import RepoContent, Report, RepoStats


	def create_activity_matrix(
	commits: list,
	max_delta: int = 5,
	min_delta: int = 1,
	) -> list[list[int]]:
	"""Creates an activity matrix from the commits."""
	commit_times = [
	datetime.datetime.fromisoformat(commit["commit"]["committer"]["date"][:-1])
	for commit in commits
	]
	commit_times.sort()

	start_time = commit_times[0]
	end_time = max(
	start_time + datetime.timedelta(weeks=min_delta),
	min(start_time + datetime.timedelta(weeks=max_delta), commit_times[-1]),
	)

	num_days = (end_time - start_time).days + 1 # include last day

	commit_matrix = np.zeros((num_days, 24), dtype=int)

	for commit_time in commit_times:
	if start_time <= commit_time <= end_time:
	day_index = (commit_time - start_time).days
	hour_index = commit_time.hour
	commit_matrix[day_index, hour_index] += 1

	return commit_matrix.tolist()


	def scrape(repo_link: str) -> RepoStats:
	repo = RepoInfo(repo_url=repo_link)

	if not repo.is_accessible:
	logger.error(f"Repository {repo.repo_url} is not accessible.")
	repo_stats = RepoStats(
	num_contributors=None,
	num_prs=None,
	num_commits_to_main=None,
	average_commit_length_to_main=None,
	latest_commit=None,
	average_commit_length=None,
	contributions_per_contributor=None,
	total_commits=None,
	activity_matrix=None,
	num_docker_files=None,
	num_python_files=None,
	num_workflow_files=None,
	has_requirements_file=None,
	has_cloudbuild=None,
	using_dvc=None,
	repo_size=None,
	readme_length=None,
	actions_passing=None,
	num_warnings=None,
	)
	else:
	logger.info(f"Scraping repository {repo.repo_url}")
	contributors = repo.contributors
	num_contributors = len(contributors)

	prs = repo.prs
	num_prs = len(prs)

	commits = repo.commits
	num_commits_to_main = len(commits)
	commit_messages = [c["commit"]["message"] for c in commits]
	average_commit_length_to_main = sum([len(c) for c in commit_messages]) / len(
	commit_messages
	)
	latest_commit = commits[0]["commit"]["author"]["date"]

	merged_prs = [p["number"] for p in prs if p["merged_at"] is not None]
	for pr_num in merged_prs:
	pr_commits: list[dict] = requests.get(
	f"{repo.repo_api}/pulls/{pr_num}/commits",
	headers=headers,
	timeout=100,
	).json()
	commit_messages += [c["commit"]["message"] for c in pr_commits]
	for commit in pr_commits:
	for contributor in contributors:
	commit_author = commit.get("author") # GitHub account info
	commit_committer = commit.get("committer") # GitHub account info
	commit_author_name = commit["commit"]["author"]["name"]
	commit_committer_name = commit["commit"]["committer"]["name"]

	matches = (
	(
	commit_author
	and commit_author.get("login") == contributor.login
	)
	or (
	commit_author_name
	and commit_author_name.lower() == contributor.login.lower()
	)
	or (
	commit_committer
	and commit_committer.get("login") == contributor.login
	)
	or (
	commit_committer_name
	and commit_committer_name.lower()
	== contributor.login.lower()
	)
	)

	if matches:
	contributor.commits_pr += 1
	break
	commits += pr_commits

	activity_matrix = create_activity_matrix(commits, max_delta=3, min_delta=1)

	average_commit_length = sum([len(c) for c in commit_messages]) / len(
	commit_messages
	)

	contributions_per_contributor = [c.total_commits for c in contributors]
	total_commits = sum(contributions_per_contributor)

	repo_content = RepoContent(
	repo_api=repo.repo_api,
	default_branch=repo.default_branch,
	)
	num_docker_files = repo_content.num_docker_files
	num_python_files = repo_content.num_python_files
	num_workflow_files = repo_content.num_workflow_files
	has_requirements_file = repo_content.has_requirements_file
	has_cloudbuild = repo_content.has_cloudbuild
	using_dvc = repo_content.using_dvc
	repo_size = repo_content.repo_size
	readme_length = repo_content.readme_length
	actions_passing = repo_content.actions_passing

	report = Report(
	repo_api=repo.repo_api,
	default_branch=repo.default_branch,
	)
	num_warnings = report.check_answers

	repo_stats = RepoStats(
	num_contributors=num_contributors,
	num_prs=num_prs,
	num_commits_to_main=num_commits_to_main,
	average_commit_length_to_main=average_commit_length_to_main,
	latest_commit=latest_commit,
	average_commit_length=average_commit_length,
	contributions_per_contributor=contributions_per_contributor,
	total_commits=total_commits,
	activity_matrix=activity_matrix,
	num_docker_files=num_docker_files,
	num_python_files=num_python_files,
	num_workflow_files=num_workflow_files,
	has_requirements_file=has_requirements_file,
	has_cloudbuild=has_cloudbuild,
	using_dvc=using_dvc,
	repo_size=repo_size,
	readme_length=readme_length,
	actions_passing=actions_passing,
	num_warnings=num_warnings,
	)
	logger.info(f"Scraping of repository {repo.repo_url} completed successfully")
	logger.info(f"Repo stats: {repo_stats}")
	return repo_stats


	def clone(repo_link: str, base_dir: str = "cloned_repos"):
	"""Clones the repositories of the groups."""

	if not os.path.exists(base_dir):
	os.makedirs(base_dir)

	repo_url = repo_link
	# Create a directory for the group if it doesn't exist
	group_dir = os.path.join(base_dir, f"group_{'.'.join(repo_link.split('/')[-2:])}")
	if not os.path.exists(group_dir):
	os.makedirs(group_dir)

	# Extract the repository name from the URL
	repo_name = repo_url.split("/")[-1].replace(".git", "")

	# Create a directory for the repository
	repo_dir = os.path.join(group_dir, repo_name)
	if not os.path.exists(repo_dir):
	os.makedirs(repo_dir)

	# Clone the repository
	try:
	subprocess.run(["git", "clone", repo_url, repo_dir], check=True)
	logger.info(f"Successfully cloned {repo_url} into {repo_dir}")
	except subprocess.CalledProcessError as e:
	logger.info(f"Failed to clone {repo_url}: {e}")


	if __name__ == "__main__":
	app = Typer()
	app.command()(scrape)
	app.command()(clone)

	app()