Rasmus Aagaard
Deploy MLOps Mentor Leaderboard to Hugging Face Space
567b305
import datetime
import os
import subprocess
import numpy as np
import requests
from loguru import logger
from typer import Typer
from mlops_mentor.common import github_headers as headers
from mlops_mentor.common.models import RepoInfo
from mlops_mentor.scraper.models import RepoContent, Report, RepoStats
def create_activity_matrix(
commits: list,
max_delta: int = 5,
min_delta: int = 1,
) -> list[list[int]]:
"""Creates an activity matrix from the commits."""
commit_times = [
datetime.datetime.fromisoformat(commit["commit"]["committer"]["date"][:-1])
for commit in commits
]
commit_times.sort()
start_time = commit_times[0]
end_time = max(
start_time + datetime.timedelta(weeks=min_delta),
min(start_time + datetime.timedelta(weeks=max_delta), commit_times[-1]),
)
num_days = (end_time - start_time).days + 1 # include last day
commit_matrix = np.zeros((num_days, 24), dtype=int)
for commit_time in commit_times:
if start_time <= commit_time <= end_time:
day_index = (commit_time - start_time).days
hour_index = commit_time.hour
commit_matrix[day_index, hour_index] += 1
return commit_matrix.tolist()
def scrape(repo_link: str) -> RepoStats:
repo = RepoInfo(repo_url=repo_link)
if not repo.is_accessible:
logger.error(f"Repository {repo.repo_url} is not accessible.")
repo_stats = RepoStats(
num_contributors=None,
num_prs=None,
num_commits_to_main=None,
average_commit_length_to_main=None,
latest_commit=None,
average_commit_length=None,
contributions_per_contributor=None,
total_commits=None,
activity_matrix=None,
num_docker_files=None,
num_python_files=None,
num_workflow_files=None,
has_requirements_file=None,
has_cloudbuild=None,
using_dvc=None,
repo_size=None,
readme_length=None,
actions_passing=None,
num_warnings=None,
)
else:
logger.info(f"Scraping repository {repo.repo_url}")
contributors = repo.contributors
num_contributors = len(contributors)
prs = repo.prs
num_prs = len(prs)
commits = repo.commits
num_commits_to_main = len(commits)
commit_messages = [c["commit"]["message"] for c in commits]
average_commit_length_to_main = sum([len(c) for c in commit_messages]) / len(
commit_messages
)
latest_commit = commits[0]["commit"]["author"]["date"]
merged_prs = [p["number"] for p in prs if p["merged_at"] is not None]
for pr_num in merged_prs:
pr_commits: list[dict] = requests.get(
f"{repo.repo_api}/pulls/{pr_num}/commits",
headers=headers,
timeout=100,
).json()
commit_messages += [c["commit"]["message"] for c in pr_commits]
for commit in pr_commits:
for contributor in contributors:
commit_author = commit.get("author") # GitHub account info
commit_committer = commit.get("committer") # GitHub account info
commit_author_name = commit["commit"]["author"]["name"]
commit_committer_name = commit["commit"]["committer"]["name"]
matches = (
(
commit_author
and commit_author.get("login") == contributor.login
)
or (
commit_author_name
and commit_author_name.lower() == contributor.login.lower()
)
or (
commit_committer
and commit_committer.get("login") == contributor.login
)
or (
commit_committer_name
and commit_committer_name.lower()
== contributor.login.lower()
)
)
if matches:
contributor.commits_pr += 1
break
commits += pr_commits
activity_matrix = create_activity_matrix(commits, max_delta=3, min_delta=1)
average_commit_length = sum([len(c) for c in commit_messages]) / len(
commit_messages
)
contributions_per_contributor = [c.total_commits for c in contributors]
total_commits = sum(contributions_per_contributor)
repo_content = RepoContent(
repo_api=repo.repo_api,
default_branch=repo.default_branch,
)
num_docker_files = repo_content.num_docker_files
num_python_files = repo_content.num_python_files
num_workflow_files = repo_content.num_workflow_files
has_requirements_file = repo_content.has_requirements_file
has_cloudbuild = repo_content.has_cloudbuild
using_dvc = repo_content.using_dvc
repo_size = repo_content.repo_size
readme_length = repo_content.readme_length
actions_passing = repo_content.actions_passing
report = Report(
repo_api=repo.repo_api,
default_branch=repo.default_branch,
)
num_warnings = report.check_answers
repo_stats = RepoStats(
num_contributors=num_contributors,
num_prs=num_prs,
num_commits_to_main=num_commits_to_main,
average_commit_length_to_main=average_commit_length_to_main,
latest_commit=latest_commit,
average_commit_length=average_commit_length,
contributions_per_contributor=contributions_per_contributor,
total_commits=total_commits,
activity_matrix=activity_matrix,
num_docker_files=num_docker_files,
num_python_files=num_python_files,
num_workflow_files=num_workflow_files,
has_requirements_file=has_requirements_file,
has_cloudbuild=has_cloudbuild,
using_dvc=using_dvc,
repo_size=repo_size,
readme_length=readme_length,
actions_passing=actions_passing,
num_warnings=num_warnings,
)
logger.info(f"Scraping of repository {repo.repo_url} completed successfully")
logger.info(f"Repo stats: {repo_stats}")
return repo_stats
def clone(repo_link: str, base_dir: str = "cloned_repos"):
"""Clones the repositories of the groups."""
if not os.path.exists(base_dir):
os.makedirs(base_dir)
repo_url = repo_link
# Create a directory for the group if it doesn't exist
group_dir = os.path.join(base_dir, f"group_{'.'.join(repo_link.split('/')[-2:])}")
if not os.path.exists(group_dir):
os.makedirs(group_dir)
# Extract the repository name from the URL
repo_name = repo_url.split("/")[-1].replace(".git", "")
# Create a directory for the repository
repo_dir = os.path.join(group_dir, repo_name)
if not os.path.exists(repo_dir):
os.makedirs(repo_dir)
# Clone the repository
try:
subprocess.run(["git", "clone", repo_url, repo_dir], check=True)
logger.info(f"Successfully cloned {repo_url} into {repo_dir}")
except subprocess.CalledProcessError as e:
logger.info(f"Failed to clone {repo_url}: {e}")
if __name__ == "__main__":
app = Typer()
app.command()(scrape)
app.command()(clone)
app()