nodeaudit-openenv / scripts /clone_training_repos.sh
shreyas-joshi's picture
Add training scripts and utilities for NodeAudit and GraphReview
902cd29
#!/bin/sh
# Clone open-source Python corpora for NodeAudit / GraphReview training.
# Licensed MIT/Apache per upstream projects. Uses shallow clones to save disk.
# POSIX sh — safe to run as: sh scripts/clone_training_repos.sh
set -eu
ROOT="$(CDPATH='' cd "$(dirname "$0")/.." && pwd)"
CORPUS_DIR="${CORPUS_DIR:-$ROOT/training_corpus}"
mkdir -p "$CORPUS_DIR"
clone_if_missing() {
url="$1"
dest="$2"
if [ -d "$dest" ] && [ -e "$dest/.git" ]; then
echo "skip (exists): $dest"
return
fi
git clone --depth=1 "$url" "$dest"
}
echo "Corpus directory: $CORPUS_DIR"
# Tier 1 — core training
clone_if_missing https://github.com/pallets/flask "$CORPUS_DIR/flask"
clone_if_missing https://github.com/celery/celery "$CORPUS_DIR/celery"
clone_if_missing https://github.com/psf/requests "$CORPUS_DIR/requests"
clone_if_missing https://github.com/encode/httpx "$CORPUS_DIR/httpx"
clone_if_missing https://github.com/fastapi/fastapi "$CORPUS_DIR/fastapi"
clone_if_missing https://github.com/sqlalchemy/sqlalchemy "$CORPUS_DIR/sqlalchemy"
clone_if_missing https://github.com/pydantic/pydantic "$CORPUS_DIR/pydantic"
# Tier 2 — topology diversity (large clones)
clone_if_missing https://github.com/spotify/luigi "$CORPUS_DIR/luigi"
clone_if_missing https://github.com/scrapy/scrapy "$CORPUS_DIR/scrapy"
clone_if_missing https://github.com/paramiko/paramiko "$CORPUS_DIR/paramiko"
clone_if_missing https://github.com/django/django "$CORPUS_DIR/django"
clone_if_missing https://github.com/apache/airflow "$CORPUS_DIR/airflow"
# Tier 3 — synthetic bug injection / smaller templates
# Small Flask-Smorest API (Real Python course flavor; original realpython/flask-smorest-api URL is not public)
clone_if_missing https://github.com/tecladocode/rest-api-smorest-docker "$CORPUS_DIR/rest-api-smorest-docker"
clone_if_missing https://github.com/tiangolo/full-stack-fastapi-template "$CORPUS_DIR/full-stack-fastapi-template"
clone_if_missing https://github.com/miguelgrinberg/flasky "$CORPUS_DIR/flasky"
clone_if_missing https://github.com/testdrivenio/fastapi-tdd-docker "$CORPUS_DIR/fastapi-tdd-docker"
count="$(find "$CORPUS_DIR" -mindepth 1 -maxdepth 1 -type d | wc -l)"
echo "Top-level corpus entries: $count"