nodeaudit-openenv / scripts /seed_training_corpus.sh
shreyas-joshi's picture
Add training scripts and utilities for NodeAudit and GraphReview
902cd29
#!/bin/sh
# Seed GraphReview SQLite DBs from training_corpus subpaths (application core only).
# Run after scripts/clone_training_repos.sh. Executes from code-review-env so `python -m db.seed` resolves.
# POSIX sh — safe to run as: sh scripts/seed_training_corpus.sh
set -eu
ROOT="$(CDPATH='' cd "$(dirname "$0")/.." && pwd)"
ENV_DIR="$ROOT/code-review-env"
CORPUS_DIR="${CORPUS_DIR:-$ROOT/training_corpus}"
OUT_DIR="${CORPUS_DB_DIR:-$ROOT/outputs/corpus_dbs}"
if [ ! -d "$ENV_DIR" ]; then
echo "error: expected code-review-env at $ENV_DIR" >&2
exit 1
fi
mkdir -p "$OUT_DIR"
cd "$ENV_DIR"
seed_one() {
db_basename="$1"
relative_path="$2"
target="$CORPUS_DIR/$relative_path"
db_path="$OUT_DIR/${db_basename}.db"
if [ ! -d "$target" ]; then
echo "[skip] missing directory: $target"
return 0
fi
echo "[seed] $target -> $db_path"
python -m db.seed "$target" --db-path "$db_path" --force
}
# Tier 1 — single package roots matching training corpus seed table
seed_one corpus_flask "flask/src/flask"
# Full celery package (app/, worker/, backends/ live under this tree)
seed_one corpus_celery "celery/celery"
seed_one corpus_requests "requests/src/requests"
seed_one corpus_httpx "httpx/httpx"
seed_one corpus_fastapi "fastapi/fastapi"
seed_one corpus_sqlalchemy "sqlalchemy/lib/sqlalchemy"
seed_one corpus_pydantic "pydantic/pydantic"
# Tier 2
seed_one corpus_luigi "luigi/luigi"
# Focus: middleware stack modules (omit tests/spiders noise)
seed_one corpus_scrapy_core "scrapy/scrapy/core"
seed_one corpus_scrapy_pipelines "scrapy/scrapy/pipelines"
seed_one corpus_paramiko "paramiko/paramiko"
seed_one corpus_airflow "airflow/airflow"
# Django: seed focused subtrees (separate DBs — no cross-edges between DBs)
seed_one corpus_django_db "django/django/db"
seed_one corpus_django_http "django/django/http"
seed_one corpus_django_auth "django/django/contrib/auth"
# Tier 3 — small templates (paths vary; adjust if upstream layout changes)
# App root: models/, resources/, app.py (Flask-Smorest sample)
seed_one corpus_rest_api_smorest_docker "rest-api-smorest-docker"
seed_one corpus_fullstack_fastapi_template "full-stack-fastapi-template/backend/app"
seed_one corpus_flasky "flasky/app"
# Layout: project/{app,db,migrations,tests}
if [ -d "$CORPUS_DIR/fastapi-tdd-docker/project" ]; then
seed_one corpus_fastapi_tdd "fastapi-tdd-docker/project"
else
echo "[skip] fastapi-tdd-docker/project — clone testdrivenio/fastapi-tdd-docker first"
fi
echo "Done. Databases under: $OUT_DIR"