migratron / code_migration /migration_docs.py
amrithanandini's picture
integrated backend and frontend
1b35d41
"""
Migration Docs Fetcher
======================
Pre-collects breaking changes documentation for Python version migrations
and package updates. Fetches from official docs and caches locally.
Usage:
# As a module
from code_migration.migration_docs import get_migration_context
context = get_migration_context(
old_python="3.6",
new_python="3.12",
related_modules="numpy",
dependency_versions="numpy==2.3.2\npandas==2.3.1\n",
)
# As a script β€” pre-fetch all docs for the dataset
python code_migration/migration_docs.py
"""
from __future__ import annotations
import json
import os
import re
import time
from pathlib import Path
from typing import Dict, List, Optional
from urllib.request import urlopen, Request
from urllib.error import URLError
CACHE_DIR = Path(__file__).parent / "data" / "migration_docs_cache"
# ---------------------------------------------------------------------------
# Python "What's New" β€” breaking changes per version
# ---------------------------------------------------------------------------
# Hardcoded summaries of the most impactful breaking changes per Python version.
# These are extracted from docs.python.org/3.X/whatsnew/ pages.
# Much faster and more reliable than fetching at runtime.
PYTHON_BREAKING_CHANGES: Dict[str, str] = {
"3.7": "", # baseline β€” no breaks from 3.6
"3.8": (
"- `collections.abc` aliases in `collections` deprecated (Callable, Mapping, etc.)\n"
"- `platform.popen()` removed\n"
"- `time.clock()` removed β†’ use `time.perf_counter()`\n"
),
"3.9": (
"- `collections.abc` aliases in `collections` still deprecated\n"
"- `typing.List`, `typing.Dict` etc. can be replaced with `list`, `dict`\n"
"- `math.gcd()` now accepts multiple arguments\n"
),
"3.10": (
"- `collections.Callable` REMOVED β†’ use `collections.abc.Callable`\n"
"- `collections.Mapping` REMOVED β†’ use `collections.abc.Mapping`\n"
"- `collections.MutableMapping` REMOVED β†’ use `collections.abc.MutableMapping`\n"
"- `collections.Iterable` REMOVED β†’ use `collections.abc.Iterable`\n"
"- `collections.Iterator` REMOVED β†’ use `collections.abc.Iterator`\n"
"- `collections.Sequence` REMOVED β†’ use `collections.abc.Sequence`\n"
"- `collections.MutableSequence` REMOVED β†’ use `collections.abc.MutableSequence`\n"
"- `collections.Set` REMOVED β†’ use `collections.abc.Set`\n"
"- `collections.MutableSet` REMOVED β†’ use `collections.abc.MutableSet`\n"
"- `collections.ByteString` REMOVED β†’ use `collections.abc.ByteString`\n"
"- `typing.io` and `typing.re` removed\n"
"- `loop` parameter removed from most `asyncio` functions\n"
),
"3.11": (
"- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` deprecated\n"
"- `locale.resetlocale()` deprecated\n"
"- `configparser.SafeConfigParser` β†’ `configparser.ConfigParser`\n"
),
"3.12": (
"- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` REMOVED\n"
"- `configparser.SafeConfigParser` REMOVED β†’ use `ConfigParser`\n"
"- `distutils` REMOVED entirely β†’ use `setuptools`\n"
"- `imp` module REMOVED β†’ use `importlib`\n"
"- `pkgutil.ImpImporter` and `pkgutil.ImpLoader` REMOVED\n"
"- `locale.resetlocale()` REMOVED\n"
"- `asynchat`, `asyncore`, `smtpd` REMOVED\n"
"- `xml.etree.ElementTree.Element.copy()` β†’ use `copy.copy()`\n"
),
}
# ---------------------------------------------------------------------------
# Package-specific breaking changes
# ---------------------------------------------------------------------------
PACKAGE_BREAKING_CHANGES: Dict[str, str] = {
"numpy": (
"NumPy 2.0 breaking changes:\n"
"- `np.math` REMOVED β†’ use `math` (stdlib) directly. `import math; math.factorial(x)` etc.\n"
"- `np.product` REMOVED β†’ use `np.prod`\n"
"- `np.cumproduct` REMOVED β†’ use `np.cumprod`\n"
"- `np.sometrue` REMOVED β†’ use `np.any`\n"
"- `np.alltrue` REMOVED β†’ use `np.all`\n"
"- `np.in1d` REMOVED β†’ use `np.isin`\n"
"- `np.row_stack` REMOVED β†’ use `np.vstack`\n"
"- `np.bool` REMOVED β†’ use `bool`\n"
"- `np.int` REMOVED β†’ use `int`\n"
"- `np.float` REMOVED β†’ use `float`\n"
"- `np.complex` REMOVED β†’ use `complex`\n"
"- `np.object` REMOVED β†’ use `object`\n"
"- `np.str` REMOVED β†’ use `str`\n"
"- `np.long` REMOVED β†’ use `int`\n"
"- `np.unicode` REMOVED β†’ use `str`\n"
"- `numpy.NaN` REMOVED β†’ use `numpy.nan`\n"
"- `numpy.Inf` REMOVED β†’ use `numpy.inf`\n"
"- `numpy.string_` β†’ `numpy.bytes_`\n"
"- `numpy.unicode_` β†’ `numpy.str_`\n"
"- `numpy.AxisError` moved to `numpy.exceptions.AxisError`\n"
"- `np.typeDict` REMOVED β†’ use `np.sctypeDict`\n"
"- `np.matrix.itemset()` REMOVED β†’ use direct indexing `M[i,j] = val`\n"
"- `np.ndarray.itemset()` REMOVED β†’ use direct indexing `arr[i] = val`\n"
),
"pandas": (
"Pandas 2.0+ breaking changes:\n"
"- `DataFrame.append()` REMOVED β†’ use `pd.concat([df, new_row])`\n"
"- `Series.append()` REMOVED β†’ use `pd.concat([s1, s2])`\n"
"- `pd.read_csv(error_bad_lines=)` REMOVED β†’ use `on_bad_lines='skip'`\n"
"- `pd.read_csv(warn_bad_lines=)` REMOVED β†’ use `on_bad_lines='warn'`\n"
"- `DataFrame.swaplevel()` β†’ axis parameter deprecated\n"
"- `Index.is_monotonic` β†’ use `Index.is_monotonic_increasing`\n"
"- Default dtype changed from object to nullable types\n"
),
"flask": (
"Flask 2.0+ / 3.0+ breaking changes:\n"
"- `flask.helpers.safe_join` REMOVED β†’ use `werkzeug.utils.safe_join`\n"
"- `flask.json.JSONEncoder` REMOVED β†’ use standard json or custom\n"
"- `@app.before_first_request` REMOVED\n"
"- `flask.escape` REMOVED β†’ use `markupsafe.escape`\n"
"- `flask.Markup` REMOVED β†’ use `markupsafe.Markup`\n"
),
"Django": (
"Django 4.0+ / 5.0+ breaking changes:\n"
"- `django.conf.urls.url()` REMOVED β†’ use `django.urls.re_path()`\n"
"- `django.utils.translation.ugettext()` REMOVED β†’ use `gettext()`\n"
"- `django.utils.translation.ugettext_lazy()` REMOVED β†’ use `gettext_lazy()`\n"
"- `django.utils.translation.ungettext()` REMOVED β†’ use `ngettext()`\n"
"- `django.utils.translation.ungettext_lazy()` REMOVED β†’ use `ngettext_lazy()`\n"
"- `django.utils.encoding.force_text()` REMOVED β†’ use `force_str()`\n"
"- `django.utils.encoding.smart_text()` REMOVED β†’ use `smart_str()`\n"
"- `django.utils.http.is_safe_url()` REMOVED β†’ use `url_has_allowed_host_and_scheme()`\n"
"- `django.conf.urls.include()` no longer accepts `app_name` as string\n"
),
"django": ( # lowercase alias
"Same as Django β€” see Django entry above.\n"
),
"gensim": (
"Gensim 4.0+ breaking changes:\n"
"- `gensim.models.Word2Vec.most_similar()` β†’ use `model.wv.most_similar()`\n"
"- `gensim.models.KeyedVectors.load_word2vec_format()` still works\n"
"- `gensim.corpora.Dictionary.doc2bow()` unchanged\n"
"- `gensim.similarities.MatrixSimilarity` unchanged\n"
"- `smart_open` dependency updated\n"
),
"pydantic": (
"Pydantic v2 breaking changes:\n"
"- `BaseModel.dict()` β†’ use `BaseModel.model_dump()`\n"
"- `BaseModel.json()` β†’ use `BaseModel.model_dump_json()`\n"
"- `BaseModel.parse_obj()` β†’ use `BaseModel.model_validate()`\n"
"- `BaseModel.parse_raw()` β†’ use `BaseModel.model_validate_json()`\n"
"- `@validator` β†’ use `@field_validator`\n"
"- `@root_validator` β†’ use `@model_validator`\n"
"- `Field(regex=)` β†’ use `Field(pattern=)`\n"
"- `Config` class β†’ use `model_config = ConfigDict(...)`\n"
),
"PyYAML": (
"PyYAML breaking changes:\n"
"- `yaml.load(f)` without Loader is REMOVED β†’ use `yaml.safe_load(f)`\n"
"- `yaml.load(f, Loader=yaml.FullLoader)` is the explicit alternative\n"
),
"pillow": (
"Pillow (PIL) breaking changes:\n"
"- `Image.ANTIALIAS` REMOVED β†’ use `Image.LANCZOS`\n"
"- `ImageDraw.textsize()` REMOVED β†’ use `ImageDraw.textbbox()` or `textlength()`\n"
"- `FreeTypeFont.getsize()` REMOVED β†’ use `FreeTypeFont.getbbox()`\n"
),
"PyJWT": (
"PyJWT 2.0+ breaking changes:\n"
"- `jwt.decode()` now returns dict directly (was bytes in 1.x)\n"
"- `algorithms` parameter is now required in `jwt.decode()`\n"
"- `jwt.decode(verify=False)` β†’ use `options={'verify_signature': False}`\n"
),
"marshmallow": (
"Marshmallow 3.0+ breaking changes:\n"
"- `Schema.dump()` returns data directly (not tuple)\n"
"- `Schema.load()` returns data directly (not tuple)\n"
"- `fields.Nested(many=True)` β†’ use `fields.List(fields.Nested(...))`\n"
"- `@post_load` decorated methods receive `**kwargs` differently\n"
),
"Flask-SQLAlchemy": (
"Flask-SQLAlchemy 3.0+ breaking changes:\n"
"- `db.Model.query` still works but `db.session.execute(select(...))` preferred\n"
"- `SQLALCHEMY_TRACK_MODIFICATIONS` default changed\n"
),
"pyasn1": (
"pyasn1 0.5+ / 0.6+ breaking changes:\n"
"- `pyasn1.compat.octets` module REMOVED\n"
"- `pyasn1.compat.octets.null` REMOVED β†’ use `b''`\n"
"- `pyasn1.compat.octets.str2octs(s)` REMOVED β†’ use `s.encode()` or `b'...'`\n"
"- `pyasn1.compat.octets.octs2str(b)` REMOVED β†’ use `b.decode()`\n"
"- `pyasn1.compat.octets.isOctetsType(x)` REMOVED β†’ use `isinstance(x, bytes)`\n"
"- `pyasn1.compat.integer` module REMOVED\n"
),
"pyee": (
"pyee 9.0+ / 12.0+ breaking changes:\n"
"- `from pyee import ExecutorEventEmitter` REMOVED β†’ use `from pyee.executor import ExecutorEventEmitter`\n"
"- `from pyee import BaseEventEmitter` REMOVED β†’ use `from pyee.base import EventEmitter`\n"
"- `BaseEventEmitter` renamed to `EventEmitter`\n"
),
"async-timeout": (
"async-timeout 4.0+ breaking changes:\n"
"- `async_timeout.timeout()` now returns async context manager\n"
"- `with async_timeout.timeout(n):` β†’ `async with asyncio.timeout(n):`\n"
"- In Python 3.11+, use `asyncio.timeout()` from stdlib instead\n"
),
"discord": (
"discord.py 2.0 breaking changes:\n"
"- Many methods now require `Intents`\n"
"- `Client.logout()` REMOVED β†’ use `Client.close()`\n"
"- `on_ready` behavior changed\n"
"- `commands.Bot` requires `intents` parameter\n"
),
"tweepy": (
"Tweepy 4.0+ breaking changes:\n"
"- `API` class methods renamed\n"
"- `StreamListener` REMOVED β†’ subclass `Stream` directly\n"
"- `Cursor` API changed\n"
),
"scikit-learn": (
"scikit-learn 1.0+ breaking changes:\n"
"- `sklearn.utils._get_column_indices` REMOVED\n"
"- `sklearn.utils._safe_indexing` moved to `sklearn.utils`\n"
"- Many private APIs reorganized\n"
"- `sklearn.metrics.plot_*` functions deprecated β†’ use display objects\n"
),
"ansible": (
"Ansible breaking changes:\n"
"- Module paths changed in ansible-core 2.10+\n"
"- `AnsibleModule` import paths may differ\n"
),
}
# Aliases
PACKAGE_BREAKING_CHANGES["builtin"] = "" # covered by Python version changes
PACKAGE_BREAKING_CHANGES["numpy, pandas"] = (
PACKAGE_BREAKING_CHANGES["numpy"] + "\n" + PACKAGE_BREAKING_CHANGES["pandas"]
)
PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy, marshmallow"] = (
PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy"] + "\n" + PACKAGE_BREAKING_CHANGES["marshmallow"]
)
# ---------------------------------------------------------------------------
# Main API
# ---------------------------------------------------------------------------
def get_python_breaking_changes(old_version: str, new_version: str) -> str:
"""Get Python breaking changes between two versions.
Args:
old_version: e.g., "3.6" or "3.6.4"
new_version: e.g., "3.12" or "3.12.11"
Returns:
String with all breaking changes between the versions.
"""
old_minor = int(old_version.split(".")[1]) if "." in old_version else 6
new_minor = int(new_version.split(".")[1]) if "." in new_version else 12
changes = []
for minor in range(old_minor + 1, new_minor + 1):
key = f"3.{minor}"
if key in PYTHON_BREAKING_CHANGES and PYTHON_BREAKING_CHANGES[key]:
changes.append(f"Python {key}:\n{PYTHON_BREAKING_CHANGES[key]}")
if not changes:
return "No known Python breaking changes for this version range."
return "\n".join(changes)
def get_package_breaking_changes(related_modules: str) -> str:
"""Get package-specific breaking changes.
Args:
related_modules: comma-separated module names, e.g., "numpy" or "numpy, pandas"
Returns:
String with breaking changes for the specified packages.
"""
# Try exact match first
if related_modules in PACKAGE_BREAKING_CHANGES:
return PACKAGE_BREAKING_CHANGES[related_modules]
# Try individual modules
modules = [m.strip() for m in related_modules.split(",")]
changes = []
for mod in modules:
if mod in PACKAGE_BREAKING_CHANGES and PACKAGE_BREAKING_CHANGES[mod]:
changes.append(PACKAGE_BREAKING_CHANGES[mod])
if not changes:
return f"No pre-built migration guide for '{related_modules}'. Check the package changelog."
return "\n".join(changes)
def get_migration_context(
old_python: str,
new_python: str,
related_modules: str,
dependency_versions: str = "",
) -> str:
"""Build a complete migration context string for a task.
This is injected into the system prompt to give the model
specific knowledge about what changed.
Returns:
Formatted string with all relevant breaking changes.
"""
parts = []
# Python version changes
py_changes = get_python_breaking_changes(old_python, new_python)
if py_changes and "No known" not in py_changes:
parts.append(f"=== PYTHON {old_python} β†’ {new_python} BREAKING CHANGES ===\n{py_changes}")
# Package changes
pkg_changes = get_package_breaking_changes(related_modules)
if pkg_changes and "No pre-built" not in pkg_changes:
parts.append(f"=== PACKAGE BREAKING CHANGES ({related_modules}) ===\n{pkg_changes}")
if not parts:
return "No specific migration docs available. Use general debugging."
return "\n\n".join(parts)
# ---------------------------------------------------------------------------
# CLI β€” pre-fetch and show context for all tasks in the dataset
# ---------------------------------------------------------------------------
def main():
"""Show migration context for all tasks in train + eval."""
from code_migration.dataset_loader import DatasetLoader
data_dir = Path(__file__).parent / "data"
for split in ["train.jsonl", "eval.jsonl"]:
path = data_dir / split
if not path.exists():
continue
print(f"\n{'='*60}")
print(f" {split}")
print(f"{'='*60}")
loader = DatasetLoader(str(path))
for i, task in enumerate(loader._tasks):
ctx = get_migration_context(
old_python=task.reproduction_target_version,
new_python=task.migration_target_version,
related_modules=task.related_modules,
dependency_versions=task.dependency_versions,
)
has_py = "PYTHON" in ctx
has_pkg = "PACKAGE" in ctx
ctx_len = len(ctx)
print(f" {i+1:2d}. {task.repo_name:40s} "
f"py={'βœ“' if has_py else 'βœ—'} pkg={'βœ“' if has_pkg else 'βœ—'} "
f"({ctx_len} chars)")
if __name__ == "__main__":
main()