""" Migration Docs Fetcher ====================== Pre-collects breaking changes documentation for Python version migrations and package updates. Fetches from official docs and caches locally. Usage: # As a module from code_migration.migration_docs import get_migration_context context = get_migration_context( old_python="3.6", new_python="3.12", related_modules="numpy", dependency_versions="numpy==2.3.2\npandas==2.3.1\n", ) # As a script — pre-fetch all docs for the dataset python code_migration/migration_docs.py """ from __future__ import annotations import json import os import re import time from pathlib import Path from typing import Dict, List, Optional from urllib.request import urlopen, Request from urllib.error import URLError CACHE_DIR = Path(__file__).parent / "data" / "migration_docs_cache" # --------------------------------------------------------------------------- # Python "What's New" — breaking changes per version # --------------------------------------------------------------------------- # Hardcoded summaries of the most impactful breaking changes per Python version. # These are extracted from docs.python.org/3.X/whatsnew/ pages. # Much faster and more reliable than fetching at runtime. PYTHON_BREAKING_CHANGES: Dict[str, str] = { "3.7": "", # baseline — no breaks from 3.6 "3.8": ( "- `collections.abc` aliases in `collections` deprecated (Callable, Mapping, etc.)\n" "- `platform.popen()` removed\n" "- `time.clock()` removed → use `time.perf_counter()`\n" ), "3.9": ( "- `collections.abc` aliases in `collections` still deprecated\n" "- `typing.List`, `typing.Dict` etc. can be replaced with `list`, `dict`\n" "- `math.gcd()` now accepts multiple arguments\n" ), "3.10": ( "- `collections.Callable` REMOVED → use `collections.abc.Callable`\n" "- `collections.Mapping` REMOVED → use `collections.abc.Mapping`\n" "- `collections.MutableMapping` REMOVED → use `collections.abc.MutableMapping`\n" "- `collections.Iterable` REMOVED → use `collections.abc.Iterable`\n" "- `collections.Iterator` REMOVED → use `collections.abc.Iterator`\n" "- `collections.Sequence` REMOVED → use `collections.abc.Sequence`\n" "- `collections.MutableSequence` REMOVED → use `collections.abc.MutableSequence`\n" "- `collections.Set` REMOVED → use `collections.abc.Set`\n" "- `collections.MutableSet` REMOVED → use `collections.abc.MutableSet`\n" "- `collections.ByteString` REMOVED → use `collections.abc.ByteString`\n" "- `typing.io` and `typing.re` removed\n" "- `loop` parameter removed from most `asyncio` functions\n" ), "3.11": ( "- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` deprecated\n" "- `locale.resetlocale()` deprecated\n" "- `configparser.SafeConfigParser` → `configparser.ConfigParser`\n" ), "3.12": ( "- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` REMOVED\n" "- `configparser.SafeConfigParser` REMOVED → use `ConfigParser`\n" "- `distutils` REMOVED entirely → use `setuptools`\n" "- `imp` module REMOVED → use `importlib`\n" "- `pkgutil.ImpImporter` and `pkgutil.ImpLoader` REMOVED\n" "- `locale.resetlocale()` REMOVED\n" "- `asynchat`, `asyncore`, `smtpd` REMOVED\n" "- `xml.etree.ElementTree.Element.copy()` → use `copy.copy()`\n" ), } # --------------------------------------------------------------------------- # Package-specific breaking changes # --------------------------------------------------------------------------- PACKAGE_BREAKING_CHANGES: Dict[str, str] = { "numpy": ( "NumPy 2.0 breaking changes:\n" "- `np.math` REMOVED → use `math` (stdlib) directly. `import math; math.factorial(x)` etc.\n" "- `np.product` REMOVED → use `np.prod`\n" "- `np.cumproduct` REMOVED → use `np.cumprod`\n" "- `np.sometrue` REMOVED → use `np.any`\n" "- `np.alltrue` REMOVED → use `np.all`\n" "- `np.in1d` REMOVED → use `np.isin`\n" "- `np.row_stack` REMOVED → use `np.vstack`\n" "- `np.bool` REMOVED → use `bool`\n" "- `np.int` REMOVED → use `int`\n" "- `np.float` REMOVED → use `float`\n" "- `np.complex` REMOVED → use `complex`\n" "- `np.object` REMOVED → use `object`\n" "- `np.str` REMOVED → use `str`\n" "- `np.long` REMOVED → use `int`\n" "- `np.unicode` REMOVED → use `str`\n" "- `numpy.NaN` REMOVED → use `numpy.nan`\n" "- `numpy.Inf` REMOVED → use `numpy.inf`\n" "- `numpy.string_` → `numpy.bytes_`\n" "- `numpy.unicode_` → `numpy.str_`\n" "- `numpy.AxisError` moved to `numpy.exceptions.AxisError`\n" "- `np.typeDict` REMOVED → use `np.sctypeDict`\n" "- `np.matrix.itemset()` REMOVED → use direct indexing `M[i,j] = val`\n" "- `np.ndarray.itemset()` REMOVED → use direct indexing `arr[i] = val`\n" ), "pandas": ( "Pandas 2.0+ breaking changes:\n" "- `DataFrame.append()` REMOVED → use `pd.concat([df, new_row])`\n" "- `Series.append()` REMOVED → use `pd.concat([s1, s2])`\n" "- `pd.read_csv(error_bad_lines=)` REMOVED → use `on_bad_lines='skip'`\n" "- `pd.read_csv(warn_bad_lines=)` REMOVED → use `on_bad_lines='warn'`\n" "- `DataFrame.swaplevel()` → axis parameter deprecated\n" "- `Index.is_monotonic` → use `Index.is_monotonic_increasing`\n" "- Default dtype changed from object to nullable types\n" ), "flask": ( "Flask 2.0+ / 3.0+ breaking changes:\n" "- `flask.helpers.safe_join` REMOVED → use `werkzeug.utils.safe_join`\n" "- `flask.json.JSONEncoder` REMOVED → use standard json or custom\n" "- `@app.before_first_request` REMOVED\n" "- `flask.escape` REMOVED → use `markupsafe.escape`\n" "- `flask.Markup` REMOVED → use `markupsafe.Markup`\n" ), "Django": ( "Django 4.0+ / 5.0+ breaking changes:\n" "- `django.conf.urls.url()` REMOVED → use `django.urls.re_path()`\n" "- `django.utils.translation.ugettext()` REMOVED → use `gettext()`\n" "- `django.utils.translation.ugettext_lazy()` REMOVED → use `gettext_lazy()`\n" "- `django.utils.translation.ungettext()` REMOVED → use `ngettext()`\n" "- `django.utils.translation.ungettext_lazy()` REMOVED → use `ngettext_lazy()`\n" "- `django.utils.encoding.force_text()` REMOVED → use `force_str()`\n" "- `django.utils.encoding.smart_text()` REMOVED → use `smart_str()`\n" "- `django.utils.http.is_safe_url()` REMOVED → use `url_has_allowed_host_and_scheme()`\n" "- `django.conf.urls.include()` no longer accepts `app_name` as string\n" ), "django": ( # lowercase alias "Same as Django — see Django entry above.\n" ), "gensim": ( "Gensim 4.0+ breaking changes:\n" "- `gensim.models.Word2Vec.most_similar()` → use `model.wv.most_similar()`\n" "- `gensim.models.KeyedVectors.load_word2vec_format()` still works\n" "- `gensim.corpora.Dictionary.doc2bow()` unchanged\n" "- `gensim.similarities.MatrixSimilarity` unchanged\n" "- `smart_open` dependency updated\n" ), "pydantic": ( "Pydantic v2 breaking changes:\n" "- `BaseModel.dict()` → use `BaseModel.model_dump()`\n" "- `BaseModel.json()` → use `BaseModel.model_dump_json()`\n" "- `BaseModel.parse_obj()` → use `BaseModel.model_validate()`\n" "- `BaseModel.parse_raw()` → use `BaseModel.model_validate_json()`\n" "- `@validator` → use `@field_validator`\n" "- `@root_validator` → use `@model_validator`\n" "- `Field(regex=)` → use `Field(pattern=)`\n" "- `Config` class → use `model_config = ConfigDict(...)`\n" ), "PyYAML": ( "PyYAML breaking changes:\n" "- `yaml.load(f)` without Loader is REMOVED → use `yaml.safe_load(f)`\n" "- `yaml.load(f, Loader=yaml.FullLoader)` is the explicit alternative\n" ), "pillow": ( "Pillow (PIL) breaking changes:\n" "- `Image.ANTIALIAS` REMOVED → use `Image.LANCZOS`\n" "- `ImageDraw.textsize()` REMOVED → use `ImageDraw.textbbox()` or `textlength()`\n" "- `FreeTypeFont.getsize()` REMOVED → use `FreeTypeFont.getbbox()`\n" ), "PyJWT": ( "PyJWT 2.0+ breaking changes:\n" "- `jwt.decode()` now returns dict directly (was bytes in 1.x)\n" "- `algorithms` parameter is now required in `jwt.decode()`\n" "- `jwt.decode(verify=False)` → use `options={'verify_signature': False}`\n" ), "marshmallow": ( "Marshmallow 3.0+ breaking changes:\n" "- `Schema.dump()` returns data directly (not tuple)\n" "- `Schema.load()` returns data directly (not tuple)\n" "- `fields.Nested(many=True)` → use `fields.List(fields.Nested(...))`\n" "- `@post_load` decorated methods receive `**kwargs` differently\n" ), "Flask-SQLAlchemy": ( "Flask-SQLAlchemy 3.0+ breaking changes:\n" "- `db.Model.query` still works but `db.session.execute(select(...))` preferred\n" "- `SQLALCHEMY_TRACK_MODIFICATIONS` default changed\n" ), "pyasn1": ( "pyasn1 0.5+ / 0.6+ breaking changes:\n" "- `pyasn1.compat.octets` module REMOVED\n" "- `pyasn1.compat.octets.null` REMOVED → use `b''`\n" "- `pyasn1.compat.octets.str2octs(s)` REMOVED → use `s.encode()` or `b'...'`\n" "- `pyasn1.compat.octets.octs2str(b)` REMOVED → use `b.decode()`\n" "- `pyasn1.compat.octets.isOctetsType(x)` REMOVED → use `isinstance(x, bytes)`\n" "- `pyasn1.compat.integer` module REMOVED\n" ), "pyee": ( "pyee 9.0+ / 12.0+ breaking changes:\n" "- `from pyee import ExecutorEventEmitter` REMOVED → use `from pyee.executor import ExecutorEventEmitter`\n" "- `from pyee import BaseEventEmitter` REMOVED → use `from pyee.base import EventEmitter`\n" "- `BaseEventEmitter` renamed to `EventEmitter`\n" ), "async-timeout": ( "async-timeout 4.0+ breaking changes:\n" "- `async_timeout.timeout()` now returns async context manager\n" "- `with async_timeout.timeout(n):` → `async with asyncio.timeout(n):`\n" "- In Python 3.11+, use `asyncio.timeout()` from stdlib instead\n" ), "discord": ( "discord.py 2.0 breaking changes:\n" "- Many methods now require `Intents`\n" "- `Client.logout()` REMOVED → use `Client.close()`\n" "- `on_ready` behavior changed\n" "- `commands.Bot` requires `intents` parameter\n" ), "tweepy": ( "Tweepy 4.0+ breaking changes:\n" "- `API` class methods renamed\n" "- `StreamListener` REMOVED → subclass `Stream` directly\n" "- `Cursor` API changed\n" ), "scikit-learn": ( "scikit-learn 1.0+ breaking changes:\n" "- `sklearn.utils._get_column_indices` REMOVED\n" "- `sklearn.utils._safe_indexing` moved to `sklearn.utils`\n" "- Many private APIs reorganized\n" "- `sklearn.metrics.plot_*` functions deprecated → use display objects\n" ), "ansible": ( "Ansible breaking changes:\n" "- Module paths changed in ansible-core 2.10+\n" "- `AnsibleModule` import paths may differ\n" ), } # Aliases PACKAGE_BREAKING_CHANGES["builtin"] = "" # covered by Python version changes PACKAGE_BREAKING_CHANGES["numpy, pandas"] = ( PACKAGE_BREAKING_CHANGES["numpy"] + "\n" + PACKAGE_BREAKING_CHANGES["pandas"] ) PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy, marshmallow"] = ( PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy"] + "\n" + PACKAGE_BREAKING_CHANGES["marshmallow"] ) # --------------------------------------------------------------------------- # Main API # --------------------------------------------------------------------------- def get_python_breaking_changes(old_version: str, new_version: str) -> str: """Get Python breaking changes between two versions. Args: old_version: e.g., "3.6" or "3.6.4" new_version: e.g., "3.12" or "3.12.11" Returns: String with all breaking changes between the versions. """ old_minor = int(old_version.split(".")[1]) if "." in old_version else 6 new_minor = int(new_version.split(".")[1]) if "." in new_version else 12 changes = [] for minor in range(old_minor + 1, new_minor + 1): key = f"3.{minor}" if key in PYTHON_BREAKING_CHANGES and PYTHON_BREAKING_CHANGES[key]: changes.append(f"Python {key}:\n{PYTHON_BREAKING_CHANGES[key]}") if not changes: return "No known Python breaking changes for this version range." return "\n".join(changes) def get_package_breaking_changes(related_modules: str) -> str: """Get package-specific breaking changes. Args: related_modules: comma-separated module names, e.g., "numpy" or "numpy, pandas" Returns: String with breaking changes for the specified packages. """ # Try exact match first if related_modules in PACKAGE_BREAKING_CHANGES: return PACKAGE_BREAKING_CHANGES[related_modules] # Try individual modules modules = [m.strip() for m in related_modules.split(",")] changes = [] for mod in modules: if mod in PACKAGE_BREAKING_CHANGES and PACKAGE_BREAKING_CHANGES[mod]: changes.append(PACKAGE_BREAKING_CHANGES[mod]) if not changes: return f"No pre-built migration guide for '{related_modules}'. Check the package changelog." return "\n".join(changes) def get_migration_context( old_python: str, new_python: str, related_modules: str, dependency_versions: str = "", ) -> str: """Build a complete migration context string for a task. This is injected into the system prompt to give the model specific knowledge about what changed. Returns: Formatted string with all relevant breaking changes. """ parts = [] # Python version changes py_changes = get_python_breaking_changes(old_python, new_python) if py_changes and "No known" not in py_changes: parts.append(f"=== PYTHON {old_python} → {new_python} BREAKING CHANGES ===\n{py_changes}") # Package changes pkg_changes = get_package_breaking_changes(related_modules) if pkg_changes and "No pre-built" not in pkg_changes: parts.append(f"=== PACKAGE BREAKING CHANGES ({related_modules}) ===\n{pkg_changes}") if not parts: return "No specific migration docs available. Use general debugging." return "\n\n".join(parts) # --------------------------------------------------------------------------- # CLI — pre-fetch and show context for all tasks in the dataset # --------------------------------------------------------------------------- def main(): """Show migration context for all tasks in train + eval.""" from code_migration.dataset_loader import DatasetLoader data_dir = Path(__file__).parent / "data" for split in ["train.jsonl", "eval.jsonl"]: path = data_dir / split if not path.exists(): continue print(f"\n{'='*60}") print(f" {split}") print(f"{'='*60}") loader = DatasetLoader(str(path)) for i, task in enumerate(loader._tasks): ctx = get_migration_context( old_python=task.reproduction_target_version, new_python=task.migration_target_version, related_modules=task.related_modules, dependency_versions=task.dependency_versions, ) has_py = "PYTHON" in ctx has_pkg = "PACKAGE" in ctx ctx_len = len(ctx) print(f" {i+1:2d}. {task.repo_name:40s} " f"py={'✓' if has_py else '✗'} pkg={'✓' if has_pkg else '✗'} " f"({ctx_len} chars)") if __name__ == "__main__": main()