Spaces:
Sleeping
Sleeping
| """ | |
| Migration Docs Fetcher | |
| ====================== | |
| Pre-collects breaking changes documentation for Python version migrations | |
| and package updates. Fetches from official docs and caches locally. | |
| Usage: | |
| # As a module | |
| from code_migration.migration_docs import get_migration_context | |
| context = get_migration_context( | |
| old_python="3.6", | |
| new_python="3.12", | |
| related_modules="numpy", | |
| dependency_versions="numpy==2.3.2\npandas==2.3.1\n", | |
| ) | |
| # As a script β pre-fetch all docs for the dataset | |
| python code_migration/migration_docs.py | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import time | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| from urllib.request import urlopen, Request | |
| from urllib.error import URLError | |
| CACHE_DIR = Path(__file__).parent / "data" / "migration_docs_cache" | |
| # --------------------------------------------------------------------------- | |
| # Python "What's New" β breaking changes per version | |
| # --------------------------------------------------------------------------- | |
| # Hardcoded summaries of the most impactful breaking changes per Python version. | |
| # These are extracted from docs.python.org/3.X/whatsnew/ pages. | |
| # Much faster and more reliable than fetching at runtime. | |
| PYTHON_BREAKING_CHANGES: Dict[str, str] = { | |
| "3.7": "", # baseline β no breaks from 3.6 | |
| "3.8": ( | |
| "- `collections.abc` aliases in `collections` deprecated (Callable, Mapping, etc.)\n" | |
| "- `platform.popen()` removed\n" | |
| "- `time.clock()` removed β use `time.perf_counter()`\n" | |
| ), | |
| "3.9": ( | |
| "- `collections.abc` aliases in `collections` still deprecated\n" | |
| "- `typing.List`, `typing.Dict` etc. can be replaced with `list`, `dict`\n" | |
| "- `math.gcd()` now accepts multiple arguments\n" | |
| ), | |
| "3.10": ( | |
| "- `collections.Callable` REMOVED β use `collections.abc.Callable`\n" | |
| "- `collections.Mapping` REMOVED β use `collections.abc.Mapping`\n" | |
| "- `collections.MutableMapping` REMOVED β use `collections.abc.MutableMapping`\n" | |
| "- `collections.Iterable` REMOVED β use `collections.abc.Iterable`\n" | |
| "- `collections.Iterator` REMOVED β use `collections.abc.Iterator`\n" | |
| "- `collections.Sequence` REMOVED β use `collections.abc.Sequence`\n" | |
| "- `collections.MutableSequence` REMOVED β use `collections.abc.MutableSequence`\n" | |
| "- `collections.Set` REMOVED β use `collections.abc.Set`\n" | |
| "- `collections.MutableSet` REMOVED β use `collections.abc.MutableSet`\n" | |
| "- `collections.ByteString` REMOVED β use `collections.abc.ByteString`\n" | |
| "- `typing.io` and `typing.re` removed\n" | |
| "- `loop` parameter removed from most `asyncio` functions\n" | |
| ), | |
| "3.11": ( | |
| "- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` deprecated\n" | |
| "- `locale.resetlocale()` deprecated\n" | |
| "- `configparser.SafeConfigParser` β `configparser.ConfigParser`\n" | |
| ), | |
| "3.12": ( | |
| "- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` REMOVED\n" | |
| "- `configparser.SafeConfigParser` REMOVED β use `ConfigParser`\n" | |
| "- `distutils` REMOVED entirely β use `setuptools`\n" | |
| "- `imp` module REMOVED β use `importlib`\n" | |
| "- `pkgutil.ImpImporter` and `pkgutil.ImpLoader` REMOVED\n" | |
| "- `locale.resetlocale()` REMOVED\n" | |
| "- `asynchat`, `asyncore`, `smtpd` REMOVED\n" | |
| "- `xml.etree.ElementTree.Element.copy()` β use `copy.copy()`\n" | |
| ), | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Package-specific breaking changes | |
| # --------------------------------------------------------------------------- | |
| PACKAGE_BREAKING_CHANGES: Dict[str, str] = { | |
| "numpy": ( | |
| "NumPy 2.0 breaking changes:\n" | |
| "- `np.math` REMOVED β use `math` (stdlib) directly. `import math; math.factorial(x)` etc.\n" | |
| "- `np.product` REMOVED β use `np.prod`\n" | |
| "- `np.cumproduct` REMOVED β use `np.cumprod`\n" | |
| "- `np.sometrue` REMOVED β use `np.any`\n" | |
| "- `np.alltrue` REMOVED β use `np.all`\n" | |
| "- `np.in1d` REMOVED β use `np.isin`\n" | |
| "- `np.row_stack` REMOVED β use `np.vstack`\n" | |
| "- `np.bool` REMOVED β use `bool`\n" | |
| "- `np.int` REMOVED β use `int`\n" | |
| "- `np.float` REMOVED β use `float`\n" | |
| "- `np.complex` REMOVED β use `complex`\n" | |
| "- `np.object` REMOVED β use `object`\n" | |
| "- `np.str` REMOVED β use `str`\n" | |
| "- `np.long` REMOVED β use `int`\n" | |
| "- `np.unicode` REMOVED β use `str`\n" | |
| "- `numpy.NaN` REMOVED β use `numpy.nan`\n" | |
| "- `numpy.Inf` REMOVED β use `numpy.inf`\n" | |
| "- `numpy.string_` β `numpy.bytes_`\n" | |
| "- `numpy.unicode_` β `numpy.str_`\n" | |
| "- `numpy.AxisError` moved to `numpy.exceptions.AxisError`\n" | |
| "- `np.typeDict` REMOVED β use `np.sctypeDict`\n" | |
| "- `np.matrix.itemset()` REMOVED β use direct indexing `M[i,j] = val`\n" | |
| "- `np.ndarray.itemset()` REMOVED β use direct indexing `arr[i] = val`\n" | |
| ), | |
| "pandas": ( | |
| "Pandas 2.0+ breaking changes:\n" | |
| "- `DataFrame.append()` REMOVED β use `pd.concat([df, new_row])`\n" | |
| "- `Series.append()` REMOVED β use `pd.concat([s1, s2])`\n" | |
| "- `pd.read_csv(error_bad_lines=)` REMOVED β use `on_bad_lines='skip'`\n" | |
| "- `pd.read_csv(warn_bad_lines=)` REMOVED β use `on_bad_lines='warn'`\n" | |
| "- `DataFrame.swaplevel()` β axis parameter deprecated\n" | |
| "- `Index.is_monotonic` β use `Index.is_monotonic_increasing`\n" | |
| "- Default dtype changed from object to nullable types\n" | |
| ), | |
| "flask": ( | |
| "Flask 2.0+ / 3.0+ breaking changes:\n" | |
| "- `flask.helpers.safe_join` REMOVED β use `werkzeug.utils.safe_join`\n" | |
| "- `flask.json.JSONEncoder` REMOVED β use standard json or custom\n" | |
| "- `@app.before_first_request` REMOVED\n" | |
| "- `flask.escape` REMOVED β use `markupsafe.escape`\n" | |
| "- `flask.Markup` REMOVED β use `markupsafe.Markup`\n" | |
| ), | |
| "Django": ( | |
| "Django 4.0+ / 5.0+ breaking changes:\n" | |
| "- `django.conf.urls.url()` REMOVED β use `django.urls.re_path()`\n" | |
| "- `django.utils.translation.ugettext()` REMOVED β use `gettext()`\n" | |
| "- `django.utils.translation.ugettext_lazy()` REMOVED β use `gettext_lazy()`\n" | |
| "- `django.utils.translation.ungettext()` REMOVED β use `ngettext()`\n" | |
| "- `django.utils.translation.ungettext_lazy()` REMOVED β use `ngettext_lazy()`\n" | |
| "- `django.utils.encoding.force_text()` REMOVED β use `force_str()`\n" | |
| "- `django.utils.encoding.smart_text()` REMOVED β use `smart_str()`\n" | |
| "- `django.utils.http.is_safe_url()` REMOVED β use `url_has_allowed_host_and_scheme()`\n" | |
| "- `django.conf.urls.include()` no longer accepts `app_name` as string\n" | |
| ), | |
| "django": ( # lowercase alias | |
| "Same as Django β see Django entry above.\n" | |
| ), | |
| "gensim": ( | |
| "Gensim 4.0+ breaking changes:\n" | |
| "- `gensim.models.Word2Vec.most_similar()` β use `model.wv.most_similar()`\n" | |
| "- `gensim.models.KeyedVectors.load_word2vec_format()` still works\n" | |
| "- `gensim.corpora.Dictionary.doc2bow()` unchanged\n" | |
| "- `gensim.similarities.MatrixSimilarity` unchanged\n" | |
| "- `smart_open` dependency updated\n" | |
| ), | |
| "pydantic": ( | |
| "Pydantic v2 breaking changes:\n" | |
| "- `BaseModel.dict()` β use `BaseModel.model_dump()`\n" | |
| "- `BaseModel.json()` β use `BaseModel.model_dump_json()`\n" | |
| "- `BaseModel.parse_obj()` β use `BaseModel.model_validate()`\n" | |
| "- `BaseModel.parse_raw()` β use `BaseModel.model_validate_json()`\n" | |
| "- `@validator` β use `@field_validator`\n" | |
| "- `@root_validator` β use `@model_validator`\n" | |
| "- `Field(regex=)` β use `Field(pattern=)`\n" | |
| "- `Config` class β use `model_config = ConfigDict(...)`\n" | |
| ), | |
| "PyYAML": ( | |
| "PyYAML breaking changes:\n" | |
| "- `yaml.load(f)` without Loader is REMOVED β use `yaml.safe_load(f)`\n" | |
| "- `yaml.load(f, Loader=yaml.FullLoader)` is the explicit alternative\n" | |
| ), | |
| "pillow": ( | |
| "Pillow (PIL) breaking changes:\n" | |
| "- `Image.ANTIALIAS` REMOVED β use `Image.LANCZOS`\n" | |
| "- `ImageDraw.textsize()` REMOVED β use `ImageDraw.textbbox()` or `textlength()`\n" | |
| "- `FreeTypeFont.getsize()` REMOVED β use `FreeTypeFont.getbbox()`\n" | |
| ), | |
| "PyJWT": ( | |
| "PyJWT 2.0+ breaking changes:\n" | |
| "- `jwt.decode()` now returns dict directly (was bytes in 1.x)\n" | |
| "- `algorithms` parameter is now required in `jwt.decode()`\n" | |
| "- `jwt.decode(verify=False)` β use `options={'verify_signature': False}`\n" | |
| ), | |
| "marshmallow": ( | |
| "Marshmallow 3.0+ breaking changes:\n" | |
| "- `Schema.dump()` returns data directly (not tuple)\n" | |
| "- `Schema.load()` returns data directly (not tuple)\n" | |
| "- `fields.Nested(many=True)` β use `fields.List(fields.Nested(...))`\n" | |
| "- `@post_load` decorated methods receive `**kwargs` differently\n" | |
| ), | |
| "Flask-SQLAlchemy": ( | |
| "Flask-SQLAlchemy 3.0+ breaking changes:\n" | |
| "- `db.Model.query` still works but `db.session.execute(select(...))` preferred\n" | |
| "- `SQLALCHEMY_TRACK_MODIFICATIONS` default changed\n" | |
| ), | |
| "pyasn1": ( | |
| "pyasn1 0.5+ / 0.6+ breaking changes:\n" | |
| "- `pyasn1.compat.octets` module REMOVED\n" | |
| "- `pyasn1.compat.octets.null` REMOVED β use `b''`\n" | |
| "- `pyasn1.compat.octets.str2octs(s)` REMOVED β use `s.encode()` or `b'...'`\n" | |
| "- `pyasn1.compat.octets.octs2str(b)` REMOVED β use `b.decode()`\n" | |
| "- `pyasn1.compat.octets.isOctetsType(x)` REMOVED β use `isinstance(x, bytes)`\n" | |
| "- `pyasn1.compat.integer` module REMOVED\n" | |
| ), | |
| "pyee": ( | |
| "pyee 9.0+ / 12.0+ breaking changes:\n" | |
| "- `from pyee import ExecutorEventEmitter` REMOVED β use `from pyee.executor import ExecutorEventEmitter`\n" | |
| "- `from pyee import BaseEventEmitter` REMOVED β use `from pyee.base import EventEmitter`\n" | |
| "- `BaseEventEmitter` renamed to `EventEmitter`\n" | |
| ), | |
| "async-timeout": ( | |
| "async-timeout 4.0+ breaking changes:\n" | |
| "- `async_timeout.timeout()` now returns async context manager\n" | |
| "- `with async_timeout.timeout(n):` β `async with asyncio.timeout(n):`\n" | |
| "- In Python 3.11+, use `asyncio.timeout()` from stdlib instead\n" | |
| ), | |
| "discord": ( | |
| "discord.py 2.0 breaking changes:\n" | |
| "- Many methods now require `Intents`\n" | |
| "- `Client.logout()` REMOVED β use `Client.close()`\n" | |
| "- `on_ready` behavior changed\n" | |
| "- `commands.Bot` requires `intents` parameter\n" | |
| ), | |
| "tweepy": ( | |
| "Tweepy 4.0+ breaking changes:\n" | |
| "- `API` class methods renamed\n" | |
| "- `StreamListener` REMOVED β subclass `Stream` directly\n" | |
| "- `Cursor` API changed\n" | |
| ), | |
| "scikit-learn": ( | |
| "scikit-learn 1.0+ breaking changes:\n" | |
| "- `sklearn.utils._get_column_indices` REMOVED\n" | |
| "- `sklearn.utils._safe_indexing` moved to `sklearn.utils`\n" | |
| "- Many private APIs reorganized\n" | |
| "- `sklearn.metrics.plot_*` functions deprecated β use display objects\n" | |
| ), | |
| "ansible": ( | |
| "Ansible breaking changes:\n" | |
| "- Module paths changed in ansible-core 2.10+\n" | |
| "- `AnsibleModule` import paths may differ\n" | |
| ), | |
| } | |
| # Aliases | |
| PACKAGE_BREAKING_CHANGES["builtin"] = "" # covered by Python version changes | |
| PACKAGE_BREAKING_CHANGES["numpy, pandas"] = ( | |
| PACKAGE_BREAKING_CHANGES["numpy"] + "\n" + PACKAGE_BREAKING_CHANGES["pandas"] | |
| ) | |
| PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy, marshmallow"] = ( | |
| PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy"] + "\n" + PACKAGE_BREAKING_CHANGES["marshmallow"] | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Main API | |
| # --------------------------------------------------------------------------- | |
| def get_python_breaking_changes(old_version: str, new_version: str) -> str: | |
| """Get Python breaking changes between two versions. | |
| Args: | |
| old_version: e.g., "3.6" or "3.6.4" | |
| new_version: e.g., "3.12" or "3.12.11" | |
| Returns: | |
| String with all breaking changes between the versions. | |
| """ | |
| old_minor = int(old_version.split(".")[1]) if "." in old_version else 6 | |
| new_minor = int(new_version.split(".")[1]) if "." in new_version else 12 | |
| changes = [] | |
| for minor in range(old_minor + 1, new_minor + 1): | |
| key = f"3.{minor}" | |
| if key in PYTHON_BREAKING_CHANGES and PYTHON_BREAKING_CHANGES[key]: | |
| changes.append(f"Python {key}:\n{PYTHON_BREAKING_CHANGES[key]}") | |
| if not changes: | |
| return "No known Python breaking changes for this version range." | |
| return "\n".join(changes) | |
| def get_package_breaking_changes(related_modules: str) -> str: | |
| """Get package-specific breaking changes. | |
| Args: | |
| related_modules: comma-separated module names, e.g., "numpy" or "numpy, pandas" | |
| Returns: | |
| String with breaking changes for the specified packages. | |
| """ | |
| # Try exact match first | |
| if related_modules in PACKAGE_BREAKING_CHANGES: | |
| return PACKAGE_BREAKING_CHANGES[related_modules] | |
| # Try individual modules | |
| modules = [m.strip() for m in related_modules.split(",")] | |
| changes = [] | |
| for mod in modules: | |
| if mod in PACKAGE_BREAKING_CHANGES and PACKAGE_BREAKING_CHANGES[mod]: | |
| changes.append(PACKAGE_BREAKING_CHANGES[mod]) | |
| if not changes: | |
| return f"No pre-built migration guide for '{related_modules}'. Check the package changelog." | |
| return "\n".join(changes) | |
| def get_migration_context( | |
| old_python: str, | |
| new_python: str, | |
| related_modules: str, | |
| dependency_versions: str = "", | |
| ) -> str: | |
| """Build a complete migration context string for a task. | |
| This is injected into the system prompt to give the model | |
| specific knowledge about what changed. | |
| Returns: | |
| Formatted string with all relevant breaking changes. | |
| """ | |
| parts = [] | |
| # Python version changes | |
| py_changes = get_python_breaking_changes(old_python, new_python) | |
| if py_changes and "No known" not in py_changes: | |
| parts.append(f"=== PYTHON {old_python} β {new_python} BREAKING CHANGES ===\n{py_changes}") | |
| # Package changes | |
| pkg_changes = get_package_breaking_changes(related_modules) | |
| if pkg_changes and "No pre-built" not in pkg_changes: | |
| parts.append(f"=== PACKAGE BREAKING CHANGES ({related_modules}) ===\n{pkg_changes}") | |
| if not parts: | |
| return "No specific migration docs available. Use general debugging." | |
| return "\n\n".join(parts) | |
| # --------------------------------------------------------------------------- | |
| # CLI β pre-fetch and show context for all tasks in the dataset | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| """Show migration context for all tasks in train + eval.""" | |
| from code_migration.dataset_loader import DatasetLoader | |
| data_dir = Path(__file__).parent / "data" | |
| for split in ["train.jsonl", "eval.jsonl"]: | |
| path = data_dir / split | |
| if not path.exists(): | |
| continue | |
| print(f"\n{'='*60}") | |
| print(f" {split}") | |
| print(f"{'='*60}") | |
| loader = DatasetLoader(str(path)) | |
| for i, task in enumerate(loader._tasks): | |
| ctx = get_migration_context( | |
| old_python=task.reproduction_target_version, | |
| new_python=task.migration_target_version, | |
| related_modules=task.related_modules, | |
| dependency_versions=task.dependency_versions, | |
| ) | |
| has_py = "PYTHON" in ctx | |
| has_pkg = "PACKAGE" in ctx | |
| ctx_len = len(ctx) | |
| print(f" {i+1:2d}. {task.repo_name:40s} " | |
| f"py={'β' if has_py else 'β'} pkg={'β' if has_pkg else 'β'} " | |
| f"({ctx_len} chars)") | |
| if __name__ == "__main__": | |
| main() | |