File size: 16,349 Bytes
1b35d41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
"""
Migration Docs Fetcher
======================

Pre-collects breaking changes documentation for Python version migrations
and package updates. Fetches from official docs and caches locally.

Usage:
    # As a module
    from code_migration.migration_docs import get_migration_context
    context = get_migration_context(
        old_python="3.6",
        new_python="3.12",
        related_modules="numpy",
        dependency_versions="numpy==2.3.2\npandas==2.3.1\n",
    )

    # As a script β€” pre-fetch all docs for the dataset
    python code_migration/migration_docs.py
"""

from __future__ import annotations

import json
import os
import re
import time
from pathlib import Path
from typing import Dict, List, Optional
from urllib.request import urlopen, Request
from urllib.error import URLError

CACHE_DIR = Path(__file__).parent / "data" / "migration_docs_cache"

# ---------------------------------------------------------------------------
# Python "What's New" β€” breaking changes per version
# ---------------------------------------------------------------------------
# Hardcoded summaries of the most impactful breaking changes per Python version.
# These are extracted from docs.python.org/3.X/whatsnew/ pages.
# Much faster and more reliable than fetching at runtime.

PYTHON_BREAKING_CHANGES: Dict[str, str] = {
    "3.7": "",  # baseline β€” no breaks from 3.6
    "3.8": (
        "- `collections.abc` aliases in `collections` deprecated (Callable, Mapping, etc.)\n"
        "- `platform.popen()` removed\n"
        "- `time.clock()` removed β†’ use `time.perf_counter()`\n"
    ),
    "3.9": (
        "- `collections.abc` aliases in `collections` still deprecated\n"
        "- `typing.List`, `typing.Dict` etc. can be replaced with `list`, `dict`\n"
        "- `math.gcd()` now accepts multiple arguments\n"
    ),
    "3.10": (
        "- `collections.Callable` REMOVED β†’ use `collections.abc.Callable`\n"
        "- `collections.Mapping` REMOVED β†’ use `collections.abc.Mapping`\n"
        "- `collections.MutableMapping` REMOVED β†’ use `collections.abc.MutableMapping`\n"
        "- `collections.Iterable` REMOVED β†’ use `collections.abc.Iterable`\n"
        "- `collections.Iterator` REMOVED β†’ use `collections.abc.Iterator`\n"
        "- `collections.Sequence` REMOVED β†’ use `collections.abc.Sequence`\n"
        "- `collections.MutableSequence` REMOVED β†’ use `collections.abc.MutableSequence`\n"
        "- `collections.Set` REMOVED β†’ use `collections.abc.Set`\n"
        "- `collections.MutableSet` REMOVED β†’ use `collections.abc.MutableSet`\n"
        "- `collections.ByteString` REMOVED β†’ use `collections.abc.ByteString`\n"
        "- `typing.io` and `typing.re` removed\n"
        "- `loop` parameter removed from most `asyncio` functions\n"
    ),
    "3.11": (
        "- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` deprecated\n"
        "- `locale.resetlocale()` deprecated\n"
        "- `configparser.SafeConfigParser` β†’ `configparser.ConfigParser`\n"
    ),
    "3.12": (
        "- `unittest.findTestCases()`, `makeSuite()`, `getTestCaseNames()` REMOVED\n"
        "- `configparser.SafeConfigParser` REMOVED β†’ use `ConfigParser`\n"
        "- `distutils` REMOVED entirely β†’ use `setuptools`\n"
        "- `imp` module REMOVED β†’ use `importlib`\n"
        "- `pkgutil.ImpImporter` and `pkgutil.ImpLoader` REMOVED\n"
        "- `locale.resetlocale()` REMOVED\n"
        "- `asynchat`, `asyncore`, `smtpd` REMOVED\n"
        "- `xml.etree.ElementTree.Element.copy()` β†’ use `copy.copy()`\n"
    ),
}

# ---------------------------------------------------------------------------
# Package-specific breaking changes
# ---------------------------------------------------------------------------
PACKAGE_BREAKING_CHANGES: Dict[str, str] = {
    "numpy": (
        "NumPy 2.0 breaking changes:\n"
        "- `np.math` REMOVED β†’ use `math` (stdlib) directly. `import math; math.factorial(x)` etc.\n"
        "- `np.product` REMOVED β†’ use `np.prod`\n"
        "- `np.cumproduct` REMOVED β†’ use `np.cumprod`\n"
        "- `np.sometrue` REMOVED β†’ use `np.any`\n"
        "- `np.alltrue` REMOVED β†’ use `np.all`\n"
        "- `np.in1d` REMOVED β†’ use `np.isin`\n"
        "- `np.row_stack` REMOVED β†’ use `np.vstack`\n"
        "- `np.bool` REMOVED β†’ use `bool`\n"
        "- `np.int` REMOVED β†’ use `int`\n"
        "- `np.float` REMOVED β†’ use `float`\n"
        "- `np.complex` REMOVED β†’ use `complex`\n"
        "- `np.object` REMOVED β†’ use `object`\n"
        "- `np.str` REMOVED β†’ use `str`\n"
        "- `np.long` REMOVED β†’ use `int`\n"
        "- `np.unicode` REMOVED β†’ use `str`\n"
        "- `numpy.NaN` REMOVED β†’ use `numpy.nan`\n"
        "- `numpy.Inf` REMOVED β†’ use `numpy.inf`\n"
        "- `numpy.string_` β†’ `numpy.bytes_`\n"
        "- `numpy.unicode_` β†’ `numpy.str_`\n"
        "- `numpy.AxisError` moved to `numpy.exceptions.AxisError`\n"
        "- `np.typeDict` REMOVED β†’ use `np.sctypeDict`\n"
        "- `np.matrix.itemset()` REMOVED β†’ use direct indexing `M[i,j] = val`\n"
        "- `np.ndarray.itemset()` REMOVED β†’ use direct indexing `arr[i] = val`\n"
    ),
    "pandas": (
        "Pandas 2.0+ breaking changes:\n"
        "- `DataFrame.append()` REMOVED β†’ use `pd.concat([df, new_row])`\n"
        "- `Series.append()` REMOVED β†’ use `pd.concat([s1, s2])`\n"
        "- `pd.read_csv(error_bad_lines=)` REMOVED β†’ use `on_bad_lines='skip'`\n"
        "- `pd.read_csv(warn_bad_lines=)` REMOVED β†’ use `on_bad_lines='warn'`\n"
        "- `DataFrame.swaplevel()` β†’ axis parameter deprecated\n"
        "- `Index.is_monotonic` β†’ use `Index.is_monotonic_increasing`\n"
        "- Default dtype changed from object to nullable types\n"
    ),
    "flask": (
        "Flask 2.0+ / 3.0+ breaking changes:\n"
        "- `flask.helpers.safe_join` REMOVED β†’ use `werkzeug.utils.safe_join`\n"
        "- `flask.json.JSONEncoder` REMOVED β†’ use standard json or custom\n"
        "- `@app.before_first_request` REMOVED\n"
        "- `flask.escape` REMOVED β†’ use `markupsafe.escape`\n"
        "- `flask.Markup` REMOVED β†’ use `markupsafe.Markup`\n"
    ),
    "Django": (
        "Django 4.0+ / 5.0+ breaking changes:\n"
        "- `django.conf.urls.url()` REMOVED β†’ use `django.urls.re_path()`\n"
        "- `django.utils.translation.ugettext()` REMOVED β†’ use `gettext()`\n"
        "- `django.utils.translation.ugettext_lazy()` REMOVED β†’ use `gettext_lazy()`\n"
        "- `django.utils.translation.ungettext()` REMOVED β†’ use `ngettext()`\n"
        "- `django.utils.translation.ungettext_lazy()` REMOVED β†’ use `ngettext_lazy()`\n"
        "- `django.utils.encoding.force_text()` REMOVED β†’ use `force_str()`\n"
        "- `django.utils.encoding.smart_text()` REMOVED β†’ use `smart_str()`\n"
        "- `django.utils.http.is_safe_url()` REMOVED β†’ use `url_has_allowed_host_and_scheme()`\n"
        "- `django.conf.urls.include()` no longer accepts `app_name` as string\n"
    ),
    "django": (  # lowercase alias
        "Same as Django β€” see Django entry above.\n"
    ),
    "gensim": (
        "Gensim 4.0+ breaking changes:\n"
        "- `gensim.models.Word2Vec.most_similar()` β†’ use `model.wv.most_similar()`\n"
        "- `gensim.models.KeyedVectors.load_word2vec_format()` still works\n"
        "- `gensim.corpora.Dictionary.doc2bow()` unchanged\n"
        "- `gensim.similarities.MatrixSimilarity` unchanged\n"
        "- `smart_open` dependency updated\n"
    ),
    "pydantic": (
        "Pydantic v2 breaking changes:\n"
        "- `BaseModel.dict()` β†’ use `BaseModel.model_dump()`\n"
        "- `BaseModel.json()` β†’ use `BaseModel.model_dump_json()`\n"
        "- `BaseModel.parse_obj()` β†’ use `BaseModel.model_validate()`\n"
        "- `BaseModel.parse_raw()` β†’ use `BaseModel.model_validate_json()`\n"
        "- `@validator` β†’ use `@field_validator`\n"
        "- `@root_validator` β†’ use `@model_validator`\n"
        "- `Field(regex=)` β†’ use `Field(pattern=)`\n"
        "- `Config` class β†’ use `model_config = ConfigDict(...)`\n"
    ),
    "PyYAML": (
        "PyYAML breaking changes:\n"
        "- `yaml.load(f)` without Loader is REMOVED β†’ use `yaml.safe_load(f)`\n"
        "- `yaml.load(f, Loader=yaml.FullLoader)` is the explicit alternative\n"
    ),
    "pillow": (
        "Pillow (PIL) breaking changes:\n"
        "- `Image.ANTIALIAS` REMOVED β†’ use `Image.LANCZOS`\n"
        "- `ImageDraw.textsize()` REMOVED β†’ use `ImageDraw.textbbox()` or `textlength()`\n"
        "- `FreeTypeFont.getsize()` REMOVED β†’ use `FreeTypeFont.getbbox()`\n"
    ),
    "PyJWT": (
        "PyJWT 2.0+ breaking changes:\n"
        "- `jwt.decode()` now returns dict directly (was bytes in 1.x)\n"
        "- `algorithms` parameter is now required in `jwt.decode()`\n"
        "- `jwt.decode(verify=False)` β†’ use `options={'verify_signature': False}`\n"
    ),
    "marshmallow": (
        "Marshmallow 3.0+ breaking changes:\n"
        "- `Schema.dump()` returns data directly (not tuple)\n"
        "- `Schema.load()` returns data directly (not tuple)\n"
        "- `fields.Nested(many=True)` β†’ use `fields.List(fields.Nested(...))`\n"
        "- `@post_load` decorated methods receive `**kwargs` differently\n"
    ),
    "Flask-SQLAlchemy": (
        "Flask-SQLAlchemy 3.0+ breaking changes:\n"
        "- `db.Model.query` still works but `db.session.execute(select(...))` preferred\n"
        "- `SQLALCHEMY_TRACK_MODIFICATIONS` default changed\n"
    ),
    "pyasn1": (
        "pyasn1 0.5+ / 0.6+ breaking changes:\n"
        "- `pyasn1.compat.octets` module REMOVED\n"
        "- `pyasn1.compat.octets.null` REMOVED β†’ use `b''`\n"
        "- `pyasn1.compat.octets.str2octs(s)` REMOVED β†’ use `s.encode()` or `b'...'`\n"
        "- `pyasn1.compat.octets.octs2str(b)` REMOVED β†’ use `b.decode()`\n"
        "- `pyasn1.compat.octets.isOctetsType(x)` REMOVED β†’ use `isinstance(x, bytes)`\n"
        "- `pyasn1.compat.integer` module REMOVED\n"
    ),
    "pyee": (
        "pyee 9.0+ / 12.0+ breaking changes:\n"
        "- `from pyee import ExecutorEventEmitter` REMOVED β†’ use `from pyee.executor import ExecutorEventEmitter`\n"
        "- `from pyee import BaseEventEmitter` REMOVED β†’ use `from pyee.base import EventEmitter`\n"
        "- `BaseEventEmitter` renamed to `EventEmitter`\n"
    ),
    "async-timeout": (
        "async-timeout 4.0+ breaking changes:\n"
        "- `async_timeout.timeout()` now returns async context manager\n"
        "- `with async_timeout.timeout(n):` β†’ `async with asyncio.timeout(n):`\n"
        "- In Python 3.11+, use `asyncio.timeout()` from stdlib instead\n"
    ),
    "discord": (
        "discord.py 2.0 breaking changes:\n"
        "- Many methods now require `Intents`\n"
        "- `Client.logout()` REMOVED β†’ use `Client.close()`\n"
        "- `on_ready` behavior changed\n"
        "- `commands.Bot` requires `intents` parameter\n"
    ),
    "tweepy": (
        "Tweepy 4.0+ breaking changes:\n"
        "- `API` class methods renamed\n"
        "- `StreamListener` REMOVED β†’ subclass `Stream` directly\n"
        "- `Cursor` API changed\n"
    ),
    "scikit-learn": (
        "scikit-learn 1.0+ breaking changes:\n"
        "- `sklearn.utils._get_column_indices` REMOVED\n"
        "- `sklearn.utils._safe_indexing` moved to `sklearn.utils`\n"
        "- Many private APIs reorganized\n"
        "- `sklearn.metrics.plot_*` functions deprecated β†’ use display objects\n"
    ),
    "ansible": (
        "Ansible breaking changes:\n"
        "- Module paths changed in ansible-core 2.10+\n"
        "- `AnsibleModule` import paths may differ\n"
    ),
}

# Aliases
PACKAGE_BREAKING_CHANGES["builtin"] = ""  # covered by Python version changes
PACKAGE_BREAKING_CHANGES["numpy, pandas"] = (
    PACKAGE_BREAKING_CHANGES["numpy"] + "\n" + PACKAGE_BREAKING_CHANGES["pandas"]
)
PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy, marshmallow"] = (
    PACKAGE_BREAKING_CHANGES["Flask-SQLAlchemy"] + "\n" + PACKAGE_BREAKING_CHANGES["marshmallow"]
)


# ---------------------------------------------------------------------------
# Main API
# ---------------------------------------------------------------------------
def get_python_breaking_changes(old_version: str, new_version: str) -> str:
    """Get Python breaking changes between two versions.

    Args:
        old_version: e.g., "3.6" or "3.6.4"
        new_version: e.g., "3.12" or "3.12.11"

    Returns:
        String with all breaking changes between the versions.
    """
    old_minor = int(old_version.split(".")[1]) if "." in old_version else 6
    new_minor = int(new_version.split(".")[1]) if "." in new_version else 12

    changes = []
    for minor in range(old_minor + 1, new_minor + 1):
        key = f"3.{minor}"
        if key in PYTHON_BREAKING_CHANGES and PYTHON_BREAKING_CHANGES[key]:
            changes.append(f"Python {key}:\n{PYTHON_BREAKING_CHANGES[key]}")

    if not changes:
        return "No known Python breaking changes for this version range."

    return "\n".join(changes)


def get_package_breaking_changes(related_modules: str) -> str:
    """Get package-specific breaking changes.

    Args:
        related_modules: comma-separated module names, e.g., "numpy" or "numpy, pandas"

    Returns:
        String with breaking changes for the specified packages.
    """
    # Try exact match first
    if related_modules in PACKAGE_BREAKING_CHANGES:
        return PACKAGE_BREAKING_CHANGES[related_modules]

    # Try individual modules
    modules = [m.strip() for m in related_modules.split(",")]
    changes = []
    for mod in modules:
        if mod in PACKAGE_BREAKING_CHANGES and PACKAGE_BREAKING_CHANGES[mod]:
            changes.append(PACKAGE_BREAKING_CHANGES[mod])

    if not changes:
        return f"No pre-built migration guide for '{related_modules}'. Check the package changelog."

    return "\n".join(changes)


def get_migration_context(
    old_python: str,
    new_python: str,
    related_modules: str,
    dependency_versions: str = "",
) -> str:
    """Build a complete migration context string for a task.

    This is injected into the system prompt to give the model
    specific knowledge about what changed.

    Returns:
        Formatted string with all relevant breaking changes.
    """
    parts = []

    # Python version changes
    py_changes = get_python_breaking_changes(old_python, new_python)
    if py_changes and "No known" not in py_changes:
        parts.append(f"=== PYTHON {old_python} β†’ {new_python} BREAKING CHANGES ===\n{py_changes}")

    # Package changes
    pkg_changes = get_package_breaking_changes(related_modules)
    if pkg_changes and "No pre-built" not in pkg_changes:
        parts.append(f"=== PACKAGE BREAKING CHANGES ({related_modules}) ===\n{pkg_changes}")

    if not parts:
        return "No specific migration docs available. Use general debugging."

    return "\n\n".join(parts)


# ---------------------------------------------------------------------------
# CLI β€” pre-fetch and show context for all tasks in the dataset
# ---------------------------------------------------------------------------
def main():
    """Show migration context for all tasks in train + eval."""
    from code_migration.dataset_loader import DatasetLoader

    data_dir = Path(__file__).parent / "data"

    for split in ["train.jsonl", "eval.jsonl"]:
        path = data_dir / split
        if not path.exists():
            continue

        print(f"\n{'='*60}")
        print(f"  {split}")
        print(f"{'='*60}")

        loader = DatasetLoader(str(path))
        for i, task in enumerate(loader._tasks):
            ctx = get_migration_context(
                old_python=task.reproduction_target_version,
                new_python=task.migration_target_version,
                related_modules=task.related_modules,
                dependency_versions=task.dependency_versions,
            )
            has_py = "PYTHON" in ctx
            has_pkg = "PACKAGE" in ctx
            ctx_len = len(ctx)
            print(f"  {i+1:2d}. {task.repo_name:40s} "
                  f"py={'βœ“' if has_py else 'βœ—'} pkg={'βœ“' if has_pkg else 'βœ—'} "
                  f"({ctx_len} chars)")


if __name__ == "__main__":
    main()