| | from __future__ import division, print_function, unicode_literals |
| |
|
| | import json |
| | import os |
| | import pprint |
| | from subprocess import CalledProcessError, check_output, PIPE, Popen, STDOUT |
| | try: |
| | from urllib.parse import urlsplit, urlunsplit |
| | from urllib.request import Request, urlopen |
| | from urllib.error import HTTPError |
| | except ImportError: |
| | from urllib2 import Request, urlopen |
| | from urllib2 import HTTPError |
| | from urlparse import urlsplit, urlunsplit |
| |
|
| | from .utils import force_link, ignore_missing_file, in_dir, TempDir, TempFile |
| |
|
| |
|
| | MEDIA_TYPE = 'application/vnd.git-lfs+json' |
| | POST_HEADERS = {'Accept': MEDIA_TYPE, 'Content-Type': MEDIA_TYPE} |
| |
|
| |
|
| | def git_show(git_repo, p): |
| | with in_dir(git_repo): |
| | return check_output(['git', 'show', 'HEAD:'+p]) |
| |
|
| |
|
| | def get_cache_dir(git_dir, oid): |
| | return git_dir+'/lfs/objects/'+oid[:2]+'/'+oid[2:4] |
| |
|
| |
|
| | def get_lfs_endpoint_url(git_repo, checkout_dir): |
| | try: |
| | with in_dir(checkout_dir): |
| | url = check_output( |
| | 'git config -f .lfsconfig --get lfs.url'.split() |
| | ).strip().decode('utf8') |
| | except CalledProcessError: |
| | with in_dir(git_repo): |
| | url = check_output( |
| | 'git config --get remote.origin.url'.split() |
| | ).strip().decode('utf8') |
| | if url.endswith('/'): |
| | url = url[:-1] |
| | if not url.endswith('/info/lfs'): |
| | url += '/info/lfs' if url.endswith('.git') else '.git/info/lfs' |
| | url_split = urlsplit(url) |
| | host, path = url_split.hostname, url_split.path |
| | if url_split.scheme != 'https': |
| | if not url_split.scheme: |
| | |
| | host, path = url_split.path.split('@', 1)[1].split(':', 1) |
| | url = urlunsplit(('https', host, path, '', '')) |
| | del url_split |
| |
|
| | |
| | |
| | if path.endswith('/info/lfs'): |
| | path = path[:-len('/info/lfs')] |
| | auth_header = get_lfs_api_token(host, path) |
| | return url, auth_header |
| |
|
| |
|
| | def get_lfs_api_token(host, path): |
| | """ gets an authorization token to use to do further introspection on the |
| | LFS info in the repository. See documentation here for description of |
| | the ssh command and response: |
| | https://github.com/git-lfs/git-lfs/blob/master/docs/api/server-discovery.md |
| | """ |
| | header_info = {} |
| | query_cmd = 'ssh git@'+host+' git-lfs-authenticate '+path+' download' |
| | output = check_output(query_cmd.split()).strip().decode('utf8') |
| | if output: |
| | query_resp = json.loads(output) |
| | header_info = query_resp['header'] |
| |
|
| | return header_info |
| |
|
| |
|
| | def find_lfs_files(checkout_dir): |
| | """Yields the paths of the files managed by Git LFS |
| | """ |
| | with in_dir(checkout_dir): |
| | repo_files = Popen('git ls-files -z'.split(), stdout=PIPE) |
| | repo_files_attrs = check_output( |
| | 'git check-attr --cached --stdin -z diff filter'.split(), |
| | stdin=repo_files.stdout |
| | ) |
| | |
| | sep = b'\0' if b'\0' in repo_files_attrs else b'\n' |
| | i = iter(repo_files_attrs.strip(sep).split(sep)) |
| | paths = set() |
| | while True: |
| | try: |
| | if sep == b'\0': |
| | path, attr, value = next(i), next(i), next(i) |
| | else: |
| | path, attr, value = next(i).rsplit(': ', 2) |
| | attr |
| | except StopIteration: |
| | break |
| | if value != b'lfs': |
| | continue |
| | if path in paths: |
| | continue |
| | paths.add(path) |
| | yield path.decode('ascii') |
| |
|
| |
|
| | def read_lfs_metadata(checkout_dir): |
| | """Yields (path, oid, size) tuples for all files managed by Git LFS |
| | """ |
| | for path in find_lfs_files(checkout_dir): |
| | meta = git_show(checkout_dir, path).decode('utf8').strip().split('\n') |
| | assert meta[0] == 'version https://git-lfs.github.com/spec/v1', meta |
| | d = dict(line.split(' ', 1) for line in meta[1:]) |
| | oid = d['oid'] |
| | oid = oid[7:] if oid.startswith('sha256:') else oid |
| | size = int(d['size']) |
| | yield (path, oid, size) |
| |
|
| |
|
| | def fetch_urls(lfs_url, lfs_auth_info, oid_list): |
| | """Fetch the URLs of the files from the Git LFS endpoint |
| | """ |
| | objects = [] |
| | data = json.dumps({'operation': 'download', 'objects': oid_list}) |
| | headers = dict(POST_HEADERS) |
| | headers.update(lfs_auth_info) |
| | req = Request(lfs_url+'/objects/batch', data.encode('ascii'), headers) |
| |
|
| | try: |
| | resp = json.loads(urlopen(req).read().decode('ascii')) |
| | assert 'objects' in resp, resp |
| | objects.extend(resp['objects']) |
| |
|
| | except HTTPError as err: |
| | if err.code != 413: |
| | raise |
| |
|
| | |
| | |
| | |
| | objects.extend( |
| | fetch_urls(lfs_url, lfs_auth_info, oid_list[:len(oid_list) // 2]) |
| | ) |
| | objects.extend( |
| | fetch_urls(lfs_url, lfs_auth_info, oid_list[len(oid_list) // 2:]) |
| | ) |
| |
|
| | return objects |
| |
|
| |
|
| | def fetch(git_repo, checkout_dir=None, verbose=0): |
| | """Download all the files managed by Git LFS |
| | """ |
| | git_dir = git_repo+'/.git' if os.path.isdir(git_repo+'/.git') else git_repo |
| | checkout_dir = checkout_dir or git_repo |
| | if checkout_dir == git_dir: |
| | print('Can\'t checkout into a bare repo, please provide a valid ' |
| | 'checkout_dir') |
| | raise SystemExit(1) |
| | checkout_git_dir = checkout_dir+'/.git' |
| | if not os.path.isdir(checkout_git_dir): |
| | with TempDir(dir=checkout_dir) as d: |
| | check_output(['git', 'clone', '-ns', git_repo, d], stderr=STDOUT) |
| | os.rename(d+'/.git', checkout_git_dir) |
| | with in_dir(checkout_dir): |
| | check_output(['git', 'reset', 'HEAD']) |
| |
|
| | |
| | found = False |
| | oid_list, lfs_files = [], {} |
| | for path, oid, size in read_lfs_metadata(checkout_dir): |
| | found = True |
| | dst = checkout_dir+'/'+path |
| |
|
| | |
| | with ignore_missing_file(): |
| | if os.stat(dst).st_size == size: |
| | if verbose > 1: |
| | print('Skipping', path, '(already present)') |
| | continue |
| |
|
| | |
| | with ignore_missing_file(): |
| | cached = get_cache_dir(git_dir, oid)+'/'+oid |
| | if os.stat(cached).st_size == size: |
| | force_link(cached, dst) |
| | if verbose > 0: |
| | print('Linked', path, 'from the cache') |
| | continue |
| |
|
| | oid_list.append(dict(oid=oid, size=size)) |
| | lfs_files[(oid, size)] = path |
| |
|
| | if not found: |
| | print('This repository does not seem to use LFS.') |
| | return |
| |
|
| | if not oid_list: |
| | if verbose > 0: |
| | print('Nothing to fetch.') |
| | return |
| |
|
| | |
| | lfs_url, lfs_auth_info = get_lfs_endpoint_url(git_repo, checkout_dir) |
| |
|
| | if verbose > 0: |
| | print('Fetching URLs from %s ...' % lfs_url) |
| | if verbose > 1: |
| | print('Authorization info for URL: %s' % lfs_auth_info) |
| | print('oid_list: %s' % pprint.pformat(oid_list)) |
| | objects = fetch_urls(lfs_url, lfs_auth_info, oid_list) |
| |
|
| | |
| | tmp_dir = git_dir+'/lfs/tmp' |
| | if not os.path.exists(tmp_dir): |
| | os.makedirs(tmp_dir) |
| | for obj in objects: |
| | oid, size = (obj['oid'], obj['size']) |
| | path = lfs_files[(oid, size)] |
| | cache_dir = get_cache_dir(git_dir, oid) |
| |
|
| | |
| | with TempFile(dir=tmp_dir) as f: |
| | url = obj['actions']['download']['href'] |
| | head = obj['actions']['download']['header'] |
| | print('Downloading %s (%s bytes) from %s...' % |
| | (path, size, url if verbose > 0 else url[:40])) |
| | h = urlopen(Request(url, headers=head)) |
| | while True: |
| | buf = h.read(10240) |
| | if not buf: |
| | break |
| | f.write(buf) |
| |
|
| | |
| | dst1 = cache_dir+'/'+oid |
| | if not os.path.exists(cache_dir): |
| | os.makedirs(cache_dir) |
| | if verbose > 1: |
| | print('temp download file: ' + f.name) |
| | print('cache file name: ' + dst1) |
| | os.rename(f.name, dst1) |
| |
|
| | |
| | dst2 = checkout_dir+'/'+path |
| | force_link(dst1, dst2) |
| |
|