| |
| |
|
|
| import argparse |
| import logging |
| import os |
| import pathlib |
| import re |
| import sys |
| from multiprocessing.dummy import Pool |
| from typing import NamedTuple, Optional, Tuple |
|
|
| import requests |
| from mmengine.logging import MMLogger |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser( |
| description='Goes through all the inline-links ' |
| 'in markdown files and reports the breakages') |
| parser.add_argument( |
| '--num-threads', |
| type=int, |
| default=100, |
| help='Number of processes to confirm the link') |
| parser.add_argument('--https-proxy', type=str, help='https proxy') |
| parser.add_argument( |
| '--out', |
| type=str, |
| default='link_reports.txt', |
| help='output path of reports') |
| args = parser.parse_args() |
| return args |
|
|
|
|
| OK_STATUS_CODES = ( |
| 200, |
| 401, |
| 403, |
| 405, |
| |
| |
| 406, |
| ) |
|
|
|
|
| class MatchTuple(NamedTuple): |
| source: str |
| name: str |
| link: str |
|
|
|
|
| def check_link( |
| match_tuple: MatchTuple, |
| http_session: requests.Session, |
| logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]: |
| reason: Optional[str] = None |
| if match_tuple.link.startswith('http'): |
| result_ok, reason = check_url(match_tuple, http_session) |
| else: |
| result_ok = check_path(match_tuple) |
| if logger is None: |
| print(f" {'✓' if result_ok else '✗'} {match_tuple.link}") |
| else: |
| logger.info(f" {'✓' if result_ok else '✗'} {match_tuple.link}") |
| return match_tuple, result_ok, reason |
|
|
|
|
| def check_url(match_tuple: MatchTuple, |
| http_session: requests.Session) -> Tuple[bool, str]: |
| """Check if a URL is reachable.""" |
| try: |
| result = http_session.head( |
| match_tuple.link, timeout=5, allow_redirects=True) |
| return ( |
| result.ok or result.status_code in OK_STATUS_CODES, |
| f'status code = {result.status_code}', |
| ) |
| except (requests.ConnectionError, requests.Timeout): |
| return False, 'connection error' |
|
|
|
|
| def check_path(match_tuple: MatchTuple) -> bool: |
| """Check if a file in this repository exists.""" |
| relative_path = match_tuple.link.split('#')[0] |
| full_path = os.path.join( |
| os.path.dirname(str(match_tuple.source)), relative_path) |
| return os.path.exists(full_path) |
|
|
|
|
| def main(): |
| args = parse_args() |
|
|
| |
| logger = MMLogger.get_instance(name='mmdet', log_file=args.out) |
|
|
| |
| if args.https_proxy: |
| os.environ['https_proxy'] = args.https_proxy |
|
|
| |
| http_session = requests.Session() |
| for resource_prefix in ('http://', 'https://'): |
| http_session.mount( |
| resource_prefix, |
| requests.adapters.HTTPAdapter( |
| max_retries=5, |
| pool_connections=20, |
| pool_maxsize=args.num_threads), |
| ) |
|
|
| logger.info('Finding all markdown files in the current directory...') |
|
|
| project_root = (pathlib.Path(__file__).parent / '..').resolve() |
| markdown_files = project_root.glob('**/*.md') |
|
|
| all_matches = set() |
| url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)') |
| for markdown_file in markdown_files: |
| with open(markdown_file) as handle: |
| for line in handle.readlines(): |
| matches = url_regex.findall(line) |
| for name, link in matches: |
| if 'localhost' not in link: |
| all_matches.add( |
| MatchTuple( |
| source=str(markdown_file), |
| name=name, |
| link=link)) |
|
|
| logger.info(f' {len(all_matches)} markdown files found') |
| logger.info('Checking to make sure we can retrieve each link...') |
|
|
| with Pool(processes=args.num_threads) as pool: |
| results = pool.starmap(check_link, [(match, http_session, logger) |
| for match in list(all_matches)]) |
|
|
| |
| unreachable_results = [(match_tuple, reason) |
| for match_tuple, success, reason in results |
| if not success] |
|
|
| if unreachable_results: |
| logger.info('================================================') |
| logger.info(f'Unreachable links ({len(unreachable_results)}):') |
| for match_tuple, reason in unreachable_results: |
| logger.info(' > Source: ' + match_tuple.source) |
| logger.info(' Name: ' + match_tuple.name) |
| logger.info(' Link: ' + match_tuple.link) |
| if reason is not None: |
| logger.info(' Reason: ' + reason) |
| sys.exit(1) |
| logger.info('No Unreachable link found.') |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|