import requests import json import re import pandas as pd with open("/content/languages.json") as f: languages = json.load(f) token = "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" headers = {"Authorization": f"token ghp_{token}"} res = requests.get("https://api.github.com/users/Eben113/repos", headers=headers) js1 = res.json()[0] url = js1["url"] + "/contents" res1 = requests.get(url) def buildLis(): lis = [{"name": dict_["name"], "url": dict_["url"], "langURL": dict_["languages_url"]} for dict_ in res.json()] return lis def scanJson(name, url, langURL): extensions = [] for language in requests.get(langURL).json(): exts = languages.get(language.title(), None) if exts: extensions.extend(exts["extensions"]) else: extensions.append("."+language) files = requests.get(url + "/contents").json() def walk(js, prefix): res = {} for branch in js: if branch["name"] == "README.md": res[prefix + "/" + "readme"] = branch["url"] elif branch["type"] == "file" and (("."+branch["name"].split(".")[-1]) in extensions): res[prefix + "/" + branch["name"]] = branch["url"] elif branch["type"] == "dir": res.update(walk(requests.get(branch["url"]).json(), prefix + "/" + branch["name"])) return res info = walk(files, name) return info def buildDataset(repo_list): data = [] for repo in repo_list: records = scanJson(repo["name"], repo["url"], repo["langURL"]) for dir, url in records.items(): data.append({"repo": repo["name"], "directory": dir, "url": url}) data = pd.DataFrame(data) return data