|
|
""" |
|
|
GitHub repository analysis utilities |
|
|
""" |
|
|
|
|
|
import os |
|
|
import re |
|
|
import httpx |
|
|
from typing import Optional |
|
|
from datetime import datetime, timedelta |
|
|
|
|
|
|
|
|
def get_since_iso(time_range: str) -> str: |
|
|
"""Compute ISO since date for a given window.""" |
|
|
now = datetime.now() |
|
|
delta_map = { |
|
|
"1week": timedelta(days=7), |
|
|
"1month": timedelta(days=30), |
|
|
"3months": timedelta(days=90), |
|
|
"6months": timedelta(days=180), |
|
|
"1year": timedelta(days=365), |
|
|
} |
|
|
delta = delta_map.get(time_range, timedelta(days=90)) |
|
|
return (now - delta).isoformat() |
|
|
|
|
|
|
|
|
async def fetch_github_repo(repo_url: str, github_token: Optional[str] = None) -> dict: |
|
|
"""Fetch repository metadata from GitHub. |
|
|
|
|
|
Priority for GitHub token: |
|
|
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) |
|
|
2. User-provided github_token parameter (for manual override) |
|
|
""" |
|
|
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) |
|
|
if not match: |
|
|
raise ValueError("Invalid GitHub URL") |
|
|
|
|
|
owner, repo = match.groups() |
|
|
repo_name = repo.rstrip(".git") |
|
|
|
|
|
|
|
|
token = os.environ.get("GITHUB_TOKEN") or github_token |
|
|
headers = {"Accept": "application/vnd.github.v3+json"} |
|
|
if token: |
|
|
headers["Authorization"] = f"token {token}" |
|
|
|
|
|
async with httpx.AsyncClient() as client: |
|
|
|
|
|
repo_response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}", |
|
|
headers=headers, |
|
|
timeout=30, |
|
|
) |
|
|
if repo_response.status_code != 200: |
|
|
raise ValueError(f"GitHub API error: {repo_response.text}") |
|
|
|
|
|
repo_data = repo_response.json() |
|
|
|
|
|
|
|
|
languages_response = await client.get( |
|
|
repo_data["languages_url"], headers=headers, timeout=30 |
|
|
) |
|
|
languages = ( |
|
|
languages_response.json() if languages_response.status_code == 200 else {} |
|
|
) |
|
|
|
|
|
|
|
|
readme = "" |
|
|
try: |
|
|
readme_response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/readme", |
|
|
headers={**headers, "Accept": "application/vnd.github.raw"}, |
|
|
timeout=30, |
|
|
) |
|
|
if readme_response.status_code == 200: |
|
|
readme = readme_response.text |
|
|
except Exception as e: |
|
|
print(f"README fetch failed: {e}") |
|
|
|
|
|
|
|
|
tree_response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/git/trees/{repo_data['default_branch']}?recursive=1", |
|
|
headers=headers, |
|
|
timeout=30, |
|
|
) |
|
|
tree_data = ( |
|
|
tree_response.json() if tree_response.status_code == 200 else {"tree": []} |
|
|
) |
|
|
files = tree_data.get("tree", []) |
|
|
|
|
|
structure = analyze_repo_structure(files) |
|
|
|
|
|
return { |
|
|
"full_name": repo_data["full_name"], |
|
|
"owner": owner, |
|
|
"name": repo_name, |
|
|
"description": repo_data.get("description", "") or "", |
|
|
"languages": languages, |
|
|
"primaryLanguage": repo_data.get("language", "Unknown") or "Unknown", |
|
|
"stars": repo_data.get("stargazers_count", 0), |
|
|
"forks": repo_data.get("forks_count", 0), |
|
|
"readme": readme, |
|
|
"structure": structure, |
|
|
"url": repo_url, |
|
|
} |
|
|
|
|
|
|
|
|
async def fetch_recent_commits( |
|
|
repo_url: str, |
|
|
limit: int = 10, |
|
|
since_iso: Optional[str] = None, |
|
|
until_iso: Optional[str] = None, |
|
|
github_token: Optional[str] = None, |
|
|
) -> list[dict]: |
|
|
"""Fetch recent commits from a GitHub repository. |
|
|
|
|
|
Priority for GitHub token: |
|
|
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) |
|
|
2. User-provided github_token parameter (for manual override) |
|
|
""" |
|
|
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) |
|
|
if not match: |
|
|
raise ValueError("Invalid GitHub URL") |
|
|
|
|
|
owner, repo = match.groups() |
|
|
repo_name = repo.rstrip(".git") |
|
|
|
|
|
|
|
|
token = os.environ.get("GITHUB_TOKEN") or github_token |
|
|
headers = {"Accept": "application/vnd.github.v3+json"} |
|
|
if token: |
|
|
headers["Authorization"] = f"token {token}" |
|
|
|
|
|
try: |
|
|
params = {"per_page": str(limit)} |
|
|
if since_iso: |
|
|
params["since"] = since_iso |
|
|
if until_iso: |
|
|
params["until"] = until_iso |
|
|
|
|
|
async with httpx.AsyncClient() as client: |
|
|
response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/commits", |
|
|
headers=headers, |
|
|
params=params, |
|
|
timeout=30, |
|
|
) |
|
|
if response.status_code != 200: |
|
|
return [] |
|
|
|
|
|
commits_data = response.json() |
|
|
return [ |
|
|
{ |
|
|
"sha": c["sha"], |
|
|
"message": c.get("commit", {}) |
|
|
.get("message", "") |
|
|
.split("\n")[0][:100], |
|
|
"author": c.get("commit", {}) |
|
|
.get("author", {}) |
|
|
.get("name", "unknown"), |
|
|
"date": c.get("commit", {}).get("author", {}).get("date", ""), |
|
|
"url": c.get("html_url", ""), |
|
|
} |
|
|
for c in commits_data |
|
|
] |
|
|
except Exception as e: |
|
|
print(f"Recent commits unavailable: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def analyze_repo_structure(files: list) -> dict: |
|
|
"""Analyze repository structure from file list.""" |
|
|
paths = [f.get("path", "").lower() for f in files] |
|
|
|
|
|
has_readme = any(p in ("readme.md", "readme.txt") for p in paths) |
|
|
has_docs = any(p.startswith("docs/") or p.startswith("documentation/") for p in paths) |
|
|
has_examples = any(p.startswith("examples/") or p.startswith("example/") for p in paths) |
|
|
|
|
|
|
|
|
directories = set() |
|
|
for f in files: |
|
|
if f.get("type") == "tree": |
|
|
parts = f.get("path", "").split("/") |
|
|
if len(parts) == 1: |
|
|
directories.add(parts[0]) |
|
|
|
|
|
return { |
|
|
"hasReadme": has_readme, |
|
|
"hasDocs": has_docs, |
|
|
"hasExamples": has_examples, |
|
|
"keyDirectories": sorted(list(directories)), |
|
|
} |
|
|
|
|
|
|
|
|
async def fetch_commit_diff( |
|
|
repo_url: str, |
|
|
commit_sha: str, |
|
|
github_token: Optional[str] = None, |
|
|
) -> dict: |
|
|
"""Fetch the diff for a specific commit. |
|
|
|
|
|
Priority for GitHub token: |
|
|
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) |
|
|
2. User-provided github_token parameter (for manual override) |
|
|
""" |
|
|
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) |
|
|
if not match: |
|
|
raise ValueError("Invalid GitHub URL") |
|
|
|
|
|
owner, repo = match.groups() |
|
|
repo_name = repo.rstrip(".git") |
|
|
|
|
|
|
|
|
token = os.environ.get("GITHUB_TOKEN") or github_token |
|
|
headers = {"Accept": "application/vnd.github.v3.diff"} |
|
|
if token: |
|
|
headers["Authorization"] = f"token {token}" |
|
|
|
|
|
try: |
|
|
async with httpx.AsyncClient() as client: |
|
|
response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/commits/{commit_sha}", |
|
|
headers=headers, |
|
|
timeout=30, |
|
|
) |
|
|
if response.status_code != 200: |
|
|
return {"sha": commit_sha, "diff": "", "error": "Failed to fetch diff"} |
|
|
|
|
|
return { |
|
|
"sha": commit_sha, |
|
|
"diff": response.text[:10000], |
|
|
} |
|
|
except Exception as e: |
|
|
print(f"Commit diff fetch failed: {e}") |
|
|
return {"sha": commit_sha, "diff": "", "error": str(e)} |
|
|
|
|
|
|
|
|
async def fetch_commits_with_diffs( |
|
|
repo_url: str, |
|
|
commits: list[dict], |
|
|
max_commits: int = 5, |
|
|
github_token: Optional[str] = None, |
|
|
) -> list[dict]: |
|
|
"""Fetch diffs for multiple commits, focusing on important changes.""" |
|
|
diffs = [] |
|
|
for commit in commits[:max_commits]: |
|
|
diff_data = await fetch_commit_diff(repo_url, commit["sha"], github_token) |
|
|
diffs.append({ |
|
|
**commit, |
|
|
"diff": diff_data.get("diff", ""), |
|
|
}) |
|
|
return diffs |
|
|
|
|
|
|
|
|
async def fetch_readme_at_commit( |
|
|
repo_url: str, |
|
|
commit_sha: str, |
|
|
github_token: Optional[str] = None, |
|
|
) -> str: |
|
|
"""Fetch README content at a specific commit. |
|
|
|
|
|
Priority for GitHub token: |
|
|
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) |
|
|
2. User-provided github_token parameter (for manual override) |
|
|
""" |
|
|
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) |
|
|
if not match: |
|
|
raise ValueError("Invalid GitHub URL") |
|
|
|
|
|
owner, repo = match.groups() |
|
|
repo_name = repo.rstrip(".git") |
|
|
|
|
|
|
|
|
token = os.environ.get("GITHUB_TOKEN") or github_token |
|
|
headers = {"Accept": "application/vnd.github.raw"} |
|
|
if token: |
|
|
headers["Authorization"] = f"token {token}" |
|
|
|
|
|
|
|
|
readme_files = ["README.md", "readme.md", "README.txt", "README"] |
|
|
|
|
|
async with httpx.AsyncClient() as client: |
|
|
for readme_file in readme_files: |
|
|
try: |
|
|
response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/contents/{readme_file}?ref={commit_sha}", |
|
|
headers=headers, |
|
|
timeout=30, |
|
|
) |
|
|
if response.status_code == 200: |
|
|
return response.text |
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
return "" |
|
|
|
|
|
|
|
|
async def fetch_first_commit_sha( |
|
|
repo_url: str, |
|
|
github_token: Optional[str] = None, |
|
|
) -> Optional[str]: |
|
|
"""Fetch the SHA of the first (oldest) commit in the repository. |
|
|
|
|
|
Priority for GitHub token: |
|
|
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) |
|
|
2. User-provided github_token parameter (for manual override) |
|
|
""" |
|
|
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) |
|
|
if not match: |
|
|
raise ValueError("Invalid GitHub URL") |
|
|
|
|
|
owner, repo = match.groups() |
|
|
repo_name = repo.rstrip(".git") |
|
|
|
|
|
|
|
|
token = os.environ.get("GITHUB_TOKEN") or github_token |
|
|
headers = {"Accept": "application/vnd.github.v3+json"} |
|
|
if token: |
|
|
headers["Authorization"] = f"token {token}" |
|
|
|
|
|
try: |
|
|
async with httpx.AsyncClient() as client: |
|
|
|
|
|
repo_response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}", |
|
|
headers=headers, |
|
|
timeout=30, |
|
|
) |
|
|
if repo_response.status_code != 200: |
|
|
return None |
|
|
|
|
|
default_branch = repo_response.json().get("default_branch", "main") |
|
|
|
|
|
|
|
|
|
|
|
response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/commits", |
|
|
headers=headers, |
|
|
params={"sha": default_branch, "per_page": "1"}, |
|
|
timeout=30, |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
return None |
|
|
|
|
|
|
|
|
link_header = response.headers.get("Link", "") |
|
|
last_page_url = None |
|
|
|
|
|
for link in link_header.split(", "): |
|
|
if 'rel="last"' in link: |
|
|
last_page_url = link.split(";")[0].strip("<>") |
|
|
break |
|
|
|
|
|
if last_page_url: |
|
|
|
|
|
last_response = await client.get( |
|
|
last_page_url, |
|
|
headers=headers, |
|
|
timeout=30, |
|
|
) |
|
|
if last_response.status_code == 200: |
|
|
commits = last_response.json() |
|
|
if commits: |
|
|
return commits[-1]["sha"] |
|
|
|
|
|
|
|
|
response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/commits", |
|
|
headers=headers, |
|
|
params={"sha": default_branch, "per_page": "100"}, |
|
|
timeout=30, |
|
|
) |
|
|
if response.status_code == 200: |
|
|
commits = response.json() |
|
|
if commits: |
|
|
return commits[-1]["sha"] |
|
|
|
|
|
return None |
|
|
except Exception as e: |
|
|
print(f"First commit fetch failed: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
async def fetch_readme_changes( |
|
|
repo_url: str, |
|
|
since_iso: str, |
|
|
github_token: Optional[str] = None, |
|
|
) -> dict: |
|
|
""" |
|
|
Fetch README changes: initial README and diff within the time period. |
|
|
Returns both the original README (for project understanding) and |
|
|
any README changes during the analysis period (highest priority for DevRel). |
|
|
|
|
|
Priority for GitHub token: |
|
|
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) |
|
|
2. User-provided github_token parameter (for manual override) |
|
|
""" |
|
|
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) |
|
|
if not match: |
|
|
raise ValueError("Invalid GitHub URL") |
|
|
|
|
|
owner, repo = match.groups() |
|
|
repo_name = repo.rstrip(".git") |
|
|
|
|
|
|
|
|
token = os.environ.get("GITHUB_TOKEN") or github_token |
|
|
headers = {"Accept": "application/vnd.github.v3+json"} |
|
|
if token: |
|
|
headers["Authorization"] = f"token {token}" |
|
|
|
|
|
result = { |
|
|
"initial_readme": "", |
|
|
"readme_diff": "", |
|
|
"has_readme_changes": False, |
|
|
"readme_commits": [], |
|
|
} |
|
|
|
|
|
try: |
|
|
async with httpx.AsyncClient() as client: |
|
|
|
|
|
readme_commits = [] |
|
|
for readme_file in ["README.md", "readme.md", "README.txt"]: |
|
|
response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/commits", |
|
|
headers=headers, |
|
|
params={ |
|
|
"path": readme_file, |
|
|
"since": since_iso, |
|
|
"per_page": "10", |
|
|
}, |
|
|
timeout=30, |
|
|
) |
|
|
if response.status_code == 200: |
|
|
commits = response.json() |
|
|
for c in commits: |
|
|
readme_commits.append({ |
|
|
"sha": c["sha"], |
|
|
"message": c.get("commit", {}).get("message", "").split("\n")[0][:100], |
|
|
"date": c.get("commit", {}).get("author", {}).get("date", ""), |
|
|
"file": readme_file, |
|
|
}) |
|
|
|
|
|
result["readme_commits"] = readme_commits |
|
|
result["has_readme_changes"] = len(readme_commits) > 0 |
|
|
|
|
|
|
|
|
first_sha = await fetch_first_commit_sha(repo_url, github_token) |
|
|
if first_sha: |
|
|
result["initial_readme"] = await fetch_readme_at_commit( |
|
|
repo_url, first_sha, github_token |
|
|
) |
|
|
|
|
|
|
|
|
if readme_commits: |
|
|
oldest_readme_commit = readme_commits[-1]["sha"] if readme_commits else None |
|
|
if oldest_readme_commit: |
|
|
|
|
|
diff_headers = {"Accept": "application/vnd.github.v3.diff"} |
|
|
if token: |
|
|
diff_headers["Authorization"] = f"token {token}" |
|
|
|
|
|
|
|
|
diffs = [] |
|
|
for commit in readme_commits[:3]: |
|
|
diff_response = await client.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo_name}/commits/{commit['sha']}", |
|
|
headers=diff_headers, |
|
|
timeout=30, |
|
|
) |
|
|
if diff_response.status_code == 200: |
|
|
|
|
|
diff_text = diff_response.text |
|
|
readme_sections = [] |
|
|
in_readme = False |
|
|
current_section = [] |
|
|
|
|
|
for line in diff_text.split("\n"): |
|
|
if line.startswith("diff --git") and "readme" in line.lower(): |
|
|
in_readme = True |
|
|
current_section = [line] |
|
|
elif line.startswith("diff --git") and in_readme: |
|
|
readme_sections.append("\n".join(current_section)) |
|
|
in_readme = "readme" in line.lower() |
|
|
current_section = [line] if in_readme else [] |
|
|
elif in_readme: |
|
|
current_section.append(line) |
|
|
|
|
|
if current_section and in_readme: |
|
|
readme_sections.append("\n".join(current_section)) |
|
|
|
|
|
if readme_sections: |
|
|
diffs.append(f"# Commit: {commit['message']}\n" + "\n".join(readme_sections)) |
|
|
|
|
|
result["readme_diff"] = "\n\n---\n\n".join(diffs)[:8000] |
|
|
|
|
|
except Exception as e: |
|
|
print(f"README changes fetch failed: {e}") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
async def analyze_breaking_changes( |
|
|
commits_with_diffs: list[dict], |
|
|
) -> list[dict]: |
|
|
""" |
|
|
Analyze commit diffs to identify potential breaking changes. |
|
|
Returns a list of breaking changes with context. |
|
|
""" |
|
|
breaking_changes = [] |
|
|
|
|
|
|
|
|
breaking_patterns = [ |
|
|
|
|
|
(r"^-\s*(def|function|func|public|private|protected)\s+\w+\s*\([^)]*\)", "Function signature removed/changed"), |
|
|
|
|
|
(r"^-\s*(class|interface|struct|type)\s+\w+", "Class/interface removed/changed"), |
|
|
|
|
|
(r"^-\s*(import|export|from|require)\s+", "Import/export removed/changed"), |
|
|
|
|
|
(r"^-\s*['\"]?[a-zA-Z_]+['\"]?\s*[:=]", "Configuration option removed/changed"), |
|
|
|
|
|
(r"^-\s*@?(get|post|put|delete|patch|route|api)\s*\(", "API endpoint removed/changed"), |
|
|
|
|
|
(r"deprecated|breaking|removed|obsolete", "Deprecation or breaking change mentioned"), |
|
|
|
|
|
(r"^-\s*['\"]?version['\"]?\s*[:=]\s*['\"]?\d+\.\d+", "Version number changed"), |
|
|
] |
|
|
|
|
|
for commit in commits_with_diffs: |
|
|
diff = commit.get("diff", "") |
|
|
if not diff: |
|
|
continue |
|
|
|
|
|
commit_breaking_changes = [] |
|
|
|
|
|
for pattern, description in breaking_patterns: |
|
|
import re |
|
|
matches = re.findall(pattern, diff, re.MULTILINE | re.IGNORECASE) |
|
|
if matches: |
|
|
commit_breaking_changes.append({ |
|
|
"type": description, |
|
|
"count": len(matches), |
|
|
}) |
|
|
|
|
|
if commit_breaking_changes: |
|
|
breaking_changes.append({ |
|
|
"sha": commit.get("sha", "")[:7], |
|
|
"message": commit.get("message", ""), |
|
|
"date": commit.get("date", ""), |
|
|
"changes": commit_breaking_changes, |
|
|
}) |
|
|
|
|
|
return breaking_changes |
|
|
|