devrel-agent-gradio / github_utils.py
chenglu's picture
Upload 4 files
7c4bfcc verified
"""
GitHub repository analysis utilities
"""
import os
import re
import httpx
from typing import Optional
from datetime import datetime, timedelta
def get_since_iso(time_range: str) -> str:
"""Compute ISO since date for a given window."""
now = datetime.now()
delta_map = {
"1week": timedelta(days=7),
"1month": timedelta(days=30),
"3months": timedelta(days=90),
"6months": timedelta(days=180),
"1year": timedelta(days=365),
}
delta = delta_map.get(time_range, timedelta(days=90))
return (now - delta).isoformat()
async def fetch_github_repo(repo_url: str, github_token: Optional[str] = None) -> dict:
"""Fetch repository metadata from GitHub.
Priority for GitHub token:
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets)
2. User-provided github_token parameter (for manual override)
"""
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url)
if not match:
raise ValueError("Invalid GitHub URL")
owner, repo = match.groups()
repo_name = repo.rstrip(".git")
# Environment variable takes priority (for HF Spaces secrets)
token = os.environ.get("GITHUB_TOKEN") or github_token
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"token {token}"
async with httpx.AsyncClient() as client:
# Fetch repository metadata
repo_response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}",
headers=headers,
timeout=30,
)
if repo_response.status_code != 200:
raise ValueError(f"GitHub API error: {repo_response.text}")
repo_data = repo_response.json()
# Fetch languages
languages_response = await client.get(
repo_data["languages_url"], headers=headers, timeout=30
)
languages = (
languages_response.json() if languages_response.status_code == 200 else {}
)
# Fetch README
readme = ""
try:
readme_response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/readme",
headers={**headers, "Accept": "application/vnd.github.raw"},
timeout=30,
)
if readme_response.status_code == 200:
readme = readme_response.text
except Exception as e:
print(f"README fetch failed: {e}")
# Fetch repository tree
tree_response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/git/trees/{repo_data['default_branch']}?recursive=1",
headers=headers,
timeout=30,
)
tree_data = (
tree_response.json() if tree_response.status_code == 200 else {"tree": []}
)
files = tree_data.get("tree", [])
structure = analyze_repo_structure(files)
return {
"full_name": repo_data["full_name"],
"owner": owner,
"name": repo_name,
"description": repo_data.get("description", "") or "",
"languages": languages,
"primaryLanguage": repo_data.get("language", "Unknown") or "Unknown",
"stars": repo_data.get("stargazers_count", 0),
"forks": repo_data.get("forks_count", 0),
"readme": readme,
"structure": structure,
"url": repo_url,
}
async def fetch_recent_commits(
repo_url: str,
limit: int = 10,
since_iso: Optional[str] = None,
until_iso: Optional[str] = None,
github_token: Optional[str] = None,
) -> list[dict]:
"""Fetch recent commits from a GitHub repository.
Priority for GitHub token:
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets)
2. User-provided github_token parameter (for manual override)
"""
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url)
if not match:
raise ValueError("Invalid GitHub URL")
owner, repo = match.groups()
repo_name = repo.rstrip(".git")
# Environment variable takes priority (for HF Spaces secrets)
token = os.environ.get("GITHUB_TOKEN") or github_token
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"token {token}"
try:
params = {"per_page": str(limit)}
if since_iso:
params["since"] = since_iso
if until_iso:
params["until"] = until_iso
async with httpx.AsyncClient() as client:
response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/commits",
headers=headers,
params=params,
timeout=30,
)
if response.status_code != 200:
return []
commits_data = response.json()
return [
{
"sha": c["sha"],
"message": c.get("commit", {})
.get("message", "")
.split("\n")[0][:100],
"author": c.get("commit", {})
.get("author", {})
.get("name", "unknown"),
"date": c.get("commit", {}).get("author", {}).get("date", ""),
"url": c.get("html_url", ""),
}
for c in commits_data
]
except Exception as e:
print(f"Recent commits unavailable: {e}")
return []
def analyze_repo_structure(files: list) -> dict:
"""Analyze repository structure from file list."""
paths = [f.get("path", "").lower() for f in files]
has_readme = any(p in ("readme.md", "readme.txt") for p in paths)
has_docs = any(p.startswith("docs/") or p.startswith("documentation/") for p in paths)
has_examples = any(p.startswith("examples/") or p.startswith("example/") for p in paths)
# Extract key directories (top-level only)
directories = set()
for f in files:
if f.get("type") == "tree":
parts = f.get("path", "").split("/")
if len(parts) == 1:
directories.add(parts[0])
return {
"hasReadme": has_readme,
"hasDocs": has_docs,
"hasExamples": has_examples,
"keyDirectories": sorted(list(directories)),
}
async def fetch_commit_diff(
repo_url: str,
commit_sha: str,
github_token: Optional[str] = None,
) -> dict:
"""Fetch the diff for a specific commit.
Priority for GitHub token:
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets)
2. User-provided github_token parameter (for manual override)
"""
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url)
if not match:
raise ValueError("Invalid GitHub URL")
owner, repo = match.groups()
repo_name = repo.rstrip(".git")
# Environment variable takes priority (for HF Spaces secrets)
token = os.environ.get("GITHUB_TOKEN") or github_token
headers = {"Accept": "application/vnd.github.v3.diff"}
if token:
headers["Authorization"] = f"token {token}"
try:
async with httpx.AsyncClient() as client:
response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/commits/{commit_sha}",
headers=headers,
timeout=30,
)
if response.status_code != 200:
return {"sha": commit_sha, "diff": "", "error": "Failed to fetch diff"}
return {
"sha": commit_sha,
"diff": response.text[:10000], # Limit diff size
}
except Exception as e:
print(f"Commit diff fetch failed: {e}")
return {"sha": commit_sha, "diff": "", "error": str(e)}
async def fetch_commits_with_diffs(
repo_url: str,
commits: list[dict],
max_commits: int = 5,
github_token: Optional[str] = None,
) -> list[dict]:
"""Fetch diffs for multiple commits, focusing on important changes."""
diffs = []
for commit in commits[:max_commits]:
diff_data = await fetch_commit_diff(repo_url, commit["sha"], github_token)
diffs.append({
**commit,
"diff": diff_data.get("diff", ""),
})
return diffs
async def fetch_readme_at_commit(
repo_url: str,
commit_sha: str,
github_token: Optional[str] = None,
) -> str:
"""Fetch README content at a specific commit.
Priority for GitHub token:
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets)
2. User-provided github_token parameter (for manual override)
"""
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url)
if not match:
raise ValueError("Invalid GitHub URL")
owner, repo = match.groups()
repo_name = repo.rstrip(".git")
# Environment variable takes priority (for HF Spaces secrets)
token = os.environ.get("GITHUB_TOKEN") or github_token
headers = {"Accept": "application/vnd.github.raw"}
if token:
headers["Authorization"] = f"token {token}"
# Try common README filenames
readme_files = ["README.md", "readme.md", "README.txt", "README"]
async with httpx.AsyncClient() as client:
for readme_file in readme_files:
try:
response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/contents/{readme_file}?ref={commit_sha}",
headers=headers,
timeout=30,
)
if response.status_code == 200:
return response.text
except Exception:
continue
return ""
async def fetch_first_commit_sha(
repo_url: str,
github_token: Optional[str] = None,
) -> Optional[str]:
"""Fetch the SHA of the first (oldest) commit in the repository.
Priority for GitHub token:
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets)
2. User-provided github_token parameter (for manual override)
"""
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url)
if not match:
raise ValueError("Invalid GitHub URL")
owner, repo = match.groups()
repo_name = repo.rstrip(".git")
# Environment variable takes priority (for HF Spaces secrets)
token = os.environ.get("GITHUB_TOKEN") or github_token
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"token {token}"
try:
async with httpx.AsyncClient() as client:
# Get the default branch
repo_response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}",
headers=headers,
timeout=30,
)
if repo_response.status_code != 200:
return None
default_branch = repo_response.json().get("default_branch", "main")
# Get commits with pagination to find the oldest
# We'll get the last page of commits
response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/commits",
headers=headers,
params={"sha": default_branch, "per_page": "1"},
timeout=30,
)
if response.status_code != 200:
return None
# Check for Link header to get last page
link_header = response.headers.get("Link", "")
last_page_url = None
for link in link_header.split(", "):
if 'rel="last"' in link:
last_page_url = link.split(";")[0].strip("<>")
break
if last_page_url:
# Fetch the last page
last_response = await client.get(
last_page_url,
headers=headers,
timeout=30,
)
if last_response.status_code == 200:
commits = last_response.json()
if commits:
return commits[-1]["sha"]
# If no pagination, get commits and return the oldest
response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/commits",
headers=headers,
params={"sha": default_branch, "per_page": "100"},
timeout=30,
)
if response.status_code == 200:
commits = response.json()
if commits:
return commits[-1]["sha"]
return None
except Exception as e:
print(f"First commit fetch failed: {e}")
return None
async def fetch_readme_changes(
repo_url: str,
since_iso: str,
github_token: Optional[str] = None,
) -> dict:
"""
Fetch README changes: initial README and diff within the time period.
Returns both the original README (for project understanding) and
any README changes during the analysis period (highest priority for DevRel).
Priority for GitHub token:
1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets)
2. User-provided github_token parameter (for manual override)
"""
match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url)
if not match:
raise ValueError("Invalid GitHub URL")
owner, repo = match.groups()
repo_name = repo.rstrip(".git")
# Environment variable takes priority (for HF Spaces secrets)
token = os.environ.get("GITHUB_TOKEN") or github_token
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"token {token}"
result = {
"initial_readme": "",
"readme_diff": "",
"has_readme_changes": False,
"readme_commits": [],
}
try:
async with httpx.AsyncClient() as client:
# Get commits that touched README files in the time period
readme_commits = []
for readme_file in ["README.md", "readme.md", "README.txt"]:
response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/commits",
headers=headers,
params={
"path": readme_file,
"since": since_iso,
"per_page": "10",
},
timeout=30,
)
if response.status_code == 200:
commits = response.json()
for c in commits:
readme_commits.append({
"sha": c["sha"],
"message": c.get("commit", {}).get("message", "").split("\n")[0][:100],
"date": c.get("commit", {}).get("author", {}).get("date", ""),
"file": readme_file,
})
result["readme_commits"] = readme_commits
result["has_readme_changes"] = len(readme_commits) > 0
# Get initial README (from first commit or oldest available)
first_sha = await fetch_first_commit_sha(repo_url, github_token)
if first_sha:
result["initial_readme"] = await fetch_readme_at_commit(
repo_url, first_sha, github_token
)
# If there are README changes, get the diff
if readme_commits:
oldest_readme_commit = readme_commits[-1]["sha"] if readme_commits else None
if oldest_readme_commit:
# Get diff for the README changes
diff_headers = {"Accept": "application/vnd.github.v3.diff"}
if token:
diff_headers["Authorization"] = f"token {token}"
# Get combined diff of all README commits
diffs = []
for commit in readme_commits[:3]: # Limit to 3 most recent
diff_response = await client.get(
f"https://api.github.com/repos/{owner}/{repo_name}/commits/{commit['sha']}",
headers=diff_headers,
timeout=30,
)
if diff_response.status_code == 200:
# Filter to only include README changes
diff_text = diff_response.text
readme_sections = []
in_readme = False
current_section = []
for line in diff_text.split("\n"):
if line.startswith("diff --git") and "readme" in line.lower():
in_readme = True
current_section = [line]
elif line.startswith("diff --git") and in_readme:
readme_sections.append("\n".join(current_section))
in_readme = "readme" in line.lower()
current_section = [line] if in_readme else []
elif in_readme:
current_section.append(line)
if current_section and in_readme:
readme_sections.append("\n".join(current_section))
if readme_sections:
diffs.append(f"# Commit: {commit['message']}\n" + "\n".join(readme_sections))
result["readme_diff"] = "\n\n---\n\n".join(diffs)[:8000] # Limit size
except Exception as e:
print(f"README changes fetch failed: {e}")
return result
async def analyze_breaking_changes(
commits_with_diffs: list[dict],
) -> list[dict]:
"""
Analyze commit diffs to identify potential breaking changes.
Returns a list of breaking changes with context.
"""
breaking_changes = []
# Patterns that might indicate breaking changes
breaking_patterns = [
# Function/method signature changes
(r"^-\s*(def|function|func|public|private|protected)\s+\w+\s*\([^)]*\)", "Function signature removed/changed"),
# Class/interface changes
(r"^-\s*(class|interface|struct|type)\s+\w+", "Class/interface removed/changed"),
# Import/export changes
(r"^-\s*(import|export|from|require)\s+", "Import/export removed/changed"),
# Configuration changes
(r"^-\s*['\"]?[a-zA-Z_]+['\"]?\s*[:=]", "Configuration option removed/changed"),
# API endpoint changes
(r"^-\s*@?(get|post|put|delete|patch|route|api)\s*\(", "API endpoint removed/changed"),
# Deprecation notices
(r"deprecated|breaking|removed|obsolete", "Deprecation or breaking change mentioned"),
# Version bumps in configs
(r"^-\s*['\"]?version['\"]?\s*[:=]\s*['\"]?\d+\.\d+", "Version number changed"),
]
for commit in commits_with_diffs:
diff = commit.get("diff", "")
if not diff:
continue
commit_breaking_changes = []
for pattern, description in breaking_patterns:
import re
matches = re.findall(pattern, diff, re.MULTILINE | re.IGNORECASE)
if matches:
commit_breaking_changes.append({
"type": description,
"count": len(matches),
})
if commit_breaking_changes:
breaking_changes.append({
"sha": commit.get("sha", "")[:7],
"message": commit.get("message", ""),
"date": commit.get("date", ""),
"changes": commit_breaking_changes,
})
return breaking_changes