Spaces:
Running
Running
File size: 6,433 Bytes
fef9b59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | import os
import requests
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
# Load variables from .env file
load_dotenv()
# Configuration
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
REPO = "PostHog/posthog"
HEADERS = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28"
}
if GITHUB_TOKEN:
# Clean up any accidental leading/trailing quotes or whitespace from terminal exports
token_clean = GITHUB_TOKEN.strip().strip('\"').strip("'")
HEADERS["Authorization"] = f"Bearer {token_clean}"
else:
print("β οΈ WARNING: GITHUB_TOKEN environment variable not found.")
print("Using unauthenticated requests. GitHub will rate-limit this instantly.")
engineers = {}
def get_or_init(user):
if not user or user.endswith("[bot]"):
return None
if user not in engineers:
engineers[user] = {
"prs_merged": 0,
"bug_fixes": 0,
"reverts_triggered": 0,
"review_actions": 0,
"review_words_written": 0,
"multiplier_impact": 0
}
return engineers[user]
print("π Extracting Advanced Impact Metrics matched to PostHog Topology...")
cutoff_date = datetime.now() - timedelta(days=90)
# -------------------------------------------------------------
# Phase 1: Scan PR Stream (Execution, Complexity, Reverts)
# -------------------------------------------------------------
print("\nπ¦ Phase 1: Fetching recent Pull Requests...")
pr_url = f"https://api.github.com/repos/{REPO}/pulls"
phase_1_success = False
for page in range(1, 11):
params = {
"state": "closed",
"sort": "updated",
"direction": "desc",
"per_page": 100,
"page": page
}
res = requests.get(pr_url, headers=HEADERS, params=params)
if res.status_code != 200:
print(f"β Phase 1 Error on page {page}: API returned {res.status_code} - {res.json().get('message')}")
break
prs = res.json()
if not prs:
break
phase_1_success = True
for pr in prs:
if not pr.get("merged_at"):
continue
merged_at = datetime.strptime(pr["merged_at"], "%Y-%m-%dT%H:%M:%SZ")
if merged_at < cutoff_date:
continue
author = pr["user"]["login"]
eng = get_or_init(author)
if not eng:
continue
# Track raw baseline engineering velocity
eng["prs_merged"] += 1
# Extract textual fields for heuristics matching
title = pr.get("title", "").lower()
# Metric: System Quality (Avoidable Revert Tracking)
if "revert" in title:
eng["reverts_triggered"] += 1
# Extract native labels payload once for all downstream metric evaluations
labels = [l["name"].lower() for l in pr.get("labels", [])]
# Condition A: Structural Complexity Multiplier (Title Analysis)
if any(x in title for x in ["lib", "core", "infra", "architecture", "critical"]):
eng["multiplier_impact"] += 1
# Condition B: High Severity Multiplier (Native Priority Label Analysis)
# Adds an extra point if the PR is explicitly flagged as a P0 or P1 incident/initiative
if any(p in labels for p in ["p0", "p1"]):
eng["multiplier_impact"] += 1
# Metric: Native Bug Tracking
if "bug" in labels or any("bug" in label_name for label_name in labels):
eng["bug_fixes"] += 1
# -------------------------------------------------------------
# Phase 2: Scan Review Comments Stream (Citizenship & Depth)
# -------------------------------------------------------------
print("\n㪠Phase 2: Fetching repository-wide review comments...")
comments_url = f"https://api.github.com/repos/{REPO}/pulls/comments"
phase_2_success = False
for page in range(1, 11):
params = {
"sort": "created",
"direction": "desc",
"per_page": 100,
"page": page
}
res = requests.get(comments_url, headers=HEADERS, params=params)
if res.status_code != 200:
print(f"β Phase 2 Error on page {page}: API returned {res.status_code} - {res.json().get('message')}")
break
comments = res.json()
if not comments:
break
phase_2_success = True
for comment in comments:
created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ")
if created_at < cutoff_date:
continue
reviewer = comment["user"]["login"]
eng = get_or_init(reviewer)
if not eng:
continue
# Track raw volume of code review interaction
eng["review_actions"] += 1
# Metric: Meaningful Review Depth (Filters out superficial "LGTM" comments)
body = comment.get("body", "")
word_count = len(body.split())
if word_count > 15:
eng["review_words_written"] += word_count
# -------------------------------------------------------------
# Phase 3: Defensive Data Processing and Export
# -------------------------------------------------------------
print("\nπ Phase 3: Processing and Exporting Data...")
if engineers and (phase_1_success or phase_2_success):
df = pd.DataFrame.from_dict(engineers, orient='index').reset_index().rename(columns={'index': 'engineer'})
# Defensive Schema Guard: Force-initialize expected columns to protect against downstream KeyErrors
expected_cols = ["prs_merged", "review_actions", "bug_fixes", "reverts_triggered", "multiplier_impact", "review_words_written"]
for expected_col in expected_cols:
if expected_col not in df.columns:
df[expected_col] = 0
df[expected_col] = df[expected_col].fillna(0)
# Prune inactive records to keep dataset compact
df = df[(df['prs_merged'] > 0) | (df['review_actions'] > 0)]
if not df.empty:
df.to_csv("posthog_impact_data.csv", index=False)
print("π Advanced metrics pipeline successfully saved to posthog_impact_data.csv")
else:
print("β οΈ DataFrame filtered down to 0 rows. No matching active engineers found in this window.")
else:
print("β Critical Error: No data payload compiled. Please check the API error codes printed above.")
|