AndreasThinks's picture
chore: deploy dashboard from GitHub Actions
2db0de9 verified
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import sqlite3
import os
import logging
from datetime import datetime, timedelta, timezone
# Tags that describe the dataset itself rather than individual repos β€” excluded from all charts/filters
BLOCKLIST_TAGS = frozenset([
"government", "open-source", "public-sector", "open-government",
"government-software", "government-tool", "government-project",
"government-repository", "government-platform", "government-code",
])
# Tags that duplicate the language field already in the schema
LANGUAGE_TAGS = frozenset([
"javascript", "python", "java", "typescript", "html", "css", "php",
"ruby", "shell", "r", "scala", "c#", "kotlin", "go", "rust", "c",
"c++", "perl", "swift", "matlab", "bash", "json", "xml", "yaml",
"sql", "makefile",
])
# Combined filter β€” tags to hide from dashboard display
EXCLUDED_TAGS = BLOCKLIST_TAGS | LANGUAGE_TAGS
# Minimum repos a tag must appear in to show in charts/filters
MIN_TAG_REPOS = 2
def _tag_filter_sql(tag_col: str = "tag") -> str:
"""Return a SQL fragment excluding noise tags. Use with AND."""
excluded = EXCLUDED_TAGS
ph = ",".join(["?"] * len(excluded))
return f"{tag_col} NOT IN ({ph}) AND {tag_col} IS NOT NULL"
def _tag_filter_params() -> list:
return list(EXCLUDED_TAGS)
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S")
logger = logging.getLogger("govtech-dashboard")
logger.info("Starting GovTech Dashboard...")
st.set_page_config(
page_title="GovTech GitHub Explorer",
page_icon="πŸ›οΈ",
layout="wide",
)
def get_db_path():
candidates = [
os.path.join(os.path.dirname(__file__), "..", "govtech.db"),
os.path.join(os.path.dirname(__file__), "govtech.db"),
"govtech.db",
"../govtech.db",
]
for p in candidates:
if os.path.exists(p):
logger.info(f"Found local DB: {os.path.abspath(p)}")
return os.path.abspath(p)
logger.info("No local DB found, downloading from HuggingFace Hub...")
try:
from huggingface_hub import hf_hub_download
path = hf_hub_download(
repo_id="AndreasThinks/government-github-repos",
filename="data/govtech.db",
repo_type="dataset",
)
logger.info(f"Downloaded DB to: {path}")
return path
except Exception as e:
logger.error(f"Failed to download DB: {e}")
st.error(f"Could not find or download govtech.db: {e}")
st.stop()
DB_PATH = get_db_path()
def get_conn():
return sqlite3.connect(DB_PATH, check_same_thread=False)
@st.cache_data(ttl=300)
def query_df(sql, params=None):
conn = get_conn()
df = pd.read_sql_query(sql, conn, params=params or [])
conn.close()
return df
@st.cache_data(ttl=300)
def query_one(sql, params=None):
conn = get_conn()
cur = conn.cursor()
result = cur.execute(sql, params or []).fetchone()[0]
conn.close()
return result
@st.cache_data(ttl=600)
def load_filter_options():
conn = get_conn()
countries = pd.read_sql_query(
"SELECT DISTINCT country FROM repositories WHERE country IS NOT NULL AND country != '' ORDER BY country", conn
)["country"].tolist()
languages = pd.read_sql_query(
"SELECT DISTINCT language FROM repositories WHERE language IS NOT NULL AND language != '' ORDER BY language", conn
)["language"].tolist()
tf_sql = _tag_filter_sql()
tf_params = _tag_filter_params()
tags = pd.read_sql_query(
f"SELECT tag, COUNT(DISTINCT html_url) as c FROM repository_tags WHERE {tf_sql} GROUP BY tag HAVING c >= {MIN_TAG_REPOS} ORDER BY c DESC",
conn, params=tf_params
)["tag"].tolist()
orgs = pd.read_sql_query(
"SELECT owner, COUNT(*) as c FROM repositories GROUP BY owner ORDER BY c DESC LIMIT 300", conn
)["owner"].tolist()
conn.close()
return countries, languages, tags, orgs
# ==================== SIDEBAR FILTERS ====================
st.sidebar.title("πŸ”­ Filters")
st.sidebar.caption("Applied across all tabs")
countries, languages, tags, orgs = load_filter_options()
sel_countries = st.sidebar.multiselect("🌍 Country", countries, key="g_country")
sel_orgs = st.sidebar.multiselect("🏒 Organisation", orgs, key="g_org")
sel_languages = st.sidebar.multiselect("πŸ’» Language", languages, key="g_lang")
sel_tags = st.sidebar.multiselect("🏷️ Tag", tags, key="g_tag")
st.sidebar.divider()
st.sidebar.subheader("πŸ“… Activity")
activity_options = {
"All time": None,
"Active last 3 months": 90,
"Active last 6 months": 180,
"Active last 12 months": 365,
"Active last 2 years": 730,
}
activity_label = st.sidebar.selectbox("Last pushed", list(activity_options.keys()), index=0, key="g_activity")
activity_days = activity_options[activity_label]
st.sidebar.divider()
show_archived = st.sidebar.checkbox("Include archived", value=False, key="g_arch")
show_forks = st.sidebar.checkbox("Include forks", value=True, key="g_forks")
min_stars = st.sidebar.slider("Min stars", 0, 500, 0, key="g_stars")
st.sidebar.divider()
st.sidebar.markdown(
"πŸ”— [GitHub](https://github.com/AndreasThinks/open-govtech-report)  |  "
"[Dataset](https://huggingface.co/datasets/AndreasThinks/government-github-repos)  |  "
"[βž• Submit a missing org](https://github.com/AndreasThinks/open-govtech-report/blob/main/CONTRIBUTING.md)",
unsafe_allow_html=True,
)
def build_where(extra_conditions=None, base_table="r", tag_table="rt"):
"""Build a WHERE clause and params list from global sidebar filters."""
conditions = list(extra_conditions or [])
params = []
if sel_countries:
ph = ",".join(["?"] * len(sel_countries))
conditions.append(f"{base_table}.country IN ({ph})")
params.extend(sel_countries)
if sel_orgs:
ph = ",".join(["?"] * len(sel_orgs))
conditions.append(f"{base_table}.owner IN ({ph})")
params.extend(sel_orgs)
if sel_languages:
ph = ",".join(["?"] * len(sel_languages))
conditions.append(f"{base_table}.language IN ({ph})")
params.extend(sel_languages)
if activity_days:
cutoff = (datetime.now(timezone.utc) - timedelta(days=activity_days)).strftime("%Y-%m-%dT%H:%M:%SZ")
conditions.append(f"{base_table}.pushed_at >= ?")
params.append(cutoff)
if not show_archived:
conditions.append(f"({base_table}.archived = 0 OR {base_table}.archived IS NULL)")
if not show_forks:
conditions.append(f"({base_table}.fork = 0 OR {base_table}.fork IS NULL)")
if min_stars > 0:
conditions.append(f"{base_table}.stars >= ?")
params.append(min_stars)
return conditions, params
def build_tag_join_where(extra_conditions=None):
"""Build WHERE for queries that need to join repository_tags for tag filter."""
conditions, params = build_where(extra_conditions)
tag_join = ""
if sel_tags:
tag_join = "JOIN repository_tags rt ON r.html_url = rt.html_url"
ph = ",".join(["?"] * len(sel_tags))
conditions.append(f"rt.tag IN ({ph})")
params.extend(sel_tags)
where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
return where, params, tag_join
# ==================== HEADER ====================
st.title("πŸ›οΈ GovTech GitHub Explorer")
st.caption("Exploring 70k+ government GitHub repositories worldwide")
# ==================== TABS ====================
tab_overview, tab_explorer, tab_tags, tab_insights, tab_trends, tab_about = st.tabs(
["πŸ“Š Overview", "πŸ” Explorer", "🏷️ Tags", "πŸ’‘ Insights", "πŸ“ˆ Trends", "ℹ️ About"]
)
# ==================== OVERVIEW ====================
with tab_overview:
where, params, tag_join = build_tag_join_where()
total_filtered = query_one(f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join} {where}", params)
account_count = query_one("SELECT COUNT(*) FROM accounts")
country_count_val = query_one(
f"SELECT COUNT(DISTINCT r.country) FROM repositories r {tag_join} {where}", params
)
# Active in last 12m within filtered set
active_cutoff = (datetime.now(timezone.utc) - timedelta(days=365)).strftime("%Y-%m-%dT%H:%M:%SZ")
active_conditions, active_params = build_where([f"r.pushed_at >= ?"])
active_params_full = active_params.copy()
active_params_full.insert(
len(active_params) - 1 if active_params else 0, active_cutoff
)
# Simpler: just count directly
conn = get_conn()
conds_12m, p_12m = build_where()
conds_12m.append("r.pushed_at >= ?")
p_12m.append(active_cutoff)
tj2 = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
if sel_tags:
ph = ",".join(["?"] * len(sel_tags))
conds_12m.append(f"rt.tag IN ({ph})")
p_12m.extend(sel_tags)
w12 = ("WHERE " + " AND ".join(conds_12m)) if conds_12m else ""
active_12m = conn.execute(
f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tj2} {w12}", p_12m
).fetchone()[0]
conn.close()
c1, c2, c3, c4 = st.columns(4)
c1.metric("Repositories", f"{total_filtered:,}")
c2.metric("Accounts", f"{account_count:,}")
c3.metric("Countries", country_count_val)
c4.metric("Active last 12m", f"{active_12m:,}", help="Repos with a push in the last 12 months")
st.divider()
col_left, col_right = st.columns(2)
with col_left:
st.subheader("Top Countries by Repositories")
df_countries = query_df(
f"SELECT r.country, COUNT(DISTINCT r.html_url) as count FROM repositories r {tag_join} {where} GROUP BY r.country ORDER BY count DESC LIMIT 20",
params,
)
if not df_countries.empty:
fig = px.bar(df_countries, x="country", y="count", color="count", color_continuous_scale="Blues")
fig.update_layout(showlegend=False, xaxis_title="Country", yaxis_title="Repositories", coloraxis_showscale=False)
st.plotly_chart(fig, use_container_width=True)
with col_right:
st.subheader("Top Languages")
df_langs = query_df(
f"""SELECT r.language, COUNT(DISTINCT r.html_url) as count
FROM repositories r {tag_join} {where}
{"AND" if where else "WHERE"} r.language IS NOT NULL AND r.language != ''
GROUP BY r.language ORDER BY count DESC LIMIT 15""",
params,
)
if not df_langs.empty:
fig = px.bar(df_langs, x="count", y="language", orientation="h", color="count", color_continuous_scale="Greens")
fig.update_layout(showlegend=False, yaxis=dict(autorange="reversed"), xaxis_title="Repositories", yaxis_title="", coloraxis_showscale=False)
st.plotly_chart(fig, use_container_width=True)
st.subheader("Repository Creation Timeline")
df_timeline = query_df(
f"""SELECT SUBSTR(r.created_at, 1, 4) as year, COUNT(DISTINCT r.html_url) as count
FROM repositories r {tag_join} {where}
{"AND" if where else "WHERE"} r.created_at IS NOT NULL
GROUP BY year ORDER BY year""",
params,
)
df_timeline = df_timeline[df_timeline["year"].str.match(r"^\d{4}$", na=False)]
# Also pull active repos per year (pushed_at within 12m of each year-end β€” proxy: pushed in that year or later)
df_pushed = query_df(
f"""SELECT SUBSTR(r.pushed_at, 1, 4) as year, COUNT(DISTINCT r.html_url) as active
FROM repositories r {tag_join} {where}
{"AND" if where else "WHERE"} r.pushed_at IS NOT NULL
GROUP BY year ORDER BY year""",
params,
)
df_pushed = df_pushed[df_pushed["year"].str.match(r"^\d{4}$", na=False)]
if not df_timeline.empty:
fig = go.Figure()
fig.add_trace(go.Scatter(
x=df_timeline["year"], y=df_timeline["count"],
name="Created", fill="tozeroy", mode="lines",
line=dict(color="#3b82f6"), fillcolor="rgba(59,130,246,0.2)"
))
if not df_pushed.empty:
fig.add_trace(go.Scatter(
x=df_pushed["year"], y=df_pushed["active"],
name="Last pushed", fill="tozeroy", mode="lines",
line=dict(color="#10b981"), fillcolor="rgba(16,185,129,0.15)"
))
fig.update_layout(xaxis_title="Year", yaxis_title="Repositories", legend=dict(orientation="h"))
st.plotly_chart(fig, use_container_width=True)
st.caption("'Last pushed' shows when repositories last received a commit β€” a proxy for active maintenance.")
# ==================== EXPLORER ====================
with tab_explorer:
where_e, params_e, tag_join_e = build_tag_join_where()
# Extra local search
search_text = st.text_input("Search name / description", key="exp_search")
if search_text:
where_e_conds, _ = build_where()
where_e_conds.append("(r.name LIKE ? OR r.description LIKE ?)")
params_e_local = params_e + [f"%{search_text}%", f"%{search_text}%"]
where_e_local = ("WHERE " + " AND ".join(where_e_conds + (["(r.name LIKE ? OR r.description LIKE ?)"] if search_text else []))) if where_e_conds else ""
else:
params_e_local = params_e
sort_col = st.selectbox("Sort by", ["stars", "forks", "pushed_at", "created_at"], key="exp_sort")
conn = get_conn()
count_sql = f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join_e} {where_e}"
if search_text:
extra = " AND (r.name LIKE ? OR r.description LIKE ?)"
total_results = conn.execute(count_sql + extra, params_e + [f"%{search_text}%", f"%{search_text}%"]).fetchone()[0]
else:
total_results = conn.execute(count_sql, params_e).fetchone()[0]
conn.close()
st.write(f"**{total_results:,}** repositories match current filters")
page_size = 50
total_pages = max(1, (total_results + page_size - 1) // page_size)
page = st.number_input("Page", min_value=1, max_value=total_pages, value=1, key="exp_page")
offset = (page - 1) * page_size
search_clause = " AND (r.name LIKE ? OR r.description LIKE ?)" if search_text else ""
search_params = [f"%{search_text}%", f"%{search_text}%"] if search_text else []
data_sql = f"""
SELECT r.html_url, r.name, r.owner, r.country, r.language, r.stars, r.forks,
r.license, r.created_at, r.pushed_at, r.archived, r.fork
FROM repositories r {tag_join_e} {where_e} {search_clause}
GROUP BY r.html_url
ORDER BY r.{sort_col} DESC
LIMIT ? OFFSET ?
"""
df_results = query_df(data_sql, params_e + search_params + [page_size, offset])
if not df_results.empty:
st.dataframe(
df_results,
column_config={
"html_url": st.column_config.LinkColumn("URL", display_text="Open"),
"name": st.column_config.TextColumn("Name"),
"owner": st.column_config.TextColumn("Owner"),
"country": st.column_config.TextColumn("Country"),
"language": st.column_config.TextColumn("Language"),
"stars": st.column_config.NumberColumn("⭐ Stars"),
"forks": st.column_config.NumberColumn("🍴 Forks"),
"license": st.column_config.TextColumn("License"),
"created_at": st.column_config.TextColumn("Created"),
"pushed_at": st.column_config.TextColumn("Last pushed"),
"archived": st.column_config.CheckboxColumn("Archived"),
"fork": st.column_config.CheckboxColumn("Fork"),
},
use_container_width=True,
hide_index=True,
)
st.caption(f"Page {page} of {total_pages}")
else:
st.info("No repositories match the current filters.")
# ==================== TAGS ====================
with tab_tags:
where_t, params_t, tag_join_t = build_tag_join_where()
tagged_count_t = query_one("SELECT COUNT(DISTINCT html_url) FROM repository_tags")
total_repos_t = query_one("SELECT COUNT(*) FROM repositories")
if tagged_count_t < total_repos_t * 0.99:
pct = tagged_count_t / total_repos_t * 100 if total_repos_t > 0 else 0
st.info(
f"πŸ—οΈ **Tagging in progress** β€” {tagged_count_t:,} of {total_repos_t:,} repositories tagged ({pct:.1f}%). "
"Results below reflect partially tagged data."
)
col_tl, col_tr = st.columns(2)
with col_tl:
st.subheader("Top Tags")
tf_sql_t = _tag_filter_sql("rt2.tag")
tf_params_t = _tag_filter_params()
df_top_tags = query_df(
f"""SELECT rt2.tag, COUNT(DISTINCT r.html_url) as count
FROM repositories r
JOIN repository_tags rt2 ON r.html_url = rt2.html_url
{tag_join_t.replace("rt", "rt_f") if sel_tags else ""}
{where_t.replace("rt.", "rt2.") if where_t else ""}
{"AND" if where_t else "WHERE"} {tf_sql_t}
GROUP BY rt2.tag HAVING count >= {MIN_TAG_REPOS} ORDER BY count DESC LIMIT 30""",
params_t + tf_params_t,
)
if not df_top_tags.empty:
fig = px.bar(
df_top_tags, x="count", y="tag", orientation="h",
color="count", color_continuous_scale="Purples",
)
fig.update_layout(yaxis=dict(autorange="reversed"), showlegend=False, height=600,
xaxis_title="Repositories", yaxis_title="", coloraxis_showscale=False)
st.plotly_chart(fig, use_container_width=True)
with col_tr:
st.subheader("Tags by Year Created")
st.caption("Repos tagged with each technology, by creation year β€” shows technology adoption over time.")
# Pick top 10 tags for the chart
if not df_top_tags.empty:
top10_tags = df_top_tags.head(10)["tag"].tolist()
ph = ",".join(["?"] * len(top10_tags))
conds_ty, params_ty = build_where(base_table="r")
conds_ty.append(f"rt3.tag IN ({ph})")
params_ty.extend(top10_tags)
conds_ty.append("r.created_at IS NOT NULL")
w_ty = ("WHERE " + " AND ".join(conds_ty)) if conds_ty else ""
df_tag_time = query_df(
f"""SELECT SUBSTR(r.created_at,1,4) as year, rt3.tag, COUNT(DISTINCT r.html_url) as count
FROM repositories r JOIN repository_tags rt3 ON r.html_url = rt3.html_url
{w_ty}
GROUP BY year, rt3.tag ORDER BY year""",
params_ty,
)
df_tag_time = df_tag_time[df_tag_time["year"].str.match(r"^\d{4}$", na=False)]
if not df_tag_time.empty:
fig = px.line(df_tag_time, x="year", y="count", color="tag",
labels={"year": "Year", "count": "Repos created", "tag": "Tag"})
fig.update_layout(legend=dict(orientation="h", y=-0.3))
st.plotly_chart(fig, use_container_width=True)
st.divider()
st.subheader("Browse Repos by Tag")
browse_tags = df_top_tags["tag"].tolist() if not df_top_tags.empty else tags
if browse_tags:
sel_tag = st.selectbox("Select a tag", browse_tags, key="tag_browse")
sort_tag = st.selectbox("Sort by", ["stars", "pushed_at", "created_at"], key="tag_sort")
conds_br, params_br = build_where(base_table="r")
conds_br.append("rt_b.tag = ?")
params_br.append(sel_tag)
w_br = ("WHERE " + " AND ".join(conds_br)) if conds_br else ""
df_tag_repos = query_df(
f"""SELECT r.name, r.owner, r.country, r.language, r.stars, r.pushed_at, rt_b.confidence, r.html_url
FROM repository_tags rt_b JOIN repositories r ON rt_b.html_url = r.html_url
{w_br} ORDER BY r.{sort_tag} DESC LIMIT 200""",
params_br,
)
st.write(f"**{len(df_tag_repos)}** repos tagged with **{sel_tag}**")
if not df_tag_repos.empty:
st.dataframe(
df_tag_repos,
column_config={
"html_url": st.column_config.LinkColumn("URL", display_text="Open"),
"confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1),
"stars": st.column_config.NumberColumn("⭐ Stars"),
"pushed_at": st.column_config.TextColumn("Last pushed"),
},
use_container_width=True,
hide_index=True,
)
st.divider()
st.subheader("Tag Groups")
df_groups = query_df("SELECT id, name, description FROM tag_groups ORDER BY name")
if not df_groups.empty:
for _, grp in df_groups.iterrows():
with st.expander(f"πŸ“ {grp['name']}" + (f" β€” {grp['description']}" if grp["description"] else "")):
df_members = query_df(
"SELECT tag FROM tag_group_members WHERE group_id = ? ORDER BY tag",
[int(grp["id"])]
)
if not df_members.empty:
st.write(", ".join(df_members["tag"].tolist()))
else:
st.write("No tags in this group yet.")
else:
st.info("No tag groups defined yet.")
# ==================== INSIGHTS ====================
with tab_insights:
where_i, params_i, tag_join_i = build_tag_join_where()
col_ia, col_ib = st.columns(2)
with col_ia:
st.subheader("⭐ Most Starred")
df_top = query_df(
f"""SELECT r.name, r.owner, r.country, r.stars, r.language, r.pushed_at, r.html_url
FROM repositories r {tag_join_i} {where_i}
GROUP BY r.html_url ORDER BY r.stars DESC LIMIT 25""",
params_i,
)
st.dataframe(
df_top,
column_config={
"html_url": st.column_config.LinkColumn("URL", display_text="Open"),
"stars": st.column_config.NumberColumn("⭐ Stars"),
"pushed_at": st.column_config.TextColumn("Last pushed"),
},
use_container_width=True,
hide_index=True,
)
with col_ib:
st.subheader("πŸš€ Rising Stars (active last 12m, sorted by stars)")
rising_cutoff = (datetime.now(timezone.utc) - timedelta(days=365)).strftime("%Y-%m-%dT%H:%M:%SZ")
conds_r, params_r = build_where(base_table="r")
conds_r.append("r.pushed_at >= ?")
params_r.append(rising_cutoff)
tj_r = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
if sel_tags:
ph = ",".join(["?"] * len(sel_tags))
conds_r.append(f"rt.tag IN ({ph})")
params_r.extend(sel_tags)
w_r = ("WHERE " + " AND ".join(conds_r)) if conds_r else ""
df_rising = query_df(
f"""SELECT r.name, r.owner, r.country, r.stars, r.language, r.pushed_at, r.html_url
FROM repositories r {tj_r} {w_r}
GROUP BY r.html_url ORDER BY r.stars DESC LIMIT 25""",
params_r,
)
st.dataframe(
df_rising,
column_config={
"html_url": st.column_config.LinkColumn("URL", display_text="Open"),
"stars": st.column_config.NumberColumn("⭐ Stars"),
"pushed_at": st.column_config.TextColumn("Last pushed"),
},
use_container_width=True,
hide_index=True,
)
st.divider()
st.subheader("πŸ“ˆ Most Active Organisations")
st.caption("Organisations ranked by number of repos with a push in the last 12 months.")
conds_ao, params_ao = build_where(base_table="r")
conds_ao.append("r.pushed_at >= ?")
params_ao.append((datetime.now(timezone.utc) - timedelta(days=365)).strftime("%Y-%m-%dT%H:%M:%SZ"))
tj_ao = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
if sel_tags:
ph = ",".join(["?"] * len(sel_tags))
conds_ao.append(f"rt.tag IN ({ph})")
params_ao.extend(sel_tags)
w_ao = ("WHERE " + " AND ".join(conds_ao)) if conds_ao else ""
df_active_orgs = query_df(
f"""SELECT r.owner, r.country, COUNT(DISTINCT r.html_url) as active_repos,
SUM(r.stars) as total_stars
FROM repositories r {tj_ao} {w_ao}
GROUP BY r.owner ORDER BY active_repos DESC LIMIT 20""",
params_ao,
)
col_org1, col_org2 = st.columns(2)
with col_org1:
if not df_active_orgs.empty:
fig = px.bar(df_active_orgs, x="active_repos", y="owner", orientation="h",
color="active_repos", color_continuous_scale="Oranges",
labels={"active_repos": "Active repos (12m)", "owner": ""})
fig.update_layout(yaxis=dict(autorange="reversed"), showlegend=False,
height=500, coloraxis_showscale=False)
st.plotly_chart(fig, use_container_width=True)
with col_org2:
if not df_active_orgs.empty:
st.dataframe(df_active_orgs, use_container_width=True, hide_index=True,
column_config={"total_stars": st.column_config.NumberColumn("⭐ Total stars"),
"active_repos": st.column_config.NumberColumn("Active repos (12m)")})
st.divider()
col_ic, col_id = st.columns(2)
with col_ic:
st.subheader("πŸ“œ License Breakdown")
df_lic = query_df(
f"""SELECT r.license, COUNT(DISTINCT r.html_url) as count
FROM repositories r {tag_join_i} {where_i}
{"AND" if where_i else "WHERE"} r.license IS NOT NULL AND r.license != ''
GROUP BY r.license ORDER BY count DESC""",
params_i,
)
if not df_lic.empty:
top_n = 10
if len(df_lic) > top_n:
top = df_lic.head(top_n)
other = pd.DataFrame([{"license": "Other", "count": df_lic.iloc[top_n:]["count"].sum()}])
df_lic_plot = pd.concat([top, other], ignore_index=True)
else:
df_lic_plot = df_lic
fig = px.pie(df_lic_plot, names="license", values="count", hole=0.3)
fig.update_traces(textposition="inside", textinfo="percent+label")
st.plotly_chart(fig, use_container_width=True)
with col_id:
st.subheader("🍴 Fork vs Original")
fork_count = query_one(
f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join_i} {where_i} {'AND' if where_i else 'WHERE'} r.fork = 1",
params_i,
)
original_count = query_one(
f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join_i} {where_i} {'AND' if where_i else 'WHERE'} (r.fork = 0 OR r.fork IS NULL)",
params_i,
)
m1, m2 = st.columns(2)
m1.metric("Original", f"{original_count:,}")
m2.metric("Forked", f"{fork_count:,}")
fig = px.pie(
pd.DataFrame({"type": ["Original", "Fork"], "count": [original_count, fork_count]}),
names="type", values="count", hole=0.4,
color_discrete_sequence=["#2ecc71", "#e74c3c"],
)
st.plotly_chart(fig, use_container_width=True)
st.divider()
st.subheader("🌍 Language Γ— Country Heatmap")
df_heat = query_df(
f"""SELECT r.country, r.language, COUNT(DISTINCT r.html_url) as count
FROM repositories r {tag_join_i} {where_i}
{"AND" if where_i else "WHERE"} r.language IS NOT NULL AND r.language != ''
AND r.country IN (
SELECT country FROM repositories GROUP BY country ORDER BY COUNT(*) DESC LIMIT 15
)
AND r.language IN (
SELECT language FROM repositories WHERE language IS NOT NULL AND language != ''
GROUP BY language ORDER BY COUNT(*) DESC LIMIT 12
)
GROUP BY r.country, r.language""",
params_i,
)
if not df_heat.empty:
pivot = df_heat.pivot_table(index="country", columns="language", values="count", fill_value=0)
fig = px.imshow(pivot, text_auto=True, color_continuous_scale="YlOrRd",
labels=dict(x="Language", y="Country", color="Repos"), aspect="auto")
fig.update_layout(height=500)
st.plotly_chart(fig, use_container_width=True)
else:
st.info("Not enough data for heatmap.")
# ==================== TRENDS ====================
with tab_trends:
st.subheader("πŸ“ˆ Trends Over Time")
st.caption("How government open source has evolved β€” new activity, rising tags, and shifting languages.")
# ---- Period selector ----
PERIOD_OPTIONS = {
"This week": 7,
"This month": 30,
"Last 3 months": 90,
"Last 6 months": 180,
"Last 12 months": 365,
"Last 2 years": 730,
}
period_label = st.radio(
"Period", list(PERIOD_OPTIONS.keys()), index=4,
horizontal=True, key="trends_period",
)
period_days = PERIOD_OPTIONS[period_label]
period_prior_days = period_days * 2 # prior window = same length, shifted back
now_utc = datetime.now(timezone.utc)
now_str = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
cutoff_recent = (now_utc - timedelta(days=period_days)).strftime("%Y-%m-%dT%H:%M:%SZ")
cutoff_prior = (now_utc - timedelta(days=period_prior_days)).strftime("%Y-%m-%dT%H:%M:%SZ")
# Granularity: week/month buckets depending on period
if period_days <= 30:
bucket_fmt = "%Y-%W" # ISO week
bucket_label = "Week"
elif period_days <= 365:
bucket_fmt = "%Y-%m" # Month
bucket_label = "Month"
else:
bucket_fmt = "%Y-%m"
bucket_label = "Month"
# For the repos chart, show the bucket pattern for the selected period
# SQLite STRFTIME format
if period_days <= 30:
sql_bucket = "STRFTIME('%Y-%W', r.created_at)"
bucket_re = r"^\d{4}-\d{2}$"
else:
sql_bucket = "SUBSTR(r.created_at, 1, 7)"
bucket_re = r"^\d{4}-\d{2}$"
st.divider()
conds_base, params_base = build_where(base_table="r")
w_base = (" AND ".join(conds_base)) if conds_base else ""
and_base = ("AND " + w_base) if w_base else ""
tf_sql_mom = _tag_filter_sql("rt.tag")
tf_params_mom = _tag_filter_params()
# ---- 1. New repos per period ----
st.markdown(f"### πŸ—“οΈ New Repositories β€” {period_label}")
conds_m, params_m = build_where(base_table="r")
conds_m.append("r.created_at >= ?")
params_m.append(cutoff_recent)
w_m = ("WHERE " + " AND ".join(conds_m)) if conds_m else ""
tj_m = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
if sel_tags:
ph = ",".join(["?"] * len(sel_tags))
conds_m.append(f"rt.tag IN ({ph})")
params_m.extend(sel_tags)
w_m = ("WHERE " + " AND ".join(conds_m)) if conds_m else ""
df_activity = query_df(
f"""SELECT {sql_bucket} as bucket, COUNT(DISTINCT r.html_url) as new_repos
FROM repositories r {tj_m} {w_m}
GROUP BY bucket ORDER BY bucket""",
params_m,
)
df_activity = df_activity[df_activity["bucket"].str.match(bucket_re, na=False)]
if not df_activity.empty:
fig = px.bar(
df_activity, x="bucket", y="new_repos",
labels={"bucket": bucket_label, "new_repos": "New repositories"},
color="new_repos", color_continuous_scale="Blues",
)
fig.update_layout(coloraxis_showscale=False, xaxis_title=bucket_label, yaxis_title="New repos")
st.plotly_chart(fig, use_container_width=True)
else:
st.info("Not enough data for this period.")
# Fastest growing repos this period (by stars delta proxy: recently created + high stars)
st.markdown(f"#### 🌟 Top New Repos β€” {period_label}")
st.caption("Highest-starred repositories created in the selected period.")
conds_nr, params_nr = build_where(base_table="r")
conds_nr.append("r.created_at >= ?")
params_nr.append(cutoff_recent)
w_nr = ("WHERE " + " AND ".join(conds_nr)) if conds_nr else ""
tj_nr = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
if sel_tags:
ph = ",".join(["?"] * len(sel_tags))
conds_nr.append(f"rt.tag IN ({ph})")
params_nr.extend(sel_tags)
w_nr = ("WHERE " + " AND ".join(conds_nr)) if conds_nr else ""
df_new_repos = query_df(
f"""SELECT r.name, r.owner, r.country, r.language, r.stars, r.created_at, r.html_url
FROM repositories r {tj_nr} {w_nr}
GROUP BY r.html_url ORDER BY r.stars DESC LIMIT 20""",
params_nr,
)
if not df_new_repos.empty:
st.dataframe(
df_new_repos,
column_config={
"html_url": st.column_config.LinkColumn("URL", display_text="Open"),
"stars": st.column_config.NumberColumn("⭐ Stars"),
"created_at": st.column_config.TextColumn("Created"),
},
use_container_width=True, hide_index=True,
)
else:
st.info("No new repos in this period.")
st.divider()
# ---- 2. Tag momentum ----
st.markdown(f"### πŸš€ Tag Momentum β€” {period_label} vs prior {period_label.lower()}")
st.caption(
f"Tags ranked by growth β€” repos created in the selected period vs the equivalent period before it. "
"Higher ratio = faster-growing category."
)
# Minimum repo count scales with period to avoid noise on short windows
min_repos = max(2, period_days // 60)
df_momentum = query_df(
f"""
SELECT
rt.tag,
COUNT(DISTINCT CASE WHEN r.created_at >= ? THEN r.html_url END) as recent,
COUNT(DISTINCT CASE WHEN r.created_at >= ? AND r.created_at < ? THEN r.html_url END) as prior
FROM repository_tags rt
JOIN repositories r ON rt.html_url = r.html_url
WHERE r.created_at IS NOT NULL AND {tf_sql_mom} {and_base}
GROUP BY rt.tag
HAVING recent >= {min_repos} AND prior >= {min_repos}
ORDER BY (CAST(recent AS FLOAT) / prior) DESC
LIMIT 30
""",
[cutoff_recent, cutoff_prior, cutoff_recent] + tf_params_mom + params_base,
)
if not df_momentum.empty:
df_momentum["growth_ratio"] = (df_momentum["recent"] / df_momentum["prior"]).round(2)
df_momentum["growth_pct"] = ((df_momentum["growth_ratio"] - 1) * 100).round(1)
col_m1, col_m2 = st.columns(2)
with col_m1:
st.markdown(f"**Fastest growing tags**")
fig = px.bar(
df_momentum.head(20), x="growth_ratio", y="tag", orientation="h",
color="growth_ratio", color_continuous_scale="Greens",
labels={"growth_ratio": "Growth ratio (recent / prior)", "tag": "Tag"},
hover_data={"recent": True, "prior": True, "growth_pct": True},
)
fig.update_layout(
yaxis=dict(autorange="reversed"), height=550,
coloraxis_showscale=False, xaxis_title="Growth ratio", yaxis_title="",
)
fig.add_vline(x=1.0, line_dash="dash", line_color="grey", annotation_text="no change")
st.plotly_chart(fig, use_container_width=True)
with col_m2:
st.markdown(f"**Top tags by volume**")
df_recent_top = query_df(
f"""
SELECT rt.tag, COUNT(DISTINCT r.html_url) as recent_count
FROM repository_tags rt JOIN repositories r ON rt.html_url = r.html_url
WHERE r.created_at >= ? AND {tf_sql_mom} {and_base}
GROUP BY rt.tag ORDER BY recent_count DESC LIMIT 20
""",
[cutoff_recent] + tf_params_mom + params_base,
)
if not df_recent_top.empty:
fig2 = px.bar(
df_recent_top, x="recent_count", y="tag", orientation="h",
color="recent_count", color_continuous_scale="Purples",
labels={"recent_count": f"Repos ({period_label.lower()})", "tag": "Tag"},
)
fig2.update_layout(
yaxis=dict(autorange="reversed"), height=550,
coloraxis_showscale=False,
xaxis_title=f"Repos ({period_label.lower()})", yaxis_title="",
)
st.plotly_chart(fig2, use_container_width=True)
# Emerging tags table
st.markdown("**Emerging tags** β€” ratio > 1.5")
df_emerging = df_momentum[df_momentum["growth_ratio"] >= 1.5][
["tag", "recent", "prior", "growth_ratio", "growth_pct"]
].rename(columns={
"tag": "Tag", "recent": period_label, "prior": f"Prior {period_label.lower()}",
"growth_ratio": "Ratio", "growth_pct": "Growth %"
})
if not df_emerging.empty:
st.dataframe(df_emerging, use_container_width=True, hide_index=True)
else:
st.info("No tags with >50% growth in this period.")
else:
st.info("Not enough tagged data for this period β€” try a longer window.")
st.divider()
# ---- 3. Language trends ----
st.markdown("### πŸ’» Language Trends")
st.caption("Year-over-year share of new repositories by primary language β€” top 10 languages.")
df_lang_year = query_df(
f"""
SELECT SUBSTR(r.created_at, 1, 4) as year, r.language,
COUNT(DISTINCT r.html_url) as count
FROM repositories r
WHERE r.language IS NOT NULL AND r.language != ''
AND r.created_at IS NOT NULL
AND r.language IN (
SELECT language FROM repositories
WHERE language IS NOT NULL AND language != ''
GROUP BY language ORDER BY COUNT(*) DESC LIMIT 10
)
AND SUBSTR(r.created_at, 1, 4) BETWEEN '2015' AND SUBSTR(?, 1, 4)
GROUP BY year, r.language ORDER BY year
""",
[now_str],
)
df_lang_year = df_lang_year[df_lang_year["year"].str.match(r"^\d{4}$", na=False)]
if not df_lang_year.empty:
fig_lang = px.line(
df_lang_year, x="year", y="count", color="language",
labels={"year": "Year", "count": "New repositories", "language": "Language"},
markers=True,
)
fig_lang.update_layout(legend=dict(orientation="h", y=-0.25))
st.plotly_chart(fig_lang, use_container_width=True)
st.caption("As a share of all new repos that year (top 10 languages).")
df_totals = df_lang_year.groupby("year")["count"].sum().reset_index().rename(columns={"count": "total"})
df_share = df_lang_year.merge(df_totals, on="year")
df_share["share"] = (df_share["count"] / df_share["total"] * 100).round(1)
fig_share = px.area(
df_share, x="year", y="share", color="language",
labels={"year": "Year", "share": "Share of new repos (%)", "language": "Language"},
groupnorm="",
)
fig_share.update_layout(legend=dict(orientation="h", y=-0.25), yaxis_title="Share (%)")
st.plotly_chart(fig_share, use_container_width=True)
else:
st.info("Not enough data for language trends.")
# ==================== ABOUT ====================
with tab_about:
st.subheader("About GovTech GitHub Explorer")
st.write(
"""
**GovTech GitHub Explorer** maps the global landscape of government open source software.
It discovers, scrapes, and automatically categorises every public GitHub repository
belonging to government organisations worldwide β€” updated weekly.
"""
)
st.subheader("How it works")
col_a1, col_a2, col_a3, col_a4 = st.columns(4)
with col_a1:
st.markdown("### πŸ” Discover")
st.write("Government GitHub accounts are sourced from the [government.github.com](https://github.com/github/government.github.com) registry β€” ~2,000 organisations across 100+ countries.")
with col_a2:
st.markdown("### πŸ•·οΈ Scrape")
st.write("Repository metadata is collected via the GitHub API using a GitHub App installation, giving high-throughput authenticated access.")
with col_a3:
st.markdown("### 🏷️ Tag")
st.write("An LLM pipeline (Qwen3-32B via OpenRouter) reads each repository's metadata and README, then assigns structured tags and categories.")
with col_a4:
st.markdown("### πŸ“Š Explore")
st.write("Tags are clustered into groups using embedding similarity, and the full dataset is published to HuggingFace for anyone to use.")
st.divider()
st.subheader("Data")
col_d1, col_d2, col_d3 = st.columns(3)
total_a = query_one("SELECT COUNT(*) FROM repositories")
tagged_a = query_one("SELECT COUNT(DISTINCT html_url) FROM repository_tags")
tag_count_a = query_one("SELECT COUNT(*) FROM tags")
col_d1.metric("Repositories", f"{total_a:,}")
col_d2.metric("Tagged", f"{tagged_a:,}")
col_d3.metric("Unique tags", f"{tag_count_a:,}")
st.write(
"The full dataset β€” including repo metadata, tags, and tag groups β€” is available on "
"[HuggingFace](https://huggingface.co/datasets/AndreasThinks/government-github-repos) "
"in CSV, Parquet, and SQLite formats. Updated every Sunday."
)
st.divider()
st.subheader("Contribute")
st.write(
"Know a government GitHub organisation that's missing from the dataset? "
"Submit it via a pull request β€” it'll be included in the next weekly scrape."
)
st.markdown(
"πŸ“‹ **[How to submit a missing organisation](https://github.com/AndreasThinks/open-govtech-report/blob/main/CONTRIBUTING.md)**"
)
st.write(
"The scraper, tagger, and dashboard are all open source. "
"Issues and pull requests welcome."
)
st.markdown("[github.com/AndreasThinks/open-govtech-report](https://github.com/AndreasThinks/open-govtech-report)")
st.divider()
st.markdown(
"✨ A project by [AndreasThinks](https://andreasthinks.me), built with ❀️ using Streamlit, "
"and some ✨vibes✨",
unsafe_allow_html=True,
)
st.divider()
st.caption(
"Data sourced from government GitHub accounts worldwide. Updated weekly. "
"| [GitHub](https://github.com/AndreasThinks/open-govtech-report) "
"| [Dataset](https://huggingface.co/datasets/AndreasThinks/government-github-repos) "
"| [βž• Submit a missing org](https://github.com/AndreasThinks/open-govtech-report/blob/main/CONTRIBUTING.md) "
"| ✨ A project by [AndreasThinks](https://andreasthinks.me)"
)