File size: 43,632 Bytes
63542f9
 
 
 
 
 
 
f2cc925
63542f9
2d97c94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2cc925
63542f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2cc925
63542f9
 
f2cc925
63542f9
 
 
 
ce8c066
 
 
 
 
 
 
 
 
 
 
 
f2cc925
 
 
 
 
 
 
 
 
2d97c94
 
f2cc925
2d97c94
 
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2db0de9
 
 
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b69f783
 
63542f9
 
f2cc925
63542f9
 
f2cc925
 
 
63542f9
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63542f9
 
f2cc925
63542f9
f2cc925
 
63542f9
 
 
 
 
 
f2cc925
63542f9
f2cc925
 
63542f9
f2cc925
 
 
 
63542f9
 
f2cc925
63542f9
f2cc925
 
 
 
 
63542f9
f2cc925
 
 
 
63542f9
 
 
f2cc925
 
 
 
 
63542f9
 
f2cc925
 
 
 
 
 
 
 
63542f9
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63542f9
 
 
 
f2cc925
63542f9
f2cc925
 
63542f9
f2cc925
 
 
 
 
 
63542f9
f2cc925
63542f9
 
f2cc925
 
 
 
 
 
63542f9
 
f2cc925
63542f9
 
 
 
 
 
f2cc925
 
 
63542f9
f2cc925
 
 
 
 
63542f9
 
f2cc925
63542f9
 
 
 
 
 
 
 
 
 
 
 
 
 
f2cc925
 
 
63542f9
 
 
 
 
 
 
 
 
 
 
f2cc925
 
63542f9
 
 
f2cc925
63542f9
 
 
f2cc925
63542f9
 
f2cc925
 
 
 
2d97c94
 
f2cc925
 
 
 
 
 
2d97c94
 
 
63542f9
f2cc925
 
 
 
 
 
 
 
63542f9
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63542f9
f2cc925
63542f9
f2cc925
 
 
 
 
 
 
 
63542f9
f2cc925
 
 
 
63542f9
 
 
 
 
 
 
 
f2cc925
 
63542f9
 
 
 
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63542f9
f2cc925
63542f9
 
 
 
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63542f9
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
63542f9
 
 
f2cc925
 
 
63542f9
 
f2cc925
 
 
 
 
63542f9
 
 
 
 
 
 
 
 
 
 
 
 
f2cc925
63542f9
f2cc925
 
 
 
 
 
 
 
63542f9
f2cc925
 
63542f9
 
 
f2cc925
63542f9
 
 
 
 
f2cc925
 
 
 
 
 
 
 
 
 
 
 
 
 
63542f9
 
f2cc925
 
63542f9
 
 
 
 
f2cc925
b69f783
 
 
 
 
b2f76a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b69f783
 
 
b2f76a4
b69f783
 
 
 
 
 
 
 
b2f76a4
 
b69f783
b2f76a4
b69f783
 
b2f76a4
 
b69f783
b2f76a4
 
b69f783
 
b2f76a4
b69f783
 
b2f76a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b69f783
 
 
 
b2f76a4
b69f783
b2f76a4
b69f783
 
 
b2f76a4
 
b69f783
 
 
 
 
 
 
 
 
2d97c94
b69f783
b2f76a4
b69f783
 
 
b2f76a4
b69f783
 
 
 
 
 
 
 
 
b2f76a4
b69f783
 
 
 
 
 
 
 
 
 
 
 
 
 
b2f76a4
b69f783
 
 
 
2d97c94
b69f783
 
b2f76a4
b69f783
 
 
 
 
b2f76a4
b69f783
 
 
b2f76a4
 
b69f783
 
 
 
b2f76a4
b69f783
 
 
b2f76a4
b69f783
 
 
 
 
 
 
b2f76a4
b69f783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2db0de9
 
 
 
 
 
 
 
b69f783
 
2db0de9
b69f783
 
 
 
 
 
 
 
 
 
 
63542f9
ce8c066
 
f2cc925
ce8c066
f2cc925
b69f783
2db0de9
b69f783
f2cc925
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import sqlite3
import os
import logging
from datetime import datetime, timedelta, timezone

# Tags that describe the dataset itself rather than individual repos β€” excluded from all charts/filters
BLOCKLIST_TAGS = frozenset([
    "government", "open-source", "public-sector", "open-government",
    "government-software", "government-tool", "government-project",
    "government-repository", "government-platform", "government-code",
])

# Tags that duplicate the language field already in the schema
LANGUAGE_TAGS = frozenset([
    "javascript", "python", "java", "typescript", "html", "css", "php",
    "ruby", "shell", "r", "scala", "c#", "kotlin", "go", "rust", "c",
    "c++", "perl", "swift", "matlab", "bash", "json", "xml", "yaml",
    "sql", "makefile",
])

# Combined filter β€” tags to hide from dashboard display
EXCLUDED_TAGS = BLOCKLIST_TAGS | LANGUAGE_TAGS

# Minimum repos a tag must appear in to show in charts/filters
MIN_TAG_REPOS = 2

def _tag_filter_sql(tag_col: str = "tag") -> str:
    """Return a SQL fragment excluding noise tags. Use with AND."""
    excluded = EXCLUDED_TAGS
    ph = ",".join(["?"] * len(excluded))
    return f"{tag_col} NOT IN ({ph}) AND {tag_col} IS NOT NULL"

def _tag_filter_params() -> list:
    return list(EXCLUDED_TAGS)

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S")
logger = logging.getLogger("govtech-dashboard")
logger.info("Starting GovTech Dashboard...")

st.set_page_config(
    page_title="GovTech GitHub Explorer",
    page_icon="πŸ›οΈ",
    layout="wide",
)


def get_db_path():
    candidates = [
        os.path.join(os.path.dirname(__file__), "..", "govtech.db"),
        os.path.join(os.path.dirname(__file__), "govtech.db"),
        "govtech.db",
        "../govtech.db",
    ]
    for p in candidates:
        if os.path.exists(p):
            logger.info(f"Found local DB: {os.path.abspath(p)}")
            return os.path.abspath(p)
    logger.info("No local DB found, downloading from HuggingFace Hub...")
    try:
        from huggingface_hub import hf_hub_download
        path = hf_hub_download(
            repo_id="AndreasThinks/government-github-repos",
            filename="data/govtech.db",
            repo_type="dataset",
        )
        logger.info(f"Downloaded DB to: {path}")
        return path
    except Exception as e:
        logger.error(f"Failed to download DB: {e}")
        st.error(f"Could not find or download govtech.db: {e}")
        st.stop()


DB_PATH = get_db_path()


def get_conn():
    return sqlite3.connect(DB_PATH, check_same_thread=False)


@st.cache_data(ttl=300)
def query_df(sql, params=None):
    conn = get_conn()
    df = pd.read_sql_query(sql, conn, params=params or [])
    conn.close()
    return df


@st.cache_data(ttl=300)
def query_one(sql, params=None):
    conn = get_conn()
    cur = conn.cursor()
    result = cur.execute(sql, params or []).fetchone()[0]
    conn.close()
    return result


@st.cache_data(ttl=300)
def get_last_updated_display():
    iso = query_one("SELECT MAX(last_scraped) FROM repositories")
    if not iso:
        return None
    try:
        dt = datetime.fromisoformat(iso).astimezone(timezone.utc)
        return dt.strftime("%Y-%m-%d %H:%M UTC")
    except (ValueError, TypeError):
        return None


@st.cache_data(ttl=600)
def load_filter_options():
    conn = get_conn()
    countries = pd.read_sql_query(
        "SELECT DISTINCT country FROM repositories WHERE country IS NOT NULL AND country != '' ORDER BY country", conn
    )["country"].tolist()
    languages = pd.read_sql_query(
        "SELECT DISTINCT language FROM repositories WHERE language IS NOT NULL AND language != '' ORDER BY language", conn
    )["language"].tolist()
    tf_sql = _tag_filter_sql()
    tf_params = _tag_filter_params()
    tags = pd.read_sql_query(
        f"SELECT tag, COUNT(DISTINCT html_url) as c FROM repository_tags WHERE {tf_sql} GROUP BY tag HAVING c >= {MIN_TAG_REPOS} ORDER BY c DESC",
        conn, params=tf_params
    )["tag"].tolist()
    orgs = pd.read_sql_query(
        "SELECT owner, COUNT(*) as c FROM repositories GROUP BY owner ORDER BY c DESC LIMIT 300", conn
    )["owner"].tolist()
    conn.close()
    return countries, languages, tags, orgs


# ==================== SIDEBAR FILTERS ====================
st.sidebar.title("πŸ”­ Filters")
st.sidebar.caption("Applied across all tabs")

countries, languages, tags, orgs = load_filter_options()

sel_countries = st.sidebar.multiselect("🌍 Country", countries, key="g_country")
sel_orgs = st.sidebar.multiselect("🏒 Organisation", orgs, key="g_org")
sel_languages = st.sidebar.multiselect("πŸ’» Language", languages, key="g_lang")
sel_tags = st.sidebar.multiselect("🏷️ Tag", tags, key="g_tag")

st.sidebar.divider()
st.sidebar.subheader("πŸ“… Activity")
activity_options = {
    "All time": None,
    "Active last 3 months": 90,
    "Active last 6 months": 180,
    "Active last 12 months": 365,
    "Active last 2 years": 730,
}
activity_label = st.sidebar.selectbox("Last pushed", list(activity_options.keys()), index=0, key="g_activity")
activity_days = activity_options[activity_label]

st.sidebar.divider()
show_archived = st.sidebar.checkbox("Include archived", value=False, key="g_arch")
show_forks = st.sidebar.checkbox("Include forks", value=True, key="g_forks")
min_stars = st.sidebar.slider("Min stars", 0, 500, 0, key="g_stars")

st.sidebar.divider()
st.sidebar.markdown(
    "πŸ”— [GitHub](https://github.com/AndreasThinks/open-govtech-report)  |  "
    "[Dataset](https://huggingface.co/datasets/AndreasThinks/government-github-repos)  |  "
    "[βž• Submit a missing org](https://github.com/AndreasThinks/open-govtech-report/blob/main/CONTRIBUTING.md)",
    unsafe_allow_html=True,
)


def build_where(extra_conditions=None, base_table="r", tag_table="rt"):
    """Build a WHERE clause and params list from global sidebar filters."""
    conditions = list(extra_conditions or [])
    params = []

    if sel_countries:
        ph = ",".join(["?"] * len(sel_countries))
        conditions.append(f"{base_table}.country IN ({ph})")
        params.extend(sel_countries)

    if sel_orgs:
        ph = ",".join(["?"] * len(sel_orgs))
        conditions.append(f"{base_table}.owner IN ({ph})")
        params.extend(sel_orgs)

    if sel_languages:
        ph = ",".join(["?"] * len(sel_languages))
        conditions.append(f"{base_table}.language IN ({ph})")
        params.extend(sel_languages)

    if activity_days:
        cutoff = (datetime.now(timezone.utc) - timedelta(days=activity_days)).strftime("%Y-%m-%dT%H:%M:%SZ")
        conditions.append(f"{base_table}.pushed_at >= ?")
        params.append(cutoff)

    if not show_archived:
        conditions.append(f"({base_table}.archived = 0 OR {base_table}.archived IS NULL)")

    if not show_forks:
        conditions.append(f"({base_table}.fork = 0 OR {base_table}.fork IS NULL)")

    if min_stars > 0:
        conditions.append(f"{base_table}.stars >= ?")
        params.append(min_stars)

    return conditions, params


def build_tag_join_where(extra_conditions=None):
    """Build WHERE for queries that need to join repository_tags for tag filter."""
    conditions, params = build_where(extra_conditions)
    tag_join = ""
    if sel_tags:
        tag_join = "JOIN repository_tags rt ON r.html_url = rt.html_url"
        ph = ",".join(["?"] * len(sel_tags))
        conditions.append(f"rt.tag IN ({ph})")
        params.extend(sel_tags)
    where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
    return where, params, tag_join


# ==================== HEADER ====================
st.title("πŸ›οΈ GovTech GitHub Explorer")
st.caption("Exploring 70k+ government GitHub repositories worldwide")

# ==================== TABS ====================
tab_overview, tab_explorer, tab_tags, tab_insights, tab_trends, tab_about = st.tabs(
    ["πŸ“Š Overview", "πŸ” Explorer", "🏷️ Tags", "πŸ’‘ Insights", "πŸ“ˆ Trends", "ℹ️ About"]
)


# ==================== OVERVIEW ====================
with tab_overview:
    where, params, tag_join = build_tag_join_where()

    total_filtered = query_one(f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join} {where}", params)
    account_count = query_one("SELECT COUNT(*) FROM accounts")
    country_count_val = query_one(
        f"SELECT COUNT(DISTINCT r.country) FROM repositories r {tag_join} {where}", params
    )

    # Active in last 12m within filtered set
    active_cutoff = (datetime.now(timezone.utc) - timedelta(days=365)).strftime("%Y-%m-%dT%H:%M:%SZ")
    active_conditions, active_params = build_where([f"r.pushed_at >= ?"])
    active_params_full = active_params.copy()
    active_params_full.insert(
        len(active_params) - 1 if active_params else 0, active_cutoff
    )
    # Simpler: just count directly
    conn = get_conn()
    conds_12m, p_12m = build_where()
    conds_12m.append("r.pushed_at >= ?")
    p_12m.append(active_cutoff)
    tj2 = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
    if sel_tags:
        ph = ",".join(["?"] * len(sel_tags))
        conds_12m.append(f"rt.tag IN ({ph})")
        p_12m.extend(sel_tags)
    w12 = ("WHERE " + " AND ".join(conds_12m)) if conds_12m else ""
    active_12m = conn.execute(
        f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tj2} {w12}", p_12m
    ).fetchone()[0]
    conn.close()

    c1, c2, c3, c4 = st.columns(4)
    c1.metric("Repositories", f"{total_filtered:,}")
    c2.metric("Accounts", f"{account_count:,}")
    c3.metric("Countries", country_count_val)
    c4.metric("Active last 12m", f"{active_12m:,}", help="Repos with a push in the last 12 months")

    st.divider()

    col_left, col_right = st.columns(2)

    with col_left:
        st.subheader("Top Countries by Repositories")
        df_countries = query_df(
            f"SELECT r.country, COUNT(DISTINCT r.html_url) as count FROM repositories r {tag_join} {where} GROUP BY r.country ORDER BY count DESC LIMIT 20",
            params,
        )
        if not df_countries.empty:
            fig = px.bar(df_countries, x="country", y="count", color="count", color_continuous_scale="Blues")
            fig.update_layout(showlegend=False, xaxis_title="Country", yaxis_title="Repositories", coloraxis_showscale=False)
            st.plotly_chart(fig, use_container_width=True)

    with col_right:
        st.subheader("Top Languages")
        df_langs = query_df(
            f"""SELECT r.language, COUNT(DISTINCT r.html_url) as count
                FROM repositories r {tag_join} {where}
                {"AND" if where else "WHERE"} r.language IS NOT NULL AND r.language != ''
                GROUP BY r.language ORDER BY count DESC LIMIT 15""",
            params,
        )
        if not df_langs.empty:
            fig = px.bar(df_langs, x="count", y="language", orientation="h", color="count", color_continuous_scale="Greens")
            fig.update_layout(showlegend=False, yaxis=dict(autorange="reversed"), xaxis_title="Repositories", yaxis_title="", coloraxis_showscale=False)
            st.plotly_chart(fig, use_container_width=True)

    st.subheader("Repository Creation Timeline")
    df_timeline = query_df(
        f"""SELECT SUBSTR(r.created_at, 1, 4) as year, COUNT(DISTINCT r.html_url) as count
            FROM repositories r {tag_join} {where}
            {"AND" if where else "WHERE"} r.created_at IS NOT NULL
            GROUP BY year ORDER BY year""",
        params,
    )
    df_timeline = df_timeline[df_timeline["year"].str.match(r"^\d{4}$", na=False)]

    # Also pull active repos per year (pushed_at within 12m of each year-end β€” proxy: pushed in that year or later)
    df_pushed = query_df(
        f"""SELECT SUBSTR(r.pushed_at, 1, 4) as year, COUNT(DISTINCT r.html_url) as active
            FROM repositories r {tag_join} {where}
            {"AND" if where else "WHERE"} r.pushed_at IS NOT NULL
            GROUP BY year ORDER BY year""",
        params,
    )
    df_pushed = df_pushed[df_pushed["year"].str.match(r"^\d{4}$", na=False)]

    if not df_timeline.empty:
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=df_timeline["year"], y=df_timeline["count"],
            name="Created", fill="tozeroy", mode="lines",
            line=dict(color="#3b82f6"), fillcolor="rgba(59,130,246,0.2)"
        ))
        if not df_pushed.empty:
            fig.add_trace(go.Scatter(
                x=df_pushed["year"], y=df_pushed["active"],
                name="Last pushed", fill="tozeroy", mode="lines",
                line=dict(color="#10b981"), fillcolor="rgba(16,185,129,0.15)"
            ))
        fig.update_layout(xaxis_title="Year", yaxis_title="Repositories", legend=dict(orientation="h"))
        st.plotly_chart(fig, use_container_width=True)
        st.caption("'Last pushed' shows when repositories last received a commit β€” a proxy for active maintenance.")


# ==================== EXPLORER ====================
with tab_explorer:
    where_e, params_e, tag_join_e = build_tag_join_where()

    # Extra local search
    search_text = st.text_input("Search name / description", key="exp_search")
    if search_text:
        where_e_conds, _ = build_where()
        where_e_conds.append("(r.name LIKE ? OR r.description LIKE ?)")
        params_e_local = params_e + [f"%{search_text}%", f"%{search_text}%"]
        where_e_local = ("WHERE " + " AND ".join(where_e_conds + (["(r.name LIKE ? OR r.description LIKE ?)"] if search_text else []))) if where_e_conds else ""
    else:
        params_e_local = params_e

    sort_col = st.selectbox("Sort by", ["stars", "forks", "pushed_at", "created_at"], key="exp_sort")

    conn = get_conn()
    count_sql = f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join_e} {where_e}"
    if search_text:
        extra = " AND (r.name LIKE ? OR r.description LIKE ?)"
        total_results = conn.execute(count_sql + extra, params_e + [f"%{search_text}%", f"%{search_text}%"]).fetchone()[0]
    else:
        total_results = conn.execute(count_sql, params_e).fetchone()[0]
    conn.close()

    st.write(f"**{total_results:,}** repositories match current filters")

    page_size = 50
    total_pages = max(1, (total_results + page_size - 1) // page_size)
    page = st.number_input("Page", min_value=1, max_value=total_pages, value=1, key="exp_page")
    offset = (page - 1) * page_size

    search_clause = " AND (r.name LIKE ? OR r.description LIKE ?)" if search_text else ""
    search_params = [f"%{search_text}%", f"%{search_text}%"] if search_text else []

    data_sql = f"""
        SELECT r.html_url, r.name, r.owner, r.country, r.language, r.stars, r.forks,
               r.license, r.created_at, r.pushed_at, r.archived, r.fork
        FROM repositories r {tag_join_e} {where_e} {search_clause}
        GROUP BY r.html_url
        ORDER BY r.{sort_col} DESC
        LIMIT ? OFFSET ?
    """
    df_results = query_df(data_sql, params_e + search_params + [page_size, offset])

    if not df_results.empty:
        st.dataframe(
            df_results,
            column_config={
                "html_url": st.column_config.LinkColumn("URL", display_text="Open"),
                "name": st.column_config.TextColumn("Name"),
                "owner": st.column_config.TextColumn("Owner"),
                "country": st.column_config.TextColumn("Country"),
                "language": st.column_config.TextColumn("Language"),
                "stars": st.column_config.NumberColumn("⭐ Stars"),
                "forks": st.column_config.NumberColumn("🍴 Forks"),
                "license": st.column_config.TextColumn("License"),
                "created_at": st.column_config.TextColumn("Created"),
                "pushed_at": st.column_config.TextColumn("Last pushed"),
                "archived": st.column_config.CheckboxColumn("Archived"),
                "fork": st.column_config.CheckboxColumn("Fork"),
            },
            use_container_width=True,
            hide_index=True,
        )
        st.caption(f"Page {page} of {total_pages}")
    else:
        st.info("No repositories match the current filters.")


# ==================== TAGS ====================
with tab_tags:
    where_t, params_t, tag_join_t = build_tag_join_where()

    tagged_count_t = query_one("SELECT COUNT(DISTINCT html_url) FROM repository_tags")
    total_repos_t = query_one("SELECT COUNT(*) FROM repositories")

    if tagged_count_t < total_repos_t * 0.99:
        pct = tagged_count_t / total_repos_t * 100 if total_repos_t > 0 else 0
        st.info(
            f"πŸ—οΈ **Tagging in progress** β€” {tagged_count_t:,} of {total_repos_t:,} repositories tagged ({pct:.1f}%). "
            "Results below reflect partially tagged data."
        )

    col_tl, col_tr = st.columns(2)

    with col_tl:
        st.subheader("Top Tags")
        tf_sql_t = _tag_filter_sql("rt2.tag")
        tf_params_t = _tag_filter_params()
        df_top_tags = query_df(
            f"""SELECT rt2.tag, COUNT(DISTINCT r.html_url) as count
                FROM repositories r
                JOIN repository_tags rt2 ON r.html_url = rt2.html_url
                {tag_join_t.replace("rt", "rt_f") if sel_tags else ""}
                {where_t.replace("rt.", "rt2.") if where_t else ""}
                {"AND" if where_t else "WHERE"} {tf_sql_t}
                GROUP BY rt2.tag HAVING count >= {MIN_TAG_REPOS} ORDER BY count DESC LIMIT 30""",
            params_t + tf_params_t,
        )
        if not df_top_tags.empty:
            fig = px.bar(
                df_top_tags, x="count", y="tag", orientation="h",
                color="count", color_continuous_scale="Purples",
            )
            fig.update_layout(yaxis=dict(autorange="reversed"), showlegend=False, height=600,
                              xaxis_title="Repositories", yaxis_title="", coloraxis_showscale=False)
            st.plotly_chart(fig, use_container_width=True)

    with col_tr:
        st.subheader("Tags by Year Created")
        st.caption("Repos tagged with each technology, by creation year β€” shows technology adoption over time.")

        # Pick top 10 tags for the chart
        if not df_top_tags.empty:
            top10_tags = df_top_tags.head(10)["tag"].tolist()
            ph = ",".join(["?"] * len(top10_tags))
            conds_ty, params_ty = build_where(base_table="r")
            conds_ty.append(f"rt3.tag IN ({ph})")
            params_ty.extend(top10_tags)
            conds_ty.append("r.created_at IS NOT NULL")
            w_ty = ("WHERE " + " AND ".join(conds_ty)) if conds_ty else ""
            df_tag_time = query_df(
                f"""SELECT SUBSTR(r.created_at,1,4) as year, rt3.tag, COUNT(DISTINCT r.html_url) as count
                    FROM repositories r JOIN repository_tags rt3 ON r.html_url = rt3.html_url
                    {w_ty}
                    GROUP BY year, rt3.tag ORDER BY year""",
                params_ty,
            )
            df_tag_time = df_tag_time[df_tag_time["year"].str.match(r"^\d{4}$", na=False)]
            if not df_tag_time.empty:
                fig = px.line(df_tag_time, x="year", y="count", color="tag",
                              labels={"year": "Year", "count": "Repos created", "tag": "Tag"})
                fig.update_layout(legend=dict(orientation="h", y=-0.3))
                st.plotly_chart(fig, use_container_width=True)

    st.divider()
    st.subheader("Browse Repos by Tag")
    browse_tags = df_top_tags["tag"].tolist() if not df_top_tags.empty else tags
    if browse_tags:
        sel_tag = st.selectbox("Select a tag", browse_tags, key="tag_browse")
        sort_tag = st.selectbox("Sort by", ["stars", "pushed_at", "created_at"], key="tag_sort")
        conds_br, params_br = build_where(base_table="r")
        conds_br.append("rt_b.tag = ?")
        params_br.append(sel_tag)
        w_br = ("WHERE " + " AND ".join(conds_br)) if conds_br else ""
        df_tag_repos = query_df(
            f"""SELECT r.name, r.owner, r.country, r.language, r.stars, r.pushed_at, rt_b.confidence, r.html_url
               FROM repository_tags rt_b JOIN repositories r ON rt_b.html_url = r.html_url
               {w_br} ORDER BY r.{sort_tag} DESC LIMIT 200""",
            params_br,
        )
        st.write(f"**{len(df_tag_repos)}** repos tagged with **{sel_tag}**")
        if not df_tag_repos.empty:
            st.dataframe(
                df_tag_repos,
                column_config={
                    "html_url": st.column_config.LinkColumn("URL", display_text="Open"),
                    "confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1),
                    "stars": st.column_config.NumberColumn("⭐ Stars"),
                    "pushed_at": st.column_config.TextColumn("Last pushed"),
                },
                use_container_width=True,
                hide_index=True,
            )

    st.divider()
    st.subheader("Tag Groups")
    df_groups = query_df("SELECT id, name, description FROM tag_groups ORDER BY name")
    if not df_groups.empty:
        for _, grp in df_groups.iterrows():
            with st.expander(f"πŸ“ {grp['name']}" + (f" β€” {grp['description']}" if grp["description"] else "")):
                df_members = query_df(
                    "SELECT tag FROM tag_group_members WHERE group_id = ? ORDER BY tag",
                    [int(grp["id"])]
                )
                if not df_members.empty:
                    st.write(", ".join(df_members["tag"].tolist()))
                else:
                    st.write("No tags in this group yet.")
    else:
        st.info("No tag groups defined yet.")


# ==================== INSIGHTS ====================
with tab_insights:
    where_i, params_i, tag_join_i = build_tag_join_where()

    col_ia, col_ib = st.columns(2)

    with col_ia:
        st.subheader("⭐ Most Starred")
        df_top = query_df(
            f"""SELECT r.name, r.owner, r.country, r.stars, r.language, r.pushed_at, r.html_url
                FROM repositories r {tag_join_i} {where_i}
                GROUP BY r.html_url ORDER BY r.stars DESC LIMIT 25""",
            params_i,
        )
        st.dataframe(
            df_top,
            column_config={
                "html_url": st.column_config.LinkColumn("URL", display_text="Open"),
                "stars": st.column_config.NumberColumn("⭐ Stars"),
                "pushed_at": st.column_config.TextColumn("Last pushed"),
            },
            use_container_width=True,
            hide_index=True,
        )

    with col_ib:
        st.subheader("πŸš€ Rising Stars (active last 12m, sorted by stars)")
        rising_cutoff = (datetime.now(timezone.utc) - timedelta(days=365)).strftime("%Y-%m-%dT%H:%M:%SZ")
        conds_r, params_r = build_where(base_table="r")
        conds_r.append("r.pushed_at >= ?")
        params_r.append(rising_cutoff)
        tj_r = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
        if sel_tags:
            ph = ",".join(["?"] * len(sel_tags))
            conds_r.append(f"rt.tag IN ({ph})")
            params_r.extend(sel_tags)
        w_r = ("WHERE " + " AND ".join(conds_r)) if conds_r else ""
        df_rising = query_df(
            f"""SELECT r.name, r.owner, r.country, r.stars, r.language, r.pushed_at, r.html_url
                FROM repositories r {tj_r} {w_r}
                GROUP BY r.html_url ORDER BY r.stars DESC LIMIT 25""",
            params_r,
        )
        st.dataframe(
            df_rising,
            column_config={
                "html_url": st.column_config.LinkColumn("URL", display_text="Open"),
                "stars": st.column_config.NumberColumn("⭐ Stars"),
                "pushed_at": st.column_config.TextColumn("Last pushed"),
            },
            use_container_width=True,
            hide_index=True,
        )

    st.divider()

    st.subheader("πŸ“ˆ Most Active Organisations")
    st.caption("Organisations ranked by number of repos with a push in the last 12 months.")
    conds_ao, params_ao = build_where(base_table="r")
    conds_ao.append("r.pushed_at >= ?")
    params_ao.append((datetime.now(timezone.utc) - timedelta(days=365)).strftime("%Y-%m-%dT%H:%M:%SZ"))
    tj_ao = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
    if sel_tags:
        ph = ",".join(["?"] * len(sel_tags))
        conds_ao.append(f"rt.tag IN ({ph})")
        params_ao.extend(sel_tags)
    w_ao = ("WHERE " + " AND ".join(conds_ao)) if conds_ao else ""
    df_active_orgs = query_df(
        f"""SELECT r.owner, r.country, COUNT(DISTINCT r.html_url) as active_repos,
                   SUM(r.stars) as total_stars
            FROM repositories r {tj_ao} {w_ao}
            GROUP BY r.owner ORDER BY active_repos DESC LIMIT 20""",
        params_ao,
    )
    col_org1, col_org2 = st.columns(2)
    with col_org1:
        if not df_active_orgs.empty:
            fig = px.bar(df_active_orgs, x="active_repos", y="owner", orientation="h",
                         color="active_repos", color_continuous_scale="Oranges",
                         labels={"active_repos": "Active repos (12m)", "owner": ""})
            fig.update_layout(yaxis=dict(autorange="reversed"), showlegend=False,
                              height=500, coloraxis_showscale=False)
            st.plotly_chart(fig, use_container_width=True)
    with col_org2:
        if not df_active_orgs.empty:
            st.dataframe(df_active_orgs, use_container_width=True, hide_index=True,
                         column_config={"total_stars": st.column_config.NumberColumn("⭐ Total stars"),
                                        "active_repos": st.column_config.NumberColumn("Active repos (12m)")})

    st.divider()

    col_ic, col_id = st.columns(2)

    with col_ic:
        st.subheader("πŸ“œ License Breakdown")
        df_lic = query_df(
            f"""SELECT r.license, COUNT(DISTINCT r.html_url) as count
                FROM repositories r {tag_join_i} {where_i}
                {"AND" if where_i else "WHERE"} r.license IS NOT NULL AND r.license != ''
                GROUP BY r.license ORDER BY count DESC""",
            params_i,
        )
        if not df_lic.empty:
            top_n = 10
            if len(df_lic) > top_n:
                top = df_lic.head(top_n)
                other = pd.DataFrame([{"license": "Other", "count": df_lic.iloc[top_n:]["count"].sum()}])
                df_lic_plot = pd.concat([top, other], ignore_index=True)
            else:
                df_lic_plot = df_lic
            fig = px.pie(df_lic_plot, names="license", values="count", hole=0.3)
            fig.update_traces(textposition="inside", textinfo="percent+label")
            st.plotly_chart(fig, use_container_width=True)

    with col_id:
        st.subheader("🍴 Fork vs Original")
        fork_count = query_one(
            f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join_i} {where_i} {'AND' if where_i else 'WHERE'} r.fork = 1",
            params_i,
        )
        original_count = query_one(
            f"SELECT COUNT(DISTINCT r.html_url) FROM repositories r {tag_join_i} {where_i} {'AND' if where_i else 'WHERE'} (r.fork = 0 OR r.fork IS NULL)",
            params_i,
        )
        m1, m2 = st.columns(2)
        m1.metric("Original", f"{original_count:,}")
        m2.metric("Forked", f"{fork_count:,}")
        fig = px.pie(
            pd.DataFrame({"type": ["Original", "Fork"], "count": [original_count, fork_count]}),
            names="type", values="count", hole=0.4,
            color_discrete_sequence=["#2ecc71", "#e74c3c"],
        )
        st.plotly_chart(fig, use_container_width=True)

    st.divider()
    st.subheader("🌍 Language Γ— Country Heatmap")
    df_heat = query_df(
        f"""SELECT r.country, r.language, COUNT(DISTINCT r.html_url) as count
            FROM repositories r {tag_join_i} {where_i}
            {"AND" if where_i else "WHERE"} r.language IS NOT NULL AND r.language != ''
              AND r.country IN (
                SELECT country FROM repositories GROUP BY country ORDER BY COUNT(*) DESC LIMIT 15
              )
              AND r.language IN (
                SELECT language FROM repositories WHERE language IS NOT NULL AND language != ''
                GROUP BY language ORDER BY COUNT(*) DESC LIMIT 12
              )
            GROUP BY r.country, r.language""",
        params_i,
    )
    if not df_heat.empty:
        pivot = df_heat.pivot_table(index="country", columns="language", values="count", fill_value=0)
        fig = px.imshow(pivot, text_auto=True, color_continuous_scale="YlOrRd",
                        labels=dict(x="Language", y="Country", color="Repos"), aspect="auto")
        fig.update_layout(height=500)
        st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("Not enough data for heatmap.")


# ==================== TRENDS ====================
with tab_trends:
    st.subheader("πŸ“ˆ Trends Over Time")
    st.caption("How government open source has evolved β€” new activity, rising tags, and shifting languages.")

    # ---- Period selector ----
    PERIOD_OPTIONS = {
        "This week":     7,
        "This month":    30,
        "Last 3 months": 90,
        "Last 6 months": 180,
        "Last 12 months": 365,
        "Last 2 years":  730,
    }
    period_label = st.radio(
        "Period", list(PERIOD_OPTIONS.keys()), index=4,
        horizontal=True, key="trends_period",
    )
    period_days = PERIOD_OPTIONS[period_label]
    period_prior_days = period_days * 2  # prior window = same length, shifted back

    now_utc = datetime.now(timezone.utc)
    now_str = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
    cutoff_recent = (now_utc - timedelta(days=period_days)).strftime("%Y-%m-%dT%H:%M:%SZ")
    cutoff_prior  = (now_utc - timedelta(days=period_prior_days)).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Granularity: week/month buckets depending on period
    if period_days <= 30:
        bucket_fmt = "%Y-%W"   # ISO week
        bucket_label = "Week"
    elif period_days <= 365:
        bucket_fmt = "%Y-%m"   # Month
        bucket_label = "Month"
    else:
        bucket_fmt = "%Y-%m"
        bucket_label = "Month"

    # For the repos chart, show the bucket pattern for the selected period
    # SQLite STRFTIME format
    if period_days <= 30:
        sql_bucket = "STRFTIME('%Y-%W', r.created_at)"
        bucket_re = r"^\d{4}-\d{2}$"
    else:
        sql_bucket = "SUBSTR(r.created_at, 1, 7)"
        bucket_re = r"^\d{4}-\d{2}$"

    st.divider()

    conds_base, params_base = build_where(base_table="r")
    w_base = (" AND ".join(conds_base)) if conds_base else ""
    and_base = ("AND " + w_base) if w_base else ""
    tf_sql_mom = _tag_filter_sql("rt.tag")
    tf_params_mom = _tag_filter_params()

    # ---- 1. New repos per period ----
    st.markdown(f"### πŸ—“οΈ New Repositories β€” {period_label}")

    conds_m, params_m = build_where(base_table="r")
    conds_m.append("r.created_at >= ?")
    params_m.append(cutoff_recent)
    w_m = ("WHERE " + " AND ".join(conds_m)) if conds_m else ""
    tj_m = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
    if sel_tags:
        ph = ",".join(["?"] * len(sel_tags))
        conds_m.append(f"rt.tag IN ({ph})")
        params_m.extend(sel_tags)
        w_m = ("WHERE " + " AND ".join(conds_m)) if conds_m else ""

    df_activity = query_df(
        f"""SELECT {sql_bucket} as bucket, COUNT(DISTINCT r.html_url) as new_repos
            FROM repositories r {tj_m} {w_m}
            GROUP BY bucket ORDER BY bucket""",
        params_m,
    )
    df_activity = df_activity[df_activity["bucket"].str.match(bucket_re, na=False)]
    if not df_activity.empty:
        fig = px.bar(
            df_activity, x="bucket", y="new_repos",
            labels={"bucket": bucket_label, "new_repos": "New repositories"},
            color="new_repos", color_continuous_scale="Blues",
        )
        fig.update_layout(coloraxis_showscale=False, xaxis_title=bucket_label, yaxis_title="New repos")
        st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("Not enough data for this period.")

    # Fastest growing repos this period (by stars delta proxy: recently created + high stars)
    st.markdown(f"#### 🌟 Top New Repos β€” {period_label}")
    st.caption("Highest-starred repositories created in the selected period.")
    conds_nr, params_nr = build_where(base_table="r")
    conds_nr.append("r.created_at >= ?")
    params_nr.append(cutoff_recent)
    w_nr = ("WHERE " + " AND ".join(conds_nr)) if conds_nr else ""
    tj_nr = "JOIN repository_tags rt ON r.html_url = rt.html_url" if sel_tags else ""
    if sel_tags:
        ph = ",".join(["?"] * len(sel_tags))
        conds_nr.append(f"rt.tag IN ({ph})")
        params_nr.extend(sel_tags)
        w_nr = ("WHERE " + " AND ".join(conds_nr)) if conds_nr else ""
    df_new_repos = query_df(
        f"""SELECT r.name, r.owner, r.country, r.language, r.stars, r.created_at, r.html_url
            FROM repositories r {tj_nr} {w_nr}
            GROUP BY r.html_url ORDER BY r.stars DESC LIMIT 20""",
        params_nr,
    )
    if not df_new_repos.empty:
        st.dataframe(
            df_new_repos,
            column_config={
                "html_url": st.column_config.LinkColumn("URL", display_text="Open"),
                "stars": st.column_config.NumberColumn("⭐ Stars"),
                "created_at": st.column_config.TextColumn("Created"),
            },
            use_container_width=True, hide_index=True,
        )
    else:
        st.info("No new repos in this period.")

    st.divider()

    # ---- 2. Tag momentum ----
    st.markdown(f"### πŸš€ Tag Momentum β€” {period_label} vs prior {period_label.lower()}")
    st.caption(
        f"Tags ranked by growth β€” repos created in the selected period vs the equivalent period before it. "
        "Higher ratio = faster-growing category."
    )

    # Minimum repo count scales with period to avoid noise on short windows
    min_repos = max(2, period_days // 60)

    df_momentum = query_df(
        f"""
        SELECT
            rt.tag,
            COUNT(DISTINCT CASE WHEN r.created_at >= ? THEN r.html_url END) as recent,
            COUNT(DISTINCT CASE WHEN r.created_at >= ? AND r.created_at < ? THEN r.html_url END) as prior
        FROM repository_tags rt
        JOIN repositories r ON rt.html_url = r.html_url
        WHERE r.created_at IS NOT NULL AND {tf_sql_mom} {and_base}
        GROUP BY rt.tag
        HAVING recent >= {min_repos} AND prior >= {min_repos}
        ORDER BY (CAST(recent AS FLOAT) / prior) DESC
        LIMIT 30
        """,
        [cutoff_recent, cutoff_prior, cutoff_recent] + tf_params_mom + params_base,
    )

    if not df_momentum.empty:
        df_momentum["growth_ratio"] = (df_momentum["recent"] / df_momentum["prior"]).round(2)
        df_momentum["growth_pct"] = ((df_momentum["growth_ratio"] - 1) * 100).round(1)

        col_m1, col_m2 = st.columns(2)

        with col_m1:
            st.markdown(f"**Fastest growing tags**")
            fig = px.bar(
                df_momentum.head(20), x="growth_ratio", y="tag", orientation="h",
                color="growth_ratio", color_continuous_scale="Greens",
                labels={"growth_ratio": "Growth ratio (recent / prior)", "tag": "Tag"},
                hover_data={"recent": True, "prior": True, "growth_pct": True},
            )
            fig.update_layout(
                yaxis=dict(autorange="reversed"), height=550,
                coloraxis_showscale=False, xaxis_title="Growth ratio", yaxis_title="",
            )
            fig.add_vline(x=1.0, line_dash="dash", line_color="grey", annotation_text="no change")
            st.plotly_chart(fig, use_container_width=True)

        with col_m2:
            st.markdown(f"**Top tags by volume**")
            df_recent_top = query_df(
                f"""
                SELECT rt.tag, COUNT(DISTINCT r.html_url) as recent_count
                FROM repository_tags rt JOIN repositories r ON rt.html_url = r.html_url
                WHERE r.created_at >= ? AND {tf_sql_mom} {and_base}
                GROUP BY rt.tag ORDER BY recent_count DESC LIMIT 20
                """,
                [cutoff_recent] + tf_params_mom + params_base,
            )
            if not df_recent_top.empty:
                fig2 = px.bar(
                    df_recent_top, x="recent_count", y="tag", orientation="h",
                    color="recent_count", color_continuous_scale="Purples",
                    labels={"recent_count": f"Repos ({period_label.lower()})", "tag": "Tag"},
                )
                fig2.update_layout(
                    yaxis=dict(autorange="reversed"), height=550,
                    coloraxis_showscale=False,
                    xaxis_title=f"Repos ({period_label.lower()})", yaxis_title="",
                )
                st.plotly_chart(fig2, use_container_width=True)

        # Emerging tags table
        st.markdown("**Emerging tags** β€” ratio > 1.5")
        df_emerging = df_momentum[df_momentum["growth_ratio"] >= 1.5][
            ["tag", "recent", "prior", "growth_ratio", "growth_pct"]
        ].rename(columns={
            "tag": "Tag", "recent": period_label, "prior": f"Prior {period_label.lower()}",
            "growth_ratio": "Ratio", "growth_pct": "Growth %"
        })
        if not df_emerging.empty:
            st.dataframe(df_emerging, use_container_width=True, hide_index=True)
        else:
            st.info("No tags with >50% growth in this period.")
    else:
        st.info("Not enough tagged data for this period β€” try a longer window.")

    st.divider()

    # ---- 3. Language trends ----
    st.markdown("### πŸ’» Language Trends")
    st.caption("Year-over-year share of new repositories by primary language β€” top 10 languages.")

    df_lang_year = query_df(
        f"""
        SELECT SUBSTR(r.created_at, 1, 4) as year, r.language,
               COUNT(DISTINCT r.html_url) as count
        FROM repositories r
        WHERE r.language IS NOT NULL AND r.language != ''
          AND r.created_at IS NOT NULL
          AND r.language IN (
              SELECT language FROM repositories
              WHERE language IS NOT NULL AND language != ''
              GROUP BY language ORDER BY COUNT(*) DESC LIMIT 10
          )
          AND SUBSTR(r.created_at, 1, 4) BETWEEN '2015' AND SUBSTR(?, 1, 4)
        GROUP BY year, r.language ORDER BY year
        """,
        [now_str],
    )
    df_lang_year = df_lang_year[df_lang_year["year"].str.match(r"^\d{4}$", na=False)]

    if not df_lang_year.empty:
        fig_lang = px.line(
            df_lang_year, x="year", y="count", color="language",
            labels={"year": "Year", "count": "New repositories", "language": "Language"},
            markers=True,
        )
        fig_lang.update_layout(legend=dict(orientation="h", y=-0.25))
        st.plotly_chart(fig_lang, use_container_width=True)

        st.caption("As a share of all new repos that year (top 10 languages).")
        df_totals = df_lang_year.groupby("year")["count"].sum().reset_index().rename(columns={"count": "total"})
        df_share = df_lang_year.merge(df_totals, on="year")
        df_share["share"] = (df_share["count"] / df_share["total"] * 100).round(1)

        fig_share = px.area(
            df_share, x="year", y="share", color="language",
            labels={"year": "Year", "share": "Share of new repos (%)", "language": "Language"},
            groupnorm="",
        )
        fig_share.update_layout(legend=dict(orientation="h", y=-0.25), yaxis_title="Share (%)")
        st.plotly_chart(fig_share, use_container_width=True)
    else:
        st.info("Not enough data for language trends.")


# ==================== ABOUT ====================
with tab_about:
    st.subheader("About GovTech GitHub Explorer")
    st.write(
        """
        **GovTech GitHub Explorer** maps the global landscape of government open source software.
        It discovers, scrapes, and automatically categorises every public GitHub repository
        belonging to government organisations worldwide β€” updated weekly.
        """
    )

    st.subheader("How it works")
    col_a1, col_a2, col_a3, col_a4 = st.columns(4)
    with col_a1:
        st.markdown("### πŸ” Discover")
        st.write("Government GitHub accounts are sourced from the [government.github.com](https://github.com/github/government.github.com) registry β€” ~2,000 organisations across 100+ countries.")
    with col_a2:
        st.markdown("### πŸ•·οΈ Scrape")
        st.write("Repository metadata is collected via the GitHub API using a GitHub App installation, giving high-throughput authenticated access.")
    with col_a3:
        st.markdown("### 🏷️ Tag")
        st.write("An LLM pipeline (Qwen3-32B via OpenRouter) reads each repository's metadata and README, then assigns structured tags and categories.")
    with col_a4:
        st.markdown("### πŸ“Š Explore")
        st.write("Tags are clustered into groups using embedding similarity, and the full dataset is published to HuggingFace for anyone to use.")

    st.divider()

    st.subheader("Data")
    col_d1, col_d2, col_d3 = st.columns(3)
    total_a = query_one("SELECT COUNT(*) FROM repositories")
    tagged_a = query_one("SELECT COUNT(DISTINCT html_url) FROM repository_tags")
    tag_count_a = query_one("SELECT COUNT(*) FROM tags")
    col_d1.metric("Repositories", f"{total_a:,}")
    col_d2.metric("Tagged", f"{tagged_a:,}")
    col_d3.metric("Unique tags", f"{tag_count_a:,}")

    st.write(
        "The full dataset β€” including repo metadata, tags, and tag groups β€” is available on "
        "[HuggingFace](https://huggingface.co/datasets/AndreasThinks/government-github-repos) "
        "in CSV, Parquet, and SQLite formats. Updated every Sunday."
    )

    st.divider()

    st.subheader("Contribute")
    st.write(
        "Know a government GitHub organisation that's missing from the dataset? "
        "Submit it via a pull request β€” it'll be included in the next weekly scrape."
    )
    st.markdown(
        "πŸ“‹ **[How to submit a missing organisation](https://github.com/AndreasThinks/open-govtech-report/blob/main/CONTRIBUTING.md)**"
    )
    st.write(
        "The scraper, tagger, and dashboard are all open source. "
        "Issues and pull requests welcome."
    )
    st.markdown("[github.com/AndreasThinks/open-govtech-report](https://github.com/AndreasThinks/open-govtech-report)")

    st.divider()
    st.markdown(
        "✨ A project by [AndreasThinks](https://andreasthinks.me), built with ❀️ using Streamlit, "
        "and some ✨vibes✨",
        unsafe_allow_html=True,
    )


st.divider()
last_updated = get_last_updated_display()
updated_str = f"Last updated: {last_updated}" if last_updated else "Update time unavailable"
st.caption(
    f"Data sourced from government GitHub accounts worldwide. {updated_str}. "
    "| [GitHub](https://github.com/AndreasThinks/open-govtech-report) "
    "| [Dataset](https://huggingface.co/datasets/AndreasThinks/government-github-repos) "
    "| [βž• Submit a missing org](https://github.com/AndreasThinks/open-govtech-report/blob/main/CONTRIBUTING.md) "
    "| ✨ A project by [AndreasThinks](https://andreasthinks.me)"
)