datamatters24's picture
Upload ml/config.py with huggingface_hub
f1e6b91 verified
"""Configuration for the ML pipeline."""
import os
DB = {
"host": os.getenv("DB_HOST", "127.0.0.1"),
"port": int(os.getenv("DB_PORT", "5432")),
"dbname": os.getenv("DB_NAME", "epstein_research"),
"user": os.getenv("DB_USER", "epstein"),
"password": os.getenv("DB_PASSWORD", "di9vrLaJGskZrvlDRQJPtDYiFf3UCPl"),
}
RAW_DIR = "/data/raw"
# Use the existing venv's spaCy model
SPACY_MODEL = "en_core_web_lg"
# Zero-shot classification model
ZERO_SHOT_MODEL = "facebook/bart-large-mnli"
# Topic labels for zero-shot classification
TOPIC_LABELS = [
"national security",
"intelligence operations",
"assassination",
"military operations",
"civil rights",
"public health",
"government oversight",
"foreign policy",
"law enforcement",
"judicial proceedings",
"congressional legislation",
"scientific research",
"financial regulation",
"nuclear weapons",
"surveillance",
"propaganda",
"human experimentation",
"space exploration",
"terrorism",
"organized crime",
]
# Congress session date ranges (for date estimation from filenames)
CONGRESS_DATES = {
80: ("1947-01-03", "1949-01-03"),
81: ("1949-01-03", "1951-01-03"),
82: ("1951-01-03", "1953-01-03"),
103: ("1993-01-05", "1995-01-03"),
104: ("1995-01-04", "1997-01-03"),
105: ("1997-01-07", "1999-01-03"),
106: ("1999-01-06", "2001-01-03"),
107: ("2001-01-03", "2003-01-03"),
108: ("2003-01-07", "2005-01-03"),
109: ("2005-01-04", "2007-01-03"),
110: ("2007-01-04", "2009-01-03"),
111: ("2009-01-06", "2011-01-03"),
112: ("2011-01-05", "2013-01-03"),
113: ("2013-01-03", "2015-01-03"),
114: ("2015-01-06", "2017-01-03"),
115: ("2017-01-03", "2019-01-03"),
116: ("2019-01-03", "2021-01-03"),
117: ("2021-01-03", "2023-01-03"),
118: ("2023-01-03", "2025-01-03"),
119: ("2025-01-03", "2027-01-03"),
}
# Historical events for crisis correlation
HISTORICAL_EVENTS = [
{
"name": "Lincoln Assassination",
"start": "1865-04-14",
"end": "1865-07-07",
"category": "assassination",
"keywords": ["lincoln", "booth", "ford's theatre", "assassination", "conspirator"],
},
{
"name": "Civil War End / Reconstruction",
"start": "1865-04-09",
"end": "1877-03-31",
"category": "war",
"keywords": ["reconstruction", "appomattox", "confederate", "freedmen", "civil war"],
},
{
"name": "Bay of Pigs Invasion",
"start": "1961-04-17",
"end": "1961-04-20",
"category": "military",
"keywords": ["bay of pigs", "cuba", "castro", "brigade 2506"],
},
{
"name": "Cuban Missile Crisis",
"start": "1962-10-16",
"end": "1962-10-28",
"category": "nuclear",
"keywords": ["cuban missile", "nuclear", "blockade", "khrushchev"],
},
{
"name": "JFK Assassination",
"start": "1963-11-22",
"end": "1964-09-24",
"category": "assassination",
"keywords": ["kennedy", "oswald", "dallas", "warren commission", "grassy knoll", "dealey plaza"],
},
{
"name": "Gulf of Tonkin Incident",
"start": "1964-08-02",
"end": "1964-08-07",
"category": "military",
"keywords": ["gulf of tonkin", "vietnam", "tonkin resolution"],
},
{
"name": "MLK Assassination",
"start": "1968-04-04",
"end": "1968-04-04",
"category": "assassination",
"keywords": ["martin luther king", "mlk", "james earl ray", "memphis"],
},
{
"name": "RFK Assassination",
"start": "1968-06-05",
"end": "1968-06-06",
"category": "assassination",
"keywords": ["robert kennedy", "rfk", "sirhan", "ambassador hotel"],
},
{
"name": "Watergate Scandal",
"start": "1972-06-17",
"end": "1974-08-09",
"category": "scandal",
"keywords": ["watergate", "nixon", "impeach", "cover-up", "plumbers"],
},
{
"name": "Church Committee Investigations",
"start": "1975-01-27",
"end": "1976-04-29",
"category": "oversight",
"keywords": ["church committee", "intelligence abuses", "cointelpro", "mkultra", "assassination plots"],
},
{
"name": "MKUltra Program",
"start": "1953-04-13",
"end": "1973-01-01",
"category": "human_experimentation",
"keywords": ["mkultra", "mind control", "lsd", "behavioral", "gottlieb", "subproject"],
},
{
"name": "CIA Stargate / Remote Viewing Program",
"start": "1978-01-01",
"end": "1995-06-30",
"category": "intelligence",
"keywords": ["stargate", "remote viewing", "psychic", "grill flame", "sun streak"],
},
{
"name": "Iran-Contra Affair",
"start": "1985-08-01",
"end": "1987-11-18",
"category": "scandal",
"keywords": ["iran-contra", "contras", "nicaragua", "oliver north", "arms sales"],
},
{
"name": "Area 51 / U-2 Program",
"start": "1955-01-01",
"end": "1998-12-31",
"category": "intelligence",
"keywords": ["area 51", "groom lake", "u-2", "oxcart", "a-12", "classified aircraft"],
},
{
"name": "September 11 Attacks",
"start": "2001-09-11",
"end": "2001-12-31",
"category": "terrorism",
"keywords": ["september 11", "9/11", "world trade center", "pentagon", "al-qaeda", "bin laden"],
},
{
"name": "PATRIOT Act Passage",
"start": "2001-10-26",
"end": "2001-10-26",
"category": "legislation",
"keywords": ["patriot act", "surveillance", "domestic spying", "fisa"],
},
{
"name": "Iraq War Authorization",
"start": "2002-10-10",
"end": "2003-05-01",
"category": "military",
"keywords": ["iraq war", "weapons of mass destruction", "wmd", "saddam", "authorization for use"],
},
{
"name": "Snowden NSA Revelations",
"start": "2013-06-05",
"end": "2013-12-31",
"category": "surveillance",
"keywords": ["snowden", "nsa", "prism", "mass surveillance", "metadata"],
},
{
"name": "COVID-19 Pandemic",
"start": "2020-01-20",
"end": "2023-05-11",
"category": "pandemic",
"keywords": ["covid", "coronavirus", "pandemic", "lockdown", "vaccine"],
},
{
"name": "January 6 Capitol Attack",
"start": "2021-01-06",
"end": "2022-12-22",
"category": "insurrection",
"keywords": ["january 6", "capitol", "insurrection", "electoral college", "certification"],
},
]
BATCH_SIZE = 500