TextPeriod_Summarization / mvp_temporal.py
DelaliScratchwerk's picture
Upload 5 files
410a5d4 verified
import re
from collections import Counter
BUCKETS = ["pre-1900","1900–1945","1946–1990","1991–2008","2009–2015","2016–2018","2019–2022","2023–present"]
# Very small keyword→bucket map to start (extend this over time)
LEXICON = {
"covid": "2019–2022", "covid-19": "2019–2022", "sars-cov-2": "2019–2022",
"lockdown": "2019–2022", "n95": "2019–2022", "zoom": "2019–2022",
"myspace": "1991–2008", "iraq war": "1991–2008", "y2k": "1991–2008",
"tik tok": "2023–present", "tiktok": "2023–present", "chatgpt": "2023–present",
"vietnam war": "1946–1990", "sputnik": "1946–1990", "cold war": "1946–1990",
}
def year_to_bucket(y: int) -> str:
if y < 1900: return "pre-1900"
if y <= 1945: return "1900–1945"
if y <= 1990: return "1946–1990"
if y <= 2008: return "1991–2008"
if y <= 2015: return "2009–2015"
if y <= 2018: return "2016–2018"
if y <= 2022: return "2019–2022"
return "2023–present"
def predict_period(text: str):
t = text.lower()
# 1) explicit years
years = [int(y) for y in re.findall(r"\b(1[89]\d{2}|20\d{2})\b", t)]
bucket_votes = []
for y in years:
bucket_votes.append(year_to_bucket(y))
# 2) keyword hits
for k, b in LEXICON.items():
if k in t:
bucket_votes.append(b)
if not bucket_votes:
# no clues → default to a broad recent bucket
return "2023–present", {"reason": "no explicit clues", "votes": {}}
counts = Counter(bucket_votes)
best = counts.most_common(1)[0][0]
return best, {"reason": "votes", "votes": dict(counts)}
if __name__ == "__main__":
txt = "Schools went remote during the pandemic and everyone wore N95 masks."
pred, expl = predict_period(txt)
print(pred, expl)