SUBREDDITS = [
    # business
    "r/stocks",
    "r/investing",
    "r/FinancialIndependence",
    "r/business",
    "r/sales",
    "r/BESalary",
    "r/mrr",
    "r/saasacquire",
    "r/ycombinator",
    "r/cofounder",
    "r/cofounderhunt",
    "r/founder",

    # saas/startups
    "r/SaaS",
    "r/SaaSDevelopers",
    "r/microsaas",
    "r/micro_saas",
    "r/SaaSSales",
    "r/startups",
    "r/Entrepreneur",
    "r/EntrepreneurRideAlong",
    "r/smallbusiness",
    "r/SideProject",
    "r/SideHustle",
    "r/indiehackers",
    "r/startup_ideas",
    
    # product / growth / marketing
    "r/growthhacking",
    "r/marketing",
    "r/digital_marketing",
    "r/content_marketing",
    "r/SEO",
    "r/PPC",
    
    # dev / building SaaS
    "r/webdev",
    "r/programming",
    "r/coding",
    "r/learnprogramming",
    "r/devops",
    "r/cloudcomputing",
    
    # niche SaaS + builders
    "r/nocode",
    "r/lowcode",
    "r/ProductManagement",
    "r/UserExperience",

    # academic
    "r/PhD",
    "r/OpenAI",
    "r/ChatGPT",
    "r/Artificial",
    "r/singularity",

    # law
    "r/AskLawyers",
    "r/legaladvice",

    # health
    "r/medicine",
    "r/AskDocs",

    # others"r/psychology", "r/AskPsychology",
    "r/neuroscience", "r/biology", "r/chemistry", "r/physics", "r/math",
    "r/learnmath", "r/linguistics", "r/writing", "r/books", "r/literature",
    "r/TrueFilm", "r/Screenwriting", "r/journalism", "r/education", "r/Teachers",
    "r/GradSchool", "r/academia", "r/careerguidance", "r/jobs", "r/resumes",
    "r/Leadership", "r/management", "r/productivity", "r/selfimprovement", "r/DecidingToBeBetter",
    "r/Minimalism",
    "r/climate", "r/environment", "r/urbanplanning", "r/sociology", "r/anthropology",
    "r/ethics", "r/criticaltheory", "r/AskPhilosophy", "r/AskEconomics", "r/Space",
    'r/ycombinator', 'r/saas', 'r/mrr', 'r/saasacquire', 'vibecodingsaas',
    'r/saascofounders', 'r/growthhacking', 'r/startups', 'r/entrepreneur', 'r/smallbusiness', 'r/business',
    'r/sideproject', 'r/indiehackers', 'r/webdev', 'r/dividends', 'r/saas',
    'r/cofounderhunt', 'r/aiagents', 'r/reinforcementlearning', 'r/openclaw', 'r/founder',
    'r/sideprojects', 'r/growthhacking', 'r/productmanagement',
    "r/AskReddit", "r/AskScience", "r/AskHistorians", "r/AskAcademia", "r/NoStupidQuestions",
    "r/ExplainLikeImFive", "r/ChangeMyView", "r/TrueReddit", "r/OutOfTheLoop", "r/TodayILearned",
    "r/Science", "r/Futurology", "r/Philosophy", "r/Economics", "r/PoliticalDiscussion",
    "r/Geopolitics", "r/History", "r/WorldNews", "r/Technology", "r/Programming",
    "r/ComputerScience", "r/MachineLearning", "r/ArtificialIntelligence", "r/DataScience", "r/Statistics",
    "r/learnpython", "r/Python", "r/cpp", "r/java", "r/javascript",
    "r/webdev", "r/devops", "r/cscareerquestions", "r/ITCareerQuestions", "r/startups",
    "r/Entrepreneur", "r/smallbusiness", "r/business", "r/investing",
    "r/dividends", "r/personalfinance", "r/financialindependence", "r/ecommerce", "r/SaaS",
    "r/indiehackers", "r/growthhacking", "r/ProductManagement",
    "r/marketing", "r/copywriting", "r/UXDesign",
]

NUMERICAL_FEATURES=[
    "fetched_utc",
    "hours_ago",
    "title_length",
    "text_length",
    "created_day_of_week",
    "created_day_of_month",
    "created_day_of_year",
    "created_hour",
    "created_minute",
    "created_month",
    "is_weekend",
]

MAX_TOKENS_PER_SEQ_TITLE = 128
MAX_TOKENS_PER_SEQ_TEXT = 768

TOKENIZER_MODEL_NAME_TITLE = "distilroberta-base"
TOKENIZER_MODEL_NAME_TEXT = "google/bigbird-roberta-base"

# Legacy checkpoint compatibility (model_reddit_final.pth family).
LEGACY_TOKENIZER_MODEL_NAME = "bert-base-uncased"
LEGACY_MAX_TOKENS_PER_SEQ_TITLE = 128
LEGACY_MAX_TOKENS_PER_SEQ_TEXT = 512

from transformers import AutoTokenizer


_TOKENIZERS = None
_LEGACY_TOKENIZER = None


def get_tokenizers():
    global _TOKENIZERS
    if _TOKENIZERS is None:
        tokenizer_title = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TITLE)
        tokenizer_text = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TEXT)
        _TOKENIZERS = (tokenizer_title, tokenizer_text)
    return _TOKENIZERS


def get_legacy_tokenizer():
    global _LEGACY_TOKENIZER
    if _LEGACY_TOKENIZER is None:
        _LEGACY_TOKENIZER = AutoTokenizer.from_pretrained(LEGACY_TOKENIZER_MODEL_NAME)
    return _LEGACY_TOKENIZER


tokenizer_title, tokenizer_text = get_tokenizers()
VOCAB_SIZE_TITLE = tokenizer_title.vocab_size
VOCAB_SIZE_TEXT = tokenizer_text.vocab_size

# Backward-compatible aliases for any legacy imports.
MAX_TOKENS_PER_SEQ = MAX_TOKENS_PER_SEQ_TITLE
TOKENIZER_MODEL_NAME = TOKENIZER_MODEL_NAME_TITLE
VOCAB_SIZE = VOCAB_SIZE_TITLE