SUBREDDITS = [ # business "r/stocks", "r/investing", "r/FinancialIndependence", "r/business", "r/sales", "r/BESalary", "r/mrr", "r/saasacquire", "r/ycombinator", "r/cofounder", "r/cofounderhunt", "r/founder", # saas/startups "r/SaaS", "r/SaaSDevelopers", "r/microsaas", "r/micro_saas", "r/SaaSSales", "r/startups", "r/Entrepreneur", "r/EntrepreneurRideAlong", "r/smallbusiness", "r/SideProject", "r/SideHustle", "r/indiehackers", "r/startup_ideas", # product / growth / marketing "r/growthhacking", "r/marketing", "r/digital_marketing", "r/content_marketing", "r/SEO", "r/PPC", # dev / building SaaS "r/webdev", "r/programming", "r/coding", "r/learnprogramming", "r/devops", "r/cloudcomputing", # niche SaaS + builders "r/nocode", "r/lowcode", "r/ProductManagement", "r/UserExperience", # academic "r/PhD", "r/OpenAI", "r/ChatGPT", "r/Artificial", "r/singularity", # law "r/AskLawyers", "r/legaladvice", # health "r/medicine", "r/AskDocs", # others"r/psychology", "r/AskPsychology", "r/neuroscience", "r/biology", "r/chemistry", "r/physics", "r/math", "r/learnmath", "r/linguistics", "r/writing", "r/books", "r/literature", "r/TrueFilm", "r/Screenwriting", "r/journalism", "r/education", "r/Teachers", "r/GradSchool", "r/academia", "r/careerguidance", "r/jobs", "r/resumes", "r/Leadership", "r/management", "r/productivity", "r/selfimprovement", "r/DecidingToBeBetter", "r/Minimalism", "r/climate", "r/environment", "r/urbanplanning", "r/sociology", "r/anthropology", "r/ethics", "r/criticaltheory", "r/AskPhilosophy", "r/AskEconomics", "r/Space", 'r/ycombinator', 'r/saas', 'r/mrr', 'r/saasacquire', 'vibecodingsaas', 'r/saascofounders', 'r/growthhacking', 'r/startups', 'r/entrepreneur', 'r/smallbusiness', 'r/business', 'r/sideproject', 'r/indiehackers', 'r/webdev', 'r/dividends', 'r/saas', 'r/cofounderhunt', 'r/aiagents', 'r/reinforcementlearning', 'r/openclaw', 'r/founder', 'r/sideprojects', 'r/growthhacking', 'r/productmanagement', "r/AskReddit", "r/AskScience", "r/AskHistorians", "r/AskAcademia", "r/NoStupidQuestions", "r/ExplainLikeImFive", "r/ChangeMyView", "r/TrueReddit", "r/OutOfTheLoop", "r/TodayILearned", "r/Science", "r/Futurology", "r/Philosophy", "r/Economics", "r/PoliticalDiscussion", "r/Geopolitics", "r/History", "r/WorldNews", "r/Technology", "r/Programming", "r/ComputerScience", "r/MachineLearning", "r/ArtificialIntelligence", "r/DataScience", "r/Statistics", "r/learnpython", "r/Python", "r/cpp", "r/java", "r/javascript", "r/webdev", "r/devops", "r/cscareerquestions", "r/ITCareerQuestions", "r/startups", "r/Entrepreneur", "r/smallbusiness", "r/business", "r/investing", "r/dividends", "r/personalfinance", "r/financialindependence", "r/ecommerce", "r/SaaS", "r/indiehackers", "r/growthhacking", "r/ProductManagement", "r/marketing", "r/copywriting", "r/UXDesign", ] NUMERICAL_FEATURES=[ "fetched_utc", "hours_ago", "title_length", "text_length", "created_day_of_week", "created_day_of_month", "created_day_of_year", "created_hour", "created_minute", "created_month", "is_weekend", ] MAX_TOKENS_PER_SEQ_TITLE = 128 MAX_TOKENS_PER_SEQ_TEXT = 768 TOKENIZER_MODEL_NAME_TITLE = "distilroberta-base" TOKENIZER_MODEL_NAME_TEXT = "google/bigbird-roberta-base" # Legacy checkpoint compatibility (model_reddit_final.pth family). LEGACY_TOKENIZER_MODEL_NAME = "bert-base-uncased" LEGACY_MAX_TOKENS_PER_SEQ_TITLE = 128 LEGACY_MAX_TOKENS_PER_SEQ_TEXT = 512 from transformers import AutoTokenizer _TOKENIZERS = None _LEGACY_TOKENIZER = None def get_tokenizers(): global _TOKENIZERS if _TOKENIZERS is None: tokenizer_title = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TITLE) tokenizer_text = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TEXT) _TOKENIZERS = (tokenizer_title, tokenizer_text) return _TOKENIZERS def get_legacy_tokenizer(): global _LEGACY_TOKENIZER if _LEGACY_TOKENIZER is None: _LEGACY_TOKENIZER = AutoTokenizer.from_pretrained(LEGACY_TOKENIZER_MODEL_NAME) return _LEGACY_TOKENIZER tokenizer_title, tokenizer_text = get_tokenizers() VOCAB_SIZE_TITLE = tokenizer_title.vocab_size VOCAB_SIZE_TEXT = tokenizer_text.vocab_size # Backward-compatible aliases for any legacy imports. MAX_TOKENS_PER_SEQ = MAX_TOKENS_PER_SEQ_TITLE TOKENIZER_MODEL_NAME = TOKENIZER_MODEL_NAME_TITLE VOCAB_SIZE = VOCAB_SIZE_TITLE