Reddit / constants.py
cyrilfrl's picture
hope it works this time
44748ce verified
SUBREDDITS = [
# business
"r/stocks",
"r/investing",
"r/FinancialIndependence",
"r/business",
"r/sales",
"r/BESalary",
"r/mrr",
"r/saasacquire",
"r/ycombinator",
"r/cofounder",
"r/cofounderhunt",
"r/founder",
# saas/startups
"r/SaaS",
"r/SaaSDevelopers",
"r/microsaas",
"r/micro_saas",
"r/SaaSSales",
"r/startups",
"r/Entrepreneur",
"r/EntrepreneurRideAlong",
"r/smallbusiness",
"r/SideProject",
"r/SideHustle",
"r/indiehackers",
"r/startup_ideas",
# product / growth / marketing
"r/growthhacking",
"r/marketing",
"r/digital_marketing",
"r/content_marketing",
"r/SEO",
"r/PPC",
# dev / building SaaS
"r/webdev",
"r/programming",
"r/coding",
"r/learnprogramming",
"r/devops",
"r/cloudcomputing",
# niche SaaS + builders
"r/nocode",
"r/lowcode",
"r/ProductManagement",
"r/UserExperience",
# academic
"r/PhD",
"r/OpenAI",
"r/ChatGPT",
"r/Artificial",
"r/singularity",
# law
"r/AskLawyers",
"r/legaladvice",
# health
"r/medicine",
"r/AskDocs",
# others"r/psychology", "r/AskPsychology",
"r/neuroscience", "r/biology", "r/chemistry", "r/physics", "r/math",
"r/learnmath", "r/linguistics", "r/writing", "r/books", "r/literature",
"r/TrueFilm", "r/Screenwriting", "r/journalism", "r/education", "r/Teachers",
"r/GradSchool", "r/academia", "r/careerguidance", "r/jobs", "r/resumes",
"r/Leadership", "r/management", "r/productivity", "r/selfimprovement", "r/DecidingToBeBetter",
"r/Minimalism",
"r/climate", "r/environment", "r/urbanplanning", "r/sociology", "r/anthropology",
"r/ethics", "r/criticaltheory", "r/AskPhilosophy", "r/AskEconomics", "r/Space",
'r/ycombinator', 'r/saas', 'r/mrr', 'r/saasacquire', 'vibecodingsaas',
'r/saascofounders', 'r/growthhacking', 'r/startups', 'r/entrepreneur', 'r/smallbusiness', 'r/business',
'r/sideproject', 'r/indiehackers', 'r/webdev', 'r/dividends', 'r/saas',
'r/cofounderhunt', 'r/aiagents', 'r/reinforcementlearning', 'r/openclaw', 'r/founder',
'r/sideprojects', 'r/growthhacking', 'r/productmanagement',
"r/AskReddit", "r/AskScience", "r/AskHistorians", "r/AskAcademia", "r/NoStupidQuestions",
"r/ExplainLikeImFive", "r/ChangeMyView", "r/TrueReddit", "r/OutOfTheLoop", "r/TodayILearned",
"r/Science", "r/Futurology", "r/Philosophy", "r/Economics", "r/PoliticalDiscussion",
"r/Geopolitics", "r/History", "r/WorldNews", "r/Technology", "r/Programming",
"r/ComputerScience", "r/MachineLearning", "r/ArtificialIntelligence", "r/DataScience", "r/Statistics",
"r/learnpython", "r/Python", "r/cpp", "r/java", "r/javascript",
"r/webdev", "r/devops", "r/cscareerquestions", "r/ITCareerQuestions", "r/startups",
"r/Entrepreneur", "r/smallbusiness", "r/business", "r/investing",
"r/dividends", "r/personalfinance", "r/financialindependence", "r/ecommerce", "r/SaaS",
"r/indiehackers", "r/growthhacking", "r/ProductManagement",
"r/marketing", "r/copywriting", "r/UXDesign",
]
NUMERICAL_FEATURES=[
"fetched_utc",
"hours_ago",
"title_length",
"text_length",
"created_day_of_week",
"created_day_of_month",
"created_day_of_year",
"created_hour",
"created_minute",
"created_month",
"is_weekend",
]
MAX_TOKENS_PER_SEQ_TITLE = 128
MAX_TOKENS_PER_SEQ_TEXT = 768
TOKENIZER_MODEL_NAME_TITLE = "distilroberta-base"
TOKENIZER_MODEL_NAME_TEXT = "google/bigbird-roberta-base"
# Legacy checkpoint compatibility (model_reddit_final.pth family).
LEGACY_TOKENIZER_MODEL_NAME = "bert-base-uncased"
LEGACY_MAX_TOKENS_PER_SEQ_TITLE = 128
LEGACY_MAX_TOKENS_PER_SEQ_TEXT = 512
from transformers import AutoTokenizer
_TOKENIZERS = None
_LEGACY_TOKENIZER = None
def get_tokenizers():
global _TOKENIZERS
if _TOKENIZERS is None:
tokenizer_title = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TITLE)
tokenizer_text = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TEXT)
_TOKENIZERS = (tokenizer_title, tokenizer_text)
return _TOKENIZERS
def get_legacy_tokenizer():
global _LEGACY_TOKENIZER
if _LEGACY_TOKENIZER is None:
_LEGACY_TOKENIZER = AutoTokenizer.from_pretrained(LEGACY_TOKENIZER_MODEL_NAME)
return _LEGACY_TOKENIZER
tokenizer_title, tokenizer_text = get_tokenizers()
VOCAB_SIZE_TITLE = tokenizer_title.vocab_size
VOCAB_SIZE_TEXT = tokenizer_text.vocab_size
# Backward-compatible aliases for any legacy imports.
MAX_TOKENS_PER_SEQ = MAX_TOKENS_PER_SEQ_TITLE
TOKENIZER_MODEL_NAME = TOKENIZER_MODEL_NAME_TITLE
VOCAB_SIZE = VOCAB_SIZE_TITLE