|
|
| SUBREDDITS = [
|
|
|
| "r/stocks",
|
| "r/investing",
|
| "r/FinancialIndependence",
|
| "r/business",
|
| "r/sales",
|
| "r/BESalary",
|
| "r/mrr",
|
| "r/saasacquire",
|
| "r/ycombinator",
|
| "r/cofounder",
|
| "r/cofounderhunt",
|
| "r/founder",
|
|
|
|
|
| "r/SaaS",
|
| "r/SaaSDevelopers",
|
| "r/microsaas",
|
| "r/micro_saas",
|
| "r/SaaSSales",
|
| "r/startups",
|
| "r/Entrepreneur",
|
| "r/EntrepreneurRideAlong",
|
| "r/smallbusiness",
|
| "r/SideProject",
|
| "r/SideHustle",
|
| "r/indiehackers",
|
| "r/startup_ideas",
|
|
|
|
|
| "r/growthhacking",
|
| "r/marketing",
|
| "r/digital_marketing",
|
| "r/content_marketing",
|
| "r/SEO",
|
| "r/PPC",
|
|
|
|
|
| "r/webdev",
|
| "r/programming",
|
| "r/coding",
|
| "r/learnprogramming",
|
| "r/devops",
|
| "r/cloudcomputing",
|
|
|
|
|
| "r/nocode",
|
| "r/lowcode",
|
| "r/ProductManagement",
|
| "r/UserExperience",
|
|
|
|
|
| "r/PhD",
|
| "r/OpenAI",
|
| "r/ChatGPT",
|
| "r/Artificial",
|
| "r/singularity",
|
|
|
|
|
| "r/AskLawyers",
|
| "r/legaladvice",
|
|
|
|
|
| "r/medicine",
|
| "r/AskDocs",
|
|
|
|
|
| "r/neuroscience", "r/biology", "r/chemistry", "r/physics", "r/math",
|
| "r/learnmath", "r/linguistics", "r/writing", "r/books", "r/literature",
|
| "r/TrueFilm", "r/Screenwriting", "r/journalism", "r/education", "r/Teachers",
|
| "r/GradSchool", "r/academia", "r/careerguidance", "r/jobs", "r/resumes",
|
| "r/Leadership", "r/management", "r/productivity", "r/selfimprovement", "r/DecidingToBeBetter",
|
| "r/Minimalism",
|
| "r/climate", "r/environment", "r/urbanplanning", "r/sociology", "r/anthropology",
|
| "r/ethics", "r/criticaltheory", "r/AskPhilosophy", "r/AskEconomics", "r/Space",
|
| 'r/ycombinator', 'r/saas', 'r/mrr', 'r/saasacquire', 'vibecodingsaas',
|
| 'r/saascofounders', 'r/growthhacking', 'r/startups', 'r/entrepreneur', 'r/smallbusiness', 'r/business',
|
| 'r/sideproject', 'r/indiehackers', 'r/webdev', 'r/dividends', 'r/saas',
|
| 'r/cofounderhunt', 'r/aiagents', 'r/reinforcementlearning', 'r/openclaw', 'r/founder',
|
| 'r/sideprojects', 'r/growthhacking', 'r/productmanagement',
|
| "r/AskReddit", "r/AskScience", "r/AskHistorians", "r/AskAcademia", "r/NoStupidQuestions",
|
| "r/ExplainLikeImFive", "r/ChangeMyView", "r/TrueReddit", "r/OutOfTheLoop", "r/TodayILearned",
|
| "r/Science", "r/Futurology", "r/Philosophy", "r/Economics", "r/PoliticalDiscussion",
|
| "r/Geopolitics", "r/History", "r/WorldNews", "r/Technology", "r/Programming",
|
| "r/ComputerScience", "r/MachineLearning", "r/ArtificialIntelligence", "r/DataScience", "r/Statistics",
|
| "r/learnpython", "r/Python", "r/cpp", "r/java", "r/javascript",
|
| "r/webdev", "r/devops", "r/cscareerquestions", "r/ITCareerQuestions", "r/startups",
|
| "r/Entrepreneur", "r/smallbusiness", "r/business", "r/investing",
|
| "r/dividends", "r/personalfinance", "r/financialindependence", "r/ecommerce", "r/SaaS",
|
| "r/indiehackers", "r/growthhacking", "r/ProductManagement",
|
| "r/marketing", "r/copywriting", "r/UXDesign",
|
| ]
|
|
|
| NUMERICAL_FEATURES=[
|
| "fetched_utc",
|
| "hours_ago",
|
| "title_length",
|
| "text_length",
|
| "created_day_of_week",
|
| "created_day_of_month",
|
| "created_day_of_year",
|
| "created_hour",
|
| "created_minute",
|
| "created_month",
|
| "is_weekend",
|
| ]
|
|
|
| MAX_TOKENS_PER_SEQ_TITLE = 128
|
| MAX_TOKENS_PER_SEQ_TEXT = 768
|
|
|
| TOKENIZER_MODEL_NAME_TITLE = "distilroberta-base"
|
| TOKENIZER_MODEL_NAME_TEXT = "google/bigbird-roberta-base"
|
|
|
|
|
| LEGACY_TOKENIZER_MODEL_NAME = "bert-base-uncased"
|
| LEGACY_MAX_TOKENS_PER_SEQ_TITLE = 128
|
| LEGACY_MAX_TOKENS_PER_SEQ_TEXT = 512
|
|
|
| from transformers import AutoTokenizer
|
|
|
|
|
| _TOKENIZERS = None
|
| _LEGACY_TOKENIZER = None
|
|
|
|
|
| def get_tokenizers():
|
| global _TOKENIZERS
|
| if _TOKENIZERS is None:
|
| tokenizer_title = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TITLE)
|
| tokenizer_text = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TEXT)
|
| _TOKENIZERS = (tokenizer_title, tokenizer_text)
|
| return _TOKENIZERS
|
|
|
|
|
| def get_legacy_tokenizer():
|
| global _LEGACY_TOKENIZER
|
| if _LEGACY_TOKENIZER is None:
|
| _LEGACY_TOKENIZER = AutoTokenizer.from_pretrained(LEGACY_TOKENIZER_MODEL_NAME)
|
| return _LEGACY_TOKENIZER
|
|
|
|
|
| tokenizer_title, tokenizer_text = get_tokenizers()
|
| VOCAB_SIZE_TITLE = tokenizer_title.vocab_size
|
| VOCAB_SIZE_TEXT = tokenizer_text.vocab_size
|
|
|
|
|
| MAX_TOKENS_PER_SEQ = MAX_TOKENS_PER_SEQ_TITLE
|
| TOKENIZER_MODEL_NAME = TOKENIZER_MODEL_NAME_TITLE
|
| VOCAB_SIZE = VOCAB_SIZE_TITLE |