Spaces:

cyrilfrl
/

Reddit

Sleeping

App Files Files Community

Reddit / constants.py

cyrilfrl

hope it works this time

44748ce verified about 2 months ago

raw

history blame contribute delete

4.87 kB


	SUBREDDITS = [
	# business
	"r/stocks",
	"r/investing",
	"r/FinancialIndependence",
	"r/business",
	"r/sales",
	"r/BESalary",
	"r/mrr",
	"r/saasacquire",
	"r/ycombinator",
	"r/cofounder",
	"r/cofounderhunt",
	"r/founder",

	# saas/startups
	"r/SaaS",
	"r/SaaSDevelopers",
	"r/microsaas",
	"r/micro_saas",
	"r/SaaSSales",
	"r/startups",
	"r/Entrepreneur",
	"r/EntrepreneurRideAlong",
	"r/smallbusiness",
	"r/SideProject",
	"r/SideHustle",
	"r/indiehackers",
	"r/startup_ideas",

	# product / growth / marketing
	"r/growthhacking",
	"r/marketing",
	"r/digital_marketing",
	"r/content_marketing",
	"r/SEO",
	"r/PPC",

	# dev / building SaaS
	"r/webdev",
	"r/programming",
	"r/coding",
	"r/learnprogramming",
	"r/devops",
	"r/cloudcomputing",

	# niche SaaS + builders
	"r/nocode",
	"r/lowcode",
	"r/ProductManagement",
	"r/UserExperience",

	# academic
	"r/PhD",
	"r/OpenAI",
	"r/ChatGPT",
	"r/Artificial",
	"r/singularity",

	# law
	"r/AskLawyers",
	"r/legaladvice",

	# health
	"r/medicine",
	"r/AskDocs",

	# others"r/psychology", "r/AskPsychology",
	"r/neuroscience", "r/biology", "r/chemistry", "r/physics", "r/math",
	"r/learnmath", "r/linguistics", "r/writing", "r/books", "r/literature",
	"r/TrueFilm", "r/Screenwriting", "r/journalism", "r/education", "r/Teachers",
	"r/GradSchool", "r/academia", "r/careerguidance", "r/jobs", "r/resumes",
	"r/Leadership", "r/management", "r/productivity", "r/selfimprovement", "r/DecidingToBeBetter",
	"r/Minimalism",
	"r/climate", "r/environment", "r/urbanplanning", "r/sociology", "r/anthropology",
	"r/ethics", "r/criticaltheory", "r/AskPhilosophy", "r/AskEconomics", "r/Space",
	'r/ycombinator', 'r/saas', 'r/mrr', 'r/saasacquire', 'vibecodingsaas',
	'r/saascofounders', 'r/growthhacking', 'r/startups', 'r/entrepreneur', 'r/smallbusiness', 'r/business',
	'r/sideproject', 'r/indiehackers', 'r/webdev', 'r/dividends', 'r/saas',
	'r/cofounderhunt', 'r/aiagents', 'r/reinforcementlearning', 'r/openclaw', 'r/founder',
	'r/sideprojects', 'r/growthhacking', 'r/productmanagement',
	"r/AskReddit", "r/AskScience", "r/AskHistorians", "r/AskAcademia", "r/NoStupidQuestions",
	"r/ExplainLikeImFive", "r/ChangeMyView", "r/TrueReddit", "r/OutOfTheLoop", "r/TodayILearned",
	"r/Science", "r/Futurology", "r/Philosophy", "r/Economics", "r/PoliticalDiscussion",
	"r/Geopolitics", "r/History", "r/WorldNews", "r/Technology", "r/Programming",
	"r/ComputerScience", "r/MachineLearning", "r/ArtificialIntelligence", "r/DataScience", "r/Statistics",
	"r/learnpython", "r/Python", "r/cpp", "r/java", "r/javascript",
	"r/webdev", "r/devops", "r/cscareerquestions", "r/ITCareerQuestions", "r/startups",
	"r/Entrepreneur", "r/smallbusiness", "r/business", "r/investing",
	"r/dividends", "r/personalfinance", "r/financialindependence", "r/ecommerce", "r/SaaS",
	"r/indiehackers", "r/growthhacking", "r/ProductManagement",
	"r/marketing", "r/copywriting", "r/UXDesign",
	]

	NUMERICAL_FEATURES=[
	"fetched_utc",
	"hours_ago",
	"title_length",
	"text_length",
	"created_day_of_week",
	"created_day_of_month",
	"created_day_of_year",
	"created_hour",
	"created_minute",
	"created_month",
	"is_weekend",
	]

	MAX_TOKENS_PER_SEQ_TITLE = 128
	MAX_TOKENS_PER_SEQ_TEXT = 768

	TOKENIZER_MODEL_NAME_TITLE = "distilroberta-base"
	TOKENIZER_MODEL_NAME_TEXT = "google/bigbird-roberta-base"

	# Legacy checkpoint compatibility (model_reddit_final.pth family).
	LEGACY_TOKENIZER_MODEL_NAME = "bert-base-uncased"
	LEGACY_MAX_TOKENS_PER_SEQ_TITLE = 128
	LEGACY_MAX_TOKENS_PER_SEQ_TEXT = 512

	from transformers import AutoTokenizer


	_TOKENIZERS = None
	_LEGACY_TOKENIZER = None


	def get_tokenizers():
	global _TOKENIZERS
	if _TOKENIZERS is None:
	tokenizer_title = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TITLE)
	tokenizer_text = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME_TEXT)
	_TOKENIZERS = (tokenizer_title, tokenizer_text)
	return _TOKENIZERS


	def get_legacy_tokenizer():
	global _LEGACY_TOKENIZER
	if _LEGACY_TOKENIZER is None:
	_LEGACY_TOKENIZER = AutoTokenizer.from_pretrained(LEGACY_TOKENIZER_MODEL_NAME)
	return _LEGACY_TOKENIZER


	tokenizer_title, tokenizer_text = get_tokenizers()
	VOCAB_SIZE_TITLE = tokenizer_title.vocab_size
	VOCAB_SIZE_TEXT = tokenizer_text.vocab_size

	# Backward-compatible aliases for any legacy imports.
	MAX_TOKENS_PER_SEQ = MAX_TOKENS_PER_SEQ_TITLE
	TOKENIZER_MODEL_NAME = TOKENIZER_MODEL_NAME_TITLE
	VOCAB_SIZE = VOCAB_SIZE_TITLE