Spaces:

WizardCoder2007
/

social_media_analyzer

Running

social_media_analyzer / src /language_detection.py

update

bbd259b 14 days ago

978 Bytes

	from langdetect import detect_langs, DetectorFactory

	# Enforce determinism
	DetectorFactory.seed = 0

	def detect_language(text: str):
	"""
	Robust language detection for Reddit comments.
	Prioritizes English for short text if common stopwords are found.
	"""
	# 1. Heuristic: Common English Stopwords
	# Solves short-text issues like "India has deep flaws" being detected as Spanish
	english_stopwords = {"the", "is", "are", "and", "of", "to", "in", "it", "has", "have", "for", "on", "with"}
	words = set(text.lower().split())

	# If intersection is non-empty, high confidence it's English
	if words & english_stopwords:
	return "en", 1.0

	# 2. Statistical Detection (langdetect)
	try:
	# returns list of [Language(lang, prob), ...]
	langs = detect_langs(text)
	best = langs[0]
	return best.lang, best.prob
	except Exception:
	# Fallback for empty/numeric text
	return "unknown", 0.0