|
|
from langdetect import detect_langs, DetectorFactory |
|
|
|
|
|
|
|
|
DetectorFactory.seed = 0 |
|
|
|
|
|
def detect_language(text: str): |
|
|
""" |
|
|
Robust language detection for Reddit comments. |
|
|
Prioritizes English for short text if common stopwords are found. |
|
|
""" |
|
|
|
|
|
|
|
|
english_stopwords = {"the", "is", "are", "and", "of", "to", "in", "it", "has", "have", "for", "on", "with"} |
|
|
words = set(text.lower().split()) |
|
|
|
|
|
|
|
|
if words & english_stopwords: |
|
|
return "en", 1.0 |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
langs = detect_langs(text) |
|
|
best = langs[0] |
|
|
return best.lang, best.prob |
|
|
except Exception: |
|
|
|
|
|
return "unknown", 0.0 |
|
|
|
|
|
|