Spaces:

Kalana001
/

SinCode

Running

App Files Files Community

SinCode / core /english.py

Kalana

Switch to fine-tuned model (Kalana001/xlm-roberta-sinhala-sincode)

237f296 2 days ago

raw

history blame contribute delete

3.34 kB

	"""
	English vocabulary loader and cache management for code-switch detection.
	"""

	import os
	import logging
	import requests
	from typing import Set

	from core.constants import ENGLISH_CORPUS_URL, MIN_ENGLISH_LEN

	logger = logging.getLogger(__name__)

	# Core English words always recognised (supplements the 20k corpus)
	CORE_ENGLISH_WORDS: Set[str] = {
	"transliteration", "sincode", "prototype", "assignment", "singlish",
	"rest", "complete", "tutorial", "small", "mistakes", "game", "play",
	"type", "test", "online", "code", "mixing", "project", "demo", "today",
	"tomorrow", "presentation", "slide", "submit", "feedback", "deploy",
	"merge", "update", "delete", "download", "upload", "install", "server",
	"meeting", "backlog", "comment", "reply", "chat", "selfie", "post",
	"share", "private", "message", "group", "study", "exam", "results",
	"viva", "prepared", "site", "redo", "story", "poll",
	"hall", "exam", "PR", "DM", "page", "app", "bug", "fix",
	"log", "push", "pull", "branch", "build", "run", "save",
	"link", "edit", "file", "open", "close", "live", "view",
	"deployments", "leaderboard", "instagram", "github", "standup",
	}


	def _resolve_english_cache_path() -> str:
	"""
	Resolve a writable cache path for the English corpus.

	Hugging Face Spaces may run with constrained write locations, so we prefer:
	1) explicit env override,
	2) HF_HOME cache dir,
	3) local working dir,
	4) system temp dir.
	"""
	override = os.getenv("SINCODE_ENGLISH_CACHE")
	if override:
	return override

	candidates = [
	os.path.join(os.getenv("HF_HOME", ""), "english_20k.txt") if os.getenv("HF_HOME") else "",
	os.path.join(os.getcwd(), "english_20k.txt"),
	os.path.join(os.getenv("TMPDIR", os.getenv("TEMP", "/tmp")), "english_20k.txt"),
	]

	for path in candidates:
	if not path:
	continue
	parent = os.path.dirname(path) or "."
	try:
	os.makedirs(parent, exist_ok=True)
	with open(path, "a", encoding="utf-8"):
	pass
	return path
	except OSError:
	continue

	return "english_20k.txt"


	ENGLISH_CORPUS_CACHE = _resolve_english_cache_path()


	def load_english_vocab() -> Set[str]:
	"""Load and cache a ~20k English word list for code-switch detection."""
	vocab = CORE_ENGLISH_WORDS.copy()

	if not os.path.exists(ENGLISH_CORPUS_CACHE):
	try:
	logger.info("Downloading English corpus...")
	response = requests.get(ENGLISH_CORPUS_URL, timeout=10)
	response.raise_for_status()
	with open(ENGLISH_CORPUS_CACHE, "wb") as f:
	f.write(response.content)
	except (requests.RequestException, OSError) as exc:
	logger.warning("Could not download English corpus: %s", exc)
	return vocab

	try:
	with open(ENGLISH_CORPUS_CACHE, "r", encoding="utf-8") as f:
	vocab.update(
	w for line in f
	if (w := line.strip().lower()) and len(w) >= MIN_ENGLISH_LEN
	)
	except OSError as exc:
	logger.warning("Could not read English corpus file: %s", exc)

	logger.info("English vocabulary loaded: %d words", len(vocab))
	return vocab


	ENGLISH_VOCAB: Set[str] = load_english_vocab()