Spaces:

Zeqhx
/

Automated-CV-Parser

Running

App Files Files Community

Automated-CV-Parser / config.py

Zeqhx

Dropdown: bertv1/robertav1/bertv2/robertav2 from Hub repos

e99bcfe verified 1 day ago

raw

history blame contribute delete

4.38 kB

	"""Central config for the CV Parser dashboard.

	A model "ref" is resolved by lib/model.py as:
	- a local directory (e.g. exported_models/roberta-base-ner) -> load from disk
	- any other string with a "/" -> a Hugging Face Hub repo id
	- None -> FALLBACK_MODEL (roberta-base + random head; flagged as demo in the UI)

	On the deployed HF Space there are no local folders, so the app loads the team's
	"best model" repo (PRIMARY_MODEL_ID) from the Hub. Teammates update that repo via
	the password-gated Manage Model page (or by pushing to it directly), and the app
	picks up the new weights — no redeploy needed.
	"""
	import os

	# ---- Model resolution -------------------------------------------------------
	# Hugging Face owner + the canonical "best model" repo the app loads by default
	# and that the Manage Model page overwrites.
	HF_OWNER = os.environ.get("HF_OWNER", "Zeqhx")
	# Four published models, selectable in the UI:
	# v1 = earlier (dataset-2) models, v2 = latest (dataset-4) retrains.
	BERT_V1_ID = os.environ.get("DASHBOARD_BERT_V1_ID", f"{HF_OWNER}/cv-parser-bert-v1")
	ROBERTA_V1_ID = os.environ.get("DASHBOARD_ROBERTA_V1_ID", f"{HF_OWNER}/cv-parser-roberta-v1")
	BERT_V2_ID = os.environ.get("DASHBOARD_BERT_V2_ID", f"{HF_OWNER}/cv-parser-bert-v2")
	ROBERTA_V2_ID = os.environ.get("DASHBOARD_ROBERTA_V2_ID", f"{HF_OWNER}/cv-parser-roberta-v2")

	# Default model the app loads and the Manage Model page overwrites: best = robertav2.
	PRIMARY_MODEL_ID = os.environ.get("DASHBOARD_MODEL_ID", ROBERTA_V2_ID)

	# Back-compat single-ref overrides (used by load_model()'s config path).
	MODEL_PATH = os.environ.get("DASHBOARD_MODEL_PATH", "")
	MODEL_ID = PRIMARY_MODEL_ID
	FALLBACK_MODEL = "roberta-base"

	DEMO_LABEL = "Demo — untrained roberta-base"

	# Toggle registry. Each entry: (label, kind, ref). "local" entries are only
	# offered when the folder exists (dev machines); "hub" entries are always offered.
	MODEL_REGISTRY = [
	("bertv1", "hub", BERT_V1_ID),
	("robertav1", "hub", ROBERTA_V1_ID),
	("bertv2", "hub", BERT_V2_ID),
	("robertav2", "hub", ROBERTA_V2_ID),
	]


	def available_models():
	"""Ordered {label: ref} of selectable models, plus the demo fallback.

	``ref=None`` marks the demo/fallback. Local entries appear only when present
	on disk; Hub entries always appear (a missing/private repo degrades to demo,
	which the UI flags).
	"""
	found = {}
	local_found = {}
	hub_found = {}
	for label, kind, ref in MODEL_REGISTRY:
	if kind == "local" and not os.path.isdir(ref):
	continue
	if kind == "local":
	local_found[label] = ref
	else:
	hub_found[label] = ref
	found.update(local_found)
	found.update(hub_found)
	found[DEMO_LABEL] = None
	return found

	# ---- Inference --------------------------------------------------------------
	MAX_LENGTH = 512
	STRIDE = 128 # matches the project's sliding-window preprocessing

	# ---- Label scheme (must match training; see project README) -----------------
	LABELS = [
	"O",
	"B-JOB_TITLE", "I-JOB_TITLE",
	"B-SKILL", "I-SKILL",
	"B-EDUCATION", "I-EDUCATION",
	]
	ID2LABEL = {i: l for i, l in enumerate(LABELS)}
	LABEL2ID = {l: i for i, l in enumerate(LABELS)}

	# Entity types (BIO prefix stripped) + display colours
	ENTITY_TYPES = ["JOB_TITLE", "SKILL", "EDUCATION"]
	ENTITY_COLORS = {
	"JOB_TITLE": "#ffb703", # amber
	"SKILL": "#2a9d8f", # teal
	"EDUCATION": "#4361ee", # blue
	}
	ENTITY_LABELS = {
	"JOB_TITLE": "Job Title",
	"SKILL": "Skill",
	"EDUCATION": "Education",
	}

	SUPPORTED_EXTS = [".pdf", ".docx", ".txt"]

	SAMPLE_RESUME = """John Carter
	Senior Software Engineer

	Summary
	Experienced Software Engineer and Team Lead with 8 years building scalable
	backend systems. Skilled in Python, Java, Kubernetes, and distributed systems.

	Experience
	Senior Software Engineer, Acme Corp (2019 - present)
	- Designed microservices using Python, FastAPI and PostgreSQL.
	- Led a team of 5 engineers and introduced CI/CD with Docker and Jenkins.

	Data Scientist, Globex (2016 - 2019)
	- Built machine learning models with TensorFlow and scikit-learn.

	Education
	Master of Science in Computer Science, Stanford University (2016)
	Bachelor of Engineering in Software Engineering, MIT (2014)

	Skills
	Python, Java, SQL, Machine Learning, Kubernetes, Docker, AWS, Leadership
	"""