Automated-CV-Parser / config.py
Zeqhx's picture
Dropdown: bertv1/robertav1/bertv2/robertav2 from Hub repos
e99bcfe verified
"""Central config for the CV Parser dashboard.
A model "ref" is resolved by lib/model.py as:
- a local directory (e.g. exported_models/roberta-base-ner) -> load from disk
- any other string with a "/" -> a Hugging Face Hub repo id
- None -> FALLBACK_MODEL (roberta-base + random head; flagged as demo in the UI)
On the deployed HF Space there are no local folders, so the app loads the team's
"best model" repo (PRIMARY_MODEL_ID) from the Hub. Teammates update that repo via
the password-gated Manage Model page (or by pushing to it directly), and the app
picks up the new weights — no redeploy needed.
"""
import os
# ---- Model resolution -------------------------------------------------------
# Hugging Face owner + the canonical "best model" repo the app loads by default
# and that the Manage Model page overwrites.
HF_OWNER = os.environ.get("HF_OWNER", "Zeqhx")
# Four published models, selectable in the UI:
# v1 = earlier (dataset-2) models, v2 = latest (dataset-4) retrains.
BERT_V1_ID = os.environ.get("DASHBOARD_BERT_V1_ID", f"{HF_OWNER}/cv-parser-bert-v1")
ROBERTA_V1_ID = os.environ.get("DASHBOARD_ROBERTA_V1_ID", f"{HF_OWNER}/cv-parser-roberta-v1")
BERT_V2_ID = os.environ.get("DASHBOARD_BERT_V2_ID", f"{HF_OWNER}/cv-parser-bert-v2")
ROBERTA_V2_ID = os.environ.get("DASHBOARD_ROBERTA_V2_ID", f"{HF_OWNER}/cv-parser-roberta-v2")
# Default model the app loads and the Manage Model page overwrites: best = robertav2.
PRIMARY_MODEL_ID = os.environ.get("DASHBOARD_MODEL_ID", ROBERTA_V2_ID)
# Back-compat single-ref overrides (used by load_model()'s config path).
MODEL_PATH = os.environ.get("DASHBOARD_MODEL_PATH", "")
MODEL_ID = PRIMARY_MODEL_ID
FALLBACK_MODEL = "roberta-base"
DEMO_LABEL = "Demo — untrained roberta-base"
# Toggle registry. Each entry: (label, kind, ref). "local" entries are only
# offered when the folder exists (dev machines); "hub" entries are always offered.
MODEL_REGISTRY = [
("bertv1", "hub", BERT_V1_ID),
("robertav1", "hub", ROBERTA_V1_ID),
("bertv2", "hub", BERT_V2_ID),
("robertav2", "hub", ROBERTA_V2_ID),
]
def available_models():
"""Ordered {label: ref} of selectable models, plus the demo fallback.
``ref=None`` marks the demo/fallback. Local entries appear only when present
on disk; Hub entries always appear (a missing/private repo degrades to demo,
which the UI flags).
"""
found = {}
local_found = {}
hub_found = {}
for label, kind, ref in MODEL_REGISTRY:
if kind == "local" and not os.path.isdir(ref):
continue
if kind == "local":
local_found[label] = ref
else:
hub_found[label] = ref
found.update(local_found)
found.update(hub_found)
found[DEMO_LABEL] = None
return found
# ---- Inference --------------------------------------------------------------
MAX_LENGTH = 512
STRIDE = 128 # matches the project's sliding-window preprocessing
# ---- Label scheme (must match training; see project README) -----------------
LABELS = [
"O",
"B-JOB_TITLE", "I-JOB_TITLE",
"B-SKILL", "I-SKILL",
"B-EDUCATION", "I-EDUCATION",
]
ID2LABEL = {i: l for i, l in enumerate(LABELS)}
LABEL2ID = {l: i for i, l in enumerate(LABELS)}
# Entity types (BIO prefix stripped) + display colours
ENTITY_TYPES = ["JOB_TITLE", "SKILL", "EDUCATION"]
ENTITY_COLORS = {
"JOB_TITLE": "#ffb703", # amber
"SKILL": "#2a9d8f", # teal
"EDUCATION": "#4361ee", # blue
}
ENTITY_LABELS = {
"JOB_TITLE": "Job Title",
"SKILL": "Skill",
"EDUCATION": "Education",
}
SUPPORTED_EXTS = [".pdf", ".docx", ".txt"]
SAMPLE_RESUME = """John Carter
Senior Software Engineer
Summary
Experienced Software Engineer and Team Lead with 8 years building scalable
backend systems. Skilled in Python, Java, Kubernetes, and distributed systems.
Experience
Senior Software Engineer, Acme Corp (2019 - present)
- Designed microservices using Python, FastAPI and PostgreSQL.
- Led a team of 5 engineers and introduced CI/CD with Docker and Jenkins.
Data Scientist, Globex (2016 - 2019)
- Built machine learning models with TensorFlow and scikit-learn.
Education
Master of Science in Computer Science, Stanford University (2016)
Bachelor of Engineering in Software Engineering, MIT (2014)
Skills
Python, Java, SQL, Machine Learning, Kubernetes, Docker, AWS, Leadership
"""