import json
import re
import torch
import gradio as gr
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import spaces # provided automatically on HF Spaces
# -----------------------------
# 1. Constants
# -----------------------------
PEFT_MODEL_ID = "LlamaFactoryAI/cv-job-description-matching"
BASE_MODEL_NAME = "akjindal53244/Llama-3.1-Storm-8B"
SYSTEM_PROMPT = (
"You analyze how well a CV matches a job description for No Skill Jobs. "
"education is not much relevant unless specified."
"Your ONLY output must be a single JSON object with EXACTLY these keys: "
"matching_analysis, description, Total score, recommendation, name, email adress, phone number.\n\n"
"Constraints:\n"
"- matching_analysis: at most 3 short bullet-like points, max 20 words each.\n"
"- description: at most 2 sentences, max 35 words total.\n"
"- score: integer from 0 to 100.\n"
"- recommendation: at most 2 sentences, max 35 words total.\n\n"
"Very important:\n"
"- Do NOT include the full CV or job description text.\n"
"- Do NOT wrap the JSON in backticks or any extra text.\n"
"- Output ONLY raw JSON, nothing before or after."
)
# -----------------------------
# 2. Download & patch adapter (CPU only, safe in main process)
# -----------------------------
print("Downloading adapter...")
adapter_path = snapshot_download(PEFT_MODEL_ID)
config_path = adapter_path + "/adapter_config.json"
with open(config_path, "r") as f:
cfg = json.load(f)
cfg["task_type"] = "CAUSAL_LM"
with open(config_path, "w") as f:
json.dump(cfg, f, indent=2)
print("Patched adapter_config.json → task_type = CAUSAL_LM")
print("Adapter path:", adapter_path)
# -----------------------------
# 3. Globals for lazy GPU init
# -----------------------------
tokenizer = None
model = None
def build_messages(cv: str, job_description: str):
return [
{
"role": "system",
"content": SYSTEM_PROMPT,
},
{
"role": "user",
"content": f" {cv} \n {job_description} ",
},
]
def extract_json_from_text(text: str):
"""
Try to pull a JSON object out of the model's output.
If it fails, wrap the raw text in a fallback JSON structure.
"""
match = re.search(r"\{.*\}", text, flags=re.DOTALL)
candidate = match.group(0) if match else text
try:
return json.loads(candidate)
except Exception:
return {
"matching_analysis": [
"Model output could not be parsed as JSON.",
],
"description": text[:200],
"score": 0,
"recommendation": "Please try again; the model returned non-JSON output.",
}
# -----------------------------
# 4. Main inference function (GPU)
# -----------------------------
@spaces.GPU # required for Stateless GPU Spaces
def match_cv_job(cv: str, job_description: str):
global tokenizer, model
if not cv.strip() or not job_description.strip():
return {
"matching_analysis": ["Please provide both a CV and a job description."],
"description": "",
"score": 0,
"recommendation": "Fill both text boxes and run again.",
}
# Lazy GPU initialization: all CUDA-related stuff happens ONLY here
if tokenizer is None or model is None:
print("Initializing tokenizer + model on GPU...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
)
base_model.config.pad_token_id = tokenizer.pad_token_id
model_ = PeftModel.from_pretrained(
base_model,
adapter_path,
device_map="auto",
)
model_.eval()
torch.set_grad_enabled(False)
model = model_
print("Model + LoRA adapter loaded successfully on GPU.")
messages = build_messages(cv, job_description)
prompt = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
)
encoded = tokenizer(prompt, return_tensors="pt")
# Move tensors to the same device as the model
encoded = {k: v.to(model.device) for k, v in encoded.items()}
with torch.inference_mode():
outputs = model.generate(
**encoded,
max_new_tokens=256,
pad_token_id=tokenizer.pad_token_id,
)
input_len = encoded["input_ids"].shape[1]
generated_tokens = outputs[0][input_len:]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
result = extract_json_from_text(generated_text)
return result
# -----------------------------
# 5. Gradio interface
# -----------------------------
cv_input = gr.Textbox(
label="CV",
placeholder="Paste the candidate's CV here...",
lines=18,
)
jd_input = gr.Textbox(
label="Job Description",
placeholder="Paste the job description here...",
lines=8,
)
output_json = gr.JSON(label="Matching result (JSON)")
demo = gr.Interface(
fn=match_cv_job,
inputs=[cv_input, jd_input],
outputs=output_json,
title="CV–Job Description Matching API",
description=(
"Paste a CV and a job description. The model returns a JSON object with "
"`matching_analysis`, `description`, `score`, and `recommendation`."
),
)
if __name__ == "__main__":
demo.launch()