import json import re import torch import gradio as gr from huggingface_hub import snapshot_download from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel import spaces # provided automatically on HF Spaces # ----------------------------- # 1. Constants # ----------------------------- PEFT_MODEL_ID = "LlamaFactoryAI/cv-job-description-matching" BASE_MODEL_NAME = "akjindal53244/Llama-3.1-Storm-8B" SYSTEM_PROMPT = ( "You analyze how well a CV matches a job description for No Skill Jobs. " "education is not much relevant unless specified." "Your ONLY output must be a single JSON object with EXACTLY these keys: " "matching_analysis, description, Total score, recommendation, name, email adress, phone number.\n\n" "Constraints:\n" "- matching_analysis: at most 3 short bullet-like points, max 20 words each.\n" "- description: at most 2 sentences, max 35 words total.\n" "- score: integer from 0 to 100.\n" "- recommendation: at most 2 sentences, max 35 words total.\n\n" "Very important:\n" "- Do NOT include the full CV or job description text.\n" "- Do NOT wrap the JSON in backticks or any extra text.\n" "- Output ONLY raw JSON, nothing before or after." ) # ----------------------------- # 2. Download & patch adapter (CPU only, safe in main process) # ----------------------------- print("Downloading adapter...") adapter_path = snapshot_download(PEFT_MODEL_ID) config_path = adapter_path + "/adapter_config.json" with open(config_path, "r") as f: cfg = json.load(f) cfg["task_type"] = "CAUSAL_LM" with open(config_path, "w") as f: json.dump(cfg, f, indent=2) print("Patched adapter_config.json → task_type = CAUSAL_LM") print("Adapter path:", adapter_path) # ----------------------------- # 3. Globals for lazy GPU init # ----------------------------- tokenizer = None model = None def build_messages(cv: str, job_description: str): return [ { "role": "system", "content": SYSTEM_PROMPT, }, { "role": "user", "content": f" {cv} \n {job_description} ", }, ] def extract_json_from_text(text: str): """ Try to pull a JSON object out of the model's output. If it fails, wrap the raw text in a fallback JSON structure. """ match = re.search(r"\{.*\}", text, flags=re.DOTALL) candidate = match.group(0) if match else text try: return json.loads(candidate) except Exception: return { "matching_analysis": [ "Model output could not be parsed as JSON.", ], "description": text[:200], "score": 0, "recommendation": "Please try again; the model returned non-JSON output.", } # ----------------------------- # 4. Main inference function (GPU) # ----------------------------- @spaces.GPU # required for Stateless GPU Spaces def match_cv_job(cv: str, job_description: str): global tokenizer, model if not cv.strip() or not job_description.strip(): return { "matching_analysis": ["Please provide both a CV and a job description."], "description": "", "score": 0, "recommendation": "Fill both text boxes and run again.", } # Lazy GPU initialization: all CUDA-related stuff happens ONLY here if tokenizer is None or model is None: print("Initializing tokenizer + model on GPU...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL_NAME, quantization_config=bnb_config, device_map="auto", ) base_model.config.pad_token_id = tokenizer.pad_token_id model_ = PeftModel.from_pretrained( base_model, adapter_path, device_map="auto", ) model_.eval() torch.set_grad_enabled(False) model = model_ print("Model + LoRA adapter loaded successfully on GPU.") messages = build_messages(cv, job_description) prompt = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=False, ) encoded = tokenizer(prompt, return_tensors="pt") # Move tensors to the same device as the model encoded = {k: v.to(model.device) for k, v in encoded.items()} with torch.inference_mode(): outputs = model.generate( **encoded, max_new_tokens=256, pad_token_id=tokenizer.pad_token_id, ) input_len = encoded["input_ids"].shape[1] generated_tokens = outputs[0][input_len:] generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) result = extract_json_from_text(generated_text) return result # ----------------------------- # 5. Gradio interface # ----------------------------- cv_input = gr.Textbox( label="CV", placeholder="Paste the candidate's CV here...", lines=18, ) jd_input = gr.Textbox( label="Job Description", placeholder="Paste the job description here...", lines=8, ) output_json = gr.JSON(label="Matching result (JSON)") demo = gr.Interface( fn=match_cv_job, inputs=[cv_input, jd_input], outputs=output_json, title="CV–Job Description Matching API", description=( "Paste a CV and a job description. The model returns a JSON object with " "`matching_analysis`, `description`, `score`, and `recommendation`." ), ) if __name__ == "__main__": demo.launch()