import json
import re

import torch
import gradio as gr
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import spaces  # provided automatically on HF Spaces

# -----------------------------
# 1. Constants
# -----------------------------
PEFT_MODEL_ID = "LlamaFactoryAI/cv-job-description-matching"
BASE_MODEL_NAME = "akjindal53244/Llama-3.1-Storm-8B"

SYSTEM_PROMPT = (
    "You analyze how well a CV matches a job description for No Skill Jobs. "
    "education is not much relevant unless specified."
    "Your ONLY output must be a single JSON object with EXACTLY these keys: "
    "matching_analysis, description, Total score, recommendation, name, email adress, phone number.\n\n"
    "Constraints:\n"
    "- matching_analysis: at most 3 short bullet-like points, max 20 words each.\n"
    "- description: at most 2 sentences, max 35 words total.\n"
    "- score: integer from 0 to 100.\n"
    "- recommendation: at most 2 sentences, max 35 words total.\n\n"
    "Very important:\n"
    "- Do NOT include the full CV or job description text.\n"
    "- Do NOT wrap the JSON in backticks or any extra text.\n"
    "- Output ONLY raw JSON, nothing before or after."
)

# -----------------------------
# 2. Download & patch adapter (CPU only, safe in main process)
# -----------------------------
print("Downloading adapter...")
adapter_path = snapshot_download(PEFT_MODEL_ID)

config_path = adapter_path + "/adapter_config.json"
with open(config_path, "r") as f:
    cfg = json.load(f)

cfg["task_type"] = "CAUSAL_LM"

with open(config_path, "w") as f:
    json.dump(cfg, f, indent=2)

print("Patched adapter_config.json → task_type = CAUSAL_LM")
print("Adapter path:", adapter_path)

# -----------------------------
# 3. Globals for lazy GPU init
# -----------------------------
tokenizer = None
model = None


def build_messages(cv: str, job_description: str):
    return [
        {
            "role": "system",
            "content": SYSTEM_PROMPT,
        },
        {
            "role": "user",
            "content": f"<CV> {cv} </CV>\n<job_description> {job_description} </job_description>",
        },
    ]


def extract_json_from_text(text: str):
    """
    Try to pull a JSON object out of the model's output.
    If it fails, wrap the raw text in a fallback JSON structure.
    """
    match = re.search(r"\{.*\}", text, flags=re.DOTALL)
    candidate = match.group(0) if match else text

    try:
        return json.loads(candidate)
    except Exception:
        return {
            "matching_analysis": [
                "Model output could not be parsed as JSON.",
            ],
            "description": text[:200],
            "score": 0,
            "recommendation": "Please try again; the model returned non-JSON output.",
        }


# -----------------------------
# 4. Main inference function (GPU)
# -----------------------------
@spaces.GPU  # required for Stateless GPU Spaces
def match_cv_job(cv: str, job_description: str):
    global tokenizer, model

    if not cv.strip() or not job_description.strip():
        return {
            "matching_analysis": ["Please provide both a CV and a job description."],
            "description": "",
            "score": 0,
            "recommendation": "Fill both text boxes and run again.",
        }

    # Lazy GPU initialization: all CUDA-related stuff happens ONLY here
    if tokenizer is None or model is None:
        print("Initializing tokenizer + model on GPU...")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
        )

        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto",
        )

        base_model.config.pad_token_id = tokenizer.pad_token_id

        model_ = PeftModel.from_pretrained(
            base_model,
            adapter_path,
            device_map="auto",
        )
        model_.eval()
        torch.set_grad_enabled(False)

        model = model_
        print("Model + LoRA adapter loaded successfully on GPU.")

    messages = build_messages(cv, job_description)

    prompt = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )

    encoded = tokenizer(prompt, return_tensors="pt")
    # Move tensors to the same device as the model
    encoded = {k: v.to(model.device) for k, v in encoded.items()}

    with torch.inference_mode():
        outputs = model.generate(
            **encoded,
            max_new_tokens=256,
            pad_token_id=tokenizer.pad_token_id,
        )

    input_len = encoded["input_ids"].shape[1]
    generated_tokens = outputs[0][input_len:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    result = extract_json_from_text(generated_text)
    return result


# -----------------------------
# 5. Gradio interface
# -----------------------------
cv_input = gr.Textbox(
    label="CV",
    placeholder="Paste the candidate's CV here...",
    lines=18,
)

jd_input = gr.Textbox(
    label="Job Description",
    placeholder="Paste the job description here...",
    lines=8,
)

output_json = gr.JSON(label="Matching result (JSON)")

demo = gr.Interface(
    fn=match_cv_job,
    inputs=[cv_input, jd_input],
    outputs=output_json,
    title="CV–Job Description Matching API",
    description=(
        "Paste a CV and a job description. The model returns a JSON object with "
        "`matching_analysis`, `description`, `score`, and `recommendation`."
    ),
)

if __name__ == "__main__":
    demo.launch()