import joblib
import numpy as np
import pandas as pd
import pdfplumber
import fitz
import re
from huggingface_hub import hf_hub_download

# =========================================================
# 1. COPY YOUR ORIGINAL FUNCTIONS HERE (VERY IMPORTANT)
# =========================================================

# ----- Replace these placeholder functions with your REAL code -----

def clean_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and digits (keep letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file using multiple methods for robustness"""
    text = ""

    try:
        # Method 1: Try with pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        if text.strip():
            return text

    except Exception as e:
        print(f"pdfplumber failed for {pdf_path}: {e}")

    try:
        # Method 2: Try with PyPDF2
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        if text.strip():
            return text

    except Exception as e:
        print(f"PyPDF2 failed for {pdf_path}: {e}")

    try:
        # Method 3: Try with PyMuPDF (fitz) - most robust
        doc = fitz.open(pdf_path)
        for page in doc:
            page_text = page.get_text()
            if page_text:
                text += page_text + "\n"
        doc.close()

        if text.strip():
            return text

    except Exception as e:
        print(f"PyMuPDF failed for {pdf_path}: {e}")

    return text if text.strip() else "Unable to extract text from PDF"


def extract_skills(text):
    """Extract skills from resume text"""
    skill_keywords = [
        'python', 'java', 'javascript', 'sql', 'html', 'css', 'react', 'angular',
        'machine learning', 'data analysis', 'excel', 'powerbi', 'tableau',
        'project management', 'communication', 'leadership', 'teamwork',
        'problem solving', 'analytical', 'creative', 'organizational',
        'aws', 'azure', 'docker', 'kubernetes', 'git', 'linux',
        'tensorflow', 'pytorch', 'pandas', 'numpy', 'sklearn'
    ]

    text_lower = text.lower()
    found_skills = []

    for skill in skill_keywords:
        if skill in text_lower:
            found_skills.append(skill)

    return ', '.join(found_skills)


def extract_experience_years(text):
    """Extract years of experience from resume text"""
    experience_patterns = [
        r'(\d+)\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)',
        r'(\d+)\+\s*(?:years?|yrs?)',
        r'over\s*(\d+)\s*(?:years?|yrs?)',
        r'more\s*than\s*(\d+)\s*(?:years?|yrs?)'
    ]

    text_lower = text.lower()

    for pattern in experience_patterns:
        match = re.search(pattern, text_lower)
        if match:
            return int(match.group(1))

    return 0

def extract_education_level(text):
    """Extract education level from resume text"""
    education_keywords = {
        'phd': 4, 'doctorate': 4, 'doctoral': 4,
        'master': 3, 'mba': 3, 'ms': 3, 'ma': 3,
        'bachelor': 2, 'degree': 2, 'ba': 2, 'bs': 2,
        'diploma': 1, 'certificate': 1
    }

    text_lower = text.lower()
    max_level = 0

    for keyword, level in education_keywords.items():
        if keyword in text_lower:
            max_level = max(max_level, level)

    return max_level


def count_technical_terms(text):
    """Count technical terms in resume"""
    technical_terms = [
        'algorithm', 'database', 'software', 'development', 'programming',
        'analysis', 'system', 'design', 'implementation', 'optimization',
        'automation', 'testing', 'debugging', 'framework', 'api'
    ]

    text_lower = text.lower()
    count = 0

    for term in technical_terms:
        count += text_lower.count(term)

    return count

# =========================================================
# 2. LOAD MODEL + PREPROCESSING ARTIFACTS
# =========================================================

HF_MODEL_REPO = "vvirothi/resume-parser-ml-model"   # <- change to your real repo
MODEL_FILENAME = "resume_parser_pipeline.joblib"

def load_artifacts():
    """
    Download the latest model pipeline from Hugging Face Hub
    and load it with joblib.
    """
    local_path = hf_hub_download(
        repo_id=HF_MODEL_REPO,
        filename=MODEL_FILENAME
    )
    artifacts = joblib.load(local_path)
    return artifacts

# =========================================================
# 3. FEATURE PREPARATION
# =========================================================

def prepare_features_from_text(raw_text: str, artifacts: dict):
    """
    Converts raw resume text into the exact same feature vector used during training.
    """
    tfidf = artifacts["tfidf"]
    scaler = artifacts["scaler"]
    numerical_cols = artifacts["numerical_cols"]

    # 1. Clean text
    clean = clean_text(raw_text)

    # 2. Extract numerical features
    skills = extract_skills(raw_text)
    exp_years = extract_experience_years(raw_text)
    edu_level = extract_education_level(raw_text)
    tech_terms = count_technical_terms(raw_text)
    resume_len = len(raw_text)
    word_count = len(raw_text.split())

    num_data = {
        'Experience_Years': exp_years,
        'Education_Level': edu_level,
        'Technical_Terms_Count': tech_terms,
        'Resume_Length': resume_len,
        'Word_Count': word_count,
    }

    # 3. TF-IDF text features
    text_vec = tfidf.transform([clean]).toarray()

    # 4. Numerical → scaled in same order
    num_vec_raw = np.array([[num_data[col] for col in numerical_cols]])
    num_vec_scaled = scaler.transform(num_vec_raw)

    # 5. Combine
    X_final = np.hstack([text_vec, num_vec_scaled])

    return X_final


# =========================================================
# 4. MAIN PREDICTION FUNCTION
# =========================================================
def predict_from_pdf(pdf_path: str):
    # Load artifacts from HuggingFace model repo
    artifacts = load_artifacts()
    
    model = artifacts["model"]
    label_encoder = artifacts["label_encoder"]

    # Extract text
    raw_text = extract_text_from_pdf(pdf_path)

    # Prepare features
    X_input = prepare_features_from_text(raw_text, artifacts)

    # Predict
    y_pred_encoded = model.predict(X_input)
    predicted_label = label_encoder.inverse_transform(y_pred_encoded)[0]

    # Predict probabilities (if supported)
    proba_dict = None
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_input)[0]
        classes = label_encoder.inverse_transform(range(len(probs)))
        proba_dict = {cls: float(prob) for cls, prob in zip(classes, probs)}

    return predicted_label, proba_dict, raw_text