#!/usr/bin/env python3
"""Debug: Find where the 60% comes from - check taxonomy expansion"""

from app import ATSCompatibilityAnalyzer
import re
from collections import Counter
import math

analyzer = ATSCompatibilityAnalyzer()

chef_resume = """
Chef John Smith
Executive Chef | Le Restaurant | 2015-2023
• Created award-winning French cuisine menus
• Managed kitchen staff of 20
• Sourced local organic ingredients
Skills: French cuisine, pastry, wine pairing
Education: Culinary Institute of America
"""

ml_jd = "Machine Learning Engineer with PhD, PyTorch, TensorFlow experience"

resume_lower = chef_resume.lower()
jd_lower = ml_jd.lower()

# Get JD words
jd_words = re.findall(r'\b[a-zA-Z]{2,}\b', jd_lower)
jd_words = [analyzer._stem_word(w) for w in jd_words if w not in analyzer.stop_words]
print(f"JD Keywords (stemmed): {jd_words}")

# Get resume words
resume_words = re.findall(r'\b[a-zA-Z]{2,}\b', resume_lower)
resume_words_filtered = [analyzer._stem_word(w) for w in resume_words if w not in analyzer.stop_words]

# EXPAND with taxonomy
resume_expanded = analyzer._expand_with_taxonomy(resume_words_filtered)
resume_stems = {analyzer._stem_word(w) for w in resume_expanded}
resume_raw = set(resume_lower.split())

print(f"\nResume words (before expansion): {len(resume_words_filtered)}")
print(f"Resume words (after expansion): {len(resume_expanded)}")
print(f"\nExpanded stems: {resume_stems}")

# Check each JD word
jd_tf = Counter(jd_words)
max_count = max(jd_tf.values()) if jd_tf else 1
jd_weights = {word: 1 + math.log(max_count / count) for word, count in jd_tf.items()}

print(f"\n{'='*60}")
print("CHECKING EACH JD KEYWORD:")
print('='*60)

weighted_matches = 0
total_weight = 0

for word, weight in jd_weights.items():
    total_weight += weight
    matched = False
    match_reason = ""
    match_value = 0
    
    # Check direct match in expanded stems
    if word in resume_stems:
        matched = True
        match_reason = f"DIRECT in stems"
        match_value = weight
    # Check raw word in resume text
    elif word in resume_lower:
        matched = True
        match_reason = f"IN resume text"
        match_value = weight
    # Check containment
    elif any(word in rw or rw in word for rw in resume_stems if len(word) > 3 and len(rw) > 3):
        matched = True
        match_reason = f"CONTAINMENT"
        match_value = weight * 0.95
        for rw in resume_stems:
            if len(word) > 3 and len(rw) > 3 and (word in rw or rw in word):
                match_reason = f"CONTAINMENT: '{word}' in/contains '{rw}'"
                break
    # Check fuzzy
    elif any(analyzer._fuzzy_match(word, rw, 0.65) for rw in resume_stems):
        matched = True
        match_reason = f"FUZZY"
        match_value = weight * 0.85
        for rw in resume_stems:
            if analyzer._fuzzy_match(word, rw, 0.65):
                match_reason = f"FUZZY: '{word}' ~ '{rw}'"
                break
    # Check 4-char prefix
    elif any(word[:4] in rw for rw in resume_raw if len(word) >= 4 and len(rw) >= 4):
        matched = True
        match_reason = f"4-CHAR PREFIX"
        match_value = weight * 0.75
    # Check 3-char prefix
    elif any(word[:3] == rw[:3] for rw in resume_stems if len(word) >= 3 and len(rw) >= 3):
        matched = True
        match_reason = f"3-CHAR PREFIX"
        match_value = weight * 0.5
        for rw in resume_stems:
            if len(word) >= 3 and len(rw) >= 3 and word[:3] == rw[:3]:
                match_reason = f"3-CHAR PREFIX: '{word}' -> '{rw}'"
                break
    
    if matched:
        weighted_matches += match_value
        print(f"✅ '{word}' -> {match_reason} = {match_value:.2f}")
    else:
        print(f"❌ '{word}' -> NO MATCH")

print(f"\n{'='*60}")
print(f"Total weight: {total_weight}")
print(f"Weighted matches: {weighted_matches}")
print(f"Raw score: {(weighted_matches / total_weight) * 100:.1f}%")

# Check the floor logic
resume_words_count = len(resume_words_filtered)
raw_score = (weighted_matches / total_weight) * 100
if raw_score < 10 and resume_words_count > 20:
    adjusted = 10 + (raw_score * 0.5)
    print(f"\nFloor logic applied (resume has {resume_words_count} words):")
    print(f"  raw_score ({raw_score:.1f}) < 10 AND resume has > 20 words")
    print(f"  Adjusted: 10 + ({raw_score:.1f} * 0.5) = {adjusted:.1f}%")
    raw_score = adjusted

print(f"\n📊 EXPECTED FINAL SCORE: {raw_score:.1f}%")
print(f"📊 ACTUAL SCORE FROM FUNCTION: {analyzer._calculate_tfidf_score(chef_resume, ml_jd)}%")