import pymupdf as fitz
import re
from difflib import get_close_matches

def extract_text_from_pdf(file_path: str) -> str:
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

def parse_resume_text(text: str) -> dict:
    """Enhanced resume parsing with skill validation"""
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    text_lower = text.lower()
    
    extracted = {
        "name": "",
        "email": "",
        "phone": "",
        "skills": [],
        "experience": ""
    }
    
    # Valid skills database for matching
    valid_skills = [
        'FastAPI', 'React', 'Next.js', 'Flask', 'MongoDB', 'Tailwind CSS', 
        'Machine Learning', 'Python', 'JavaScript', 'HTML', 'CSS', 'Node.js',
        'Docker', 'Kubernetes', 'AWS', 'Git', 'GitHub', 'TensorFlow', 'PyTorch',
        'Streamlit', 'Qdrant', 'LangChain', 'Gemini API', 'OpenAI', 'Gradio',
        'Pandas', 'NumPy', 'Scikit-learn', 'OpenCV', 'Django', 'Vue.js',
        'Angular', 'TypeScript', 'PostgreSQL', 'MySQL', 'Redis', 'GraphQL',
        'RESTful API', 'Microservices', 'CI/CD', 'Linux', 'Ubuntu', 'Nginx',
        'Apache', 'Jenkins', 'Terraform', 'Ansible', 'Elasticsearch'
    ]
    
    # Extract Email using regex
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    email_match = re.search(email_pattern, text)
    if email_match:
        extracted["email"] = email_match.group()
    
    # Extract Phone using regex
    phone_pattern = r'\b(?:\+91|91)?[6-9]\d{9}\b'
    phone_match = re.search(phone_pattern, text)
    if phone_match:
        extracted["phone"] = phone_match.group()
    
    # Extract Name
    for i, line in enumerate(lines[:10]):
        skip_keywords = ['course', 'email', 'mobile', 'cgpa', 'academic', 'details']
        if any(keyword in line.lower() for keyword in skip_keywords):
            continue
        
        if re.match(r'^[A-Z][A-Z\s]+$', line) and len(line.split()) >= 2:
            extracted["name"] = line.title()
            break
    
    # Extract and clean skills
    raw_skills = []
    
    # Look for explicit skill mentions
    for skill in valid_skills:
        if skill.lower() in text_lower:
            raw_skills.append(skill)
    
    # Extract from common skill patterns
    skill_patterns = [
        r'built with (.*?)(?:\.|,|;|\n)',
        r'using (.*?)(?:\.|,|;|\n)',
        r'technologies?:?\s*(.*?)(?:\.|,|;|\n)',
        r'skills?:?\s*(.*?)(?:\.|,|;|\n)',
        r'stack:?\s*(.*?)(?:\.|,|;|\n)'
    ]
    
    for pattern in skill_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
        for match in matches:
            # Split by common delimiters
            words = re.split(r'[,\.\sand\s&\s]+', match.strip())
            for word in words:
                word = word.strip()
                if len(word) > 2:
                    # Try to match with valid skills using fuzzy matching
                    close_matches = get_close_matches(word, valid_skills, n=1, cutoff=0.7)
                    if close_matches:
                        raw_skills.append(close_matches[0])
    
    # Remove duplicates and limit
    extracted["skills"] = list(set(raw_skills))[:12]
    
    # Extract Experience
    exp_patterns = [
        r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
        r'experience\s*:?\s*(\d+)\+?\s*years?'
    ]
    
    for pattern in exp_patterns:
        match = re.search(pattern, text_lower)
        if match:
            extracted["experience"] = f"{match.group(1)} years"
            break
    
    if not extracted["experience"]:
        if 'intern' in text_lower and 'b.tech' in text_lower:
            extracted["experience"] = "0-1 years (Student/Intern)"
        else:
            extracted["experience"] = "Fresher"
    
    return extracted