"""
Resume Parser Module
Handles PDF/DOCX parsing and text extraction from resumes.
"""

import re
import PyPDF2
import pdfplumber
from typing import Dict, Optional, List
from pathlib import Path
import logging
from docx import Document

logger = logging.getLogger(__name__)


class ResumeParser:
    """
    Advanced resume parser that extracts text and metadata from PDF and DOCX files.
    """
    
    def __init__(self):
        self.supported_formats = ['.pdf', '.docx', '.txt']
        logger.info("ResumeParser initialized")
    
    def parse_pdf(self, file_path: str) -> str:
        """
        Extract text from PDF using multiple methods for reliability.
        
        Args:
            file_path: Path to PDF file
            
        Returns:
            Extracted text content
        """
        try:
            # Try pdfplumber first (better for complex layouts)
            text = self._parse_with_pdfplumber(file_path)
            if text and len(text.strip()) > 50:
                return text
            
            # Fallback to PyPDF2
            text = self._parse_with_pypdf2(file_path)
            return text
            
        except Exception as e:
            logger.error(f"Error parsing PDF: {e}")
            raise ValueError(f"Failed to parse PDF: {str(e)}")
    
    def _parse_with_pdfplumber(self, file_path: str) -> str:
        """Parse PDF using pdfplumber."""
        text_content = []
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text_content.append(text)
        return "\n".join(text_content)
    
    def _parse_with_pypdf2(self, file_path: str) -> str:
        """Parse PDF using PyPDF2 as fallback."""
        text_content = []
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    text_content.append(text)
        return "\n".join(text_content)
    
    def parse_docx(self, file_path: str) -> str:
        """
        Extract text from DOCX file.
        
        Args:
            file_path: Path to DOCX file
            
        Returns:
            Extracted text content
        """
        try:
            doc = Document(file_path)
            text_content = []
            
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    text_content.append(paragraph.text)
            
            # Also extract from tables
            for table in doc.tables:
                for row in table.rows:
                    for cell in row.cells:
                        if cell.text.strip():
                            text_content.append(cell.text)
            
            return "\n".join(text_content)
            
        except Exception as e:
            logger.error(f"Error parsing DOCX: {e}")
            raise ValueError(f"Failed to parse DOCX: {str(e)}")
    
    def parse_file(self, file_path: str) -> Dict[str, any]:
        """
        Parse resume file and extract text with metadata.
        
        Args:
            file_path: Path to resume file
            
        Returns:
            Dictionary containing parsed content and metadata
        """
        file_path = Path(file_path)
        
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        extension = file_path.suffix.lower()
        
        if extension not in self.supported_formats:
            raise ValueError(f"Unsupported file format: {extension}")
        
        # Extract text based on file type
        if extension == '.pdf':
            text = self.parse_pdf(str(file_path))
        elif extension == '.docx':
            text = self.parse_docx(str(file_path))
        elif extension == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        else:
            raise ValueError(f"Unsupported format: {extension}")
        
        # Clean and normalize text
        text = self.clean_text(text)
        
        # Extract basic metadata
        metadata = self.extract_metadata(text)
        
        return {
            "raw_text": text,
            "cleaned_text": text,
            "file_name": file_path.name,
            "file_type": extension,
            "metadata": metadata,
            "word_count": len(text.split()),
            "char_count": len(text)
        }
    
    def clean_text(self, text: str) -> str:
        """
        Clean and normalize extracted text.
        
        Args:
            text: Raw extracted text
            
        Returns:
            Cleaned text
        """
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters but keep important punctuation
        text = re.sub(r'[^\w\s\.,;:\-\(\)@/]', '', text)
        
        # Normalize line breaks
        text = text.replace('\r\n', '\n').replace('\r', '\n')
        
        return text.strip()
    
    def extract_metadata(self, text: str) -> Dict[str, any]:
        """
        Extract basic metadata from resume text.
        
        Args:
            text: Resume text
            
        Returns:
            Dictionary with metadata
        """
        metadata = {}
        
        # Extract email
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, text)
        metadata['emails'] = emails[:1] if emails else []
        
        # Extract phone numbers
        phone_pattern = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
        phones = re.findall(phone_pattern, text)
        metadata['phones'] = [p[0] + p[1] if isinstance(p, tuple) else p for p in phones[:2]]
        
        # Extract LinkedIn
        linkedin_pattern = r'linkedin\.com/in/[\w-]+'
        linkedin = re.findall(linkedin_pattern, text.lower())
        metadata['linkedin'] = linkedin[0] if linkedin else None
        
        # Extract GitHub
        github_pattern = r'github\.com/[\w-]+'
        github = re.findall(github_pattern, text.lower())
        metadata['github'] = github[0] if github else None
        
        return metadata
    
    def extract_sections(self, text: str) -> Dict[str, str]:
        """
        Identify and extract common resume sections.
        
        Args:
            text: Resume text
            
        Returns:
            Dictionary mapping section names to content
        """
        sections = {}
        
        # Common section headers
        section_patterns = {
            'summary': r'(professional summary|summary|profile|objective)',
            'experience': r'(work experience|experience|employment history|professional experience)',
            'education': r'(education|academic background|qualifications)',
            'skills': r'(skills|technical skills|core competencies|expertise)',
            'projects': r'(projects|portfolio)',
            'certifications': r'(certifications|certificates|licenses)'
        }
        
        text_lower = text.lower()
        
        for section_name, pattern in section_patterns.items():
            match = re.search(pattern, text_lower)
            if match:
                start_idx = match.start()
                # Find next section or end of text
                next_section_idx = len(text)
                for other_pattern in section_patterns.values():
                    if other_pattern != pattern:
                        next_match = re.search(other_pattern, text_lower[start_idx + 50:])
                        if next_match:
                            candidate_idx = start_idx + 50 + next_match.start()
                            if candidate_idx < next_section_idx:
                                next_section_idx = candidate_idx
                
                sections[section_name] = text[start_idx:next_section_idx].strip()
        
        return sections