import nltk
import pytesseract # Moved import to the top
nltk.download('stopwords')   # one-time
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"   # Colab default

# Robust AI Resume Analyzer (Gradio)
# Paste this into a Colab cell or run locally (see install notes below).

import os, io, re, traceback
from PIL import Image, ImageFilter, ImageOps
import pytesseract
import docx
import PyPDF2
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# optional: pdf2image fallback for scanned PDFs (install poppler + pdf2image to enable)
try:
    from pdf2image import convert_from_bytes
    PDF2IMAGE_AVAILABLE = True
except Exception:
    PDF2IMAGE_AVAILABLE = False

# NLTK stopwords (download if needed)
import nltk
from nltk.corpus import stopwords
try:
    STOPWORDS = set(stopwords.words("english"))
except LookupError:
    nltk.download("stopwords")
    STOPWORDS = set(stopwords.words("english"))

BASE_SKILLS = [
    "python", "machine learning", "data analysis", "pandas", "numpy", "nlp",
    "deep learning", "tensorflow", "pytorch", "scikit-learn", "sql", "aws",
    "docker", "git", "rest api", "computer vision", "opencv", "transformers"
]

# ---------------- Extraction ----------------
def extract_text_from_bytes(file_bytes, filename):
    fname = (filename or "").lower()
    text = ""
    try:
        if fname.endswith(".pdf"):
            # first try direct PDF text extraction
            try:
                reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + " "
            except Exception:
                text = ""

            # fallback: if no text and pdf2image available, render pages and OCR
            if not text.strip() and PDF2IMAGE_AVAILABLE:
                try:
                    pages = convert_from_bytes(file_bytes, dpi=200)
                    for pg in pages:
                        pg = pg.convert("L").filter(ImageFilter.MedianFilter())
                        text += pytesseract.image_to_string(pg) + " "
                except Exception:
                    pass

        elif fname.endswith(".docx") or fname.endswith(".doc"):
            try:
                doc = docx.Document(io.BytesIO(file_bytes))
                text = "\n".join([p.text for p in doc.paragraphs])
            except Exception:
                # fallback to decoding bytes
                text = file_bytes.decode("utf-8", errors="ignore")

        elif any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]):
            img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
            img = ImageOps.grayscale(img)
            img = img.filter(ImageFilter.MedianFilter())
            text = pytesseract.image_to_string(img)

        elif fname.endswith(".txt"):
            text = file_bytes.decode("utf-8", errors="ignore")

        else:
            # unknown extension: try PDF, then image OCR, then decode
            try:
                reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + " "
            except Exception:
                pass
            if not text.strip():
                try:
                    img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
                    img = ImageOps.grayscale(img)
                    text = pytesseract.image_to_string(img)
                except Exception:
                    try:
                        text = file_bytes.decode("utf-8", errors="ignore")
                    except Exception:
                        text = ""
    except Exception as e:
        print("extract_text error:", e)
        return ""
    return text.strip()

# ---------------- Clean & Skills ----------------
def clean_text(text):
    text = (text or "").lower()
    text = re.sub(r"[^a-z0-9\s\-\.\@]", " ", text)
    tokens = [w for w in text.split() if w not in STOPWORDS]
    return " ".join(tokens)

def find_skills(text, custom_skills=[]):
    skills = BASE_SKILLS + [s.strip().lower() for s in custom_skills if s.strip()]
    text_low = (text or "").lower()
    found = [s for s in skills if s in text_low]
    return sorted(list(dict.fromkeys(found)))

def compute_similarity(resume_text, job_text):
    if not job_text.strip() or not resume_text.strip():
        return 0.0
    corpus = [resume_text, job_text]
    try:
        vec = TfidfVectorizer().fit_transform(corpus)
        sim = cosine_similarity(vec[0:1], vec[1:2])[0][0]
        return float(sim * 100)
    except Exception as e:
        print("compute_similarity error:", e)
        return 0.0

# ---------------- Main function ----------------
def analyze(file, job_description, custom_input):
    try:
        if not file:
            return "No file uploaded", "", "", 0.0, "Upload a file (PNG/JPG/PDF/DOCX/TXT)"

        # Gradio with type="file" usually passes a filepath string
        if isinstance(file, str):
            path = file
            filename = os.path.basename(path)
            with open(path, "rb") as f:
                file_bytes = f.read()
        elif isinstance(file, dict):
            # web mode / some frontends return dict-like objects
            filename = file.get("name") or file.get("filename") or "uploaded_file"
            data = file.get("data") or file.get("tmp_path")
            if isinstance(data, str) and os.path.exists(data):
                with open(data, "rb") as f:
                    file_bytes = f.read()
            elif isinstance(data, (bytes, bytearray)):
                file_bytes = data
            else:
                file_bytes = b""
        elif hasattr(file, "read"):
            filename = getattr(file, "name", "uploaded_file")
            file_bytes = file.read()
        else:
            return "Unsupported file object", "", "", 0.0, "Unsupported file object type"

        text = extract_text_from_bytes(file_bytes, filename)
        if not text:
            return "Could not extract text from file", "", "", 0.0, "Try a clearer image or a different file type"

        cleaned_resume = clean_text(text)
        cleaned_job = clean_text(job_description or "")

        custom_skills = [s.strip() for s in (custom_input or "").split(",") if s.strip()]
        skills_found = find_skills(text, custom_skills)

        score = compute_similarity(cleaned_resume, cleaned_job) if cleaned_job else 0.0

        suggestions = f"Skills found: {', '.join(skills_found) if skills_found else 'None'}\nSimilarity score: {score:.2f}%"
        short_preview = text[:2000] + ("..." if len(text) > 2000 else "")

        return short_preview, cleaned_resume, ", ".join(skills_found), round(score, 2), suggestions

    except Exception as e:
        traceback.print_exc()
        return "Error during analysis", "", "", 0.0, str(e)

# ---------------- Gradio UI ----------------
with gr.Blocks() as demo:
    gr.Markdown("# ⚡ AI Resume Analyzer (Robust)")
    with gr.Row():
        with gr.Column(scale=2):
            file_input = gr.File(label="Upload Resume (PNG/JPG/PDF/DOCX/TXT)", file_count="single", type="filepath")
            job_input = gr.Textbox(lines=4, label="Paste Job Description (optional)")
            custom_skills = gr.Textbox(lines=2, label="Custom Skills (comma separated, optional)")
            run_btn = gr.Button("Analyze Resume")
        with gr.Column(scale=3):
            output_preview = gr.Textbox(label="Extracted Text Preview")
            output_clean = gr.Textbox(label="Cleaned Text")
            output_skills = gr.Textbox(label="Detected Skills")
            output_score = gr.Number(label="Match Score (%)")
            output_suggest = gr.Textbox(label="Suggestions")

    run_btn.click(fn=analyze, inputs=[file_input, job_input, custom_skills],
                  outputs=[output_preview, output_clean, output_skills, output_score, output_suggest])

if __name__ == "__main__":
    # If running in Colab, demo.launch(share=True) will give a public link
    demo.launch(share=True)