import nltk import pytesseract # Moved import to the top nltk.download('stopwords') # one-time pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Colab default # Robust AI Resume Analyzer (Gradio) # Paste this into a Colab cell or run locally (see install notes below). import os, io, re, traceback from PIL import Image, ImageFilter, ImageOps import pytesseract import docx import PyPDF2 import gradio as gr from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # optional: pdf2image fallback for scanned PDFs (install poppler + pdf2image to enable) try: from pdf2image import convert_from_bytes PDF2IMAGE_AVAILABLE = True except Exception: PDF2IMAGE_AVAILABLE = False # NLTK stopwords (download if needed) import nltk from nltk.corpus import stopwords try: STOPWORDS = set(stopwords.words("english")) except LookupError: nltk.download("stopwords") STOPWORDS = set(stopwords.words("english")) BASE_SKILLS = [ "python", "machine learning", "data analysis", "pandas", "numpy", "nlp", "deep learning", "tensorflow", "pytorch", "scikit-learn", "sql", "aws", "docker", "git", "rest api", "computer vision", "opencv", "transformers" ] # ---------------- Extraction ---------------- def extract_text_from_bytes(file_bytes, filename): fname = (filename or "").lower() text = "" try: if fname.endswith(".pdf"): # first try direct PDF text extraction try: reader = PyPDF2.PdfReader(io.BytesIO(file_bytes)) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + " " except Exception: text = "" # fallback: if no text and pdf2image available, render pages and OCR if not text.strip() and PDF2IMAGE_AVAILABLE: try: pages = convert_from_bytes(file_bytes, dpi=200) for pg in pages: pg = pg.convert("L").filter(ImageFilter.MedianFilter()) text += pytesseract.image_to_string(pg) + " " except Exception: pass elif fname.endswith(".docx") or fname.endswith(".doc"): try: doc = docx.Document(io.BytesIO(file_bytes)) text = "\n".join([p.text for p in doc.paragraphs]) except Exception: # fallback to decoding bytes text = file_bytes.decode("utf-8", errors="ignore") elif any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]): img = Image.open(io.BytesIO(file_bytes)).convert("RGB") img = ImageOps.grayscale(img) img = img.filter(ImageFilter.MedianFilter()) text = pytesseract.image_to_string(img) elif fname.endswith(".txt"): text = file_bytes.decode("utf-8", errors="ignore") else: # unknown extension: try PDF, then image OCR, then decode try: reader = PyPDF2.PdfReader(io.BytesIO(file_bytes)) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + " " except Exception: pass if not text.strip(): try: img = Image.open(io.BytesIO(file_bytes)).convert("RGB") img = ImageOps.grayscale(img) text = pytesseract.image_to_string(img) except Exception: try: text = file_bytes.decode("utf-8", errors="ignore") except Exception: text = "" except Exception as e: print("extract_text error:", e) return "" return text.strip() # ---------------- Clean & Skills ---------------- def clean_text(text): text = (text or "").lower() text = re.sub(r"[^a-z0-9\s\-\.\@]", " ", text) tokens = [w for w in text.split() if w not in STOPWORDS] return " ".join(tokens) def find_skills(text, custom_skills=[]): skills = BASE_SKILLS + [s.strip().lower() for s in custom_skills if s.strip()] text_low = (text or "").lower() found = [s for s in skills if s in text_low] return sorted(list(dict.fromkeys(found))) def compute_similarity(resume_text, job_text): if not job_text.strip() or not resume_text.strip(): return 0.0 corpus = [resume_text, job_text] try: vec = TfidfVectorizer().fit_transform(corpus) sim = cosine_similarity(vec[0:1], vec[1:2])[0][0] return float(sim * 100) except Exception as e: print("compute_similarity error:", e) return 0.0 # ---------------- Main function ---------------- def analyze(file, job_description, custom_input): try: if not file: return "No file uploaded", "", "", 0.0, "Upload a file (PNG/JPG/PDF/DOCX/TXT)" # Gradio with type="file" usually passes a filepath string if isinstance(file, str): path = file filename = os.path.basename(path) with open(path, "rb") as f: file_bytes = f.read() elif isinstance(file, dict): # web mode / some frontends return dict-like objects filename = file.get("name") or file.get("filename") or "uploaded_file" data = file.get("data") or file.get("tmp_path") if isinstance(data, str) and os.path.exists(data): with open(data, "rb") as f: file_bytes = f.read() elif isinstance(data, (bytes, bytearray)): file_bytes = data else: file_bytes = b"" elif hasattr(file, "read"): filename = getattr(file, "name", "uploaded_file") file_bytes = file.read() else: return "Unsupported file object", "", "", 0.0, "Unsupported file object type" text = extract_text_from_bytes(file_bytes, filename) if not text: return "Could not extract text from file", "", "", 0.0, "Try a clearer image or a different file type" cleaned_resume = clean_text(text) cleaned_job = clean_text(job_description or "") custom_skills = [s.strip() for s in (custom_input or "").split(",") if s.strip()] skills_found = find_skills(text, custom_skills) score = compute_similarity(cleaned_resume, cleaned_job) if cleaned_job else 0.0 suggestions = f"Skills found: {', '.join(skills_found) if skills_found else 'None'}\nSimilarity score: {score:.2f}%" short_preview = text[:2000] + ("..." if len(text) > 2000 else "") return short_preview, cleaned_resume, ", ".join(skills_found), round(score, 2), suggestions except Exception as e: traceback.print_exc() return "Error during analysis", "", "", 0.0, str(e) # ---------------- Gradio UI ---------------- with gr.Blocks() as demo: gr.Markdown("# ⚡ AI Resume Analyzer (Robust)") with gr.Row(): with gr.Column(scale=2): file_input = gr.File(label="Upload Resume (PNG/JPG/PDF/DOCX/TXT)", file_count="single", type="filepath") job_input = gr.Textbox(lines=4, label="Paste Job Description (optional)") custom_skills = gr.Textbox(lines=2, label="Custom Skills (comma separated, optional)") run_btn = gr.Button("Analyze Resume") with gr.Column(scale=3): output_preview = gr.Textbox(label="Extracted Text Preview") output_clean = gr.Textbox(label="Cleaned Text") output_skills = gr.Textbox(label="Detected Skills") output_score = gr.Number(label="Match Score (%)") output_suggest = gr.Textbox(label="Suggestions") run_btn.click(fn=analyze, inputs=[file_input, job_input, custom_skills], outputs=[output_preview, output_clean, output_skills, output_score, output_suggest]) if __name__ == "__main__": # If running in Colab, demo.launch(share=True) will give a public link demo.launch(share=True)