Spaces:
Sleeping
Sleeping
| import nltk | |
| import pytesseract # Moved import to the top | |
| nltk.download('stopwords') # one-time | |
| pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Colab default | |
| # Robust AI Resume Analyzer (Gradio) | |
| # Paste this into a Colab cell or run locally (see install notes below). | |
| import os, io, re, traceback | |
| from PIL import Image, ImageFilter, ImageOps | |
| import pytesseract | |
| import docx | |
| import PyPDF2 | |
| import gradio as gr | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # optional: pdf2image fallback for scanned PDFs (install poppler + pdf2image to enable) | |
| try: | |
| from pdf2image import convert_from_bytes | |
| PDF2IMAGE_AVAILABLE = True | |
| except Exception: | |
| PDF2IMAGE_AVAILABLE = False | |
| # NLTK stopwords (download if needed) | |
| import nltk | |
| from nltk.corpus import stopwords | |
| try: | |
| STOPWORDS = set(stopwords.words("english")) | |
| except LookupError: | |
| nltk.download("stopwords") | |
| STOPWORDS = set(stopwords.words("english")) | |
| BASE_SKILLS = [ | |
| "python", "machine learning", "data analysis", "pandas", "numpy", "nlp", | |
| "deep learning", "tensorflow", "pytorch", "scikit-learn", "sql", "aws", | |
| "docker", "git", "rest api", "computer vision", "opencv", "transformers" | |
| ] | |
| # ---------------- Extraction ---------------- | |
| def extract_text_from_bytes(file_bytes, filename): | |
| fname = (filename or "").lower() | |
| text = "" | |
| try: | |
| if fname.endswith(".pdf"): | |
| # first try direct PDF text extraction | |
| try: | |
| reader = PyPDF2.PdfReader(io.BytesIO(file_bytes)) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + " " | |
| except Exception: | |
| text = "" | |
| # fallback: if no text and pdf2image available, render pages and OCR | |
| if not text.strip() and PDF2IMAGE_AVAILABLE: | |
| try: | |
| pages = convert_from_bytes(file_bytes, dpi=200) | |
| for pg in pages: | |
| pg = pg.convert("L").filter(ImageFilter.MedianFilter()) | |
| text += pytesseract.image_to_string(pg) + " " | |
| except Exception: | |
| pass | |
| elif fname.endswith(".docx") or fname.endswith(".doc"): | |
| try: | |
| doc = docx.Document(io.BytesIO(file_bytes)) | |
| text = "\n".join([p.text for p in doc.paragraphs]) | |
| except Exception: | |
| # fallback to decoding bytes | |
| text = file_bytes.decode("utf-8", errors="ignore") | |
| elif any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]): | |
| img = Image.open(io.BytesIO(file_bytes)).convert("RGB") | |
| img = ImageOps.grayscale(img) | |
| img = img.filter(ImageFilter.MedianFilter()) | |
| text = pytesseract.image_to_string(img) | |
| elif fname.endswith(".txt"): | |
| text = file_bytes.decode("utf-8", errors="ignore") | |
| else: | |
| # unknown extension: try PDF, then image OCR, then decode | |
| try: | |
| reader = PyPDF2.PdfReader(io.BytesIO(file_bytes)) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + " " | |
| except Exception: | |
| pass | |
| if not text.strip(): | |
| try: | |
| img = Image.open(io.BytesIO(file_bytes)).convert("RGB") | |
| img = ImageOps.grayscale(img) | |
| text = pytesseract.image_to_string(img) | |
| except Exception: | |
| try: | |
| text = file_bytes.decode("utf-8", errors="ignore") | |
| except Exception: | |
| text = "" | |
| except Exception as e: | |
| print("extract_text error:", e) | |
| return "" | |
| return text.strip() | |
| # ---------------- Clean & Skills ---------------- | |
| def clean_text(text): | |
| text = (text or "").lower() | |
| text = re.sub(r"[^a-z0-9\s\-\.\@]", " ", text) | |
| tokens = [w for w in text.split() if w not in STOPWORDS] | |
| return " ".join(tokens) | |
| def find_skills(text, custom_skills=[]): | |
| skills = BASE_SKILLS + [s.strip().lower() for s in custom_skills if s.strip()] | |
| text_low = (text or "").lower() | |
| found = [s for s in skills if s in text_low] | |
| return sorted(list(dict.fromkeys(found))) | |
| def compute_similarity(resume_text, job_text): | |
| if not job_text.strip() or not resume_text.strip(): | |
| return 0.0 | |
| corpus = [resume_text, job_text] | |
| try: | |
| vec = TfidfVectorizer().fit_transform(corpus) | |
| sim = cosine_similarity(vec[0:1], vec[1:2])[0][0] | |
| return float(sim * 100) | |
| except Exception as e: | |
| print("compute_similarity error:", e) | |
| return 0.0 | |
| # ---------------- Main function ---------------- | |
| def analyze(file, job_description, custom_input): | |
| try: | |
| if not file: | |
| return "No file uploaded", "", "", 0.0, "Upload a file (PNG/JPG/PDF/DOCX/TXT)" | |
| # Gradio with type="file" usually passes a filepath string | |
| if isinstance(file, str): | |
| path = file | |
| filename = os.path.basename(path) | |
| with open(path, "rb") as f: | |
| file_bytes = f.read() | |
| elif isinstance(file, dict): | |
| # web mode / some frontends return dict-like objects | |
| filename = file.get("name") or file.get("filename") or "uploaded_file" | |
| data = file.get("data") or file.get("tmp_path") | |
| if isinstance(data, str) and os.path.exists(data): | |
| with open(data, "rb") as f: | |
| file_bytes = f.read() | |
| elif isinstance(data, (bytes, bytearray)): | |
| file_bytes = data | |
| else: | |
| file_bytes = b"" | |
| elif hasattr(file, "read"): | |
| filename = getattr(file, "name", "uploaded_file") | |
| file_bytes = file.read() | |
| else: | |
| return "Unsupported file object", "", "", 0.0, "Unsupported file object type" | |
| text = extract_text_from_bytes(file_bytes, filename) | |
| if not text: | |
| return "Could not extract text from file", "", "", 0.0, "Try a clearer image or a different file type" | |
| cleaned_resume = clean_text(text) | |
| cleaned_job = clean_text(job_description or "") | |
| custom_skills = [s.strip() for s in (custom_input or "").split(",") if s.strip()] | |
| skills_found = find_skills(text, custom_skills) | |
| score = compute_similarity(cleaned_resume, cleaned_job) if cleaned_job else 0.0 | |
| suggestions = f"Skills found: {', '.join(skills_found) if skills_found else 'None'}\nSimilarity score: {score:.2f}%" | |
| short_preview = text[:2000] + ("..." if len(text) > 2000 else "") | |
| return short_preview, cleaned_resume, ", ".join(skills_found), round(score, 2), suggestions | |
| except Exception as e: | |
| traceback.print_exc() | |
| return "Error during analysis", "", "", 0.0, str(e) | |
| # ---------------- Gradio UI ---------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# ⚡ AI Resume Analyzer (Robust)") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| file_input = gr.File(label="Upload Resume (PNG/JPG/PDF/DOCX/TXT)", file_count="single", type="filepath") | |
| job_input = gr.Textbox(lines=4, label="Paste Job Description (optional)") | |
| custom_skills = gr.Textbox(lines=2, label="Custom Skills (comma separated, optional)") | |
| run_btn = gr.Button("Analyze Resume") | |
| with gr.Column(scale=3): | |
| output_preview = gr.Textbox(label="Extracted Text Preview") | |
| output_clean = gr.Textbox(label="Cleaned Text") | |
| output_skills = gr.Textbox(label="Detected Skills") | |
| output_score = gr.Number(label="Match Score (%)") | |
| output_suggest = gr.Textbox(label="Suggestions") | |
| run_btn.click(fn=analyze, inputs=[file_input, job_input, custom_skills], | |
| outputs=[output_preview, output_clean, output_skills, output_score, output_suggest]) | |
| if __name__ == "__main__": | |
| # If running in Colab, demo.launch(share=True) will give a public link | |
| demo.launch(share=True) | |