Spaces:

PJ2005
/

robust-ai-resume-analyzer

Sleeping

App Files Files Community

robust-ai-resume-analyzer / codes.py

PJ2005

Rename app.py to codes.py

e3ef681 verified 5 months ago

raw

history blame contribute delete

8.3 kB


	import nltk
	import pytesseract # Moved import to the top
	nltk.download('stopwords') # one-time
	pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Colab default

	# Robust AI Resume Analyzer (Gradio)
	# Paste this into a Colab cell or run locally (see install notes below).

	import os, io, re, traceback
	from PIL import Image, ImageFilter, ImageOps
	import pytesseract
	import docx
	import PyPDF2
	import gradio as gr
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	# optional: pdf2image fallback for scanned PDFs (install poppler + pdf2image to enable)
	try:
	from pdf2image import convert_from_bytes
	PDF2IMAGE_AVAILABLE = True
	except Exception:
	PDF2IMAGE_AVAILABLE = False

	# NLTK stopwords (download if needed)
	import nltk
	from nltk.corpus import stopwords
	try:
	STOPWORDS = set(stopwords.words("english"))
	except LookupError:
	nltk.download("stopwords")
	STOPWORDS = set(stopwords.words("english"))

	BASE_SKILLS = [
	"python", "machine learning", "data analysis", "pandas", "numpy", "nlp",
	"deep learning", "tensorflow", "pytorch", "scikit-learn", "sql", "aws",
	"docker", "git", "rest api", "computer vision", "opencv", "transformers"
	]

	# ---------------- Extraction ----------------
	def extract_text_from_bytes(file_bytes, filename):
	fname = (filename or "").lower()
	text = ""
	try:
	if fname.endswith(".pdf"):
	# first try direct PDF text extraction
	try:
	reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + " "
	except Exception:
	text = ""

	# fallback: if no text and pdf2image available, render pages and OCR
	if not text.strip() and PDF2IMAGE_AVAILABLE:
	try:
	pages = convert_from_bytes(file_bytes, dpi=200)
	for pg in pages:
	pg = pg.convert("L").filter(ImageFilter.MedianFilter())
	text += pytesseract.image_to_string(pg) + " "
	except Exception:
	pass

	elif fname.endswith(".docx") or fname.endswith(".doc"):
	try:
	doc = docx.Document(io.BytesIO(file_bytes))
	text = "\n".join([p.text for p in doc.paragraphs])
	except Exception:
	# fallback to decoding bytes
	text = file_bytes.decode("utf-8", errors="ignore")

	elif any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]):
	img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
	img = ImageOps.grayscale(img)
	img = img.filter(ImageFilter.MedianFilter())
	text = pytesseract.image_to_string(img)

	elif fname.endswith(".txt"):
	text = file_bytes.decode("utf-8", errors="ignore")

	else:
	# unknown extension: try PDF, then image OCR, then decode
	try:
	reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + " "
	except Exception:
	pass
	if not text.strip():
	try:
	img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
	img = ImageOps.grayscale(img)
	text = pytesseract.image_to_string(img)
	except Exception:
	try:
	text = file_bytes.decode("utf-8", errors="ignore")
	except Exception:
	text = ""
	except Exception as e:
	print("extract_text error:", e)
	return ""
	return text.strip()

	# ---------------- Clean & Skills ----------------
	def clean_text(text):
	text = (text or "").lower()
	text = re.sub(r"[^a-z0-9\s\-\.\@]", " ", text)
	tokens = [w for w in text.split() if w not in STOPWORDS]
	return " ".join(tokens)

	def find_skills(text, custom_skills=[]):
	skills = BASE_SKILLS + [s.strip().lower() for s in custom_skills if s.strip()]
	text_low = (text or "").lower()
	found = [s for s in skills if s in text_low]
	return sorted(list(dict.fromkeys(found)))

	def compute_similarity(resume_text, job_text):
	if not job_text.strip() or not resume_text.strip():
	return 0.0
	corpus = [resume_text, job_text]
	try:
	vec = TfidfVectorizer().fit_transform(corpus)
	sim = cosine_similarity(vec[0:1], vec[1:2])[0][0]
	return float(sim * 100)
	except Exception as e:
	print("compute_similarity error:", e)
	return 0.0

	# ---------------- Main function ----------------
	def analyze(file, job_description, custom_input):
	try:
	if not file:
	return "No file uploaded", "", "", 0.0, "Upload a file (PNG/JPG/PDF/DOCX/TXT)"

	# Gradio with type="file" usually passes a filepath string
	if isinstance(file, str):
	path = file
	filename = os.path.basename(path)
	with open(path, "rb") as f:
	file_bytes = f.read()
	elif isinstance(file, dict):
	# web mode / some frontends return dict-like objects
	filename = file.get("name") or file.get("filename") or "uploaded_file"
	data = file.get("data") or file.get("tmp_path")
	if isinstance(data, str) and os.path.exists(data):
	with open(data, "rb") as f:
	file_bytes = f.read()
	elif isinstance(data, (bytes, bytearray)):
	file_bytes = data
	else:
	file_bytes = b""
	elif hasattr(file, "read"):
	filename = getattr(file, "name", "uploaded_file")
	file_bytes = file.read()
	else:
	return "Unsupported file object", "", "", 0.0, "Unsupported file object type"

	text = extract_text_from_bytes(file_bytes, filename)
	if not text:
	return "Could not extract text from file", "", "", 0.0, "Try a clearer image or a different file type"

	cleaned_resume = clean_text(text)
	cleaned_job = clean_text(job_description or "")

	custom_skills = [s.strip() for s in (custom_input or "").split(",") if s.strip()]
	skills_found = find_skills(text, custom_skills)

	score = compute_similarity(cleaned_resume, cleaned_job) if cleaned_job else 0.0

	suggestions = f"Skills found: {', '.join(skills_found) if skills_found else 'None'}\nSimilarity score: {score:.2f}%"
	short_preview = text[:2000] + ("..." if len(text) > 2000 else "")

	return short_preview, cleaned_resume, ", ".join(skills_found), round(score, 2), suggestions

	except Exception as e:
	traceback.print_exc()
	return "Error during analysis", "", "", 0.0, str(e)

	# ---------------- Gradio UI ----------------
	with gr.Blocks() as demo:
	gr.Markdown("# ⚡ AI Resume Analyzer (Robust)")
	with gr.Row():
	with gr.Column(scale=2):
	file_input = gr.File(label="Upload Resume (PNG/JPG/PDF/DOCX/TXT)", file_count="single", type="filepath")
	job_input = gr.Textbox(lines=4, label="Paste Job Description (optional)")
	custom_skills = gr.Textbox(lines=2, label="Custom Skills (comma separated, optional)")
	run_btn = gr.Button("Analyze Resume")
	with gr.Column(scale=3):
	output_preview = gr.Textbox(label="Extracted Text Preview")
	output_clean = gr.Textbox(label="Cleaned Text")
	output_skills = gr.Textbox(label="Detected Skills")
	output_score = gr.Number(label="Match Score (%)")
	output_suggest = gr.Textbox(label="Suggestions")

	run_btn.click(fn=analyze, inputs=[file_input, job_input, custom_skills],
	outputs=[output_preview, output_clean, output_skills, output_score, output_suggest])

	if __name__ == "__main__":
	# If running in Colab, demo.launch(share=True) will give a public link
	demo.launch(share=True)