Spaces:

Mangesh223
/

DefendModel

Sleeping

App Files Files Community

DefendModel / app.py

Mangesh223

Update app.py

84550c0 verified about 1 year ago

raw

history blame

4.73 kB

	import gradio as gr
	from transformers import pipeline, BitsAndBytesConfig
	import PyPDF2
	import io
	import re
	import json
	import os
	import gc
	import torch
	from huggingface_hub import login
	from dotenv import load_dotenv

	# --- Configuration --- #
	load_dotenv()
	login(token=os.getenv("HF_TOKEN"))

	# Quantization config for memory efficiency
	quant_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_quant_type="nf4"
	)

	# Initialize pipeline with memory optimization
	analyzer = pipeline(
	"text-generation",
	model="mistralai/Mistral-7B-Instruct-v0.3",
	device_map="auto",
	quantization_config=quant_config,
	torch_dtype=torch.float16
	)

	# Skills list converted to set for faster lookups
	GENERAL_SKILLS = {
	'communication', 'problem solving', 'project management',
	'python', 'sql', 'excel', 'teamwork'
	}

	# Precompiled regex patterns
	YEAR_PATTERN = re.compile(r'\d{4}\s[-–]\s(?:Present\|\d{4})')
	ACHIEVEMENT_PATTERN = re.compile(r'(increased\|reduced\|saved\|improved)\s+by\s+(\d+%\|\$\d+)', re.I)
	TYPO_PATTERN = re.compile(r'\b(?:responsibilities\|accomplishment\|experiance)\b', re.I)

	# --- Core Functions --- #
	def extract_text_from_pdf(pdf_file):
	"""Extract text with memory cleanup"""
	try:
	text = PyPDF2.PdfReader(io.BytesIO(pdf_file)).pages[0].extract_text()
	return text[:10000] # Limit to first 10k chars
	finally:
	gc.collect()

	def calculate_scores(resume_text, job_desc=None):
	"""Optimized scoring function"""
	resume_lower = resume_text.lower()
	scores = {
	"relevance_to_job": 0,
	"experience_quality": 0,
	"skills_match": 0,
	"education": 0,
	"achievements": 0,
	"clarity": 10 - min(8, len(TYPO_PATTERN.findall(resume_text))),
	"customization": 0
	}

	# Relevance calculation
	if job_desc:
	job_words = set(re.findall(r'\w+', job_desc.lower()))
	resume_words = set(re.findall(r'\w+', resume_lower))
	scores["relevance_to_job"] = min(20, int(20 * len(job_words & resume_words) / len(job_words)))
	else:
	scores["relevance_to_job"] = min(10, sum(1 for skill in GENERAL_SKILLS if skill in resume_lower))

	# Experience calculation
	scores["experience_quality"] = min(10, len(YEAR_PATTERN.findall(resume_text)))
	scores["experience_quality"] += min(10, len(ACHIEVEMENT_PATTERN.findall(resume_text)) * 2)

	# Education detection
	if 'phd' in resume_lower or 'doctorate' in resume_lower:
	scores["education"] = 8
	elif 'master' in resume_lower or 'msc' in resume_lower or 'mba' in resume_lower:
	scores["education"] = 6
	elif 'bachelor' in resume_lower or ' bs ' in resume_lower or ' ba ' in resume_lower:
	scores["education"] = 4
	elif 'high school' in resume_lower:
	scores["education"] = 2

	return scores, min(100, sum(scores.values()))

	def analyze_resume(pdf_file, job_desc=None):
	"""Optimized analysis with memory management"""
	resume_text = extract_text_from_pdf(pdf_file)
	scores, total_score = calculate_scores(resume_text, job_desc)

	prompt = f"""Analyze this resume and return JSON with:
	2 key strengths (reference these scores: {scores}),
	3 specific improvements,
	2 missing skills (if job description provided).
	Return ONLY valid JSON without markdown:"""

	try:
	result = analyzer(
	prompt,
	max_new_tokens=300,
	do_sample=False # More deterministic output
	)[0]["generated_text"]

	# Safely parse JSON
	return {
	"score": {"total": total_score, "breakdown": scores},
	"analysis": json.loads(result),
	"raw_text": resume_text[:500] # Truncated for memory
	}
	except Exception as e:
	return {"error": str(e)}

	# --- Gradio Interface --- #
	with gr.Blocks(theme=gr.themes.Soft(title_size="sm")) as demo:
	gr.Markdown("""
	# 🎯 Resume Analyzer
	Optimized for memory efficiency
	""")

	with gr.Row():
	with gr.Column(scale=1):
	inputs = [
	gr.File(label="PDF Resume", type="binary"),
	gr.Textbox(label="Job Description (Optional)", lines=3)
	]
	gr.Examples(
	examples=[["sample.pdf", "Data Scientist with Python experience"]],
	inputs=inputs
	)

	with gr.Column(scale=2):
	output = gr.JSON(label="Analysis")

	inputs[0].upload(
	fn=analyze_resume,
	inputs=inputs,
	outputs=output,
	queue=True # Process one at a time
	)

	demo.launch(max_threads=1) # Further reduce memory pressure