DefendModel / app.py
Mangesh223's picture
Update app.py
84550c0 verified
raw
history blame
4.73 kB
import gradio as gr
from transformers import pipeline, BitsAndBytesConfig
import PyPDF2
import io
import re
import json
import os
import gc
import torch
from huggingface_hub import login
from dotenv import load_dotenv
# --- Configuration --- #
load_dotenv()
login(token=os.getenv("HF_TOKEN"))
# Quantization config for memory efficiency
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4"
)
# Initialize pipeline with memory optimization
analyzer = pipeline(
"text-generation",
model="mistralai/Mistral-7B-Instruct-v0.3",
device_map="auto",
quantization_config=quant_config,
torch_dtype=torch.float16
)
# Skills list converted to set for faster lookups
GENERAL_SKILLS = {
'communication', 'problem solving', 'project management',
'python', 'sql', 'excel', 'teamwork'
}
# Precompiled regex patterns
YEAR_PATTERN = re.compile(r'\d{4}\s*[-–]\s*(?:Present|\d{4})')
ACHIEVEMENT_PATTERN = re.compile(r'(increased|reduced|saved|improved)\s+by\s+(\d+%|\$\d+)', re.I)
TYPO_PATTERN = re.compile(r'\b(?:responsibilities|accomplishment|experiance)\b', re.I)
# --- Core Functions --- #
def extract_text_from_pdf(pdf_file):
"""Extract text with memory cleanup"""
try:
text = PyPDF2.PdfReader(io.BytesIO(pdf_file)).pages[0].extract_text()
return text[:10000] # Limit to first 10k chars
finally:
gc.collect()
def calculate_scores(resume_text, job_desc=None):
"""Optimized scoring function"""
resume_lower = resume_text.lower()
scores = {
"relevance_to_job": 0,
"experience_quality": 0,
"skills_match": 0,
"education": 0,
"achievements": 0,
"clarity": 10 - min(8, len(TYPO_PATTERN.findall(resume_text))),
"customization": 0
}
# Relevance calculation
if job_desc:
job_words = set(re.findall(r'\w+', job_desc.lower()))
resume_words = set(re.findall(r'\w+', resume_lower))
scores["relevance_to_job"] = min(20, int(20 * len(job_words & resume_words) / len(job_words)))
else:
scores["relevance_to_job"] = min(10, sum(1 for skill in GENERAL_SKILLS if skill in resume_lower))
# Experience calculation
scores["experience_quality"] = min(10, len(YEAR_PATTERN.findall(resume_text)))
scores["experience_quality"] += min(10, len(ACHIEVEMENT_PATTERN.findall(resume_text)) * 2)
# Education detection
if 'phd' in resume_lower or 'doctorate' in resume_lower:
scores["education"] = 8
elif 'master' in resume_lower or 'msc' in resume_lower or 'mba' in resume_lower:
scores["education"] = 6
elif 'bachelor' in resume_lower or ' bs ' in resume_lower or ' ba ' in resume_lower:
scores["education"] = 4
elif 'high school' in resume_lower:
scores["education"] = 2
return scores, min(100, sum(scores.values()))
def analyze_resume(pdf_file, job_desc=None):
"""Optimized analysis with memory management"""
resume_text = extract_text_from_pdf(pdf_file)
scores, total_score = calculate_scores(resume_text, job_desc)
prompt = f"""Analyze this resume and return JSON with:
2 key strengths (reference these scores: {scores}),
3 specific improvements,
2 missing skills (if job description provided).
Return ONLY valid JSON without markdown:"""
try:
result = analyzer(
prompt,
max_new_tokens=300,
do_sample=False # More deterministic output
)[0]["generated_text"]
# Safely parse JSON
return {
"score": {"total": total_score, "breakdown": scores},
"analysis": json.loads(result),
"raw_text": resume_text[:500] # Truncated for memory
}
except Exception as e:
return {"error": str(e)}
# --- Gradio Interface --- #
with gr.Blocks(theme=gr.themes.Soft(title_size="sm")) as demo:
gr.Markdown("""
# 🎯 Resume Analyzer
*Optimized for memory efficiency*
""")
with gr.Row():
with gr.Column(scale=1):
inputs = [
gr.File(label="PDF Resume", type="binary"),
gr.Textbox(label="Job Description (Optional)", lines=3)
]
gr.Examples(
examples=[["sample.pdf", "Data Scientist with Python experience"]],
inputs=inputs
)
with gr.Column(scale=2):
output = gr.JSON(label="Analysis")
inputs[0].upload(
fn=analyze_resume,
inputs=inputs,
outputs=output,
queue=True # Process one at a time
)
demo.launch(max_threads=1) # Further reduce memory pressure