import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import PyPDF2 import docx import io import re import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk from collections import Counter import warnings import time import json warnings.filterwarnings("ignore") # Download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: try: nltk.download('punkt') except: nltk.download('punkt_tab') try: nltk.data.find('tokenizers/punkt_tab') except LookupError: nltk.download('punkt_tab') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize class ModernATSAnalyzer: def __init__(self): self.progress_callback = None self.llm_pipeline = None self.embedding_model = None self.update_progress("🚀 Initializing AI models...", 5) # Initialize embedding model for semantic analysis try: from sentence_transformers import SentenceTransformer # Use latest 2025 optimized model for better understanding self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') self.update_progress("✅ Embedding model loaded", 15) except Exception as e: self.update_progress(f"❌ Embedding model failed: {str(e)}", 15) # Initialize LLM for intelligent analysis (using 2025 small models) try: # Try to load a small but capable 2025 model model_options = [ "microsoft/DialoGPT-small", # Fallback option "HuggingFaceTB/SmolLM2-135M", # 2025 efficient model "Qwen/Qwen2.5-0.5B" # 2025 small but powerful ] for model_name in model_options: try: self.llm_pipeline = pipeline( "text-generation", model=model_name, tokenizer=model_name, device=-1, # CPU max_length=512, do_sample=True, temperature=0.7, pad_token_id=50256 ) self.update_progress(f"✅ LLM loaded: {model_name}", 25) break except: continue if not self.llm_pipeline: self.update_progress("⚠️ Using rule-based analysis (LLM unavailable)", 25) except Exception as e: self.update_progress(f"⚠️ LLM initialization failed, using backup methods", 25) self.stop_words = set(stopwords.words('english')) self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) self.update_progress("🎯 System ready for analysis!", 30) def set_progress_callback(self, callback): self.progress_callback = callback def update_progress(self, message, progress): if self.progress_callback: self.progress_callback(message, progress) time.sleep(0.05) def extract_text_from_pdf(self, file_path): """Extract text from PDF file""" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text except Exception as e: return f"Error reading PDF: {str(e)}" def extract_text_from_docx(self, file_path): """Extract text from DOCX file""" try: doc = docx.Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text except Exception as e: return f"Error reading DOCX: {str(e)}" def clean_text(self, text): """Clean and normalize text""" text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s.,()-]', ' ', text) return text.strip() def extract_dynamic_keywords(self, text, top_n=30): """Dynamically extract important keywords using NLP techniques""" # Clean text clean_text = self.clean_text(text.lower()) # Tokenize and filter words = word_tokenize(clean_text) words = [word for word in words if ( word.isalpha() and len(word) > 2 and word not in self.stop_words )] # Get word frequencies word_freq = Counter(words) # Extract phrases (bigrams and trigrams) sentences = sent_tokenize(text) phrases = [] for sentence in sentences: sentence_words = word_tokenize(sentence.lower()) sentence_words = [w for w in sentence_words if w.isalpha()] # Bigrams for i in range(len(sentence_words) - 1): bigram = f"{sentence_words[i]} {sentence_words[i+1]}" if len(bigram) > 6: # Avoid very short phrases phrases.append(bigram) # Trigrams for technical terms for i in range(len(sentence_words) - 2): trigram = f"{sentence_words[i]} {sentence_words[i+1]} {sentence_words[i+2]}" if len(trigram) > 10: phrases.append(trigram) phrase_freq = Counter(phrases) # Combine words and phrases keywords = [] # Add top words for word, freq in word_freq.most_common(top_n//2): keywords.append((word, freq, 'word')) # Add top phrases for phrase, freq in phrase_freq.most_common(top_n//2): if freq >= 2: # Only include phrases that appear multiple times keywords.append((phrase, freq, 'phrase')) return keywords def analyze_with_llm(self, resume_text, job_text): """Use LLM for intelligent analysis""" if not self.llm_pipeline: return self.fallback_analysis(resume_text, job_text) try: prompt = f"""Analyze this resume against the job description and provide a compatibility score out of 100. Job Description: {job_text[:500]}... Resume: {resume_text[:500]}... Provide analysis in this format: Score: [0-100] Skills Match: [description] Experience Match: [description] Key Gaps: [description] """ response = self.llm_pipeline(prompt, max_new_tokens=200, num_return_sequences=1) analysis_text = response[0]['generated_text'].split(prompt)[-1].strip() # Parse the response score_match = re.search(r'Score:\s*(\d+)', analysis_text) score = int(score_match.group(1)) if score_match else 50 return { 'overall_score': min(100, max(0, score)), 'analysis_text': analysis_text, 'method': 'LLM' } except Exception as e: return self.fallback_analysis(resume_text, job_text) def fallback_analysis(self, resume_text, job_text): """Sophisticated rule-based analysis as fallback""" # Extract keywords from both texts resume_keywords = self.extract_dynamic_keywords(resume_text) job_keywords = self.extract_dynamic_keywords(job_text) # Create keyword sets for comparison resume_terms = set([kw[0] for kw in resume_keywords]) job_terms = set([kw[0] for kw in job_keywords]) # Calculate various similarity metrics # 1. Keyword overlap overlap = len(resume_terms.intersection(job_terms)) keyword_score = (overlap / len(job_terms)) * 100 if job_terms else 0 # 2. TF-IDF Similarity try: tfidf_matrix = self.tfidf_vectorizer.fit_transform([resume_text, job_text]) tfidf_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] * 100 except: tfidf_similarity = 0 # 3. Semantic similarity using embeddings semantic_score = 0 if self.embedding_model: try: resume_embedding = self.embedding_model.encode(resume_text[:512]) job_embedding = self.embedding_model.encode(job_text[:512]) semantic_score = cosine_similarity([resume_embedding], [job_embedding])[0][0] * 100 except: semantic_score = 0 # 4. Structure and length analysis structure_score = self.analyze_resume_structure(resume_text) # Weighted combination overall_score = ( keyword_score * 0.3 + tfidf_similarity * 0.25 + semantic_score * 0.25 + structure_score * 0.2 ) return { 'overall_score': min(100, max(0, overall_score)), 'keyword_score': keyword_score, 'tfidf_score': tfidf_similarity, 'semantic_score': semantic_score, 'structure_score': structure_score, 'resume_keywords': resume_keywords[:10], 'job_keywords': job_keywords[:10], 'common_keywords': list(resume_terms.intersection(job_terms))[:10], 'method': 'Advanced Rule-based' } def analyze_resume_structure(self, resume_text): """Analyze resume structure and formatting""" score = 100 # Check for essential sections sections = { 'contact': r'(email|phone|@|linkedin|github)', 'experience': r'(experience|work|employment|career|job)', 'education': r'(education|degree|university|college|school)', 'skills': r'(skills|technical|technologies|competencies|tools)' } sections_found = 0 for section, pattern in sections.items(): if re.search(pattern, resume_text, re.IGNORECASE): sections_found += 1 # Penalize missing sections section_penalty = (4 - sections_found) * 15 score -= section_penalty # Check word count word_count = len(resume_text.split()) if word_count < 150: score -= 30 elif word_count > 1200: score -= 10 # Check for bullet points or structure if '•' in resume_text or '-' in resume_text or '*' in resume_text: score += 5 # Check for years/dates (experience indicators) years_pattern = r'(20\d{2}|19\d{2})' if re.search(years_pattern, resume_text): score += 10 return max(0, min(100, score)) def generate_intelligent_suggestions(self, analysis_result): """Generate intelligent suggestions based on analysis""" suggestions = [] if analysis_result['method'] == 'LLM' and 'analysis_text' in analysis_result: # Extract suggestions from LLM response if 'Key Gaps:' in analysis_result['analysis_text']: gaps = analysis_result['analysis_text'].split('Key Gaps:')[-1].strip() suggestions.append(f"🎯 **Key Areas to Improve**: {gaps}") # Add rule-based suggestions score = analysis_result['overall_score'] if score < 40: suggestions.append("🚨 **Critical**: Your resume needs major optimization. Consider professional resume writing services.") elif score < 60: suggestions.append("⚠️ **Moderate Compatibility**: Your resume shows potential but needs significant keyword optimization.") elif score < 80: suggestions.append("👍 **Good Foundation**: You're on the right track. Focus on fine-tuning keywords and formatting.") else: suggestions.append("✅ **Excellent**: Your resume shows strong compatibility with this job!") # Specific suggestions based on analysis components if 'keyword_score' in analysis_result and analysis_result['keyword_score'] < 40: suggestions.append("🔑 **Keywords**: Incorporate more relevant keywords from the job description naturally into your resume content.") if 'structure_score' in analysis_result and analysis_result['structure_score'] < 70: suggestions.append("📋 **Structure**: Improve resume formatting with clear sections: Contact, Experience, Education, Skills.") if 'semantic_score' in analysis_result and analysis_result['semantic_score'] < 50: suggestions.append("🎨 **Content Alignment**: Rewrite your experience descriptions to better match the job's language and requirements.") # Add common ATS tips suggestions.append("💡 **ATS Tips**: Use standard fonts, avoid images/graphics, save as PDF, and use keywords in context rather than just listing them.") return suggestions def process_resume_analysis(self, resume_file, job_description, progress=gr.Progress()): """Main analysis function""" try: def update_progress_ui(message, prog): progress(prog/100, desc=message) self.set_progress_callback(update_progress_ui) # Validation if resume_file is None: return "❌ Please upload a resume file.", "", "", "" if not job_description or len(job_description.strip()) < 50: return "❌ Please provide a detailed job description (at least 50 characters).", "", "", "" self.update_progress("📄 Extracting text from resume...", 35) # Extract resume text filename = str(resume_file).lower() if filename.endswith('.pdf'): resume_text = self.extract_text_from_pdf(resume_file) elif filename.endswith('.docx'): resume_text = self.extract_text_from_docx(resume_file) else: return f"❌ Unsupported file format. Please upload PDF or DOCX files.", "", "", "" if "Error reading" in resume_text: return resume_text, "", "", "" if len(resume_text.strip()) < 100: return "❌ Resume text is too short or couldn't be extracted. Please ensure your file contains readable text.", "", "", "" self.update_progress("🧠 Analyzing with AI...", 50) # Perform AI analysis analysis_result = self.analyze_with_llm(resume_text, job_description) self.update_progress("💡 Generating suggestions...", 80) # Generate suggestions suggestions = self.generate_intelligent_suggestions(analysis_result) self.update_progress("✅ Analysis complete!", 100) # Format results score = analysis_result['overall_score'] if score >= 85: emoji = "🟢" status = "Excellent Match" elif score >= 70: emoji = "🟡" status = "Good Compatibility" elif score >= 50: emoji = "🟠" status = "Moderate Match" else: emoji = "🔴" status = "Needs Improvement" score_text = f"# 🎯 ATS Compatibility Score: {score:.0f}/100\n\n{emoji} **{status}**" # Detailed breakdown details = f"""## 📊 Analysis Breakdown **Analysis Method**: {analysis_result['method']} **Overall Score**: {score:.1f}/100 """ if 'keyword_score' in analysis_result: details += f""" **Keyword Match**: {analysis_result['keyword_score']:.1f}/100 **Content Similarity**: {analysis_result.get('tfidf_score', 0):.1f}/100 **Semantic Match**: {analysis_result.get('semantic_score', 0):.1f}/100 **Structure Quality**: {analysis_result.get('structure_score', 0):.1f}/100 """ suggestions_text = "## 💡 Improvement Recommendations\n\n" + "\n\n".join(suggestions) # Keywords analysis keywords_text = "## 🔍 Keyword Analysis\n\n" if 'resume_keywords' in analysis_result: resume_kw = [kw[0] for kw in analysis_result['resume_keywords']] job_kw = [kw[0] for kw in analysis_result['job_keywords']] common_kw = analysis_result.get('common_keywords', []) keywords_text += f"""**Resume Keywords**: {', '.join(resume_kw)} **Job Keywords**: {', '.join(job_kw)} **Matching Keywords**: {', '.join(common_kw) if common_kw else 'Limited overlap detected'} **Recommendation**: Focus on incorporating more job-specific keywords naturally into your resume content. """ else: keywords_text += "**Dynamic keyword extraction completed.** The analysis considered context and semantic meaning rather than simple keyword matching." return score_text, details, suggestions_text, keywords_text except Exception as e: return f"❌ Analysis error: {str(e)}\n\nPlease try again or contact support.", "", "", "" # Initialize analyzer analyzer = ModernATSAnalyzer() def create_interface(): with gr.Blocks(title="Modern ATS Analyzer 2025", theme=gr.themes.Soft()) as interface: gr.HTML("""
Powered by Latest AI Models | Dynamic Keyword Extraction | Intelligent Analysis
No predefined keywords - Real ATS-like analysis using 2025 AI technology
Optimized for CPU inference • 2025 Model Architecture • Enterprise-grade Analysis