Spaces:

riteshkokam
/

ResAI

Sleeping

File size: 22,857 Bytes

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import PyPDF2
import docx
import io
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from collections import Counter
import warnings
import time
import json
warnings.filterwarnings("ignore")

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    try:
        nltk.download('punkt')
    except:
        nltk.download('punkt_tab')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

class ModernATSAnalyzer:
    def __init__(self):
        self.progress_callback = None
        self.llm_pipeline = None
        self.embedding_model = None
        
        self.update_progress("🚀 Initializing AI models...", 5)
        
        # Initialize embedding model for semantic analysis
        try:
            from sentence_transformers import SentenceTransformer
            # Use latest 2025 optimized model for better understanding
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            self.update_progress("✅ Embedding model loaded", 15)
        except Exception as e:
            self.update_progress(f"❌ Embedding model failed: {str(e)}", 15)
        
        # Initialize LLM for intelligent analysis (using 2025 small models)
        try:
            # Try to load a small but capable 2025 model
            model_options = [
                "microsoft/DialoGPT-small",  # Fallback option
                "HuggingFaceTB/SmolLM2-135M",  # 2025 efficient model
                "Qwen/Qwen2.5-0.5B"  # 2025 small but powerful
            ]
            
            for model_name in model_options:
                try:
                    self.llm_pipeline = pipeline(
                        "text-generation",
                        model=model_name,
                        tokenizer=model_name,
                        device=-1,  # CPU
                        max_length=512,
                        do_sample=True,
                        temperature=0.7,
                        pad_token_id=50256
                    )
                    self.update_progress(f"✅ LLM loaded: {model_name}", 25)
                    break
                except:
                    continue
            
            if not self.llm_pipeline:
                self.update_progress("⚠️ Using rule-based analysis (LLM unavailable)", 25)
                
        except Exception as e:
            self.update_progress(f"⚠️ LLM initialization failed, using backup methods", 25)
        
        self.stop_words = set(stopwords.words('english'))
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        
        self.update_progress("🎯 System ready for analysis!", 30)
    
    def set_progress_callback(self, callback):
        self.progress_callback = callback
    
    def update_progress(self, message, progress):
        if self.progress_callback:
            self.progress_callback(message, progress)
        time.sleep(0.05)
    
    def extract_text_from_pdf(self, file_path):
        """Extract text from PDF file"""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
            return text
        except Exception as e:
            return f"Error reading PDF: {str(e)}"
    
    def extract_text_from_docx(self, file_path):
        """Extract text from DOCX file"""
        try:
            doc = docx.Document(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            return f"Error reading DOCX: {str(e)}"
    
    def clean_text(self, text):
        """Clean and normalize text"""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,()-]', ' ', text)
        return text.strip()
    
    def extract_dynamic_keywords(self, text, top_n=30):
        """Dynamically extract important keywords using NLP techniques"""
        # Clean text
        clean_text = self.clean_text(text.lower())
        
        # Tokenize and filter
        words = word_tokenize(clean_text)
        words = [word for word in words if (
            word.isalpha() and 
            len(word) > 2 and 
            word not in self.stop_words
        )]
        
        # Get word frequencies
        word_freq = Counter(words)
        
        # Extract phrases (bigrams and trigrams)
        sentences = sent_tokenize(text)
        phrases = []
        for sentence in sentences:
            sentence_words = word_tokenize(sentence.lower())
            sentence_words = [w for w in sentence_words if w.isalpha()]
            
            # Bigrams
            for i in range(len(sentence_words) - 1):
                bigram = f"{sentence_words[i]} {sentence_words[i+1]}"
                if len(bigram) > 6:  # Avoid very short phrases
                    phrases.append(bigram)
            
            # Trigrams for technical terms
            for i in range(len(sentence_words) - 2):
                trigram = f"{sentence_words[i]} {sentence_words[i+1]} {sentence_words[i+2]}"
                if len(trigram) > 10:
                    phrases.append(trigram)
        
        phrase_freq = Counter(phrases)
        
        # Combine words and phrases
        keywords = []
        
        # Add top words
        for word, freq in word_freq.most_common(top_n//2):
            keywords.append((word, freq, 'word'))
        
        # Add top phrases
        for phrase, freq in phrase_freq.most_common(top_n//2):
            if freq >= 2:  # Only include phrases that appear multiple times
                keywords.append((phrase, freq, 'phrase'))
        
        return keywords
    
    def analyze_with_llm(self, resume_text, job_text):
        """Use LLM for intelligent analysis"""
        if not self.llm_pipeline:
            return self.fallback_analysis(resume_text, job_text)
        
        try:
            prompt = f"""Analyze this resume against the job description and provide a compatibility score out of 100.

Job Description:
{job_text[:500]}...

Resume:
{resume_text[:500]}...

Provide analysis in this format:
Score: [0-100]
Skills Match: [description]
Experience Match: [description]
Key Gaps: [description]
"""
            
            response = self.llm_pipeline(prompt, max_new_tokens=200, num_return_sequences=1)
            analysis_text = response[0]['generated_text'].split(prompt)[-1].strip()
            
            # Parse the response
            score_match = re.search(r'Score:\s*(\d+)', analysis_text)
            score = int(score_match.group(1)) if score_match else 50
            
            return {
                'overall_score': min(100, max(0, score)),
                'analysis_text': analysis_text,
                'method': 'LLM'
            }
            
        except Exception as e:
            return self.fallback_analysis(resume_text, job_text)
    
    def fallback_analysis(self, resume_text, job_text):
        """Sophisticated rule-based analysis as fallback"""
        # Extract keywords from both texts
        resume_keywords = self.extract_dynamic_keywords(resume_text)
        job_keywords = self.extract_dynamic_keywords(job_text)
        
        # Create keyword sets for comparison
        resume_terms = set([kw[0] for kw in resume_keywords])
        job_terms = set([kw[0] for kw in job_keywords])
        
        # Calculate various similarity metrics
        
        # 1. Keyword overlap
        overlap = len(resume_terms.intersection(job_terms))
        keyword_score = (overlap / len(job_terms)) * 100 if job_terms else 0
        
        # 2. TF-IDF Similarity
        try:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform([resume_text, job_text])
            tfidf_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] * 100
        except:
            tfidf_similarity = 0
        
        # 3. Semantic similarity using embeddings
        semantic_score = 0
        if self.embedding_model:
            try:
                resume_embedding = self.embedding_model.encode(resume_text[:512])
                job_embedding = self.embedding_model.encode(job_text[:512])
                semantic_score = cosine_similarity([resume_embedding], [job_embedding])[0][0] * 100
            except:
                semantic_score = 0
        
        # 4. Structure and length analysis
        structure_score = self.analyze_resume_structure(resume_text)
        
        # Weighted combination
        overall_score = (
            keyword_score * 0.3 +
            tfidf_similarity * 0.25 +
            semantic_score * 0.25 +
            structure_score * 0.2
        )
        
        return {
            'overall_score': min(100, max(0, overall_score)),
            'keyword_score': keyword_score,
            'tfidf_score': tfidf_similarity,
            'semantic_score': semantic_score,
            'structure_score': structure_score,
            'resume_keywords': resume_keywords[:10],
            'job_keywords': job_keywords[:10],
            'common_keywords': list(resume_terms.intersection(job_terms))[:10],
            'method': 'Advanced Rule-based'
        }
    
    def analyze_resume_structure(self, resume_text):
        """Analyze resume structure and formatting"""
        score = 100
        
        # Check for essential sections
        sections = {
            'contact': r'(email|phone|@|linkedin|github)',
            'experience': r'(experience|work|employment|career|job)',
            'education': r'(education|degree|university|college|school)',
            'skills': r'(skills|technical|technologies|competencies|tools)'
        }
        
        sections_found = 0
        for section, pattern in sections.items():
            if re.search(pattern, resume_text, re.IGNORECASE):
                sections_found += 1
        
        # Penalize missing sections
        section_penalty = (4 - sections_found) * 15
        score -= section_penalty
        
        # Check word count
        word_count = len(resume_text.split())
        if word_count < 150:
            score -= 30
        elif word_count > 1200:
            score -= 10
        
        # Check for bullet points or structure
        if '•' in resume_text or '-' in resume_text or '*' in resume_text:
            score += 5
        
        # Check for years/dates (experience indicators)
        years_pattern = r'(20\d{2}|19\d{2})'
        if re.search(years_pattern, resume_text):
            score += 10
        
        return max(0, min(100, score))
    
    def generate_intelligent_suggestions(self, analysis_result):
        """Generate intelligent suggestions based on analysis"""
        suggestions = []
        
        if analysis_result['method'] == 'LLM' and 'analysis_text' in analysis_result:
            # Extract suggestions from LLM response
            if 'Key Gaps:' in analysis_result['analysis_text']:
                gaps = analysis_result['analysis_text'].split('Key Gaps:')[-1].strip()
                suggestions.append(f"🎯 **Key Areas to Improve**: {gaps}")
        
        # Add rule-based suggestions
        score = analysis_result['overall_score']
        
        if score < 40:
            suggestions.append("🚨 **Critical**: Your resume needs major optimization. Consider professional resume writing services.")
        elif score < 60:
            suggestions.append("⚠️ **Moderate Compatibility**: Your resume shows potential but needs significant keyword optimization.")
        elif score < 80:
            suggestions.append("👍 **Good Foundation**: You're on the right track. Focus on fine-tuning keywords and formatting.")
        else:
            suggestions.append("✅ **Excellent**: Your resume shows strong compatibility with this job!")
        
        # Specific suggestions based on analysis components
        if 'keyword_score' in analysis_result and analysis_result['keyword_score'] < 40:
            suggestions.append("🔑 **Keywords**: Incorporate more relevant keywords from the job description naturally into your resume content.")
        
        if 'structure_score' in analysis_result and analysis_result['structure_score'] < 70:
            suggestions.append("📋 **Structure**: Improve resume formatting with clear sections: Contact, Experience, Education, Skills.")
        
        if 'semantic_score' in analysis_result and analysis_result['semantic_score'] < 50:
            suggestions.append("🎨 **Content Alignment**: Rewrite your experience descriptions to better match the job's language and requirements.")
        
        # Add common ATS tips
        suggestions.append("💡 **ATS Tips**: Use standard fonts, avoid images/graphics, save as PDF, and use keywords in context rather than just listing them.")
        
        return suggestions
    
    def process_resume_analysis(self, resume_file, job_description, progress=gr.Progress()):
        """Main analysis function"""
        try:
            def update_progress_ui(message, prog):
                progress(prog/100, desc=message)
            
            self.set_progress_callback(update_progress_ui)
            
            # Validation
            if resume_file is None:
                return "❌ Please upload a resume file.", "", "", ""
            
            if not job_description or len(job_description.strip()) < 50:
                return "❌ Please provide a detailed job description (at least 50 characters).", "", "", ""
            
            self.update_progress("📄 Extracting text from resume...", 35)
            
            # Extract resume text
            filename = str(resume_file).lower()
            
            if filename.endswith('.pdf'):
                resume_text = self.extract_text_from_pdf(resume_file)
            elif filename.endswith('.docx'):
                resume_text = self.extract_text_from_docx(resume_file)
            else:
                return f"❌ Unsupported file format. Please upload PDF or DOCX files.", "", "", ""
            
            if "Error reading" in resume_text:
                return resume_text, "", "", ""
            
            if len(resume_text.strip()) < 100:
                return "❌ Resume text is too short or couldn't be extracted. Please ensure your file contains readable text.", "", "", ""
            
            self.update_progress("🧠 Analyzing with AI...", 50)
            
            # Perform AI analysis
            analysis_result = self.analyze_with_llm(resume_text, job_description)
            
            self.update_progress("💡 Generating suggestions...", 80)
            
            # Generate suggestions
            suggestions = self.generate_intelligent_suggestions(analysis_result)
            
            self.update_progress("✅ Analysis complete!", 100)
            
            # Format results
            score = analysis_result['overall_score']
            
            if score >= 85:
                emoji = "🟢"
                status = "Excellent Match"
            elif score >= 70:
                emoji = "🟡"
                status = "Good Compatibility"
            elif score >= 50:
                emoji = "🟠"
                status = "Moderate Match"
            else:
                emoji = "🔴"
                status = "Needs Improvement"
            
            score_text = f"# 🎯 ATS Compatibility Score: {score:.0f}/100\n\n{emoji} **{status}**"
            
            # Detailed breakdown
            details = f"""## 📊 Analysis Breakdown

**Analysis Method**: {analysis_result['method']}
**Overall Score**: {score:.1f}/100
"""
            
            if 'keyword_score' in analysis_result:
                details += f"""
**Keyword Match**: {analysis_result['keyword_score']:.1f}/100
**Content Similarity**: {analysis_result.get('tfidf_score', 0):.1f}/100
**Semantic Match**: {analysis_result.get('semantic_score', 0):.1f}/100
**Structure Quality**: {analysis_result.get('structure_score', 0):.1f}/100
"""
            
            suggestions_text = "## 💡 Improvement Recommendations\n\n" + "\n\n".join(suggestions)
            
            # Keywords analysis
            keywords_text = "## 🔍 Keyword Analysis\n\n"
            
            if 'resume_keywords' in analysis_result:
                resume_kw = [kw[0] for kw in analysis_result['resume_keywords']]
                job_kw = [kw[0] for kw in analysis_result['job_keywords']]
                common_kw = analysis_result.get('common_keywords', [])
                
                keywords_text += f"""**Resume Keywords**: {', '.join(resume_kw)}

**Job Keywords**: {', '.join(job_kw)}

**Matching Keywords**: {', '.join(common_kw) if common_kw else 'Limited overlap detected'}

**Recommendation**: Focus on incorporating more job-specific keywords naturally into your resume content.
"""
            else:
                keywords_text += "**Dynamic keyword extraction completed.** The analysis considered context and semantic meaning rather than simple keyword matching."
            
            return score_text, details, suggestions_text, keywords_text
            
        except Exception as e:
            return f"❌ Analysis error: {str(e)}\n\nPlease try again or contact support.", "", "", ""

# Initialize analyzer
analyzer = ModernATSAnalyzer()

def create_interface():
    with gr.Blocks(title="Modern ATS Analyzer 2025", theme=gr.themes.Soft()) as interface:
        gr.HTML("""
        <div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;'>
            <h1>🤖 Modern ATS Resume Analyzer 2025</h1>
            <p style='font-size: 16px; margin: 10px 0;'>Powered by Latest AI Models | Dynamic Keyword Extraction | Intelligent Analysis</p>
            <p style='font-size: 14px; opacity: 0.9;'>No predefined keywords - Real ATS-like analysis using 2025 AI technology</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("<h3>📄 Upload Resume</h3>")
                resume_file = gr.File(
                    label="Upload Resume (PDF/DOCX)",
                    file_types=[".pdf", ".docx"],
                    type="filepath"
                )
                
                gr.HTML("<h3>📋 Job Description</h3>")
                job_description = gr.Textbox(
                    label="Paste Complete Job Description",
                    placeholder="Paste the full job posting including requirements, responsibilities, qualifications, and company information...",
                    lines=15,
                    max_lines=25
                )
                
                analyze_btn = gr.Button("🚀 Analyze with Modern AI", variant="primary", size="lg")
                
                gr.HTML("""
                <div style='margin-top: 15px; padding: 15px; background: #f0f8ff; border-radius: 8px; border-left: 4px solid #4CAF50;'>
                    <h4 style='margin: 0 0 10px 0; color: #2E7D32;'>🎯 What makes this different:</h4>
                    <ul style='margin: 0; padding-left: 20px; color: #424242;'>
                        <li><strong>No predefined keywords</strong> - Dynamically extracts relevant terms</li>
                        <li><strong>2025 AI models</strong> - Uses latest language understanding</li>
                        <li><strong>Context-aware</strong> - Understands meaning, not just word matching</li>
                        <li><strong>Real ATS simulation</strong> - Mimics actual hiring systems</li>
                    </ul>
                </div>
                """)
            
            with gr.Column(scale=1):
                score_output = gr.Markdown(label="🎯 Compatibility Score")
                details_output = gr.Markdown(label="📊 Detailed Analysis")
                suggestions_output = gr.Markdown(label="💡 AI Recommendations")
                keywords_output = gr.Markdown(label="🔍 Keyword Intelligence")
        
        analyze_btn.click(
            fn=analyzer.process_resume_analysis,
            inputs=[resume_file, job_description],
            outputs=[score_output, details_output, suggestions_output, keywords_output]
        )
        
        gr.HTML("""
        <div style='text-align: center; padding: 20px; margin-top: 30px; border-top: 2px solid #e0e0e0; background: #fafafa; border-radius: 8px;'>
            <h4 style='color: #333; margin-bottom: 15px;'>🧠 AI-Powered Analysis Engine</h4>
            <div style='display: flex; justify-content: space-around; flex-wrap: wrap;'>
                <div style='margin: 10px; text-align: center;'>
                    <strong style='color: #1976D2;'>🎯 Dynamic Keywords</strong><br>
                    <span style='font-size: 12px; color: #666;'>Extracts context-relevant terms</span>
                </div>
                <div style='margin: 10px; text-align: center;'>
                    <strong style='color: #388E3C;'>🧠 Semantic Analysis</strong><br>
                    <span style='font-size: 12px; color: #666;'>Understands meaning & context</span>
                </div>
                <div style='margin: 10px; text-align: center;'>
                    <strong style='color: #F57C00;'>📊 Multi-metric Scoring</strong><br>
                    <span style='font-size: 12px; color: #666;'>Comprehensive compatibility analysis</span>
                </div>
                <div style='margin: 10px; text-align: center;'>
                    <strong style='color: #7B1FA2;'>💡 AI Suggestions</strong><br>
                    <span style='font-size: 12px; color: #666;'>Personalized improvement tips</span>
                </div>
            </div>
            <p style='margin-top: 15px; font-size: 13px; color: #777;'>
                <em>Optimized for CPU inference • 2025 Model Architecture • Enterprise-grade Analysis</em>
            </p>
        </div>
        """)
    
    return interface

if __name__ == "__main__":
    app = create_interface()
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )