Spaces:

lexicalspace
/

Heavy-Tools

Paused

App Files Files Community

lexicalspace commited on Jan 30

Commit

5114bea

verified ·

1 Parent(s): a1b9a4e

Create app.py

Browse files

Files changed (1) hide show

app.py +253 -0

app.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import streamlit as st
+import pandas as pd
+import pypdf
+import re
+import io
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import plotly.graph_objects as go
+import plotly.express as px
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+# --- 1. SYSTEM CONFIGURATION & SETUP ---
+st.set_page_config(
+    page_title="Smart ATS Optimizer",
+    page_icon="🎯",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# NLTK Setup (Runs once to download dictionary data)
+@st.cache_resource
+def setup_nltk():
+    try:
+        nltk.data.find('tokenizers/punkt')
+    except LookupError:
+        nltk.download('punkt')
+    try:
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        nltk.download('stopwords')
+setup_nltk()
+# --- 2. BACKEND LOGIC (The Complex Part) ---
+def extract_text_from_pdf(uploaded_file):
+    """
+    Parses PDF file and returns raw text.
+    Handles exceptions for encrypted or corrupted files.
+    """
+    try:
+        pdf_reader = pypdf.PdfReader(uploaded_file)
+        text = ""
+        for page in pdf_reader.pages:
+            content = page.extract_text()
+            if content:
+                text += content
+        return text
+    except Exception as e:
+        st.error(f"Error reading PDF: {str(e)}")
+        return None
+def clean_text(text):
+    """
+    NLP Pipeline:
+    1. Lowercase
+    2. Remove special characters (keep only alphanumeric)
+    3. Tokenize (split into words)
+    4. Remove Stopwords (common words that add no meaning)
+    """
+    # 1. Regex Cleaning
+    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
+    # 2. Tokenization & Stopword Removal
+    stop_words = set(stopwords.words('english'))
+    words = word_tokenize(text)
+    filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
+    return " ".join(filtered_words)
+def calculate_similarity(resume_text, job_desc_text):
+    """
+    Mathematical Core:
+    Uses TF-IDF (Term Frequency-Inverse Document Frequency) to convert text into numbers (vectors).
+    Then calculates Cosine Similarity (angle between vectors) to determine match %.
+    """
+    # Create the vectorizer
+    tfidf = TfidfVectorizer()
+    # Fit and transform the documents
+    vectors = tfidf.fit_transform([resume_text, job_desc_text])
+    # Calculate Cosine Similarity (Result is a matrix like [[1, 0.7], [0.7, 1]])
+    similarity_matrix = cosine_similarity(vectors)
+    # We want the similarity between Doc 0 (Resume) and Doc 1 (Job)
+    match_percentage = similarity_matrix[0][1] * 100
+    # Get Feature Names (Words) for keyword analysis
+    feature_names = tfidf.get_feature_names_out()
+    # Extract non-zero vectors to find which words are present
+    dense_vector = vectors.todense()
+    resume_vector = dense_vector[0].tolist()[0]
+    job_vector = dense_vector[1].tolist()[0]
+    # Create a DataFrame of keywords
+    df = pd.DataFrame({
+        'Keyword': feature_names,
+        'Resume Score': resume_vector,
+        'Job Score': job_vector
+    })
+    # Filter for significant words (score > 0)
+    df = df[(df['Resume Score'] > 0) | (df['Job Score'] > 0)]
+    # Identify Missing Keywords (Present in Job but ZERO in Resume)
+    missing_keywords = df[(df['Job Score'] > 0) & (df['Resume Score'] == 0)]['Keyword'].tolist()
+    return match_percentage, missing_keywords, df
+def analyze_structure(text):
+    """
+    Checks for essential resume elements using Regex.
+    """
+    issues = []
+    # Email Check
+    if not re.search(r'[\w\.-]+@[\w\.-]+', text):
+        issues.append("❌ Missing Email Address")
+    # Phone Check (Basic pattern)
+    if not re.search(r'\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}', text):
+        issues.append("⚠️ Missing Phone Number")
+    # Section Checks (Simple keyword search)
+    sections = ['experience', 'education', 'skills', 'projects']
+    missing_sections = [s.capitalize() for s in sections if s not in text.lower()]
+    if missing_sections:
+        issues.append(f"⚠️ Missing Sections: {', '.join(missing_sections)}")
+    return issues
+# --- 3. FRONTEND UI (Streamlit) ---
+# Sidebar
+st.sidebar.header("⚙️ Controls")
+st.sidebar.info(
+    "This tool uses TF-IDF Vectorization and Cosine Similarity "
+    "to analyze how well your resume matches a specific job description."
+)
+confidence_threshold = st.sidebar.slider("Match Threshold (Target)", 0, 100, 75)
+# Main Content
+st.title("🎯 Smart ATS Resume Optimizer")
+st.markdown("Optimize your resume for Applicant Tracking Systems (ATS) using AI-driven text analysis.")
+# Layout: Two Columns for Input
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader("1. Upload Resume")
+    uploaded_file = st.file_uploader("Upload PDF", type=['pdf'], help="Only PDF files are supported")
+with col2:
+    st.subheader("2. Job Description")
+    job_description = st.text_area("Paste JD here...", height=300, placeholder="Copy text from LinkedIn/Indeed...")
+# Start Analysis Button
+if st.button("🚀 Analyze Resume", type="primary"):
+    if uploaded_file and job_description:
+        with st.spinner("Parsing PDF and crunching numbers..."):
+            # A. Text Extraction
+            resume_text = extract_text_from_pdf(uploaded_file)
+            if resume_text:
+                # B. NLP Cleaning
+                clean_resume = clean_text(resume_text)
+                clean_jd = clean_text(job_description)
+                # C. Analysis Engine
+                match_score, missing_keywords, keyword_df = calculate_similarity(clean_resume, clean_jd)
+                structure_issues = analyze_structure(resume_text)
+                # --- RESULTS DASHBOARD ---
+                st.divider()
+                st.markdown("### 📊 Analysis Report")
+                # Top Metric Cards
+                m1, m2, m3 = st.columns(3)
+                m1.metric("Match Score", f"{match_score:.1f}%", delta=f"{match_score - confidence_threshold:.1f}% vs Target")
+                m2.metric("Missing Keywords", len(missing_keywords), delta=-len(missing_keywords), delta_color="inverse")
+                m3.metric("Structure Issues", len(structure_issues), delta=-len(structure_issues), delta_color="inverse")
+                # Gauge Chart (Visual Appeal)
+                fig = go.Figure(go.Indicator(
+                    mode = "gauge+number",
+                    value = match_score,
+                    domain = {'x': [0, 1], 'y': [0, 1]},
+                    title = {'text': "ATS Confidence Score"},
+                    gauge = {
+                        'axis': {'range': [None, 100]},
+                        'bar': {'color': "#FF4B4B"},
+                        'steps': [
+                            {'range': [0, 50], 'color': "#fce4e4"},
+                            {'range': [50, 75], 'color': "#fccfcf"},
+                            {'range': [75, 100], 'color': "#ffb3b3"}],
+                    }
+                ))
+                st.plotly_chart(fig, use_container_width=True)
+                # Detail Tabs
+                tab1, tab2, tab3 = st.tabs(["🔍 Keyword Gap", "📝 Resume Structure", "🛠️ Raw Data"])
+                with tab1:
+                    st.subheader("Missing Hard Skills & Keywords")
+                    st.caption("These words appear frequently in the Job Description but are missing from your Resume.")
+                    if missing_keywords:
+                        # Display as chips/tags
+                        st.markdown(" ".join([f"`{k}`" for k in missing_keywords[:20]]))
+                        if len(missing_keywords) > 20:
+                            st.info(f"...and {len(missing_keywords)-20} more.")
+                    else:
+                        st.success("Amazing! You have all the key keywords.")
+                    # Keyword Overlap Chart
+                    st.subheader("Keyword Frequency Comparison")
+                    # Get top 10 keywords from JD
+                    top_keywords = keyword_df.sort_values(by='Job Score', ascending=False).head(15)
+                    bar_fig = px.bar(
+                        top_keywords,
+                        x='Keyword',
+                        y=['Job Score', 'Resume Score'],
+                        barmode='group',
+                        title="Top Keyword Importance (Resume vs JD)"
+                    )
+                    st.plotly_chart(bar_fig, use_container_width=True)
+                with tab2:
+                    st.subheader("Formatting & Structure Check")
+                    if structure_issues:
+                        for issue in structure_issues:
+                            st.error(issue)
+                        st.info("Tip: Ensure your resume has clear headings for Experience, Education, and Skills.")
+                    else:
+                        st.success("✅ Your resume structure looks great! Essential contact info and sections detected.")
+                with tab3:
+                    st.subheader("Processed Text Debug")
+                    with st.expander("View Cleaned Resume Text"):
+                        st.write(clean_resume)
+                    with st.expander("View Cleaned JD Text"):
+                        st.write(clean_jd)
+    else:
+        st.warning("Please upload a resume and paste a job description.")