Spaces:

Alpha108
/

AI_JOB_MATCHER

Sleeping

App Files Files Community

Alpha108 commited on Sep 28, 2025

Commit

d170281

verified ·

1 Parent(s): afc5985

Update app.py

Browse files

Files changed (1) hide show

app.py +261 -184

app.py CHANGED Viewed

@@ -1,206 +1,283 @@
 import streamlit as st
-import fitz  # PyMuPDF
 import requests
-from sentence_transformers import SentenceTransformer, util
-import torch
 import re
-import io
-# --- FIX ---
-# Explicitly specify the device to 'cpu'. This resolves the 'meta tensor' error
-# that can occur on Hugging Face Spaces and other environments.
 try:
-    model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
-except Exception as e:
-    st.error(f"Error loading SentenceTransformer model: {e}")
-    # Stop the app if the model can't be loaded.
-    st.stop()
-def extract_text_from_pdf(pdf_file):
-    """Extracts text from an uploaded PDF file."""
     try:
-        pdf_bytes = pdf_file.read()
-        pdf_document = fitz.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
-        text = ""
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            text += page.get_text()
-        return text
     except Exception as e:
-        st.error(f"Error reading PDF file: {e}")
         return None
-def extract_keywords(text):
-    """
-    A simple keyword extractor for skills, technologies, and certifications.
-    This can be replaced with a more sophisticated NLP model if needed.
-    """
     if not text:
         return []
-    # Using regex to find potential skills (e.g., Python, Java, SQL, AWS, Docker)
-    # This is a basic example and can be expanded significantly.
-    skills_pattern = r'\b(Python|Java|C\+\+|JavaScript|SQL|React|Node\.js|Angular|Vue|AWS|Azure|GCP|Docker|Kubernetes|TensorFlow|PyTorch|Scikit-learn|Pandas|NumPy|Git)\b'
-    skills = re.findall(skills_pattern, text, re.IGNORECASE)
-    return list(set(skills)) # Return unique skills
-def fetch_remoteok_jobs():
-    """Fetches the latest jobs from the RemoteOK API."""
     try:
-        response = requests.get('https://remoteok.com/api')
-        response.raise_for_status()  # Raise an exception for bad status codes
-        jobs = response.json()
-        # The first element is often a header/legal notice, so we skip it
-        return jobs[1:] if isinstance(jobs, list) and len(jobs) > 1 else []
     except requests.exceptions.RequestException as e:
-        st.error(f"Failed to fetch jobs from RemoteOK: {e}")
-        return []
-    except ValueError:
-        st.error("Failed to parse JSON response from RemoteOK.")
         return []
-def calculate_similarity(cv_text, job_description):
-    """Calculates cosine similarity between CV text and a job description."""
-    if not cv_text or not job_description:
-        return 0.0
-    # Generate embeddings
-    cv_embedding = model.encode(cv_text, convert_to_tensor=True, device='cpu')
-    job_embedding = model.encode(job_description, convert_to_tensor=True, device='cpu')
-    # Calculate cosine similarity
-    cosine_scores = util.pytorch_cos_sim(cv_embedding, job_embedding)
-    return cosine_scores.item()
-def generate_match_explanation(cv_keywords, job_description):
-    """
-    Generates a brief explanation of why the job is a good match.
-    This is a simplified implementation. For more advanced explanations,
-    a generative model (like GPT or T5) could be used.
-    """
-    common_keywords = [
-        keyword for keyword in cv_keywords
-        if re.search(r'\b' + re.escape(keyword) + r'\b', job_description, re.IGNORECASE)
-    ]
-    if not common_keywords:
-        return "This job aligns with your general profile based on the overall text similarity."
-    explanation = f"This role is a strong match because it requires skills you possess, such as: **{', '.join(common_keywords[:3])}**."
-    return explanation
-def notify_user(job):
-    """Placeholder for a notification function."""
-    # In a real-world application, this would integrate with an email/messaging service.
-    log_message = f"Notification Triggered: New high-match job found - '{job['position']}' at {job['company']}. Match: {job['match_score']:.2f}%"
-    print(log_message) # For MVP, we just log to console.
-    # In HF Spaces, this will appear in the server logs.
-# --- Streamlit App UI ---
-st.set_page_config(page_title="AI Job Matcher", page_icon="🤖", layout="wide")
-st.title("🤖 AI-Powered Job Matcher")
-st.markdown("""
-Upload your CV, and this app will scan job platforms to find the best matches for your profile.
-It uses sentence embeddings to understand the context of your skills and the job requirements.
-""")
-# --- State Management ---
-if 'cv_text' not in st.session_state:
-    st.session_state.cv_text = None
-if 'cv_keywords' not in st.session_state:
-    st.session_state.cv_keywords = []
 if 'jobs' not in st.session_state:
     st.session_state.jobs = []
-if 'processed' not in st.session_state:
-    st.session_state.processed = False
-# --- Sidebar for CV Upload and Controls ---
 with st.sidebar:
-    st.header("1. Upload Your CV")
-    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
-    if uploaded_file is not None:
-        if st.button("Process CV"):
-            with st.spinner('Analyzing your CV...'):
-                st.session_state.cv_text = extract_text_from_pdf(uploaded_file)
-                if st.session_state.cv_text:
-                    st.session_state.cv_keywords = extract_keywords(st.session_state.cv_text)
-                    st.success("CV processed successfully!")
-                    st.session_state.processed = False # Reset processed state to allow re-matching
-                else:
-                    st.error("Could not extract text from the CV.")
-    if st.session_state.cv_text:
-        st.subheader("Detected Skills & Keywords:")
-        if st.session_state.cv_keywords:
-            st.write(", ".join(st.session_state.cv_keywords))
         else:
-            st.write("No specific keywords detected. Matching will be based on overall text.")
-    st.header("2. Select Job Platforms")
-    use_remoteok = st.checkbox("RemoteOK", value=True)
-    # Add other platforms here as they are implemented
-    # use_upwork = st.checkbox("Upwork (Not Implemented)", disabled=True)
-    # use_freelancer = st.checkbox("Freelancer (Not Implemented)", disabled=True)
-# --- Main Content Area for Job Matching and Display ---
-if st.session_state.cv_text:
-    if st.button("🚀 Find My Dream Job", type="primary"):
-        st.session_state.processed = False
         st.session_state.jobs = []
-        matched_jobs = []
-        with st.spinner("Fetching and analyzing jobs... This may take a moment."):
-            if use_remoteok:
-                fetched_jobs = fetch_remoteok_jobs()
-                if fetched_jobs:
-                    for job in fetched_jobs:
-                        # Ensure job has a description, position, and company
-                        if 'description' in job and 'position' in job and 'company' in job:
-                            description_text = job['description']
-                            similarity_score = calculate_similarity(st.session_state.cv_text, description_text)
-                            job['match_score'] = similarity_score * 100
-                            matched_jobs.append(job)
-        if matched_jobs:
-            # Sort jobs by match score in descending order
-            st.session_state.jobs = sorted(matched_jobs, key=lambda x: x['match_score'], reverse=True)
-            st.session_state.processed = True
-            # Trigger notifications for high-matching jobs (e.g., > 70%)
-            for job in st.session_state.jobs:
-                if job['match_score'] > 70:
-                    notify_user(job)
-        else:
-            st.warning("No jobs found or there was an issue with the job platforms. Please try again.")
-if st.session_state.processed and st.session_state.jobs:
-    st.header("🏆 Top Job Matches For You")
-    top_n = 10
-    for job in st.session_state.jobs[:top_n]:
-        with st.container(border=True):
-            col1, col2 = st.columns([4, 1])
-            with col1:
-                st.subheader(job.get('position', 'N/A'))
-                st.write(f"**Company:** {job.get('company', 'N/A')}")
-                tags = job.get('tags', [])
-                if tags:
-                    st.write(f"**Tags:** `{'`, `'.join(tags[:5])}`")
-                explanation = generate_match_explanation(st.session_state.cv_keywords, job.get('description', ''))
-                st.info(f"**Why it's a match:** {explanation}")
-                st.markdown(f"[View Job Posting]({job.get('url', '#')})", unsafe_allow_html=True)
-            with col2:
-                match_score = job.get('match_score', 0)
-                st.progress(int(match_score))
-                st.metric(label="Match Score", value=f"{match_score:.2f}%")
-else:
-    st.info("Upload your CV and select job platforms to get started.")

 import streamlit as st
+import os
 import requests
 import re
+from datetime import datetime
+import fitz  # PyMuPDF
+from docx import Document
+from collections import Counter
+import json
+# --- Configuration ---
+st.set_page_config(
+    page_title="AI Job Finder",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# --- Hugging Face Secrets & API Keys ---
+# Try to get the API key from Streamlit secrets (for deployed apps)
 try:
+    SCRAPINGDOG_API_KEY = st.secrets["SCRAPINGDOG_API_KEY"]
+except (KeyError, AttributeError):
+    # Fallback to environment variable (for local development)
+    SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
+# --- Helper Functions & Classes ---
+def parse_cv(uploaded_file):
+    """Parses the uploaded CV file and returns its text content."""
     try:
+        file_type = uploaded_file.type
+        if "pdf" in file_type:
+            with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
+                text = "".join(page.get_text() for page in doc)
+            return text
+        elif "vnd.openxmlformats-officedocument.wordprocessingml.document" in file_type: # DOCX
+            doc = Document(uploaded_file)
+            text = "\n".join([para.text for para in doc.paragraphs])
+            return text
+        elif "text/plain" in file_type:
+            text = uploaded_file.getvalue().decode("utf-8")
+            return text
+        else:
+            st.error(f"Unsupported file type: {file_type}")
+            return None
     except Exception as e:
+        st.error(f"Error parsing CV: {e}")
         return None
+def extract_keywords(text, top_n=25):
+    """Extracts the most common words from text to be used as keywords."""
     if not text:
         return []
+    # Basic regex to find words, removing simple punctuation
+    words = re.findall(r'\b[a-zA-Z-]{3,}\b', text.lower())
+    # A simple list of common English stop words.
+    # For a more robust solution, a library like NLTK would be better.
+    stop_words = set([
+        'and', 'the', 'is', 'in', 'it', 'of', 'for', 'on', 'with', 'as', 'at', 'by',
+        'to', 'a', 'an', 'that', 'this', 'i', 'you', 'he', 'she', 'we', 'they', 'was',
+        'were', 'be', 'been', 'are', 'has', 'have', 'had', 'do', 'does', 'did', 'but',
+        'if', 'or', 'so', 'not', 'from', 'about', 'more', 'my', 'your', 'our', 'their',
+        'experience', 'work', 'skills', 'responsibilities', 'project', 'projects'
+    ])
+    filtered_words = [word for word in words if word not in stop_words]
+    word_counts = Counter(filtered_words)
+    return [word for word, _ in word_counts.most_common(top_n)]
+def safe_get(data, key, default='N/A'):
+    """Safely get a value from a dictionary."""
+    return data.get(key, default) if data else default
+class JobDataNormalizer:
+    """Normalizes job data from different sources into a common schema."""
+    @staticmethod
+    def normalize_remoteok(job):
+        return {
+            "id": safe_get(job, 'id'),
+            "title": safe_get(job, 'position'),
+            "company": safe_get(job, 'company'),
+            "location": safe_get(job, 'location', "Remote"),
+            "description": safe_get(job, 'description'),
+            "url": safe_get(job, 'url'),
+            "date_posted": safe_get(job, 'date'),
+            "source": "RemoteOK"
+        }
+    @staticmethod
+    def normalize_linkedin(job):
+         return {
+            "id": hash(safe_get(job, 'link')), # Create a simple ID
+            "title": safe_get(job, 'title'),
+            "company": safe_get(job, 'company'),
+            "location": safe_get(job, 'location'),
+            "description": safe_get(job, 'description'),
+            "url": safe_get(job, 'link'),
+            "date_posted": safe_get(job, 'date'),
+            "source": "LinkedIn"
+        }
+# --- API Agent Functions ---
+def search_remoteok(keywords):
+    """Searches for jobs on RemoteOK based on keywords."""
+    all_jobs = []
     try:
+        response = requests.get("https://remoteok.com/api")
+        response.raise_for_status()
+        jobs_data = response.json()
+        # The first item is a legal notice, so we skip it
+        for job in jobs_data[1:]:
+            job_text = f"{job.get('position', '')} {job.get('company', '')} {' '.join(job.get('tags', []))}".lower()
+            if any(keyword.lower() in job_text for keyword in keywords):
+                all_jobs.append(JobDataNormalizer.normalize_remoteok(job))
     except requests.exceptions.RequestException as e:
+        st.error(f"Error fetching from RemoteOK: {e}")
+    except json.JSONDecodeError:
+        st.error("Failed to parse RemoteOK response.")
+    return all_jobs
+def search_linkedin(keywords, location):
+    """Searches for jobs on LinkedIn via ScrapingDog API."""
+    if not SCRAPINGDOG_API_KEY:
+        st.warning("ScrapingDog API key not found. Cannot search LinkedIn.")
+        st.info("Please add your API key to your Hugging Face secrets with the name `SCRAPINGDOG_API_KEY`.")
         return []
+    all_jobs = []
+    query = " ".join(keywords)
+    api_url = f"https://api.scrapingdog.com/linkedinjobs/?api_key={SCRAPINGDOG_API_KEY}&q={query}&geoid={location}"
+    try:
+        response = requests.get(api_url)
+        response.raise_for_status()
+        jobs_data = response.json()
+        if isinstance(jobs_data, list):
+            for job in jobs_data:
+                all_jobs.append(JobDataNormalizer.normalize_linkedin(job))
+    except requests.exceptions.HTTPError as e:
+        st.error(f"ScrapingDog API Error: {e}. Check your API key and usage limits.")
+    except requests.exceptions.RequestException as e:
+        st.error(f"Network error while contacting ScrapingDog: {e}")
+    except json.JSONDecodeError:
+        st.error("Failed to parse LinkedIn job data. The API might have returned an invalid response.")
+    return all_jobs
+# --- UI Rendering ---
+def display_job(job):
+    """Renders a single job listing in a card format."""
+    source_colors = {"RemoteOK": "#ff4b4b", "LinkedIn": "#0077b5"}
+    color = source_colors.get(job['source'], "#f0f2f6")
+    st.markdown(f"""
+    <div style="border: 1px solid #e1e4e8; border-radius: 8px; padding: 16px; margin-bottom: 16px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
+        <h3 style="margin-bottom: 8px;"><a href="{job['url']}" target="_blank" style="text-decoration: none; color: #0366d6;">{job['title']}</a></h3>
+        <p style="margin: 0;"><strong>🏢 Company:</strong> {job['company']}</p>
+        <p style="margin: 0;"><strong>📍 Location:</strong> {job['location']}</p>
+        <p style="margin: 0; color: #586069;"><strong>🗓️ Posted:</strong> {job['date_posted']}</p>
+        <div style="margin-top: 12px; display: flex; align-items: center;">
+            <span style="background-color: {color}; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: bold;">{job['source']}</span>
+        </div>
+    </div>
+    """, unsafe_allow_html=True)
+    with st.expander("Show Job Description Snippet"):
+        # Strip HTML tags for cleaner display
+        clean_description = re.sub('<[^<]+?>', '', job['description'])
+        st.write(clean_description[:500] + "...")
+# --- Main Application Logic ---
+# Initialize session state
+if 'keywords' not in st.session_state:
+    st.session_state.keywords = []
 if 'jobs' not in st.session_state:
     st.session_state.jobs = []
+if 'searched' not in st.session_state:
+    st.session_state.searched = False
+# --- Sidebar ---
 with st.sidebar:
+    st.image("https://images.emojiterra.com/twitter/v14.0/512px/1f916.png", width=80)
+    st.title("AI Job Finder")
+    st.markdown("""
+    Welcome! This app helps you find relevant job postings by analyzing your CV.
+    **How it works:**
+    1.  **Upload your CV** (PDF, DOCX, TXT).
+    2.  The app **extracts key skills**.
+    3.  **Select/add skills** to search for.
+    4.  **Search** across multiple job platforms.
+    """)
+    st.header("API Key Setup")
+    st.markdown("""
+    To search on LinkedIn, you need a **ScrapingDog API key**.
+    - Get a free key at [scrapingdog.com](https://www.scrapingdog.com/).
+    - In your Hugging Face Space, go to **Settings > Secrets** and add a secret named `SCRAPINGDOG_API_KEY` with your key as the value.
+    """)
+# --- Main Content ---
+st.header("1. Upload Your CV")
+uploaded_file = st.file_uploader(
+    "Upload your CV to automatically extract keywords.",
+    type=["pdf", "docx", "txt"],
+    accept_multiple_files=False
+)
+if uploaded_file:
+    with st.spinner("Analyzing your CV... 🧠"):
+        cv_text = parse_cv(uploaded_file)
+        if cv_text:
+            st.session_state.keywords = extract_keywords(cv_text)
+            st.success("CV analyzed successfully! Keywords have been extracted below.")
+st.header("2. Select and Refine Your Keywords")
+manual_keywords_input = st.text_input(
+    "Add more keywords (comma-separated)",
+    placeholder="e.g., python, data science, machine learning"
+)
+# Combine CV keywords with manually added ones
+manual_keywords = [k.strip() for k in manual_keywords_input.split(',') if k.strip()]
+combined_keywords = sorted(list(set(st.session_state.keywords + manual_keywords)))
+selected_keywords = st.multiselect(
+    "Choose the keywords you want to search for:",
+    options=combined_keywords,
+    default=st.session_state.keywords
+)
+st.header("3. Search for Jobs")
+location = st.text_input("Enter Location (e.g., 'United States' or leave empty for remote)", "Remote")
+col1, col2 = st.columns(2)
+with col1:
+    if st.button("🚀 Search Jobs", type="primary", use_container_width=True):
+        if not selected_keywords:
+            st.warning("Please select at least one keyword to search.")
         else:
+            st.session_state.jobs = [] # Clear previous results
+            st.session_state.searched = True
+            with st.spinner("Searching across job platforms... This may take a moment."):
+                remoteok_jobs = search_remoteok(selected_keywords)
+                linkedin_jobs = search_linkedin(selected_keywords, location)
+                # Combine and deduplicate
+                all_jobs = remoteok_jobs + linkedin_jobs
+                unique_jobs = []
+                seen_jobs = set()
+                for job in all_jobs:
+                    identifier = (job['title'], job['company'], job['url'])
+                    if identifier not in seen_jobs:
+                        unique_jobs.append(job)
+                        seen_jobs.add(identifier)
+                # Sort by date
+                unique_jobs.sort(key=lambda x: x.get('date_posted', ''), reverse=True)
+                st.session_state.jobs = unique_jobs
+                st.success(f"Found {len(unique_jobs)} unique jobs!")
+with col2:
+    if st.button("Reset", use_container_width=True):
+        st.session_state.keywords = []
         st.session_state.jobs = []
+        st.session_state.searched = False
+        st.rerun()
+# --- Display Results ---
+if st.session_state.searched:
+    st.header(f"💼 Job Listings ({len(st.session_state.jobs)} Found)")
+    if st.session_state.jobs:
+        for job in st.session_state.jobs:
+            display_job(job)
+    else:
+        st.info("No jobs found matching your criteria. Try different keywords or broaden your search.")