Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,84 +1,198 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
import requests
|
| 3 |
-
import fitz # PyMuPDF for PDF extraction
|
| 4 |
from sentence_transformers import SentenceTransformer, util
|
| 5 |
import torch
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
# -
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
@st.cache_resource
|
| 11 |
-
def load_model():
|
| 12 |
-
return SentenceTransformer("all-MiniLM-L6-v2")
|
| 13 |
-
|
| 14 |
-
model = load_model()
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
"""Extract text from uploaded PDF."""
|
| 19 |
-
text = ""
|
| 20 |
-
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
|
| 21 |
-
for page in doc:
|
| 22 |
-
text += page.get_text("text")
|
| 23 |
-
return text
|
| 24 |
-
|
| 25 |
-
def fetch_jobs():
|
| 26 |
-
"""Fetch jobs from RemoteOK API."""
|
| 27 |
-
url = "https://remoteok.com/api"
|
| 28 |
try:
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
except Exception as e:
|
| 33 |
-
st.error(f"
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def
|
| 37 |
-
"""
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
return []
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
cv_embedding = model.encode(cv_text, convert_to_tensor=True)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import fitz # PyMuPDF
|
| 3 |
import requests
|
|
|
|
| 4 |
from sentence_transformers import SentenceTransformer, util
|
| 5 |
import torch
|
| 6 |
+
import re
|
| 7 |
+
import io
|
| 8 |
|
| 9 |
+
# Load pre-trained Sentence Transformer model
|
| 10 |
+
# Using a smaller, efficient model suitable for HF Spaces
|
| 11 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
def extract_text_from_pdf(pdf_file):
|
| 14 |
+
"""Extracts text from an uploaded PDF file."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
try:
|
| 16 |
+
pdf_bytes = pdf_file.read()
|
| 17 |
+
pdf_document = fitz.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
|
| 18 |
+
text = ""
|
| 19 |
+
for page_num in range(len(pdf_document)):
|
| 20 |
+
page = pdf_document.load_page(page_num)
|
| 21 |
+
text += page.get_text()
|
| 22 |
+
return text
|
| 23 |
except Exception as e:
|
| 24 |
+
st.error(f"Error reading PDF file: {e}")
|
| 25 |
+
return None
|
| 26 |
+
|
| 27 |
+
def extract_keywords(text):
|
| 28 |
+
"""
|
| 29 |
+
A simple keyword extractor for skills, technologies, and certifications.
|
| 30 |
+
This can be replaced with a more sophisticated NLP model if needed.
|
| 31 |
+
"""
|
| 32 |
+
if not text:
|
| 33 |
+
return []
|
| 34 |
+
# Using regex to find potential skills (e.g., Python, Java, SQL, AWS, Docker)
|
| 35 |
+
# This is a basic example and can be expanded significantly.
|
| 36 |
+
skills_pattern = r'\b(Python|Java|C\+\+|JavaScript|SQL|React|Node\.js|Angular|Vue|AWS|Azure|GCP|Docker|Kubernetes|TensorFlow|PyTorch|Scikit-learn|Pandas|NumPy|Git)\b'
|
| 37 |
+
skills = re.findall(skills_pattern, text, re.IGNORECASE)
|
| 38 |
+
return list(set(skills)) # Return unique skills
|
| 39 |
+
|
| 40 |
+
def fetch_remoteok_jobs():
|
| 41 |
+
"""Fetches the latest jobs from the RemoteOK API."""
|
| 42 |
+
try:
|
| 43 |
+
response = requests.get('https://remoteok.com/api')
|
| 44 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 45 |
+
jobs = response.json()
|
| 46 |
+
# The first element is often a header/legal notice, so we skip it
|
| 47 |
+
return jobs[1:] if isinstance(jobs, list) and len(jobs) > 1 else []
|
| 48 |
+
except requests.exceptions.RequestException as e:
|
| 49 |
+
st.error(f"Failed to fetch jobs from RemoteOK: {e}")
|
| 50 |
+
return []
|
| 51 |
+
except ValueError:
|
| 52 |
+
st.error("Failed to parse JSON response from RemoteOK.")
|
| 53 |
return []
|
| 54 |
|
| 55 |
+
def calculate_similarity(cv_text, job_description):
|
| 56 |
+
"""Calculates cosine similarity between CV text and a job description."""
|
| 57 |
+
if not cv_text or not job_description:
|
| 58 |
+
return 0.0
|
| 59 |
+
|
| 60 |
+
# Generate embeddings
|
| 61 |
cv_embedding = model.encode(cv_text, convert_to_tensor=True)
|
| 62 |
+
job_embedding = model.encode(job_description, convert_to_tensor=True)
|
| 63 |
+
|
| 64 |
+
# Calculate cosine similarity
|
| 65 |
+
cosine_scores = util.pytorch_cos_sim(cv_embedding, job_embedding)
|
| 66 |
+
return cosine_scores.item()
|
| 67 |
+
|
| 68 |
+
def generate_match_explanation(cv_keywords, job_description):
|
| 69 |
+
"""
|
| 70 |
+
Generates a brief explanation of why the job is a good match.
|
| 71 |
+
This is a simplified implementation. For more advanced explanations,
|
| 72 |
+
a generative model (like GPT or T5) could be used.
|
| 73 |
+
"""
|
| 74 |
+
common_keywords = [
|
| 75 |
+
keyword for keyword in cv_keywords
|
| 76 |
+
if re.search(r'\b' + re.escape(keyword) + r'\b', job_description, re.IGNORECASE)
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
if not common_keywords:
|
| 80 |
+
return "This job aligns with your general profile based on the overall text similarity."
|
| 81 |
+
|
| 82 |
+
explanation = f"This role is a strong match because it requires skills you possess, such as: **{', '.join(common_keywords[:3])}**."
|
| 83 |
+
return explanation
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def notify_user(job):
|
| 87 |
+
"""Placeholder for a notification function."""
|
| 88 |
+
# In a real-world application, this would integrate with an email/messaging service.
|
| 89 |
+
log_message = f"Notification Triggered: New high-match job found - '{job['position']}' at {job['company']}. Match: {job['match_score']:.2f}%"
|
| 90 |
+
print(log_message) # For MVP, we just log to console.
|
| 91 |
+
# In HF Spaces, this will appear in the server logs.
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# --- Streamlit App UI ---
|
| 95 |
+
st.set_page_config(page_title="AI Job Matcher", page_icon="π€", layout="wide")
|
| 96 |
+
|
| 97 |
+
st.title("π€ AI-Powered Job Matcher")
|
| 98 |
+
st.markdown("""
|
| 99 |
+
Upload your CV, and this app will scan job platforms to find the best matches for your profile.
|
| 100 |
+
It uses sentence embeddings to understand the context of your skills and the job requirements.
|
| 101 |
+
""")
|
| 102 |
+
|
| 103 |
+
# --- State Management ---
|
| 104 |
+
if 'cv_text' not in st.session_state:
|
| 105 |
+
st.session_state.cv_text = None
|
| 106 |
+
if 'cv_keywords' not in st.session_state:
|
| 107 |
+
st.session_state.cv_keywords = []
|
| 108 |
+
if 'jobs' not in st.session_state:
|
| 109 |
+
st.session_state.jobs = []
|
| 110 |
+
if 'processed' not in st.session_state:
|
| 111 |
+
st.session_state.processed = False
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# --- Sidebar for CV Upload and Controls ---
|
| 115 |
+
with st.sidebar:
|
| 116 |
+
st.header("1. Upload Your CV")
|
| 117 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
| 118 |
+
|
| 119 |
+
if uploaded_file is not None:
|
| 120 |
+
if st.button("Process CV"):
|
| 121 |
+
with st.spinner('Analyzing your CV...'):
|
| 122 |
+
st.session_state.cv_text = extract_text_from_pdf(uploaded_file)
|
| 123 |
+
if st.session_state.cv_text:
|
| 124 |
+
st.session_state.cv_keywords = extract_keywords(st.session_state.cv_text)
|
| 125 |
+
st.success("CV processed successfully!")
|
| 126 |
+
st.session_state.processed = False # Reset processed state to allow re-matching
|
| 127 |
+
else:
|
| 128 |
+
st.error("Could not extract text from the CV.")
|
| 129 |
+
|
| 130 |
+
if st.session_state.cv_text:
|
| 131 |
+
st.subheader("Detected Skills & Keywords:")
|
| 132 |
+
if st.session_state.cv_keywords:
|
| 133 |
+
st.write(", ".join(st.session_state.cv_keywords))
|
| 134 |
+
else:
|
| 135 |
+
st.write("No specific keywords detected. Matching will be based on overall text.")
|
| 136 |
+
|
| 137 |
+
st.header("2. Select Job Platforms")
|
| 138 |
+
use_remoteok = st.checkbox("RemoteOK", value=True)
|
| 139 |
+
# Add other platforms here as they are implemented
|
| 140 |
+
# use_upwork = st.checkbox("Upwork (Not Implemented)", disabled=True)
|
| 141 |
+
# use_freelancer = st.checkbox("Freelancer (Not Implemented)", disabled=True)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# --- Main Content Area for Job Matching and Display ---
|
| 145 |
+
if st.session_state.cv_text:
|
| 146 |
+
if st.button("π Find My Dream Job", type="primary"):
|
| 147 |
+
st.session_state.processed = False
|
| 148 |
+
st.session_state.jobs = []
|
| 149 |
+
matched_jobs = []
|
| 150 |
+
|
| 151 |
+
with st.spinner("Fetching and analyzing jobs... This may take a moment."):
|
| 152 |
+
if use_remoteok:
|
| 153 |
+
fetched_jobs = fetch_remoteok_jobs()
|
| 154 |
+
if fetched_jobs:
|
| 155 |
+
for job in fetched_jobs:
|
| 156 |
+
# Ensure job has a description, position, and company
|
| 157 |
+
if 'description' in job and 'position' in job and 'company' in job:
|
| 158 |
+
description_text = job['description']
|
| 159 |
+
similarity_score = calculate_similarity(st.session_state.cv_text, description_text)
|
| 160 |
+
job['match_score'] = similarity_score * 100
|
| 161 |
+
matched_jobs.append(job)
|
| 162 |
+
|
| 163 |
+
if matched_jobs:
|
| 164 |
+
# Sort jobs by match score in descending order
|
| 165 |
+
st.session_state.jobs = sorted(matched_jobs, key=lambda x: x['match_score'], reverse=True)
|
| 166 |
+
st.session_state.processed = True
|
| 167 |
+
|
| 168 |
+
# Trigger notifications for high-matching jobs (e.g., > 70%)
|
| 169 |
+
for job in st.session_state.jobs:
|
| 170 |
+
if job['match_score'] > 70:
|
| 171 |
+
notify_user(job)
|
| 172 |
+
else:
|
| 173 |
+
st.warning("No jobs found or there was an issue with the job platforms. Please try again.")
|
| 174 |
+
|
| 175 |
+
if st.session_state.processed and st.session_state.jobs:
|
| 176 |
+
st.header("π Top Job Matches For You")
|
| 177 |
+
top_n = 10
|
| 178 |
+
for job in st.session_state.jobs[:top_n]:
|
| 179 |
+
with st.container(border=True):
|
| 180 |
+
col1, col2 = st.columns([4, 1])
|
| 181 |
+
with col1:
|
| 182 |
+
st.subheader(job.get('position', 'N/A'))
|
| 183 |
+
st.write(f"**Company:** {job.get('company', 'N/A')}")
|
| 184 |
+
tags = job.get('tags', [])
|
| 185 |
+
if tags:
|
| 186 |
+
st.write(f"**Tags:** `{'`, `'.join(tags[:5])}`")
|
| 187 |
+
|
| 188 |
+
explanation = generate_match_explanation(st.session_state.cv_keywords, job.get('description', ''))
|
| 189 |
+
st.info(f"**Why it's a match:** {explanation}")
|
| 190 |
+
|
| 191 |
+
st.markdown(f"[View Job Posting]({job.get('url', '#')})", unsafe_allow_html=True)
|
| 192 |
+
|
| 193 |
+
with col2:
|
| 194 |
+
match_score = job.get('match_score', 0)
|
| 195 |
+
st.progress(int(match_score))
|
| 196 |
+
st.metric(label="Match Score", value=f"{match_score:.2f}%")
|
| 197 |
+
else:
|
| 198 |
+
st.info("Upload your CV and select job platforms to get started.")
|