Alpha108 commited on
Commit
d170281
ยท
verified ยท
1 Parent(s): afc5985

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -184
app.py CHANGED
@@ -1,206 +1,283 @@
1
  import streamlit as st
2
- import fitz # PyMuPDF
3
  import requests
4
- from sentence_transformers import SentenceTransformer, util
5
- import torch
6
  import re
7
- import io
8
-
9
- # --- FIX ---
10
- # Explicitly specify the device to 'cpu'. This resolves the 'meta tensor' error
11
- # that can occur on Hugging Face Spaces and other environments.
 
 
 
 
 
 
 
 
 
 
 
12
  try:
13
- model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
14
- except Exception as e:
15
- st.error(f"Error loading SentenceTransformer model: {e}")
16
- # Stop the app if the model can't be loaded.
17
- st.stop()
18
 
 
19
 
20
- def extract_text_from_pdf(pdf_file):
21
- """Extracts text from an uploaded PDF file."""
22
  try:
23
- pdf_bytes = pdf_file.read()
24
- pdf_document = fitz.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
25
- text = ""
26
- for page_num in range(len(pdf_document)):
27
- page = pdf_document.load_page(page_num)
28
- text += page.get_text()
29
- return text
 
 
 
 
 
 
 
 
30
  except Exception as e:
31
- st.error(f"Error reading PDF file: {e}")
32
  return None
33
 
34
- def extract_keywords(text):
35
- """
36
- A simple keyword extractor for skills, technologies, and certifications.
37
- This can be replaced with a more sophisticated NLP model if needed.
38
- """
39
  if not text:
40
  return []
41
- # Using regex to find potential skills (e.g., Python, Java, SQL, AWS, Docker)
42
- # This is a basic example and can be expanded significantly.
43
- skills_pattern = r'\b(Python|Java|C\+\+|JavaScript|SQL|React|Node\.js|Angular|Vue|AWS|Azure|GCP|Docker|Kubernetes|TensorFlow|PyTorch|Scikit-learn|Pandas|NumPy|Git)\b'
44
- skills = re.findall(skills_pattern, text, re.IGNORECASE)
45
- return list(set(skills)) # Return unique skills
46
-
47
- def fetch_remoteok_jobs():
48
- """Fetches the latest jobs from the RemoteOK API."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
- response = requests.get('https://remoteok.com/api')
51
- response.raise_for_status() # Raise an exception for bad status codes
52
- jobs = response.json()
53
- # The first element is often a header/legal notice, so we skip it
54
- return jobs[1:] if isinstance(jobs, list) and len(jobs) > 1 else []
 
 
 
 
55
  except requests.exceptions.RequestException as e:
56
- st.error(f"Failed to fetch jobs from RemoteOK: {e}")
57
- return []
58
- except ValueError:
59
- st.error("Failed to parse JSON response from RemoteOK.")
 
 
 
 
 
 
60
  return []
61
 
62
- def calculate_similarity(cv_text, job_description):
63
- """Calculates cosine similarity between CV text and a job description."""
64
- if not cv_text or not job_description:
65
- return 0.0
66
-
67
- # Generate embeddings
68
- cv_embedding = model.encode(cv_text, convert_to_tensor=True, device='cpu')
69
- job_embedding = model.encode(job_description, convert_to_tensor=True, device='cpu')
70
-
71
- # Calculate cosine similarity
72
- cosine_scores = util.pytorch_cos_sim(cv_embedding, job_embedding)
73
- return cosine_scores.item()
74
-
75
- def generate_match_explanation(cv_keywords, job_description):
76
- """
77
- Generates a brief explanation of why the job is a good match.
78
- This is a simplified implementation. For more advanced explanations,
79
- a generative model (like GPT or T5) could be used.
80
- """
81
- common_keywords = [
82
- keyword for keyword in cv_keywords
83
- if re.search(r'\b' + re.escape(keyword) + r'\b', job_description, re.IGNORECASE)
84
- ]
85
-
86
- if not common_keywords:
87
- return "This job aligns with your general profile based on the overall text similarity."
88
-
89
- explanation = f"This role is a strong match because it requires skills you possess, such as: **{', '.join(common_keywords[:3])}**."
90
- return explanation
91
-
92
-
93
- def notify_user(job):
94
- """Placeholder for a notification function."""
95
- # In a real-world application, this would integrate with an email/messaging service.
96
- log_message = f"Notification Triggered: New high-match job found - '{job['position']}' at {job['company']}. Match: {job['match_score']:.2f}%"
97
- print(log_message) # For MVP, we just log to console.
98
- # In HF Spaces, this will appear in the server logs.
99
-
100
-
101
- # --- Streamlit App UI ---
102
- st.set_page_config(page_title="AI Job Matcher", page_icon="๐Ÿค–", layout="wide")
103
-
104
- st.title("๐Ÿค– AI-Powered Job Matcher")
105
- st.markdown("""
106
- Upload your CV, and this app will scan job platforms to find the best matches for your profile.
107
- It uses sentence embeddings to understand the context of your skills and the job requirements.
108
- """)
109
-
110
- # --- State Management ---
111
- if 'cv_text' not in st.session_state:
112
- st.session_state.cv_text = None
113
- if 'cv_keywords' not in st.session_state:
114
- st.session_state.cv_keywords = []
115
  if 'jobs' not in st.session_state:
116
  st.session_state.jobs = []
117
- if 'processed' not in st.session_state:
118
- st.session_state.processed = False
119
 
120
-
121
- # --- Sidebar for CV Upload and Controls ---
122
  with st.sidebar:
123
- st.header("1. Upload Your CV")
124
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
125
-
126
- if uploaded_file is not None:
127
- if st.button("Process CV"):
128
- with st.spinner('Analyzing your CV...'):
129
- st.session_state.cv_text = extract_text_from_pdf(uploaded_file)
130
- if st.session_state.cv_text:
131
- st.session_state.cv_keywords = extract_keywords(st.session_state.cv_text)
132
- st.success("CV processed successfully!")
133
- st.session_state.processed = False # Reset processed state to allow re-matching
134
- else:
135
- st.error("Could not extract text from the CV.")
136
-
137
- if st.session_state.cv_text:
138
- st.subheader("Detected Skills & Keywords:")
139
- if st.session_state.cv_keywords:
140
- st.write(", ".join(st.session_state.cv_keywords))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  else:
142
- st.write("No specific keywords detected. Matching will be based on overall text.")
143
-
144
- st.header("2. Select Job Platforms")
145
- use_remoteok = st.checkbox("RemoteOK", value=True)
146
- # Add other platforms here as they are implemented
147
- # use_upwork = st.checkbox("Upwork (Not Implemented)", disabled=True)
148
- # use_freelancer = st.checkbox("Freelancer (Not Implemented)", disabled=True)
149
-
150
-
151
- # --- Main Content Area for Job Matching and Display ---
152
- if st.session_state.cv_text:
153
- if st.button("๐Ÿš€ Find My Dream Job", type="primary"):
154
- st.session_state.processed = False
 
 
 
 
 
 
 
 
 
 
 
 
155
  st.session_state.jobs = []
156
- matched_jobs = []
157
-
158
- with st.spinner("Fetching and analyzing jobs... This may take a moment."):
159
- if use_remoteok:
160
- fetched_jobs = fetch_remoteok_jobs()
161
- if fetched_jobs:
162
- for job in fetched_jobs:
163
- # Ensure job has a description, position, and company
164
- if 'description' in job and 'position' in job and 'company' in job:
165
- description_text = job['description']
166
- similarity_score = calculate_similarity(st.session_state.cv_text, description_text)
167
- job['match_score'] = similarity_score * 100
168
- matched_jobs.append(job)
169
-
170
- if matched_jobs:
171
- # Sort jobs by match score in descending order
172
- st.session_state.jobs = sorted(matched_jobs, key=lambda x: x['match_score'], reverse=True)
173
- st.session_state.processed = True
174
-
175
- # Trigger notifications for high-matching jobs (e.g., > 70%)
176
- for job in st.session_state.jobs:
177
- if job['match_score'] > 70:
178
- notify_user(job)
179
- else:
180
- st.warning("No jobs found or there was an issue with the job platforms. Please try again.")
181
-
182
- if st.session_state.processed and st.session_state.jobs:
183
- st.header("๐Ÿ† Top Job Matches For You")
184
- top_n = 10
185
- for job in st.session_state.jobs[:top_n]:
186
- with st.container(border=True):
187
- col1, col2 = st.columns([4, 1])
188
- with col1:
189
- st.subheader(job.get('position', 'N/A'))
190
- st.write(f"**Company:** {job.get('company', 'N/A')}")
191
- tags = job.get('tags', [])
192
- if tags:
193
- st.write(f"**Tags:** `{'`, `'.join(tags[:5])}`")
194
-
195
- explanation = generate_match_explanation(st.session_state.cv_keywords, job.get('description', ''))
196
- st.info(f"**Why it's a match:** {explanation}")
197
-
198
- st.markdown(f"[View Job Posting]({job.get('url', '#')})", unsafe_allow_html=True)
199
-
200
- with col2:
201
- match_score = job.get('match_score', 0)
202
- st.progress(int(match_score))
203
- st.metric(label="Match Score", value=f"{match_score:.2f}%")
204
- else:
205
- st.info("Upload your CV and select job platforms to get started.")
206
-
 
1
  import streamlit as st
2
+ import os
3
  import requests
 
 
4
  import re
5
+ from datetime import datetime
6
+ import fitz # PyMuPDF
7
+ from docx import Document
8
+ from collections import Counter
9
+ import json
10
+
11
+ # --- Configuration ---
12
+ st.set_page_config(
13
+ page_title="AI Job Finder",
14
+ page_icon="๐Ÿค–",
15
+ layout="wide",
16
+ initial_sidebar_state="expanded",
17
+ )
18
+
19
+ # --- Hugging Face Secrets & API Keys ---
20
+ # Try to get the API key from Streamlit secrets (for deployed apps)
21
  try:
22
+ SCRAPINGDOG_API_KEY = st.secrets["SCRAPINGDOG_API_KEY"]
23
+ except (KeyError, AttributeError):
24
+ # Fallback to environment variable (for local development)
25
+ SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
 
26
 
27
+ # --- Helper Functions & Classes ---
28
 
29
+ def parse_cv(uploaded_file):
30
+ """Parses the uploaded CV file and returns its text content."""
31
  try:
32
+ file_type = uploaded_file.type
33
+ if "pdf" in file_type:
34
+ with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
35
+ text = "".join(page.get_text() for page in doc)
36
+ return text
37
+ elif "vnd.openxmlformats-officedocument.wordprocessingml.document" in file_type: # DOCX
38
+ doc = Document(uploaded_file)
39
+ text = "\n".join([para.text for para in doc.paragraphs])
40
+ return text
41
+ elif "text/plain" in file_type:
42
+ text = uploaded_file.getvalue().decode("utf-8")
43
+ return text
44
+ else:
45
+ st.error(f"Unsupported file type: {file_type}")
46
+ return None
47
  except Exception as e:
48
+ st.error(f"Error parsing CV: {e}")
49
  return None
50
 
51
+ def extract_keywords(text, top_n=25):
52
+ """Extracts the most common words from text to be used as keywords."""
 
 
 
53
  if not text:
54
  return []
55
+ # Basic regex to find words, removing simple punctuation
56
+ words = re.findall(r'\b[a-zA-Z-]{3,}\b', text.lower())
57
+
58
+ # A simple list of common English stop words.
59
+ # For a more robust solution, a library like NLTK would be better.
60
+ stop_words = set([
61
+ 'and', 'the', 'is', 'in', 'it', 'of', 'for', 'on', 'with', 'as', 'at', 'by',
62
+ 'to', 'a', 'an', 'that', 'this', 'i', 'you', 'he', 'she', 'we', 'they', 'was',
63
+ 'were', 'be', 'been', 'are', 'has', 'have', 'had', 'do', 'does', 'did', 'but',
64
+ 'if', 'or', 'so', 'not', 'from', 'about', 'more', 'my', 'your', 'our', 'their',
65
+ 'experience', 'work', 'skills', 'responsibilities', 'project', 'projects'
66
+ ])
67
+
68
+ filtered_words = [word for word in words if word not in stop_words]
69
+ word_counts = Counter(filtered_words)
70
+ return [word for word, _ in word_counts.most_common(top_n)]
71
+
72
+ def safe_get(data, key, default='N/A'):
73
+ """Safely get a value from a dictionary."""
74
+ return data.get(key, default) if data else default
75
+
76
+ class JobDataNormalizer:
77
+ """Normalizes job data from different sources into a common schema."""
78
+ @staticmethod
79
+ def normalize_remoteok(job):
80
+ return {
81
+ "id": safe_get(job, 'id'),
82
+ "title": safe_get(job, 'position'),
83
+ "company": safe_get(job, 'company'),
84
+ "location": safe_get(job, 'location', "Remote"),
85
+ "description": safe_get(job, 'description'),
86
+ "url": safe_get(job, 'url'),
87
+ "date_posted": safe_get(job, 'date'),
88
+ "source": "RemoteOK"
89
+ }
90
+
91
+ @staticmethod
92
+ def normalize_linkedin(job):
93
+ return {
94
+ "id": hash(safe_get(job, 'link')), # Create a simple ID
95
+ "title": safe_get(job, 'title'),
96
+ "company": safe_get(job, 'company'),
97
+ "location": safe_get(job, 'location'),
98
+ "description": safe_get(job, 'description'),
99
+ "url": safe_get(job, 'link'),
100
+ "date_posted": safe_get(job, 'date'),
101
+ "source": "LinkedIn"
102
+ }
103
+
104
+ # --- API Agent Functions ---
105
+
106
+ def search_remoteok(keywords):
107
+ """Searches for jobs on RemoteOK based on keywords."""
108
+ all_jobs = []
109
  try:
110
+ response = requests.get("https://remoteok.com/api")
111
+ response.raise_for_status()
112
+ jobs_data = response.json()
113
+
114
+ # The first item is a legal notice, so we skip it
115
+ for job in jobs_data[1:]:
116
+ job_text = f"{job.get('position', '')} {job.get('company', '')} {' '.join(job.get('tags', []))}".lower()
117
+ if any(keyword.lower() in job_text for keyword in keywords):
118
+ all_jobs.append(JobDataNormalizer.normalize_remoteok(job))
119
  except requests.exceptions.RequestException as e:
120
+ st.error(f"Error fetching from RemoteOK: {e}")
121
+ except json.JSONDecodeError:
122
+ st.error("Failed to parse RemoteOK response.")
123
+ return all_jobs
124
+
125
+ def search_linkedin(keywords, location):
126
+ """Searches for jobs on LinkedIn via ScrapingDog API."""
127
+ if not SCRAPINGDOG_API_KEY:
128
+ st.warning("ScrapingDog API key not found. Cannot search LinkedIn.")
129
+ st.info("Please add your API key to your Hugging Face secrets with the name `SCRAPINGDOG_API_KEY`.")
130
  return []
131
 
132
+ all_jobs = []
133
+ query = " ".join(keywords)
134
+ api_url = f"https://api.scrapingdog.com/linkedinjobs/?api_key={SCRAPINGDOG_API_KEY}&q={query}&geoid={location}"
135
+
136
+ try:
137
+ response = requests.get(api_url)
138
+ response.raise_for_status()
139
+ jobs_data = response.json()
140
+ if isinstance(jobs_data, list):
141
+ for job in jobs_data:
142
+ all_jobs.append(JobDataNormalizer.normalize_linkedin(job))
143
+ except requests.exceptions.HTTPError as e:
144
+ st.error(f"ScrapingDog API Error: {e}. Check your API key and usage limits.")
145
+ except requests.exceptions.RequestException as e:
146
+ st.error(f"Network error while contacting ScrapingDog: {e}")
147
+ except json.JSONDecodeError:
148
+ st.error("Failed to parse LinkedIn job data. The API might have returned an invalid response.")
149
+
150
+ return all_jobs
151
+
152
+ # --- UI Rendering ---
153
+
154
+ def display_job(job):
155
+ """Renders a single job listing in a card format."""
156
+ source_colors = {"RemoteOK": "#ff4b4b", "LinkedIn": "#0077b5"}
157
+ color = source_colors.get(job['source'], "#f0f2f6")
158
+
159
+ st.markdown(f"""
160
+ <div style="border: 1px solid #e1e4e8; border-radius: 8px; padding: 16px; margin-bottom: 16px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
161
+ <h3 style="margin-bottom: 8px;"><a href="{job['url']}" target="_blank" style="text-decoration: none; color: #0366d6;">{job['title']}</a></h3>
162
+ <p style="margin: 0;"><strong>๐Ÿข Company:</strong> {job['company']}</p>
163
+ <p style="margin: 0;"><strong>๐Ÿ“ Location:</strong> {job['location']}</p>
164
+ <p style="margin: 0; color: #586069;"><strong>๐Ÿ—“๏ธ Posted:</strong> {job['date_posted']}</p>
165
+ <div style="margin-top: 12px; display: flex; align-items: center;">
166
+ <span style="background-color: {color}; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: bold;">{job['source']}</span>
167
+ </div>
168
+ </div>
169
+ """, unsafe_allow_html=True)
170
+ with st.expander("Show Job Description Snippet"):
171
+ # Strip HTML tags for cleaner display
172
+ clean_description = re.sub('<[^<]+?>', '', job['description'])
173
+ st.write(clean_description[:500] + "...")
174
+
175
+ # --- Main Application Logic ---
176
+
177
+ # Initialize session state
178
+ if 'keywords' not in st.session_state:
179
+ st.session_state.keywords = []
 
 
 
 
 
180
  if 'jobs' not in st.session_state:
181
  st.session_state.jobs = []
182
+ if 'searched' not in st.session_state:
183
+ st.session_state.searched = False
184
 
185
+ # --- Sidebar ---
 
186
  with st.sidebar:
187
+ st.image("https://images.emojiterra.com/twitter/v14.0/512px/1f916.png", width=80)
188
+ st.title("AI Job Finder")
189
+ st.markdown("""
190
+ Welcome! This app helps you find relevant job postings by analyzing your CV.
191
+
192
+ **How it works:**
193
+ 1. **Upload your CV** (PDF, DOCX, TXT).
194
+ 2. The app **extracts key skills**.
195
+ 3. **Select/add skills** to search for.
196
+ 4. **Search** across multiple job platforms.
197
+ """)
198
+
199
+ st.header("API Key Setup")
200
+ st.markdown("""
201
+ To search on LinkedIn, you need a **ScrapingDog API key**.
202
+ - Get a free key at [scrapingdog.com](https://www.scrapingdog.com/).
203
+ - In your Hugging Face Space, go to **Settings > Secrets** and add a secret named `SCRAPINGDOG_API_KEY` with your key as the value.
204
+ """)
205
+
206
+ # --- Main Content ---
207
+ st.header("1. Upload Your CV")
208
+ uploaded_file = st.file_uploader(
209
+ "Upload your CV to automatically extract keywords.",
210
+ type=["pdf", "docx", "txt"],
211
+ accept_multiple_files=False
212
+ )
213
+
214
+ if uploaded_file:
215
+ with st.spinner("Analyzing your CV... ๐Ÿง "):
216
+ cv_text = parse_cv(uploaded_file)
217
+ if cv_text:
218
+ st.session_state.keywords = extract_keywords(cv_text)
219
+ st.success("CV analyzed successfully! Keywords have been extracted below.")
220
+
221
+ st.header("2. Select and Refine Your Keywords")
222
+ manual_keywords_input = st.text_input(
223
+ "Add more keywords (comma-separated)",
224
+ placeholder="e.g., python, data science, machine learning"
225
+ )
226
+
227
+ # Combine CV keywords with manually added ones
228
+ manual_keywords = [k.strip() for k in manual_keywords_input.split(',') if k.strip()]
229
+ combined_keywords = sorted(list(set(st.session_state.keywords + manual_keywords)))
230
+
231
+ selected_keywords = st.multiselect(
232
+ "Choose the keywords you want to search for:",
233
+ options=combined_keywords,
234
+ default=st.session_state.keywords
235
+ )
236
+
237
+ st.header("3. Search for Jobs")
238
+ location = st.text_input("Enter Location (e.g., 'United States' or leave empty for remote)", "Remote")
239
+
240
+ col1, col2 = st.columns(2)
241
+ with col1:
242
+ if st.button("๐Ÿš€ Search Jobs", type="primary", use_container_width=True):
243
+ if not selected_keywords:
244
+ st.warning("Please select at least one keyword to search.")
245
  else:
246
+ st.session_state.jobs = [] # Clear previous results
247
+ st.session_state.searched = True
248
+ with st.spinner("Searching across job platforms... This may take a moment."):
249
+ remoteok_jobs = search_remoteok(selected_keywords)
250
+ linkedin_jobs = search_linkedin(selected_keywords, location)
251
+
252
+ # Combine and deduplicate
253
+ all_jobs = remoteok_jobs + linkedin_jobs
254
+ unique_jobs = []
255
+ seen_jobs = set()
256
+
257
+ for job in all_jobs:
258
+ identifier = (job['title'], job['company'], job['url'])
259
+ if identifier not in seen_jobs:
260
+ unique_jobs.append(job)
261
+ seen_jobs.add(identifier)
262
+
263
+ # Sort by date
264
+ unique_jobs.sort(key=lambda x: x.get('date_posted', ''), reverse=True)
265
+ st.session_state.jobs = unique_jobs
266
+ st.success(f"Found {len(unique_jobs)} unique jobs!")
267
+
268
+ with col2:
269
+ if st.button("Reset", use_container_width=True):
270
+ st.session_state.keywords = []
271
  st.session_state.jobs = []
272
+ st.session_state.searched = False
273
+ st.rerun()
274
+
275
+
276
+ # --- Display Results ---
277
+ if st.session_state.searched:
278
+ st.header(f"๐Ÿ’ผ Job Listings ({len(st.session_state.jobs)} Found)")
279
+ if st.session_state.jobs:
280
+ for job in st.session_state.jobs:
281
+ display_job(job)
282
+ else:
283
+ st.info("No jobs found matching your criteria. Try different keywords or broaden your search.")