Alpha108 commited on
Commit
c3fdb9a
Β·
verified Β·
1 Parent(s): d170281

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -160
app.py CHANGED
@@ -2,45 +2,40 @@ import streamlit as st
2
  import os
3
  import requests
4
  import re
5
- from datetime import datetime
6
  import fitz # PyMuPDF
7
  from docx import Document
8
- from collections import Counter
9
  import json
10
 
11
  # --- Configuration ---
12
  st.set_page_config(
13
- page_title="AI Job Finder",
14
  page_icon="πŸ€–",
15
  layout="wide",
16
  initial_sidebar_state="expanded",
17
  )
18
 
19
  # --- Hugging Face Secrets & API Keys ---
20
- # Try to get the API key from Streamlit secrets (for deployed apps)
21
  try:
22
  SCRAPINGDOG_API_KEY = st.secrets["SCRAPINGDOG_API_KEY"]
23
  except (KeyError, AttributeError):
24
- # Fallback to environment variable (for local development)
25
  SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
26
 
27
- # --- Helper Functions & Classes ---
28
 
29
  def parse_cv(uploaded_file):
30
- """Parses the uploaded CV file and returns its text content."""
31
  try:
32
  file_type = uploaded_file.type
33
  if "pdf" in file_type:
34
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
35
- text = "".join(page.get_text() for page in doc)
36
- return text
37
- elif "vnd.openxmlformats-officedocument.wordprocessingml.document" in file_type: # DOCX
38
  doc = Document(uploaded_file)
39
- text = "\n".join([para.text for para in doc.paragraphs])
40
- return text
41
  elif "text/plain" in file_type:
42
- text = uploaded_file.getvalue().decode("utf-8")
43
- return text
44
  else:
45
  st.error(f"Unsupported file type: {file_type}")
46
  return None
@@ -48,88 +43,59 @@ def parse_cv(uploaded_file):
48
  st.error(f"Error parsing CV: {e}")
49
  return None
50
 
51
- def extract_keywords(text, top_n=25):
52
- """Extracts the most common words from text to be used as keywords."""
53
  if not text:
54
  return []
55
- # Basic regex to find words, removing simple punctuation
56
- words = re.findall(r'\b[a-zA-Z-]{3,}\b', text.lower())
 
 
 
 
 
 
 
 
 
 
57
 
58
- # A simple list of common English stop words.
59
- # For a more robust solution, a library like NLTK would be better.
60
- stop_words = set([
61
- 'and', 'the', 'is', 'in', 'it', 'of', 'for', 'on', 'with', 'as', 'at', 'by',
62
- 'to', 'a', 'an', 'that', 'this', 'i', 'you', 'he', 'she', 'we', 'they', 'was',
63
- 'were', 'be', 'been', 'are', 'has', 'have', 'had', 'do', 'does', 'did', 'but',
64
- 'if', 'or', 'so', 'not', 'from', 'about', 'more', 'my', 'your', 'our', 'their',
65
- 'experience', 'work', 'skills', 'responsibilities', 'project', 'projects'
66
- ])
67
 
68
- filtered_words = [word for word in words if word not in stop_words]
69
- word_counts = Counter(filtered_words)
70
- return [word for word, _ in word_counts.most_common(top_n)]
 
 
 
 
71
 
72
  def safe_get(data, key, default='N/A'):
73
- """Safely get a value from a dictionary."""
74
  return data.get(key, default) if data else default
75
 
76
  class JobDataNormalizer:
77
- """Normalizes job data from different sources into a common schema."""
78
- @staticmethod
79
- def normalize_remoteok(job):
80
- return {
81
- "id": safe_get(job, 'id'),
82
- "title": safe_get(job, 'position'),
83
- "company": safe_get(job, 'company'),
84
- "location": safe_get(job, 'location', "Remote"),
85
- "description": safe_get(job, 'description'),
86
- "url": safe_get(job, 'url'),
87
- "date_posted": safe_get(job, 'date'),
88
- "source": "RemoteOK"
89
- }
90
-
91
  @staticmethod
92
  def normalize_linkedin(job):
93
- return {
94
- "id": hash(safe_get(job, 'link')), # Create a simple ID
95
  "title": safe_get(job, 'title'),
96
  "company": safe_get(job, 'company'),
97
  "location": safe_get(job, 'location'),
98
  "description": safe_get(job, 'description'),
99
- "url": safe_get(job, 'link'),
100
  "date_posted": safe_get(job, 'date'),
 
101
  "source": "LinkedIn"
102
  }
103
 
104
- # --- API Agent Functions ---
105
-
106
- def search_remoteok(keywords):
107
- """Searches for jobs on RemoteOK based on keywords."""
108
- all_jobs = []
109
- try:
110
- response = requests.get("https://remoteok.com/api")
111
- response.raise_for_status()
112
- jobs_data = response.json()
113
-
114
- # The first item is a legal notice, so we skip it
115
- for job in jobs_data[1:]:
116
- job_text = f"{job.get('position', '')} {job.get('company', '')} {' '.join(job.get('tags', []))}".lower()
117
- if any(keyword.lower() in job_text for keyword in keywords):
118
- all_jobs.append(JobDataNormalizer.normalize_remoteok(job))
119
- except requests.exceptions.RequestException as e:
120
- st.error(f"Error fetching from RemoteOK: {e}")
121
- except json.JSONDecodeError:
122
- st.error("Failed to parse RemoteOK response.")
123
- return all_jobs
124
-
125
- def search_linkedin(keywords, location):
126
- """Searches for jobs on LinkedIn via ScrapingDog API."""
127
  if not SCRAPINGDOG_API_KEY:
128
- st.warning("ScrapingDog API key not found. Cannot search LinkedIn.")
129
- st.info("Please add your API key to your Hugging Face secrets with the name `SCRAPINGDOG_API_KEY`.")
130
  return []
131
 
132
- all_jobs = []
133
  query = " ".join(keywords)
134
  api_url = f"https://api.scrapingdog.com/linkedinjobs/?api_key={SCRAPINGDOG_API_KEY}&q={query}&geoid={location}"
135
 
@@ -138,45 +104,39 @@ def search_linkedin(keywords, location):
138
  response.raise_for_status()
139
  jobs_data = response.json()
140
  if isinstance(jobs_data, list):
141
- for job in jobs_data:
142
- all_jobs.append(JobDataNormalizer.normalize_linkedin(job))
143
  except requests.exceptions.HTTPError as e:
144
- st.error(f"ScrapingDog API Error: {e}. Check your API key and usage limits.")
145
  except requests.exceptions.RequestException as e:
146
- st.error(f"Network error while contacting ScrapingDog: {e}")
147
  except json.JSONDecodeError:
148
- st.error("Failed to parse LinkedIn job data. The API might have returned an invalid response.")
149
-
150
- return all_jobs
151
 
152
  # --- UI Rendering ---
153
 
154
  def display_job(job):
155
  """Renders a single job listing in a card format."""
156
- source_colors = {"RemoteOK": "#ff4b4b", "LinkedIn": "#0077b5"}
157
- color = source_colors.get(job['source'], "#f0f2f6")
158
-
159
  st.markdown(f"""
160
  <div style="border: 1px solid #e1e4e8; border-radius: 8px; padding: 16px; margin-bottom: 16px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
161
- <h3 style="margin-bottom: 8px;"><a href="{job['url']}" target="_blank" style="text-decoration: none; color: #0366d6;">{job['title']}</a></h3>
162
  <p style="margin: 0;"><strong>🏒 Company:</strong> {job['company']}</p>
163
  <p style="margin: 0;"><strong>πŸ“ Location:</strong> {job['location']}</p>
164
  <p style="margin: 0; color: #586069;"><strong>πŸ—“οΈ Posted:</strong> {job['date_posted']}</p>
165
- <div style="margin-top: 12px; display: flex; align-items: center;">
166
- <span style="background-color: {color}; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: bold;">{job['source']}</span>
167
  </div>
168
  </div>
169
  """, unsafe_allow_html=True)
170
  with st.expander("Show Job Description Snippet"):
171
- # Strip HTML tags for cleaner display
172
  clean_description = re.sub('<[^<]+?>', '', job['description'])
173
  st.write(clean_description[:500] + "...")
174
 
175
- # --- Main Application Logic ---
176
 
177
  # Initialize session state
178
- if 'keywords' not in st.session_state:
179
- st.session_state.keywords = []
180
  if 'jobs' not in st.session_state:
181
  st.session_state.jobs = []
182
  if 'searched' not in st.session_state:
@@ -185,99 +145,66 @@ if 'searched' not in st.session_state:
185
  # --- Sidebar ---
186
  with st.sidebar:
187
  st.image("https://images.emojiterra.com/twitter/v14.0/512px/1f916.png", width=80)
188
- st.title("AI Job Finder")
189
  st.markdown("""
190
- Welcome! This app helps you find relevant job postings by analyzing your CV.
191
 
192
- **How it works:**
193
- 1. **Upload your CV** (PDF, DOCX, TXT).
194
- 2. The app **extracts key skills**.
195
- 3. **Select/add skills** to search for.
196
- 4. **Search** across multiple job platforms.
197
- """)
198
 
199
- st.header("API Key Setup")
200
- st.markdown("""
201
- To search on LinkedIn, you need a **ScrapingDog API key**.
202
- - Get a free key at [scrapingdog.com](https://www.scrapingdog.com/).
203
- - In your Hugging Face Space, go to **Settings > Secrets** and add a secret named `SCRAPINGDOG_API_KEY` with your key as the value.
204
  """)
205
 
206
- # --- Main Content ---
207
  st.header("1. Upload Your CV")
208
  uploaded_file = st.file_uploader(
209
- "Upload your CV to automatically extract keywords.",
210
- type=["pdf", "docx", "txt"],
211
- accept_multiple_files=False
212
  )
213
 
214
  if uploaded_file:
215
- with st.spinner("Analyzing your CV... 🧠"):
216
  cv_text = parse_cv(uploaded_file)
217
  if cv_text:
218
- st.session_state.keywords = extract_keywords(cv_text)
219
- st.success("CV analyzed successfully! Keywords have been extracted below.")
220
 
221
- st.header("2. Select and Refine Your Keywords")
222
- manual_keywords_input = st.text_input(
223
- "Add more keywords (comma-separated)",
224
- placeholder="e.g., python, data science, machine learning"
225
  )
226
 
227
- # Combine CV keywords with manually added ones
228
- manual_keywords = [k.strip() for k in manual_keywords_input.split(',') if k.strip()]
229
- combined_keywords = sorted(list(set(st.session_state.keywords + manual_keywords)))
230
 
231
- selected_keywords = st.multiselect(
232
- "Choose the keywords you want to search for:",
233
- options=combined_keywords,
234
- default=st.session_state.keywords
235
  )
236
 
237
- st.header("3. Search for Jobs")
238
- location = st.text_input("Enter Location (e.g., 'United States' or leave empty for remote)", "Remote")
239
-
240
- col1, col2 = st.columns(2)
241
- with col1:
242
- if st.button("πŸš€ Search Jobs", type="primary", use_container_width=True):
243
- if not selected_keywords:
244
- st.warning("Please select at least one keyword to search.")
245
- else:
246
- st.session_state.jobs = [] # Clear previous results
247
- st.session_state.searched = True
248
- with st.spinner("Searching across job platforms... This may take a moment."):
249
- remoteok_jobs = search_remoteok(selected_keywords)
250
- linkedin_jobs = search_linkedin(selected_keywords, location)
251
-
252
- # Combine and deduplicate
253
- all_jobs = remoteok_jobs + linkedin_jobs
254
- unique_jobs = []
255
- seen_jobs = set()
256
-
257
- for job in all_jobs:
258
- identifier = (job['title'], job['company'], job['url'])
259
- if identifier not in seen_jobs:
260
- unique_jobs.append(job)
261
- seen_jobs.add(identifier)
262
-
263
- # Sort by date
264
- unique_jobs.sort(key=lambda x: x.get('date_posted', ''), reverse=True)
265
- st.session_state.jobs = unique_jobs
266
- st.success(f"Found {len(unique_jobs)} unique jobs!")
267
 
268
- with col2:
269
- if st.button("Reset", use_container_width=True):
270
- st.session_state.keywords = []
 
271
  st.session_state.jobs = []
272
- st.session_state.searched = False
273
- st.rerun()
274
-
 
275
 
276
  # --- Display Results ---
277
  if st.session_state.searched:
278
- st.header(f"πŸ’Ό Job Listings ({len(st.session_state.jobs)} Found)")
279
  if st.session_state.jobs:
280
  for job in st.session_state.jobs:
281
  display_job(job)
282
  else:
283
- st.info("No jobs found matching your criteria. Try different keywords or broaden your search.")
 
 
2
  import os
3
  import requests
4
  import re
 
5
  import fitz # PyMuPDF
6
  from docx import Document
 
7
  import json
8
 
9
  # --- Configuration ---
10
  st.set_page_config(
11
+ page_title="LinkedIn Job Finder",
12
  page_icon="πŸ€–",
13
  layout="wide",
14
  initial_sidebar_state="expanded",
15
  )
16
 
17
  # --- Hugging Face Secrets & API Keys ---
18
+ # Load API key from Streamlit secrets (for deployed apps on Hugging Face)
19
  try:
20
  SCRAPINGDOG_API_KEY = st.secrets["SCRAPINGDOG_API_KEY"]
21
  except (KeyError, AttributeError):
22
+ # Fallback for local development (optional)
23
  SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
24
 
25
+ # --- Core Functions ---
26
 
27
  def parse_cv(uploaded_file):
28
+ """Parses text from uploaded PDF, DOCX, or TXT files."""
29
  try:
30
  file_type = uploaded_file.type
31
  if "pdf" in file_type:
32
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
33
+ return "".join(page.get_text() for page in doc)
34
+ elif "vnd.openxmlformats-officedocument.wordprocessingml.document" in file_type:
 
35
  doc = Document(uploaded_file)
36
+ return "\n".join([para.text for para in doc.paragraphs])
 
37
  elif "text/plain" in file_type:
38
+ return uploaded_file.getvalue().decode("utf-8")
 
39
  else:
40
  st.error(f"Unsupported file type: {file_type}")
41
  return None
 
43
  st.error(f"Error parsing CV: {e}")
44
  return None
45
 
46
+ def extract_technical_skills(text):
47
+ """Extracts technical skills from text using a predefined list and regex."""
48
  if not text:
49
  return []
50
+
51
+ # Comprehensive list of technical skills (can be expanded)
52
+ skills_list = [
53
+ 'Python', 'Java', 'C++', 'C#', 'JavaScript', 'TypeScript', 'Go', 'Rust', 'Ruby', 'PHP', 'Swift', 'Kotlin',
54
+ 'SQL', 'NoSQL', 'PostgreSQL', 'MySQL', 'MongoDB', 'Redis', 'Cassandra', 'GraphQL',
55
+ 'React', 'Angular', 'Vue.js', 'Node.js', 'Django', 'Flask', 'Spring Boot', 'Ruby on Rails',
56
+ 'TensorFlow', 'PyTorch', 'scikit-learn', 'Keras', 'Pandas', 'NumPy', 'Matplotlib',
57
+ 'AWS', 'Azure', 'Google Cloud', 'GCP', 'Docker', 'Kubernetes', 'Terraform', 'Ansible',
58
+ 'CI/CD', 'Jenkins', 'Git', 'GitHub', 'GitLab', 'Linux', 'Bash', 'PowerShell',
59
+ 'Agile', 'Scrum', 'JIRA', 'Data Science', 'Machine Learning', 'Deep Learning', 'NLP',
60
+ 'Big Data', 'Hadoop', 'Spark', 'Cybersecurity', 'API', 'REST', 'Microservices'
61
+ ]
62
 
63
+ found_skills = set()
64
+ text_lower = text.lower()
 
 
 
 
 
 
 
65
 
66
+ # Use regex to find whole words to avoid matching substrings
67
+ for skill in skills_list:
68
+ pattern = r'\b' + re.escape(skill.lower()) + r'\b'
69
+ if re.search(pattern, text_lower):
70
+ found_skills.add(skill)
71
+
72
+ return sorted(list(found_skills))
73
 
74
  def safe_get(data, key, default='N/A'):
75
+ """Safely gets a value from a dictionary."""
76
  return data.get(key, default) if data else default
77
 
78
  class JobDataNormalizer:
79
+ """Normalizes LinkedIn job data into a common schema."""
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  @staticmethod
81
  def normalize_linkedin(job):
82
+ return {
83
+ "id": hash(safe_get(job, 'link')), # Create a simple unique ID
84
  "title": safe_get(job, 'title'),
85
  "company": safe_get(job, 'company'),
86
  "location": safe_get(job, 'location'),
87
  "description": safe_get(job, 'description'),
 
88
  "date_posted": safe_get(job, 'date'),
89
+ "job_url": safe_get(job, 'link'),
90
  "source": "LinkedIn"
91
  }
92
 
93
+ def search_linkedin_jobs(keywords, location):
94
+ """Searches for jobs on LinkedIn via the ScrapingDog API."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  if not SCRAPINGDOG_API_KEY:
96
+ st.error("Please set SCRAPINGDOG_API_KEY in Hugging Face secrets.")
 
97
  return []
98
 
 
99
  query = " ".join(keywords)
100
  api_url = f"https://api.scrapingdog.com/linkedinjobs/?api_key={SCRAPINGDOG_API_KEY}&q={query}&geoid={location}"
101
 
 
104
  response.raise_for_status()
105
  jobs_data = response.json()
106
  if isinstance(jobs_data, list):
107
+ return [JobDataNormalizer.normalize_linkedin(job) for job in jobs_data]
 
108
  except requests.exceptions.HTTPError as e:
109
+ st.error(f"API Error: {e}. Check your ScrapingDog API key and usage limits.")
110
  except requests.exceptions.RequestException as e:
111
+ st.error(f"Network error: {e}")
112
  except json.JSONDecodeError:
113
+ st.error("Failed to parse API response. The service might be temporarily down.")
114
+ return []
 
115
 
116
  # --- UI Rendering ---
117
 
118
  def display_job(job):
119
  """Renders a single job listing in a card format."""
 
 
 
120
  st.markdown(f"""
121
  <div style="border: 1px solid #e1e4e8; border-radius: 8px; padding: 16px; margin-bottom: 16px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
122
+ <h3 style="margin-bottom: 8px;"><a href="{job['job_url']}" target="_blank" style="text-decoration: none; color: #0077b5;">{job['title']}</a></h3>
123
  <p style="margin: 0;"><strong>🏒 Company:</strong> {job['company']}</p>
124
  <p style="margin: 0;"><strong>πŸ“ Location:</strong> {job['location']}</p>
125
  <p style="margin: 0; color: #586069;"><strong>πŸ—“οΈ Posted:</strong> {job['date_posted']}</p>
126
+ <div style="margin-top: 12px;">
127
+ <span style="background-color: #0077b5; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: bold;">{job['source']}</span>
128
  </div>
129
  </div>
130
  """, unsafe_allow_html=True)
131
  with st.expander("Show Job Description Snippet"):
 
132
  clean_description = re.sub('<[^<]+?>', '', job['description'])
133
  st.write(clean_description[:500] + "...")
134
 
135
+ # --- Main Application ---
136
 
137
  # Initialize session state
138
+ if 'skills' not in st.session_state:
139
+ st.session_state.skills = []
140
  if 'jobs' not in st.session_state:
141
  st.session_state.jobs = []
142
  if 'searched' not in st.session_state:
 
145
  # --- Sidebar ---
146
  with st.sidebar:
147
  st.image("https://images.emojiterra.com/twitter/v14.0/512px/1f916.png", width=80)
148
+ st.title("LinkedIn Job Finder")
149
  st.markdown("""
150
+ Find your next role on LinkedIn by leveraging the power of AI.
151
 
152
+ **How to use:**
153
+ 1. **Upload your CV** to automatically identify your technical skills.
154
+ 2. **Refine the skills list** by adding or removing keywords.
155
+ 3. **Enter a location** and hit search!
 
 
156
 
157
+ **API Key Required:**
158
+ This app uses the ScrapingDog API. You'll need to get a free API key and set it up in your Hugging Face Space secrets as `SCRAPINGDOG_API_KEY`.
 
 
 
159
  """)
160
 
161
+ # --- Main Content Panel ---
162
  st.header("1. Upload Your CV")
163
  uploaded_file = st.file_uploader(
164
+ "Upload to extract technical skills (PDF, DOCX, TXT). Personal details are ignored.",
165
+ type=["pdf", "docx", "txt"]
 
166
  )
167
 
168
  if uploaded_file:
169
+ with st.spinner("Analyzing CV for technical skills... 🧠"):
170
  cv_text = parse_cv(uploaded_file)
171
  if cv_text:
172
+ st.session_state.skills = extract_technical_skills(cv_text)
173
+ st.success("Successfully extracted skills from your CV!")
174
 
175
+ st.header("2. Refine Skills and Search")
176
+ manual_keywords = st.text_input(
177
+ "Add more skills or keywords (comma-separated)",
178
+ placeholder="e.g., Go, Cybersecurity, REST"
179
  )
180
 
181
+ added_skills = [k.strip() for k in manual_keywords.split(',') if k.strip()]
182
+ combined_skills = sorted(list(set(st.session_state.skills + added_skills)))
 
183
 
184
+ selected_skills = st.multiselect(
185
+ "Select the skills to search for:",
186
+ options=combined_skills,
187
+ default=st.session_state.skills
188
  )
189
 
190
+ location = st.text_input("Enter Location", "Remote")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ if st.button("πŸš€ Search Jobs on LinkedIn", type="primary", use_container_width=True):
193
+ if not selected_skills:
194
+ st.warning("Please select at least one skill to search.")
195
+ else:
196
  st.session_state.jobs = []
197
+ st.session_state.searched = True
198
+ with st.spinner("Searching LinkedIn... This may take a moment."):
199
+ jobs = search_linkedin_jobs(selected_skills, location)
200
+ st.session_state.jobs = sorted(jobs, key=lambda x: x.get('date_posted', ''), reverse=True)
201
 
202
  # --- Display Results ---
203
  if st.session_state.searched:
204
+ st.header(f"πŸ’Ό Job Results ({len(st.session_state.jobs)} Found)")
205
  if st.session_state.jobs:
206
  for job in st.session_state.jobs:
207
  display_job(job)
208
  else:
209
+ st.info("No jobs found for the selected keywords. Try refining your search.")
210
+