JobFinder / app.py
riteshkokam's picture
Update app.py
85c7b12 verified
# app.py
import os
import gradio as gr
from PyPDF2 import PdfReader
from docx import Document
import yake
import requests
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from google import genai
from google.genai.types import GenerateContentConfig, ThinkingConfig
from datetime import datetime
import math
import json
# Initialize components
kw_extractor = yake.KeywordExtractor(n=2, top=30)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
genai_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
PER_PAGE = 10
SYSTEM_PROMPT = """
You are a job-matching assistant. Given a resume and job listings, rank and explain why each job is a good fit.
Return your output as a ranked markdown list of jobs. For each job, include the following:
- Job Title
- Company Name
- Location (if available)
- Why this job is a good match (1–2 sentences)
Keep the tone professional and concise, suitable for display in a career guidance app.
"""
# 1️⃣ Extract text from resume
def extract_text(file):
ext = file.name.lower().split('.')[-1]
if ext == "pdf":
return "\n".join(p.extract_text() or "" for p in PdfReader(file.name).pages)
elif ext == "docx":
return "\n".join(para.text for para in Document(file.name).paragraphs)
return ""
# 2️⃣ Extract keywords using YAKE
def extract_keywords(text):
# Remove the first line (often the candidate's name/header)
parts = text.split("\n", 1)
body = parts[1] if len(parts) > 1 else text
# Extract 1–2‑gram keywords, top 20
kws = kw_extractor.extract_keywords(body)
# Filter out any that look like names or generic headers
filtered = []
for kw, score in kws:
# drop if any word is all-caps (e.g. "SUMMARY", "RITESH")
if any(w.isupper() and len(w) > 2 for w in kw.split()):
continue
filtered.append(kw)
return filtered
def on_resume_upload(file):
text = extract_text(file)
kws = extract_keywords(text)
return ", ".join(kws)
# 3️⃣ Fetch jobs from free public APIs
def fetch_arbeitnow(keywords):
resp = requests.get("https://www.arbeitnow.com/api/job-board-api")
if resp.ok:
jobs = resp.json().get("data", [])
return [j for j in jobs if any(kw.lower() in (j.get("title","") + j.get("description","")).lower() for kw in keywords)]
return []
def fetch_remotive(keywords):
resp = requests.get("https://remotive.com/api/remote-jobs", params={"search": " ".join(keywords)})
if resp.ok:
return resp.json().get("jobs", [])
return []
def fetch_remoteok(keywords):
resp = requests.get("https://remoteok.com/api")
if resp.ok:
data = [j for j in resp.json() if isinstance(j, dict)]
return [j for j in data if any(kw.lower() in (j.get("position","") + j.get("description","")).lower() for kw in keywords)]
return []
# 4️⃣ Rank jobs by semantic similarity
def rank_jobs(resume_text, jobs):
if not jobs:
return []
emb_r = embedder.encode([resume_text])
emb_j = embedder.encode([j.get("description","") for j in jobs])
sims = cosine_similarity(emb_r, emb_j)[0]
return sorted(zip(jobs, sims), key=lambda x: x[1], reverse=True)
# 5️⃣ Gemini refinement (optional)
def refine_with_ai(ranked, resume_text):
lines = []
for job, _ in ranked:
title = job.get("title") or job.get("position") or "N/A"
company = job.get("company") or job.get("company_name") or ""
loc = job.get("location") or ""
lines.append(f"- {title} at {company} ({loc})")
prompt = (
f"Resume:\n{resume_text[:500]}\n\n"
"Here are the top matched jobs:\n" +
"\n".join(lines) +
"\n\nPlease rank these top to bottom and explain why each is a good match."
)
resp = genai_client.models.generate_content(
model="gemini-2.5-flash",
contents=SYSTEM_PROMPT + prompt,
)
return resp.text or "<No explanation>"
def format_posted(job):
raw = job.get("publication_date") or job.get("created_at") or job.get("date") or ""
if isinstance(raw, int):
# RemoteOK returns an int timestamp
return datetime.fromtimestamp(raw).strftime("%Y-%m-%d")
return str(raw)[:10]
# 6️⃣ Main pipeline
def find_jobs(file, added_kw, use_ai):
resume = extract_text(file) or ""
base_kws = added_kw.split(",") if added_kw.strip() else extract_keywords(resume)
keywords = [kw.strip() for kw in base_kws if kw.strip()]
jobs = fetch_arbeitnow(keywords) + fetch_remotive(keywords) + fetch_remoteok(keywords)
ranked = rank_jobs(resume, jobs)
print("Rank_jobs", ranked)
table = []
for i, (job, score) in enumerate(ranked):
role = job.get("title") or job.get("position", "")
company = job.get("company") or job.get("company_name", "")
location = job.get("location", "N/A")
posted = format_posted(job)
apply_url= job.get("url") or job.get("apply_url","") or job.get("joblink","") or ""
# Make sure none of these are dicts/lists
table.append({
"Role": str(role),
"Company": str(company),
"Location": str(location),
"Posted": str(posted),
"Score": f"{score*100:.1f}%",
"Apply": str(apply_url)
})
explanation = refine_with_ai(ranked, resume) if use_ai else ""
return table, explanation
# 7️⃣ Jobs in DataFrame format
def jobs_to_dataframe(table, page, per_page=PER_PAGE):
total = len(table)
pages = max(1, math.ceil(total / per_page))
page = max(1, min(page, pages))
start, end = (page-1)*per_page, page*per_page
slice_ = table[start:end]
# Convert to list of lists for DataFrame
df_data = []
for row in slice_:
# Create properly formatted clickable link for apply URL
if row['Apply'] and row['Apply'] != 'N/A':
# Format as HTML link with proper styling
apply_link = f'<a href="{row["Apply"]}" target="_blank" style="color: #2563eb; text-decoration: underline;">Apply</a>'
else:
apply_link = "N/A"
df_data.append([
row['Role'],
row['Company'],
row['Location'],
row['Posted'],
row['Score'],
apply_link
])
return df_data, f"Showing jobs {start+1}{min(end,total)} of {total} (Page {page}/{pages})"
def load_jobs_and_pages(resume, added_kw, use_ai):
full_table, explanation = find_jobs(resume, added_kw, use_ai)
total_pages = max(1, math.ceil(len(full_table) / PER_PAGE))
slider_update = gr.update(value=1, maximum=total_pages)
first_page_data, page_info = jobs_to_dataframe(full_table, 1)
expl_header = "### AI Explanation" if explanation else ""
print("Header", expl_header)
return full_table, explanation, expl_header, slider_update, first_page_data, page_info
# 8️⃣ Gradio UI
with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
gr.Markdown("## 🌍 Global Job Finder")
with gr.Row():
resume = gr.File(label="Upload Resume (PDF/DOCX)")
added = gr.Textbox(label="Add keywords (comma-separated)", placeholder="e.g. Python, ML")
resume.upload(on_resume_upload, inputs=[resume], outputs=[added])
use_ai = gr.Checkbox(label="Use AI to refine explanation", value=False)
find_btn = gr.Button("Find Jobs")
jobs_state = gr.State([]) # holds full table
page_sel = gr.Slider(1, 1, step=1, value=1, label="Page")
# Page info display
page_info = gr.Markdown("")
# DataFrame for jobs display (removed Summary column)
jobs_df = gr.DataFrame(
headers=["Role", "Company", "Location", "Posted", "Score", "Apply"],
datatype=["str", "str", "str", "str", "str", "html"],
interactive=False,
wrap=True,
column_widths=["20%", "20%", "20%", "15%", "15%", "10%"],
value=[]
)
expl_md_h = gr.Markdown()
expl_md = gr.Markdown() # shows AI explanation
# 1) On Find Jobs:
# - load the jobs, explanation, slider max, and first-page data
find_btn.click(
fn=load_jobs_and_pages,
inputs=[resume, added, use_ai],
outputs=[jobs_state, expl_md, expl_md_h, page_sel, jobs_df, page_info]
)
# 2) On page change, re-render only the table data
def update_page_display(tbl, pg):
df_data, info = jobs_to_dataframe(tbl, pg)
return df_data, info
page_sel.change(
fn=update_page_display,
inputs=[jobs_state, page_sel],
outputs=[jobs_df, page_info]
)
if __name__ == "__main__":
demo.launch()