Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import PyPDF2 | |
| import os | |
| from google.oauth2 import service_account | |
| import gspread | |
| from pydantic import BaseModel, Field | |
| from typing import List | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| import time | |
| import re | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1) ENVIRONMENT VARIABLES / SECRETS | |
| # | |
| # On Huggingface Spaces: | |
| # - Go to your Spaceβs Settings β Secrets and add: | |
| # β’ OPENAI_API_KEY = yourβopenaiβkey | |
| # β’ GOOGLE_API_KEY = yourβgoogleβkey (if you use any Google LLM) | |
| # - If you also need a Google Service Account JSON, either: | |
| # a) Commit it (careful: that is public by default β only do so if itβs nonβsensitive!), | |
| # b) Or add it as βRepository Filesβ via the βFiles & versionsβ tab, | |
| # c) Or load it from a Secret. | |
| # | |
| # In code below, weβll assume the serviceβaccount JSON is committed under: | |
| # ββ synapse-recruitment-34e7b48899b4.json | |
| # | |
| # If you instead want to load it from a singleβline environment variable, you can do: | |
| # service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT_JSON")) | |
| # creds = service_account.Credentials.from_service_account_info(service_account_info, scopes=SCOPES) | |
| # | |
| # For now, weβll simply use: | |
| # SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json" | |
| # | |
| # And expect that file to be present in the topβlevel of your repo/Space. | |
| # | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") | |
| if OPENAI_API_KEY == "": | |
| st.warning("β οΈ OPENAI_API_KEY is not set. The LLM calls will fail unless you add it under Secrets.") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2) Pydantic models for structured output | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class structure(BaseModel): | |
| name: str = Field(description="Name of the candidate") | |
| location: str = Field(description="The location of the candidate.") | |
| skills: List[str] = Field(description="List of individual skills of the candidate") | |
| ideal_jobs: str = Field(description="List of ideal jobs for the candidate based on past experience.") | |
| yoe: str = Field(description="Years of experience of the candidate.") | |
| experience: str = Field(description="A brief summary of the candidate's past experience.") | |
| class Job(BaseModel): | |
| job_title: str = Field(description="The title of the job.") | |
| company: str = Field(description="The company offering the job.") | |
| location: str = Field(description="The location of the job.") | |
| skills: List[str] = Field(description="List of skills required for the job.") | |
| description: str = Field(description="A brief description of the job.") | |
| relevance_score: float = Field(description="Relevance score of the job to the candidate's resume.") | |
| justification: str = Field(description = "Reason for giving this relevance score and what all areas need to be improved by the candidate") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3) Helper: parse a commaβseparated βTech Stackβ string into a Python set | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def parse_tech_stack(stack): | |
| if pd.isna(stack) or stack == "" or stack is None: | |
| return set() | |
| if isinstance(stack, set): | |
| return stack | |
| try: | |
| # If it's literally a Pythonβset string like "{'python','django'}" | |
| if isinstance(stack, str) and stack.startswith("{") and stack.endswith("}"): | |
| items = stack.strip("{}").split(",") | |
| return set(item.strip().strip("'\"").lower() for item in items if item.strip()) | |
| # Otherwise assume commaβseparated values | |
| return set(s.strip().lower() for s in str(stack).split(",") if s.strip()) | |
| except Exception as e: | |
| st.error(f"Error parsing tech stack: {e}") | |
| return set() | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4) Google Sheets initialization (Service Account JSON must be present in repo) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def initialize_google_sheets(): | |
| SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json" | |
| SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] | |
| if not os.path.exists(SERVICE_ACCOUNT_FILE): | |
| st.error(f"Service account file not found at '{SERVICE_ACCOUNT_FILE}'.\n" | |
| "Either commit it into the repo or load from a Secret.") | |
| return None | |
| try: | |
| creds = service_account.Credentials.from_service_account_file( | |
| SERVICE_ACCOUNT_FILE, scopes=SCOPES | |
| ) | |
| return gspread.authorize(creds) | |
| except Exception as e: | |
| st.error(f"Failed to load Google Service Account credentials: {e}") | |
| return None | |
| def load_jobs_data(): | |
| gc = initialize_google_sheets() | |
| if gc is None: | |
| return None | |
| try: | |
| # NOTE: Replace this key with your actual spreadsheet key | |
| SPREADSHEET_KEY = "1BZlvbtFyiQ9Pgr_lpepDJua1ZeVEqrCLjssNd6OiG9k" | |
| worksheet = gc.open_by_key(SPREADSHEET_KEY).worksheet("paraform_jobs_formatted") | |
| all_values = worksheet.get_all_values() | |
| if not all_values or len(all_values) < 2: | |
| st.warning("No data found in the Jobs sheet.") | |
| return None | |
| df = pd.DataFrame(all_values[1:], columns=all_values[0]).fillna("") | |
| # Add a βparsed_stackβ column so we can preβfilter by skill overlap | |
| df["parsed_stack"] = df["Tech Stack"].apply(parse_tech_stack) | |
| return df | |
| except Exception as e: | |
| st.error(f"Error loading jobs data from Google Sheets: {e}") | |
| return None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5) PDF β plain text | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_text_from_pdf(pdf_file): | |
| try: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| full_text = "" | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| full_text += text + "\n" | |
| return full_text | |
| except Exception as e: | |
| st.error(f"Failed to read PDF: {e}") | |
| return "" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6) Call GPTβ4oβmini to extract structured fields from resume text | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def structure_resume_data(resume_text: str) -> structure: | |
| llm = ChatOpenAI( | |
| model="gpt-4o-mini", | |
| temperature=0.0, | |
| max_retries=2, | |
| ) | |
| sum_llm = llm.with_structured_output(structure) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", "You are a helper that extracts structured data from a resume."), | |
| ("human", "Extract the following fields from this resume:\n{resume_text}\n" | |
| "If any field is missing, return βUnknownβ.") | |
| ]) | |
| try: | |
| parsed = (prompt | sum_llm).invoke({"resume_text": resume_text}) | |
| return parsed | |
| except Exception as e: | |
| st.error(f"Failed to extract structure from resume: {e}") | |
| # Return a fallback with βUnknownβ fields | |
| return structure( | |
| name="Unknown", | |
| location="Unknown", | |
| skills=[], | |
| ideal_jobs="Unknown", | |
| yoe="Unknown", | |
| experience="Unknown" | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7) Evaluate jobs: Preβfilter by requiring at least two overlapping skills, | |
| # then run an LLM loop (with a βStopβ check on each iteration) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def eval_jobs(jobs_df: pd.DataFrame, resume_text: str) -> pd.DataFrame: | |
| """ | |
| 1) Extract candidate info (list of skills, etc.) | |
| 2) Build a skillβset from response.skills | |
| 3) Preβfilter all jobs so that jobβs Tech Stack has β₯2 skills in common | |
| 4) For that filtered subset, run an LLM evaluation loop | |
| β on each iteration, check `st.session_state.evaluation_running`: | |
| if it has become False, break out immediately. | |
| 5) Return a DataFrame of topβ10 results (or empty if none). | |
| """ | |
| response = structure_resume_data(resume_text) | |
| candidate_skills = set(skill.lower() for skill in response.skills) | |
| # How many overlapping skills does each job have? | |
| def matching_skill_count(tech_stack: str) -> int: | |
| job_skills = set(s.strip().lower() for s in tech_stack.split(",") if s.strip()) | |
| return len(candidate_skills & job_skills) | |
| jobs_df["matching_skills"] = jobs_df["Tech Stack"].apply(matching_skill_count) | |
| filtered = jobs_df[jobs_df["matching_skills"] >= 2].copy() | |
| if filtered.empty: | |
| st.warning("No jobs passed the 2-skill pre-filter.") | |
| return pd.DataFrame() | |
| # Build a candidate_text blob for the LLM to consume | |
| candidate_text = ( | |
| f"{response.name} {response.location} " | |
| f"{', '.join(response.skills)} {response.ideal_jobs} " | |
| f"{response.yoe} {response.experience}" | |
| ) | |
| # LLM setup for jobβevaluation | |
| llm = ChatOpenAI( | |
| model="gpt-4o-mini", | |
| temperature=0.0, | |
| max_retries=2, | |
| ) | |
| eval_llm = llm.with_structured_output(Job) | |
| system_msg = ( | |
| "You are an expert recruiter. First, filter by location & experience. " | |
| "Then pick jobs that match the candidateβs skills & background. " | |
| "Finally, assign a relevance score (0β10)." | |
| ) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system_msg), | |
| ("human", "Evaluate Job: {job_text}\nCandidate: {candidate_text}\n" | |
| "Return JSON with job_title, company, location, skills, description, relevance_score.") | |
| ]) | |
| chain = prompt | eval_llm | |
| jobs_for_eval = filtered[["Company", "Role", "Locations", "parsed_stack", "YOE", "matching_skills"]] | |
| results = [] | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| total = len(jobs_for_eval) | |
| for i, row in enumerate(jobs_for_eval.itertuples(), start=1): | |
| # If the user clicked βStop Evaluationβ β evaluation_running = False | |
| if not st.session_state.evaluation_running: | |
| status_text.text("βΈοΈ Evaluation halted by user.") | |
| break | |
| progress_bar.progress(i / total) | |
| status_text.text(f"Evaluating job {i}/{total}: {row.Role} at {row.Company}") | |
| job_text = " ".join([ | |
| row.Role, | |
| row.Company, | |
| row.Locations, | |
| ", ".join(row.parsed_stack), | |
| str(row.YOE) | |
| ]) | |
| try: | |
| eval_job = chain.invoke({ | |
| "job_text": job_text, | |
| "candidate_text": candidate_text | |
| }) | |
| except Exception as e: | |
| st.error(f"LLM failed on job #{i}: {e}") | |
| # Skip this job and continue | |
| continue | |
| results.append({ | |
| "job_title": eval_job.job_title, | |
| "company": eval_job.company, | |
| "location": eval_job.location, | |
| "skills": eval_job.skills, | |
| "description": eval_job.description, | |
| "relevance_score": eval_job.relevance_score, | |
| "matching_skills": row.matching_skills | |
| }) | |
| # Simulate a delay so you can see the Stop button in action | |
| time.sleep(0.5) | |
| progress_bar.empty() | |
| status_text.empty() | |
| if not results: | |
| return pd.DataFrame() | |
| df_results = pd.DataFrame(results) | |
| # Sort first by matching_skills desc, then by relevance_score desc, take top 10 | |
| df_results = df_results.sort_values( | |
| by=["matching_skills", "relevance_score"], | |
| ascending=[False, False] | |
| ).head(10) | |
| return df_results | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8) Clean rΓ©sumΓ© text (lowercase, strip special chars) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def preprocess_text(text: str) -> str: | |
| return re.sub(r"[^a-zA-Z\s]", "", text.lower()) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 9) Streamlit UI | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| st.title("π Resume Evaluator & Job Recommender") | |
| # 9.1) Initialize session state flags | |
| if "evaluation_running" not in st.session_state: | |
| st.session_state.evaluation_running = False | |
| if "evaluation_complete" not in st.session_state: | |
| st.session_state.evaluation_complete = False | |
| # 9.2) File uploader | |
| uploaded_file = st.file_uploader( | |
| "Upload your resume (PDF)", | |
| type=["pdf"], | |
| help="After picking a PDF, click βGenerate Recommendationsβ below." | |
| ) | |
| # 9.3) Always show BOTH βGenerate Recommendationsβ and βStop Evaluationβ in two columns | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.session_state.evaluation_running: | |
| st.button("Generate Recommendations", disabled=True) | |
| else: | |
| if st.button("Generate Recommendations"): | |
| # 9.4) User clicked βGenerateβ β begin | |
| st.session_state.evaluation_running = True | |
| st.session_state.evaluation_complete = False | |
| # 9.5) Ensure a file was actually uploaded | |
| if uploaded_file is None: | |
| st.error("β Please upload a PDF before clicking βGenerate Recommendationsβ.") | |
| st.session_state.evaluation_running = False | |
| else: | |
| # Debug: print basic type of what streamlit handed us | |
| st.write(f"βΆοΈ Received file of type: `{type(uploaded_file)}`") | |
| # 9.6) Load job sheet | |
| jobs_df = load_jobs_data() | |
| if jobs_df is None: | |
| st.session_state.evaluation_running = False | |
| return | |
| # 9.7) Extract text from the PDF | |
| raw_text = extract_text_from_pdf(uploaded_file) | |
| if not raw_text.strip(): | |
| st.error("β οΈ The uploaded PDF appears to contain no extractable text.") | |
| st.session_state.evaluation_running = False | |
| return | |
| cleaned = preprocess_text(raw_text) | |
| st.success("β Resume text extracted successfully!") | |
| # 9.8) Run the lengthy eval loop inside a spinner | |
| with st.spinner("Evaluating jobsβ¦"): | |
| recommendations = eval_jobs(jobs_df, cleaned) | |
| # 9.9) Show results (or warning if none) | |
| if not recommendations.empty: | |
| st.header("Recommended Jobs") | |
| st.dataframe(recommendations) | |
| st.session_state.evaluation_complete = True | |
| else: | |
| st.warning("No matching jobs found or evaluation was halted midβstream.") | |
| # 9.10) Done (or halted) | |
| st.session_state.evaluation_running = False | |
| with col2: | |
| # The βStop Evaluationβ button is only enabled while evaluation_running is True: | |
| if st.session_state.evaluation_running: | |
| if st.button("Stop Evaluation"): | |
| st.session_state.evaluation_running = False | |
| st.warning("βΈοΈ User requested to stop evaluation.") | |
| else: | |
| st.button("Stop Evaluation", disabled=True) | |
| # 9.11) Once complete, allow βTry Another Resumeβ to reset | |
| if st.session_state.evaluation_complete: | |
| if st.button("Try Another Resume"): | |
| st.session_state.evaluation_complete = False | |
| st.experimental_rerun() | |
| if __name__ == "__main__": | |
| main() | |