Spaces:

vernon1224
/

resume-screener

Running

App Files Files Community

vernon1224 commited on Apr 22

Commit

a438728

verified ·

1 Parent(s): 9db3523

Update app.py

Browse files

modify import api

Files changed (1) hide show

app.py +248 -249

app.py CHANGED Viewed

@@ -1,250 +1,249 @@
-# PDFs
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.vectorstores import FAISS
-from langchain.embeddings import HuggingFaceEmbeddings as HFE
-from langchain.schema import Document
-# Groq
-from langchain_groq import ChatGroq
-from google.colab import userdata
-from langchain_core.messages import HumanMessage
-from langchain_community.chat_message_histories import ChatMessageHistory
-from langchain_core.chat_history import BaseChatMessageHistory
-from langchain_core.runnables.history import RunnableWithMessageHistory
-from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from groq import Groq
-# Expanded Queries
-import ast
-# Cross Encoder
-from sentence_transformers import CrossEncoder
-# BM25
-from rank_bm25 import BM25Okapi
-import numpy as np
-# Gradio
-import gradio as gr
-# GROQ_API = userdata.get('GROQ_API')
-embed_model = "sentence-transformers/all-MiniLM-L6-v2"
-cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
-prompt = ChatPromptTemplate.from_messages(
-    [
-        ("system", """
-        You are a helpful HR assistant specializing in the resume screening phase.
-        Your goal is to identify the best, most suitable, or highest-potential
-        candidates whose qualifications align well with the provided job title
-        and job description. If a question or request falls outside the scope
-        of resume screening and candidate alignment,
-        please respond with 'I don't know'.
-        """),
-        MessagesPlaceholder(variable_name="history", optional=True),
-        ("system", "Context: {context}"),
-        ("human", "{question}"),
-    ]
-)
-query_expansion_prompt = ChatPromptTemplate([
-    ("system", """
-    You are an expert HR assistant. Given a job description and a user query,
-    generate 3 alternative, diverse search queries that capture different
-    aspects of what makes a great candidate for this role. Each query should
-    focus on a different facet (e.g., skills, leadership, hands-on experience,
-                                        certifications, unique achievements).
-    If the job description is empty, generate a general job description for the role
-    mentioned in the user query and then create the 3 alternative search queries based on that.
-    Return ONLY the generated queries as a Python list of strings. Do not include
-    any other explanatory text or formatting.
-    """),
-    ("human", "Job Description: {job_description}\nUser Query: {user_query}")
-])
-JUDGE_PROMPT = """
-You are an expert recruiter. Given the job description, the user query, and the system's answer, rate:
-Faithfulness: Does the answer accurately reflect the resume(s) provided? (1-5)
-Relevance: Does the answer address the job requirements and user query? (1-5)
-Provide your feedback as follows:
-Faithfulness: <score>
-Relevance: <score>
-Justification: <brief explanation>
-Job Description:
-{job_description}
-User Query:
-{user_query}
-System Answer:
-{system_answer}
-"""
-def load_single_pdf(path):
-  loader = PyPDFLoader(path)
-  pages = loader.load()
-  full_text = "\n".join([page.page_content for page in pages])
-  return Document(page_content=full_text)
-def chunks_embed(chunks, model_name):
-  """Create embeds for doc chunks and store in FAISS"""
-  embeds = HFE(model_name=model_name)
-  # Create FAISS index
-  db = FAISS.from_documents(chunks, embeds)
-  print(f"Created FAISS Index with {len(chunks)} documents.")
-  return db
-def search_docs_mmr(db, query, k, fetch_k, lambda_mult):
-  """
-  Retrieve the most similar docs to the query using MMR
-  (Maximum Marginal Relevance)
-  """
-  if not db:
-    print("Error: No document database available")
-    return []
-  docs = db.max_marginal_relevance_search(
-      query, k=fetch_k, lambda_mult=lambda_mult
-  )
-  return docs
-def combine_results(results):
-  # Combine the content from results to create context
-  context = ""
-  for doc in results:
-    context += doc.page_content + "\n"
-  return context
-# 1. Prepare corpus for BM25
-def prepare_bm25_corpus(docs):
-  # Tokenize for BM25 (simple whitespace split, can improve)
-  return [doc.page_content.lower().split() for doc in docs]
-# 2. Initialize BM25
-def init_bm25(docs):
-  corpus = prepare_bm25_corpus(docs)
-  return BM25Okapi(corpus)
-# 3. BM25 Search
-def bm25_search(bm25, query, docs, top_k=10):
-  query_tokens = query.lower().split()
-  scores = bm25.get_scores(query_tokens)
-  top_indices = np.argsort(scores)[::-1][:top_k]
-  return [docs[i] for i in top_indices], [scores[i] for i in top_indices]
-# Hybrid Merge Functino
-def hybrid_merge(semantic_results, bm25_results):
-  # Merge by union, keeping order (semantic first, then BM25 if not already present)
-  seen = set()
-  merged = []
-  for doc in semantic_results + bm25_results:
-      if doc.page_content not in seen:
-          merged.append(doc)
-          seen.add(doc.page_content)
-  return merged
-def llm_judge_groq(api_key, job_description, user_query, system_answer):
-  judge_prompt = JUDGE_PROMPT.format(
-      job_description=job_description,
-      user_query=user_query,
-      system_answer=system_answer
-  )
-  client = Groq(api_key=api_key)
-  completion = client.chat.completions.create(
-      model="deepseek-r1-distill-llama-70b",
-      messages=[{"role": "user", "content": judge_prompt}],
-      max_tokens=512
-  )
-  return completion.choices[0].message.content
-def screen_resumes(api_key, job_description, user_query, files):
-  embed_model = "sentence-transformers/all-MiniLM-L6-v2"
-  cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
-  # Model and prompt setup (inside function, using user API key)
-  model = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
-  history = {}
-  def get_session_history(session_id: str):
-      if session_id not in history:
-          history[session_id] = ChatMessageHistory()
-      return history[session_id]
-  with_message_history = RunnableWithMessageHistory(model, get_session_history)
-  chain = prompt | model
-  with_message_history = RunnableWithMessageHistory(
-      chain,
-      get_session_history,
-      input_messages_key="question",
-      history_messages_key="history"
-  )
-  # Load and process resumes
-  resume_paths = [file.name for file in files]
-  chunks = [load_single_pdf(path) for path in resume_paths]
-  embeds = chunks_embed(chunks, embed_model)
-  bm25 = init_bm25(chunks)
-  # Query Expansion
-  prompt_value = query_expansion_prompt.invoke({
-      "job_description": job_description,
-      "user_query": user_query,
-  })
-  expanded_queries_response = model.invoke(prompt_value.messages)
-  expanded_queries = ast.literal_eval(expanded_queries_response.content)
-  # Hybrid Retrieval
-  all_semantic = []
-  all_bm25 = []
-  for q in expanded_queries:
-      semantic_docs = search_docs_mmr(embeds, q, 10, 100, 0.7)
-      bm25_docs, _ = bm25_search(bm25, q, chunks, top_k=10)
-      all_semantic.extend(semantic_docs)
-      all_bm25.extend(bm25_docs)
-  merged_results = hybrid_merge(all_semantic, all_bm25)
-  unique_results_list = merged_results
-  # Cross-encoder Re-ranking
-  pairs = [(user_query, doc.page_content) for doc in unique_results_list]
-  scores = cross_encoder.predict(pairs)
-  ranked = sorted(zip(scores, unique_results_list), key=lambda x: x[0], reverse=True)
-  top_n = min(5, len(ranked))
-  ranked_top_n = [doc for score, doc in ranked[:top_n]]
-  context = "\n\n".join([doc.page_content for doc in ranked_top_n])
-  # LLM Final Reasoning
-  inputs = {
-      "context": context,
-      "question": user_query,
-  }
-  config = {"configurable": {"session_id": "GradioSession"}}
-  response = with_message_history.invoke(inputs, config=config)
-  system_output = response.content
-  # LLM-as-a-Judge Evaluation
-  judge_feedback = llm_judge_groq(api_key, job_description, user_query, system_output)
-  return system_output, context, judge_feedback
-demo = gr.Interface(
-    fn=screen_resumes,
-    inputs=[
-        gr.Textbox(label="Groq API Key", type="password", lines=1, placeholder="sk..."),
-        gr.Textbox(lines=4, label="Job Description"),
-        gr.Textbox(lines=2, label="User Query"),
-        gr.File(file_count="multiple", label="Upload Resume PDFs")
-    ],
-    outputs=[
-        gr.Textbox(label="Screening Result (LLM Output)"),
-        gr.Textbox(label="Top Ranked Resumes (Raw Text)"),
-        gr.Textbox(label="LLM-as-a-Judge Evaluation (DeepSeek)")
-    ],
-    title="Resume Screening Assistant (Hybrid + LLM-as-a-Judge)",
-    description="Enter your Groq API key, upload resumes, enter a job description and query, get the best candidates with explanations, and see an automated evaluation."
-)
 demo.launch(share=True)

+# PDFs
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings as HFE
+from langchain.schema import Document
+# Groq
+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from groq import Groq
+# Expanded Queries
+import ast
+# Cross Encoder
+from sentence_transformers import CrossEncoder
+# BM25
+from rank_bm25 import BM25Okapi
+import numpy as np
+# Gradio
+import gradio as gr
+# GROQ_API = userdata.get('GROQ_API')
+embed_model = "sentence-transformers/all-MiniLM-L6-v2"
+cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
+prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", """
+        You are a helpful HR assistant specializing in the resume screening phase.
+        Your goal is to identify the best, most suitable, or highest-potential
+        candidates whose qualifications align well with the provided job title
+        and job description. If a question or request falls outside the scope
+        of resume screening and candidate alignment,
+        please respond with 'I don't know'.
+        """),
+        MessagesPlaceholder(variable_name="history", optional=True),
+        ("system", "Context: {context}"),
+        ("human", "{question}"),
+    ]
+)
+query_expansion_prompt = ChatPromptTemplate([
+    ("system", """
+    You are an expert HR assistant. Given a job description and a user query,
+    generate 3 alternative, diverse search queries that capture different
+    aspects of what makes a great candidate for this role. Each query should
+    focus on a different facet (e.g., skills, leadership, hands-on experience,
+                                        certifications, unique achievements).
+    If the job description is empty, generate a general job description for the role
+    mentioned in the user query and then create the 3 alternative search queries based on that.
+    Return ONLY the generated queries as a Python list of strings. Do not include
+    any other explanatory text or formatting.
+    """),
+    ("human", "Job Description: {job_description}\nUser Query: {user_query}")
+])
+JUDGE_PROMPT = """
+You are an expert recruiter. Given the job description, the user query, and the system's answer, rate:
+Faithfulness: Does the answer accurately reflect the resume(s) provided? (1-5)
+Relevance: Does the answer address the job requirements and user query? (1-5)
+Provide your feedback as follows:
+Faithfulness: <score>
+Relevance: <score>
+Justification: <brief explanation>
+Job Description:
+{job_description}
+User Query:
+{user_query}
+System Answer:
+{system_answer}
+"""
+def load_single_pdf(path):
+  loader = PyPDFLoader(path)
+  pages = loader.load()
+  full_text = "\n".join([page.page_content for page in pages])
+  return Document(page_content=full_text)
+def chunks_embed(chunks, model_name):
+  """Create embeds for doc chunks and store in FAISS"""
+  embeds = HFE(model_name=model_name)
+  # Create FAISS index
+  db = FAISS.from_documents(chunks, embeds)
+  print(f"Created FAISS Index with {len(chunks)} documents.")
+  return db
+def search_docs_mmr(db, query, k, fetch_k, lambda_mult):
+  """
+  Retrieve the most similar docs to the query using MMR
+  (Maximum Marginal Relevance)
+  """
+  if not db:
+    print("Error: No document database available")
+    return []
+  docs = db.max_marginal_relevance_search(
+      query, k=fetch_k, lambda_mult=lambda_mult
+  )
+  return docs
+def combine_results(results):
+  # Combine the content from results to create context
+  context = ""
+  for doc in results:
+    context += doc.page_content + "\n"
+  return context
+# 1. Prepare corpus for BM25
+def prepare_bm25_corpus(docs):
+  # Tokenize for BM25 (simple whitespace split, can improve)
+  return [doc.page_content.lower().split() for doc in docs]
+# 2. Initialize BM25
+def init_bm25(docs):
+  corpus = prepare_bm25_corpus(docs)
+  return BM25Okapi(corpus)
+# 3. BM25 Search
+def bm25_search(bm25, query, docs, top_k=10):
+  query_tokens = query.lower().split()
+  scores = bm25.get_scores(query_tokens)
+  top_indices = np.argsort(scores)[::-1][:top_k]
+  return [docs[i] for i in top_indices], [scores[i] for i in top_indices]
+# Hybrid Merge Functino
+def hybrid_merge(semantic_results, bm25_results):
+  # Merge by union, keeping order (semantic first, then BM25 if not already present)
+  seen = set()
+  merged = []
+  for doc in semantic_results + bm25_results:
+      if doc.page_content not in seen:
+          merged.append(doc)
+          seen.add(doc.page_content)
+  return merged
+def llm_judge_groq(api_key, job_description, user_query, system_answer):
+  judge_prompt = JUDGE_PROMPT.format(
+      job_description=job_description,
+      user_query=user_query,
+      system_answer=system_answer
+  )
+  client = Groq(api_key=api_key)
+  completion = client.chat.completions.create(
+      model="deepseek-r1-distill-llama-70b",
+      messages=[{"role": "user", "content": judge_prompt}],
+      max_tokens=512
+  )
+  return completion.choices[0].message.content
+def screen_resumes(api_key, job_description, user_query, files):
+  embed_model = "sentence-transformers/all-MiniLM-L6-v2"
+  cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
+  # Model and prompt setup (inside function, using user API key)
+  model = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
+  history = {}
+  def get_session_history(session_id: str):
+      if session_id not in history:
+          history[session_id] = ChatMessageHistory()
+      return history[session_id]
+  with_message_history = RunnableWithMessageHistory(model, get_session_history)
+  chain = prompt | model
+  with_message_history = RunnableWithMessageHistory(
+      chain,
+      get_session_history,
+      input_messages_key="question",
+      history_messages_key="history"
+  )
+  # Load and process resumes
+  resume_paths = [file.name for file in files]
+  chunks = [load_single_pdf(path) for path in resume_paths]
+  embeds = chunks_embed(chunks, embed_model)
+  bm25 = init_bm25(chunks)
+  # Query Expansion
+  prompt_value = query_expansion_prompt.invoke({
+      "job_description": job_description,
+      "user_query": user_query,
+  })
+  expanded_queries_response = model.invoke(prompt_value.messages)
+  expanded_queries = ast.literal_eval(expanded_queries_response.content)
+  # Hybrid Retrieval
+  all_semantic = []
+  all_bm25 = []
+  for q in expanded_queries:
+      semantic_docs = search_docs_mmr(embeds, q, 10, 100, 0.7)
+      bm25_docs, _ = bm25_search(bm25, q, chunks, top_k=10)
+      all_semantic.extend(semantic_docs)
+      all_bm25.extend(bm25_docs)
+  merged_results = hybrid_merge(all_semantic, all_bm25)
+  unique_results_list = merged_results
+  # Cross-encoder Re-ranking
+  pairs = [(user_query, doc.page_content) for doc in unique_results_list]
+  scores = cross_encoder.predict(pairs)
+  ranked = sorted(zip(scores, unique_results_list), key=lambda x: x[0], reverse=True)
+  top_n = min(5, len(ranked))
+  ranked_top_n = [doc for score, doc in ranked[:top_n]]
+  context = "\n\n".join([doc.page_content for doc in ranked_top_n])
+  # LLM Final Reasoning
+  inputs = {
+      "context": context,
+      "question": user_query,
+  }
+  config = {"configurable": {"session_id": "GradioSession"}}
+  response = with_message_history.invoke(inputs, config=config)
+  system_output = response.content
+  # LLM-as-a-Judge Evaluation
+  judge_feedback = llm_judge_groq(api_key, job_description, user_query, system_output)
+  return system_output, context, judge_feedback
+demo = gr.Interface(
+    fn=screen_resumes,
+    inputs=[
+        gr.Textbox(label="Groq API Key", type="password", lines=1, placeholder="sk..."),
+        gr.Textbox(lines=4, label="Job Description"),
+        gr.Textbox(lines=2, label="User Query"),
+        gr.File(file_count="multiple", label="Upload Resume PDFs")
+    ],
+    outputs=[
+        gr.Textbox(label="Screening Result (LLM Output)"),
+        gr.Textbox(label="Top Ranked Resumes (Raw Text)"),
+        gr.Textbox(label="LLM-as-a-Judge Evaluation (DeepSeek)")
+    ],
+    title="Resume Screening Assistant (Hybrid + LLM-as-a-Judge)",
+    description="Enter your Groq API key, upload resumes, enter a job description and query, get the best candidates with explanations, and see an automated evaluation."
+)
 demo.launch(share=True)