vernon1224 commited on
Commit
a438728
·
verified ·
1 Parent(s): 9db3523

Update app.py

Browse files

modify import api

Files changed (1) hide show
  1. app.py +248 -249
app.py CHANGED
@@ -1,250 +1,249 @@
1
- # PDFs
2
- from langchain_community.document_loaders import PyPDFLoader
3
- from langchain.vectorstores import FAISS
4
- from langchain.embeddings import HuggingFaceEmbeddings as HFE
5
- from langchain.schema import Document
6
-
7
- # Groq
8
- from langchain_groq import ChatGroq
9
- from google.colab import userdata
10
- from langchain_core.messages import HumanMessage
11
- from langchain_community.chat_message_histories import ChatMessageHistory
12
- from langchain_core.chat_history import BaseChatMessageHistory
13
- from langchain_core.runnables.history import RunnableWithMessageHistory
14
- from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
15
- from groq import Groq
16
-
17
- # Expanded Queries
18
- import ast
19
-
20
- # Cross Encoder
21
- from sentence_transformers import CrossEncoder
22
-
23
- # BM25
24
- from rank_bm25 import BM25Okapi
25
- import numpy as np
26
-
27
- # Gradio
28
- import gradio as gr
29
-
30
- # GROQ_API = userdata.get('GROQ_API')
31
- embed_model = "sentence-transformers/all-MiniLM-L6-v2"
32
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
33
-
34
- prompt = ChatPromptTemplate.from_messages(
35
- [
36
- ("system", """
37
- You are a helpful HR assistant specializing in the resume screening phase.
38
- Your goal is to identify the best, most suitable, or highest-potential
39
- candidates whose qualifications align well with the provided job title
40
- and job description. If a question or request falls outside the scope
41
- of resume screening and candidate alignment,
42
- please respond with 'I don't know'.
43
- """),
44
- MessagesPlaceholder(variable_name="history", optional=True),
45
- ("system", "Context: {context}"),
46
- ("human", "{question}"),
47
- ]
48
- )
49
-
50
- query_expansion_prompt = ChatPromptTemplate([
51
- ("system", """
52
- You are an expert HR assistant. Given a job description and a user query,
53
- generate 3 alternative, diverse search queries that capture different
54
- aspects of what makes a great candidate for this role. Each query should
55
- focus on a different facet (e.g., skills, leadership, hands-on experience,
56
- certifications, unique achievements).
57
-
58
- If the job description is empty, generate a general job description for the role
59
- mentioned in the user query and then create the 3 alternative search queries based on that.
60
-
61
- Return ONLY the generated queries as a Python list of strings. Do not include
62
- any other explanatory text or formatting.
63
- """),
64
- ("human", "Job Description: {job_description}\nUser Query: {user_query}")
65
- ])
66
-
67
- JUDGE_PROMPT = """
68
- You are an expert recruiter. Given the job description, the user query, and the system's answer, rate:
69
-
70
- Faithfulness: Does the answer accurately reflect the resume(s) provided? (1-5)
71
- Relevance: Does the answer address the job requirements and user query? (1-5)
72
-
73
- Provide your feedback as follows:
74
-
75
- Faithfulness: <score>
76
- Relevance: <score>
77
- Justification: <brief explanation>
78
-
79
- Job Description:
80
- {job_description}
81
-
82
- User Query:
83
- {user_query}
84
-
85
- System Answer:
86
- {system_answer}
87
- """
88
-
89
- def load_single_pdf(path):
90
- loader = PyPDFLoader(path)
91
- pages = loader.load()
92
- full_text = "\n".join([page.page_content for page in pages])
93
- return Document(page_content=full_text)
94
-
95
- def chunks_embed(chunks, model_name):
96
- """Create embeds for doc chunks and store in FAISS"""
97
- embeds = HFE(model_name=model_name)
98
- # Create FAISS index
99
- db = FAISS.from_documents(chunks, embeds)
100
- print(f"Created FAISS Index with {len(chunks)} documents.")
101
- return db
102
-
103
- def search_docs_mmr(db, query, k, fetch_k, lambda_mult):
104
- """
105
- Retrieve the most similar docs to the query using MMR
106
- (Maximum Marginal Relevance)
107
- """
108
- if not db:
109
- print("Error: No document database available")
110
- return []
111
-
112
- docs = db.max_marginal_relevance_search(
113
- query, k=fetch_k, lambda_mult=lambda_mult
114
- )
115
- return docs
116
-
117
- def combine_results(results):
118
- # Combine the content from results to create context
119
- context = ""
120
- for doc in results:
121
- context += doc.page_content + "\n"
122
- return context
123
-
124
- # 1. Prepare corpus for BM25
125
- def prepare_bm25_corpus(docs):
126
- # Tokenize for BM25 (simple whitespace split, can improve)
127
- return [doc.page_content.lower().split() for doc in docs]
128
-
129
- # 2. Initialize BM25
130
- def init_bm25(docs):
131
- corpus = prepare_bm25_corpus(docs)
132
- return BM25Okapi(corpus)
133
-
134
- # 3. BM25 Search
135
- def bm25_search(bm25, query, docs, top_k=10):
136
- query_tokens = query.lower().split()
137
- scores = bm25.get_scores(query_tokens)
138
- top_indices = np.argsort(scores)[::-1][:top_k]
139
- return [docs[i] for i in top_indices], [scores[i] for i in top_indices]
140
-
141
- # Hybrid Merge Functino
142
- def hybrid_merge(semantic_results, bm25_results):
143
- # Merge by union, keeping order (semantic first, then BM25 if not already present)
144
- seen = set()
145
- merged = []
146
- for doc in semantic_results + bm25_results:
147
- if doc.page_content not in seen:
148
- merged.append(doc)
149
- seen.add(doc.page_content)
150
- return merged
151
-
152
- def llm_judge_groq(api_key, job_description, user_query, system_answer):
153
- judge_prompt = JUDGE_PROMPT.format(
154
- job_description=job_description,
155
- user_query=user_query,
156
- system_answer=system_answer
157
- )
158
- client = Groq(api_key=api_key)
159
- completion = client.chat.completions.create(
160
- model="deepseek-r1-distill-llama-70b",
161
- messages=[{"role": "user", "content": judge_prompt}],
162
- max_tokens=512
163
- )
164
- return completion.choices[0].message.content
165
-
166
- def screen_resumes(api_key, job_description, user_query, files):
167
- embed_model = "sentence-transformers/all-MiniLM-L6-v2"
168
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
169
-
170
- # Model and prompt setup (inside function, using user API key)
171
- model = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
172
- history = {}
173
- def get_session_history(session_id: str):
174
- if session_id not in history:
175
- history[session_id] = ChatMessageHistory()
176
- return history[session_id]
177
- with_message_history = RunnableWithMessageHistory(model, get_session_history)
178
- chain = prompt | model
179
- with_message_history = RunnableWithMessageHistory(
180
- chain,
181
- get_session_history,
182
- input_messages_key="question",
183
- history_messages_key="history"
184
- )
185
-
186
- # Load and process resumes
187
- resume_paths = [file.name for file in files]
188
- chunks = [load_single_pdf(path) for path in resume_paths]
189
- embeds = chunks_embed(chunks, embed_model)
190
- bm25 = init_bm25(chunks)
191
-
192
- # Query Expansion
193
- prompt_value = query_expansion_prompt.invoke({
194
- "job_description": job_description,
195
- "user_query": user_query,
196
- })
197
- expanded_queries_response = model.invoke(prompt_value.messages)
198
- expanded_queries = ast.literal_eval(expanded_queries_response.content)
199
-
200
- # Hybrid Retrieval
201
- all_semantic = []
202
- all_bm25 = []
203
- for q in expanded_queries:
204
- semantic_docs = search_docs_mmr(embeds, q, 10, 100, 0.7)
205
- bm25_docs, _ = bm25_search(bm25, q, chunks, top_k=10)
206
- all_semantic.extend(semantic_docs)
207
- all_bm25.extend(bm25_docs)
208
- merged_results = hybrid_merge(all_semantic, all_bm25)
209
- unique_results_list = merged_results
210
-
211
- # Cross-encoder Re-ranking
212
- pairs = [(user_query, doc.page_content) for doc in unique_results_list]
213
- scores = cross_encoder.predict(pairs)
214
- ranked = sorted(zip(scores, unique_results_list), key=lambda x: x[0], reverse=True)
215
- top_n = min(5, len(ranked))
216
- ranked_top_n = [doc for score, doc in ranked[:top_n]]
217
- context = "\n\n".join([doc.page_content for doc in ranked_top_n])
218
-
219
- # LLM Final Reasoning
220
- inputs = {
221
- "context": context,
222
- "question": user_query,
223
- }
224
- config = {"configurable": {"session_id": "GradioSession"}}
225
- response = with_message_history.invoke(inputs, config=config)
226
- system_output = response.content
227
-
228
- # LLM-as-a-Judge Evaluation
229
- judge_feedback = llm_judge_groq(api_key, job_description, user_query, system_output)
230
-
231
- return system_output, context, judge_feedback
232
-
233
- demo = gr.Interface(
234
- fn=screen_resumes,
235
- inputs=[
236
- gr.Textbox(label="Groq API Key", type="password", lines=1, placeholder="sk..."),
237
- gr.Textbox(lines=4, label="Job Description"),
238
- gr.Textbox(lines=2, label="User Query"),
239
- gr.File(file_count="multiple", label="Upload Resume PDFs")
240
- ],
241
- outputs=[
242
- gr.Textbox(label="Screening Result (LLM Output)"),
243
- gr.Textbox(label="Top Ranked Resumes (Raw Text)"),
244
- gr.Textbox(label="LLM-as-a-Judge Evaluation (DeepSeek)")
245
- ],
246
- title="Resume Screening Assistant (Hybrid + LLM-as-a-Judge)",
247
- description="Enter your Groq API key, upload resumes, enter a job description and query, get the best candidates with explanations, and see an automated evaluation."
248
- )
249
-
250
  demo.launch(share=True)
 
1
+ # PDFs
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.embeddings import HuggingFaceEmbeddings as HFE
5
+ from langchain.schema import Document
6
+
7
+ # Groq
8
+ from langchain_groq import ChatGroq
9
+ from langchain_core.messages import HumanMessage
10
+ from langchain_community.chat_message_histories import ChatMessageHistory
11
+ from langchain_core.chat_history import BaseChatMessageHistory
12
+ from langchain_core.runnables.history import RunnableWithMessageHistory
13
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
14
+ from groq import Groq
15
+
16
+ # Expanded Queries
17
+ import ast
18
+
19
+ # Cross Encoder
20
+ from sentence_transformers import CrossEncoder
21
+
22
+ # BM25
23
+ from rank_bm25 import BM25Okapi
24
+ import numpy as np
25
+
26
+ # Gradio
27
+ import gradio as gr
28
+
29
+ # GROQ_API = userdata.get('GROQ_API')
30
+ embed_model = "sentence-transformers/all-MiniLM-L6-v2"
31
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
32
+
33
+ prompt = ChatPromptTemplate.from_messages(
34
+ [
35
+ ("system", """
36
+ You are a helpful HR assistant specializing in the resume screening phase.
37
+ Your goal is to identify the best, most suitable, or highest-potential
38
+ candidates whose qualifications align well with the provided job title
39
+ and job description. If a question or request falls outside the scope
40
+ of resume screening and candidate alignment,
41
+ please respond with 'I don't know'.
42
+ """),
43
+ MessagesPlaceholder(variable_name="history", optional=True),
44
+ ("system", "Context: {context}"),
45
+ ("human", "{question}"),
46
+ ]
47
+ )
48
+
49
+ query_expansion_prompt = ChatPromptTemplate([
50
+ ("system", """
51
+ You are an expert HR assistant. Given a job description and a user query,
52
+ generate 3 alternative, diverse search queries that capture different
53
+ aspects of what makes a great candidate for this role. Each query should
54
+ focus on a different facet (e.g., skills, leadership, hands-on experience,
55
+ certifications, unique achievements).
56
+
57
+ If the job description is empty, generate a general job description for the role
58
+ mentioned in the user query and then create the 3 alternative search queries based on that.
59
+
60
+ Return ONLY the generated queries as a Python list of strings. Do not include
61
+ any other explanatory text or formatting.
62
+ """),
63
+ ("human", "Job Description: {job_description}\nUser Query: {user_query}")
64
+ ])
65
+
66
+ JUDGE_PROMPT = """
67
+ You are an expert recruiter. Given the job description, the user query, and the system's answer, rate:
68
+
69
+ Faithfulness: Does the answer accurately reflect the resume(s) provided? (1-5)
70
+ Relevance: Does the answer address the job requirements and user query? (1-5)
71
+
72
+ Provide your feedback as follows:
73
+
74
+ Faithfulness: <score>
75
+ Relevance: <score>
76
+ Justification: <brief explanation>
77
+
78
+ Job Description:
79
+ {job_description}
80
+
81
+ User Query:
82
+ {user_query}
83
+
84
+ System Answer:
85
+ {system_answer}
86
+ """
87
+
88
+ def load_single_pdf(path):
89
+ loader = PyPDFLoader(path)
90
+ pages = loader.load()
91
+ full_text = "\n".join([page.page_content for page in pages])
92
+ return Document(page_content=full_text)
93
+
94
+ def chunks_embed(chunks, model_name):
95
+ """Create embeds for doc chunks and store in FAISS"""
96
+ embeds = HFE(model_name=model_name)
97
+ # Create FAISS index
98
+ db = FAISS.from_documents(chunks, embeds)
99
+ print(f"Created FAISS Index with {len(chunks)} documents.")
100
+ return db
101
+
102
+ def search_docs_mmr(db, query, k, fetch_k, lambda_mult):
103
+ """
104
+ Retrieve the most similar docs to the query using MMR
105
+ (Maximum Marginal Relevance)
106
+ """
107
+ if not db:
108
+ print("Error: No document database available")
109
+ return []
110
+
111
+ docs = db.max_marginal_relevance_search(
112
+ query, k=fetch_k, lambda_mult=lambda_mult
113
+ )
114
+ return docs
115
+
116
+ def combine_results(results):
117
+ # Combine the content from results to create context
118
+ context = ""
119
+ for doc in results:
120
+ context += doc.page_content + "\n"
121
+ return context
122
+
123
+ # 1. Prepare corpus for BM25
124
+ def prepare_bm25_corpus(docs):
125
+ # Tokenize for BM25 (simple whitespace split, can improve)
126
+ return [doc.page_content.lower().split() for doc in docs]
127
+
128
+ # 2. Initialize BM25
129
+ def init_bm25(docs):
130
+ corpus = prepare_bm25_corpus(docs)
131
+ return BM25Okapi(corpus)
132
+
133
+ # 3. BM25 Search
134
+ def bm25_search(bm25, query, docs, top_k=10):
135
+ query_tokens = query.lower().split()
136
+ scores = bm25.get_scores(query_tokens)
137
+ top_indices = np.argsort(scores)[::-1][:top_k]
138
+ return [docs[i] for i in top_indices], [scores[i] for i in top_indices]
139
+
140
+ # Hybrid Merge Functino
141
+ def hybrid_merge(semantic_results, bm25_results):
142
+ # Merge by union, keeping order (semantic first, then BM25 if not already present)
143
+ seen = set()
144
+ merged = []
145
+ for doc in semantic_results + bm25_results:
146
+ if doc.page_content not in seen:
147
+ merged.append(doc)
148
+ seen.add(doc.page_content)
149
+ return merged
150
+
151
+ def llm_judge_groq(api_key, job_description, user_query, system_answer):
152
+ judge_prompt = JUDGE_PROMPT.format(
153
+ job_description=job_description,
154
+ user_query=user_query,
155
+ system_answer=system_answer
156
+ )
157
+ client = Groq(api_key=api_key)
158
+ completion = client.chat.completions.create(
159
+ model="deepseek-r1-distill-llama-70b",
160
+ messages=[{"role": "user", "content": judge_prompt}],
161
+ max_tokens=512
162
+ )
163
+ return completion.choices[0].message.content
164
+
165
+ def screen_resumes(api_key, job_description, user_query, files):
166
+ embed_model = "sentence-transformers/all-MiniLM-L6-v2"
167
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
168
+
169
+ # Model and prompt setup (inside function, using user API key)
170
+ model = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
171
+ history = {}
172
+ def get_session_history(session_id: str):
173
+ if session_id not in history:
174
+ history[session_id] = ChatMessageHistory()
175
+ return history[session_id]
176
+ with_message_history = RunnableWithMessageHistory(model, get_session_history)
177
+ chain = prompt | model
178
+ with_message_history = RunnableWithMessageHistory(
179
+ chain,
180
+ get_session_history,
181
+ input_messages_key="question",
182
+ history_messages_key="history"
183
+ )
184
+
185
+ # Load and process resumes
186
+ resume_paths = [file.name for file in files]
187
+ chunks = [load_single_pdf(path) for path in resume_paths]
188
+ embeds = chunks_embed(chunks, embed_model)
189
+ bm25 = init_bm25(chunks)
190
+
191
+ # Query Expansion
192
+ prompt_value = query_expansion_prompt.invoke({
193
+ "job_description": job_description,
194
+ "user_query": user_query,
195
+ })
196
+ expanded_queries_response = model.invoke(prompt_value.messages)
197
+ expanded_queries = ast.literal_eval(expanded_queries_response.content)
198
+
199
+ # Hybrid Retrieval
200
+ all_semantic = []
201
+ all_bm25 = []
202
+ for q in expanded_queries:
203
+ semantic_docs = search_docs_mmr(embeds, q, 10, 100, 0.7)
204
+ bm25_docs, _ = bm25_search(bm25, q, chunks, top_k=10)
205
+ all_semantic.extend(semantic_docs)
206
+ all_bm25.extend(bm25_docs)
207
+ merged_results = hybrid_merge(all_semantic, all_bm25)
208
+ unique_results_list = merged_results
209
+
210
+ # Cross-encoder Re-ranking
211
+ pairs = [(user_query, doc.page_content) for doc in unique_results_list]
212
+ scores = cross_encoder.predict(pairs)
213
+ ranked = sorted(zip(scores, unique_results_list), key=lambda x: x[0], reverse=True)
214
+ top_n = min(5, len(ranked))
215
+ ranked_top_n = [doc for score, doc in ranked[:top_n]]
216
+ context = "\n\n".join([doc.page_content for doc in ranked_top_n])
217
+
218
+ # LLM Final Reasoning
219
+ inputs = {
220
+ "context": context,
221
+ "question": user_query,
222
+ }
223
+ config = {"configurable": {"session_id": "GradioSession"}}
224
+ response = with_message_history.invoke(inputs, config=config)
225
+ system_output = response.content
226
+
227
+ # LLM-as-a-Judge Evaluation
228
+ judge_feedback = llm_judge_groq(api_key, job_description, user_query, system_output)
229
+
230
+ return system_output, context, judge_feedback
231
+
232
+ demo = gr.Interface(
233
+ fn=screen_resumes,
234
+ inputs=[
235
+ gr.Textbox(label="Groq API Key", type="password", lines=1, placeholder="sk..."),
236
+ gr.Textbox(lines=4, label="Job Description"),
237
+ gr.Textbox(lines=2, label="User Query"),
238
+ gr.File(file_count="multiple", label="Upload Resume PDFs")
239
+ ],
240
+ outputs=[
241
+ gr.Textbox(label="Screening Result (LLM Output)"),
242
+ gr.Textbox(label="Top Ranked Resumes (Raw Text)"),
243
+ gr.Textbox(label="LLM-as-a-Judge Evaluation (DeepSeek)")
244
+ ],
245
+ title="Resume Screening Assistant (Hybrid + LLM-as-a-Judge)",
246
+ description="Enter your Groq API key, upload resumes, enter a job description and query, get the best candidates with explanations, and see an automated evaluation."
247
+ )
248
+
 
249
  demo.launch(share=True)