Spaces:
Sleeping
Sleeping
File size: 13,081 Bytes
0ad22ad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 | import streamlit as st
import pandas as pd
import os
import logging
import re
import uuid
from chromadb import PersistentClient
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from rag_utils_updated import extract_text, preprocess_text, get_embeddings, is_image_pdf, assess_cv, extract_job_requirements
import plotly.graph_objects as go
from dotenv import load_dotenv
# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
load_dotenv()
if os.environ.get("LLM_PROMPT") is None:
st.error("LLM_PROMPT is missing. Check your .env file!")
if os.environ.get("ADMIN_PASSWORD") is None:
st.error("ADMIN_PASSWORD is missing. Check your .env file!")
st.title("CV Assessment and Ranking App")
# Generate a unique session ID for temporary sessions
if "session_id" not in st.session_state:
st.session_state.session_id = str(uuid.uuid4())[:8] # Short unique session ID
# Initialize session state variables
for key in ["job_description", "requirements", "detailed_assessments", "cvs", "job_description_embedding"]:
if key not in st.session_state:
st.session_state[key] = None if key in ["job_description", "requirements", "job_description_embedding"] else {}
if "assessment_completed" not in st.session_state:
st.session_state.assessment_completed = False
if "admin_logged_in" not in st.session_state:
st.session_state.admin_logged_in = False
# Persistent Storage for Embeddings
PERMANENT_DB_PATH = "./cv_db"
db_client = PersistentClient(path=PERMANENT_DB_PATH)
st.session_state.collection = db_client.get_or_create_collection(f"cv_embeddings_{st.session_state.session_id}")
if "embedding_model" not in st.session_state:
st.session_state.embedding_model = SentenceTransformer('all-mpnet-base-v2')
if "groq_client" not in st.session_state:
st.session_state.groq_client = ChatGroq(api_key=os.environ.get("GROQ_API_KEY"))
def clear_chroma_db():
"""Clears only the current session's embeddings."""
try:
st.session_state.collection.delete(where={"session_id": st.session_state.session_id}) # Delete only this session's embeddings
st.info("Session-specific embeddings cleared. Starting fresh!")
except Exception as e:
st.error(f"Error clearing session embeddings: {e}")
st.stop()
# Ensure the session clears its own embeddings on startup
clear_chroma_db()
import shutil
def clear_all_sessions_data():
"""Admin function to delete old session embeddings and reclaim disk space while keeping active sessions."""
try:
global db_client
existing_collections = db_client.list_collections()
# Identify active sessions (all currently running session IDs)
active_sessions = [f"cv_embeddings_{st.session_state.session_id}"]
# Delete all collections except currently active ones
for collection_name in existing_collections:
if collection_name not in active_sessions:
db_client.delete_collection(collection_name) # Delete only old session data
# Force database compaction to free up space
db_client = None # Close database connection
shutil.rmtree(PERMANENT_DB_PATH) # Delete database folder
os.makedirs(PERMANENT_DB_PATH, exist_ok=True) # Recreate empty database
db_client = PersistentClient(path=PERMANENT_DB_PATH) # Reinitialize database
st.success("Old session embeddings deleted. Active sessions retained. Database size optimized.")
except Exception as e:
st.error(f"Error deleting old session data: {e}")
# Admin Panel for Clearing Old Data
with st.sidebar:
st.subheader("Admin Login")
admin_user = st.text_input("Username", key="admin_user")
admin_pass = st.text_input("Password", type="password", key="admin_pass")
if st.button("Login as Admin"):
if admin_user == "admin" and admin_pass == os.environ.get("ADMIN_PASSWORD"):
st.session_state.admin_logged_in = True
st.success("Admin login successful!")
else:
st.error("Invalid credentials. Access denied.")
if st.session_state.admin_logged_in:
st.subheader("Admin Actions")
if st.button("Clear All Stored Embeddings"):
clear_all_sessions_data()
def process_cv(uploaded_file):
"""Processes a single CV file: extracts text, preprocesses, and stores embeddings with a session ID."""
filename = uploaded_file.name
session_filename = f"{st.session_state.session_id}_{filename}" # Unique per session
try:
if is_image_pdf(uploaded_file):
st.warning(f"{filename} appears to be an image-based PDF and cannot be processed.")
return None
text = extract_text(uploaded_file)
preprocessed_text = preprocess_text(text)
embedding = get_embeddings(preprocessed_text, st.session_state.embedding_model)
st.session_state.collection.add(
embeddings=[embedding],
documents=[preprocessed_text],
ids=[session_filename], # Store session-unique ID
metadatas=[{"session_id": st.session_state.session_id, "filename": filename}]
)
return {"text": preprocessed_text, "embedding": embedding, "session_filename": session_filename}
except Exception as e:
st.error(f"Failed to process {filename}: {e}")
return None
def parse_assessment(raw_response, requirements):
"""Parses the LLM's assessment with robust error handling."""
matches = {
"technical_lead": "Not Found",
"hr_specialist": "Not Found",
"project_manager": "Not Found",
"final_assessment": "Not Found",
"recommendation": "Not Found",
"technical_lead_score": "Not Found",
"hr_specialist_score": "Not Found",
"project_manager_score": "Not Found",
"final_assessment_score": "Not Found",
}
try:
technical_lead_match = re.search(r"Technical Lead Assessment:\s*(.*?)\s*Technical Lead Score:\s*(\d+)", raw_response, re.IGNORECASE | re.DOTALL)
if technical_lead_match:
matches["technical_lead"] = technical_lead_match.group(1).strip()
matches["technical_lead_score"] = technical_lead_match.group(2)
hr_specialist_match = re.search(r"HR Specialist Assessment:\s*(.*?)\s*HR Specialist Score:\s*(\d+)", raw_response, re.IGNORECASE | re.DOTALL)
if hr_specialist_match:
matches["hr_specialist"] = hr_specialist_match.group(1).strip()
matches["hr_specialist_score"] = hr_specialist_match.group(2)
project_manager_match = re.search(r"Project Manager Assessment:\s*(.*?)\s*Project Manager Score:\s*(\d+)", raw_response, re.IGNORECASE | re.DOTALL)
if project_manager_match:
matches["project_manager"] = project_manager_match.group(1).strip()
matches["project_manager_score"] = project_manager_match.group(2)
final_assessment_match = re.search(r"Final Assessment:\s*(.*?)\s*Final Assessment Score:\s*(\d+)", raw_response, re.IGNORECASE | re.DOTALL)
if final_assessment_match:
matches["final_assessment"] = final_assessment_match.group(1).strip()
matches["final_assessment_score"] = final_assessment_match.group(2)
recommendation_match = re.search(r"Recommendation:\s*(.*?)$", raw_response, re.IGNORECASE | re.DOTALL)
if recommendation_match:
matches["recommendation"] = recommendation_match.group(1).strip()
except Exception as e:
print(f"Error parsing assessment: {e}")
return matches
# 1. Input Job Description
st.subheader("Enter Job Description")
requirements_source = st.radio("Source:", ("File Upload", "Web Page Link", "Text Input"))
if requirements_source == "File Upload":
uploaded_file = st.file_uploader("Upload Job Requirements (PDF/DOCX)", type=["pdf", "docx"])
if uploaded_file:
st.session_state.job_description = extract_text(uploaded_file)
elif requirements_source == "Text Input":
st.session_state.job_description = st.text_area("Enter Job Requirements", height=200)
if st.session_state.job_description:
st.success("Job description uploaded successfully!")
if st.session_state.job_description_embedding is None:
st.session_state.job_description_embedding = get_embeddings(st.session_state.job_description, st.session_state.embedding_model)
if not st.session_state.requirements:
st.session_state.requirements = extract_job_requirements(st.session_state.job_description, st.session_state.groq_client)
if st.session_state.requirements:
with st.expander("Extracted Job Requirements:"):
for req in st.session_state.requirements:
st.write(f"- {req}")
# 2. Upload CVs
st.subheader("Upload CVs (Folder)")
uploaded_files = st.file_uploader("Choose CV files", accept_multiple_files=True)
if uploaded_files and not st.session_state.assessment_completed:
with st.spinner("Processing uploaded CVs, please wait..."):
st.write(f"{len(uploaded_files)} CV(s) uploaded.")
st.session_state.cvs = {}
for uploaded_file in uploaded_files:
result = process_cv(uploaded_file)
if result:
st.session_state.cvs[result["session_filename"]] = result
st.success("CV embeddings created successfully!")
st.session_state.assessment_completed = True
# Perform detailed assessments automatically
if st.session_state.assessment_completed:
st.write("Performing detailed assessments...")
detailed_assessments = st.session_state.detailed_assessments # Store reference for efficiency
if not detailed_assessments:
with st.spinner("Assessing CVs..."):
for filename, cv_data in st.session_state.cvs.items():
try:
assessment = assess_cv(cv_data["text"], st.session_state.requirements, filename, st.session_state.groq_client)
detailed_assessments[filename] = assessment
except Exception as e:
st.error(f"Error assessing {filename}: {e}")
st.success("Detailed assessments complete!")
st.subheader("Candidates Assessment and Ranking")
assessments_df = pd.DataFrame([{**parse_assessment(a["raw_response"], st.session_state.requirements), "filename": f} for f, a in st.session_state.detailed_assessments.items()])
assessments_df = assessments_df.sort_values(by='final_assessment_score', ascending=False)
st.dataframe(assessments_df)
st.subheader("Detailed Assessment Results")
# Iterate through the DataFrame rows to display the UI for each assessment
for index, row in assessments_df.iterrows():
st.write(f"**Filename:** {row['filename']}")
scores = {
"Technical Lead": int(row["technical_lead_score"]),
"HR Specialist": int(row["hr_specialist_score"]),
"Project Manager": int(row["project_manager_score"]),
"Final Assessment": int(row["final_assessment_score"]),
}
scores_df = pd.DataFrame(list(scores.items()), columns=["Expert", "Score"])
# Create Plotly bar chart with annotations
fig = go.Figure(data=[go.Bar(
x=scores_df["Expert"],
y=scores_df["Score"],
text=scores_df["Score"],
textposition='auto',
)])
fig.update_layout(yaxis_range=[0, 100])
# Create columns layout
col1, col2 = st.columns([1, 3])
# Display bar chart in the first column with a unique key
with col1:
st.plotly_chart(fig, use_container_width=True, key=f"chart_{index}")
# Display collapsed panels in the second column
with col2:
with st.expander("Technical Lead Assessment"):
st.write(f"{row['technical_lead']}")
st.write(f"**Technical Lead Score:** {row['technical_lead_score']}")
with st.expander("HR Specialist Assessment"):
st.write(f"{row['hr_specialist']}")
st.write(f"**HR Specialist Score:** {row['hr_specialist_score']}")
with st.expander("Project Manager Assessment"):
st.write(f"{row['project_manager']}")
st.write(f"**Project Manager Score:** {row['project_manager_score']}")
with st.expander("Final Assessment"):
st.write(f"{row['final_assessment']}")
st.write(f"**Final Assessment Score:** {row['final_assessment_score']}")
with st.expander("Recommendation"):
st.write(f"{row['recommendation']}")
st.write("---")
|