CV-Automation-Ranking / rag_utils_updated_v2.py
umarch's picture
Rename rag_utils_updated.py to rag_utils_updated_v2.py
9bff1c9 verified
import os
import logging
import requests
import json
import PyPDF2
import docx
from bs4 import BeautifulSoup
from chromadb import PersistentClient
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, ValidationError
from typing import List
from sentence_transformers import SentenceTransformer # Import SentenceTransformer
from dotenv import load_dotenv
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# --- Text Extraction ---
def extract_text(uploaded_file):
try:
if isinstance(uploaded_file, str):
return extract_text_from_webpage(uploaded_file)
elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/pdf":
if is_image_pdf(uploaded_file):
logger.warning(f"Image-based PDF detected: {uploaded_file.name}")
return "" # Skip processing
return extract_text_from_pdf(uploaded_file)
elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return extract_text_from_docx(uploaded_file)
return ""
except Exception as e:
logger.error(f"Error extracting text: {e}")
return ""
def is_image_pdf(uploaded_file):
"""Check if a PDF is image-based (contains no selectable text)."""
try:
reader = PyPDF2.PdfReader(uploaded_file)
for page in reader.pages:
if page.extract_text():
return False # Text is present, so not an image PDF
return True # No text detected, likely an image-based PDF
except Exception as e:
logger.error(f"Error checking if PDF is image-based: {e}")
return True # Assume image PDF if error occurs
def extract_text_from_pdf(uploaded_file):
try:
reader = PyPDF2.PdfReader(uploaded_file)
return "\n".join([page.extract_text() or "" for page in reader.pages])
except Exception as e:
logger.error(f"Error reading PDF {uploaded_file.name}: {e}")
return ""
def extract_text_from_docx(uploaded_file):
try:
doc = docx.Document(uploaded_file)
return "\n".join([para.text for para in doc.paragraphs])
except Exception as e:
logger.error(f"Error reading DOCX: {e}")
return ""
def extract_text_from_webpage(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
return soup.get_text(separator='\n')
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching webpage: {e}")
return ""
def preprocess_text(text):
return text.lower()
def get_embeddings(text, model):
return model.encode(text)
def get_similar_cvs(cvs, job_description_embedding, collection):
results = collection.query(
query_embeddings=[job_description_embedding],
n_results=len(cvs),
include=["distances", "metadatas"]
)
similar_cvs = []
for i in range(len(results['metadatas'][0])): # Corrected loop
metadata = results['metadatas'][0][i]
if metadata: #Check if metadata exists
filename = metadata.get('filename') # Use .get to handle missing keys
if filename: # Check if filename exists in metadata
similarity_score = 1 - results['distances'][0][i]
similar_cvs.append({
"filename": filename,
"initial_score": similarity_score
})
else:
logger.warning(f"Metadata for CV at index {i} is missing 'filename'.")
else:
logger.warning(f"No metadata found for CV at index {i}.")
similar_cvs.sort(key=lambda x: x['initial_score'], reverse=True)
return similar_cvs
# ... (CV Assessment & Ranking functions)
class RequirementAssessment(BaseModel):
requirement: str
match: str = Field(pattern="^(Yes|No|Partial|Not Applicable)$")
evidence: str
justification: str
class CandidateAssessment(BaseModel):
filename: str
requirements: List[RequirementAssessment]
overall_assessment: str = Field(pattern="^(Excellent|Good|Fair|Poor)$")
recommendation: str = Field(pattern="^(Interview|Reject|Maybe)$")
justification: str
import time
import requests
import json
from pydantic import ValidationError
def assess_cv(cv_text, requirements, filename, groq_client, max_retries=3, retry_delay=2):
"""Assess CV against specific job requirements with Tree-of-Thoughts."""
requirements_str = "\n".join([f"- {req}" for req in requirements])
prompt_template = ChatPromptTemplate.from_template(
template = os.environ.get("LLM_PROMPT")
)
prompt = prompt_template.format_messages(requirements=requirements_str, cv_text=cv_text)
# ... (rest of the assess_cv function remains the same)
for attempt in range(max_retries):
try:
response = groq_client.invoke(prompt, timeout=30)
response_content = response.content
return {"filename": filename, "raw_response": response_content}
except requests.exceptions.RequestException as e:
logger.error(f"Network error during Groq API call: {e}")
if attempt == max_retries - 1:
return {"filename": filename, "error": "Network error during LLM call"}
else:
logger.warning(f"Network error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).")
time.sleep(retry_delay)
retry_delay *= 2
except Exception as e:
logger.error(f"Groq API error (attempt {attempt + 1}/{max_retries}): {e}")
if attempt == max_retries - 1:
return {"filename": filename, "error": "General LLM failure"}
else:
logger.warning(f"General LLM error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).")
time.sleep(retry_delay)
retry_delay *= 2
return {"filename": filename, "error": "LLM call failed after multiple retries."}
def extract_job_requirements(job_description, groq_client):
"""Extracts job requirements from the job description using the LLM."""
prompt_template = ChatPromptTemplate.from_template(
template="Extract the key job requirements from the following job description:\n\n{job_description}\n\nRequirements:"
)
prompt = prompt_template.format_messages(job_description=job_description)
try:
response = groq_client.invoke(prompt, timeout=30)
requirements_text = response.content.strip()
requirements = [req.strip() for req in requirements_text.split('\n') if req.strip()]
return requirements
except Exception as e:
logger.error(f"Error extracting job requirements: {e}")
return []