import os import logging import requests import json import PyPDF2 import docx from bs4 import BeautifulSoup from chromadb import PersistentClient from langchain_groq import ChatGroq from langchain.prompts import ChatPromptTemplate from langchain.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field, ValidationError from typing import List from sentence_transformers import SentenceTransformer # Import SentenceTransformer from dotenv import load_dotenv # Setup logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # --- Text Extraction --- def extract_text(uploaded_file): try: if isinstance(uploaded_file, str): return extract_text_from_webpage(uploaded_file) elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/pdf": if is_image_pdf(uploaded_file): logger.warning(f"Image-based PDF detected: {uploaded_file.name}") return "" # Skip processing return extract_text_from_pdf(uploaded_file) elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return extract_text_from_docx(uploaded_file) return "" except Exception as e: logger.error(f"Error extracting text: {e}") return "" def is_image_pdf(uploaded_file): """Check if a PDF is image-based (contains no selectable text).""" try: reader = PyPDF2.PdfReader(uploaded_file) for page in reader.pages: if page.extract_text(): return False # Text is present, so not an image PDF return True # No text detected, likely an image-based PDF except Exception as e: logger.error(f"Error checking if PDF is image-based: {e}") return True # Assume image PDF if error occurs def extract_text_from_pdf(uploaded_file): try: reader = PyPDF2.PdfReader(uploaded_file) return "\n".join([page.extract_text() or "" for page in reader.pages]) except Exception as e: logger.error(f"Error reading PDF {uploaded_file.name}: {e}") return "" def extract_text_from_docx(uploaded_file): try: doc = docx.Document(uploaded_file) return "\n".join([para.text for para in doc.paragraphs]) except Exception as e: logger.error(f"Error reading DOCX: {e}") return "" def extract_text_from_webpage(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') return soup.get_text(separator='\n') except requests.exceptions.RequestException as e: logger.error(f"Error fetching webpage: {e}") return "" def preprocess_text(text): return text.lower() def get_embeddings(text, model): return model.encode(text) def get_similar_cvs(cvs, job_description_embedding, collection): results = collection.query( query_embeddings=[job_description_embedding], n_results=len(cvs), include=["distances", "metadatas"] ) similar_cvs = [] for i in range(len(results['metadatas'][0])): # Corrected loop metadata = results['metadatas'][0][i] if metadata: #Check if metadata exists filename = metadata.get('filename') # Use .get to handle missing keys if filename: # Check if filename exists in metadata similarity_score = 1 - results['distances'][0][i] similar_cvs.append({ "filename": filename, "initial_score": similarity_score }) else: logger.warning(f"Metadata for CV at index {i} is missing 'filename'.") else: logger.warning(f"No metadata found for CV at index {i}.") similar_cvs.sort(key=lambda x: x['initial_score'], reverse=True) return similar_cvs # ... (CV Assessment & Ranking functions) class RequirementAssessment(BaseModel): requirement: str match: str = Field(pattern="^(Yes|No|Partial|Not Applicable)$") evidence: str justification: str class CandidateAssessment(BaseModel): filename: str requirements: List[RequirementAssessment] overall_assessment: str = Field(pattern="^(Excellent|Good|Fair|Poor)$") recommendation: str = Field(pattern="^(Interview|Reject|Maybe)$") justification: str import time import requests import json from pydantic import ValidationError def assess_cv(cv_text, requirements, filename, groq_client, max_retries=3, retry_delay=2): """Assess CV against specific job requirements with Tree-of-Thoughts.""" requirements_str = "\n".join([f"- {req}" for req in requirements]) prompt_template = ChatPromptTemplate.from_template( template = os.environ.get("LLM_PROMPT") ) prompt = prompt_template.format_messages(requirements=requirements_str, cv_text=cv_text) # ... (rest of the assess_cv function remains the same) for attempt in range(max_retries): try: response = groq_client.invoke(prompt, timeout=30) response_content = response.content return {"filename": filename, "raw_response": response_content} except requests.exceptions.RequestException as e: logger.error(f"Network error during Groq API call: {e}") if attempt == max_retries - 1: return {"filename": filename, "error": "Network error during LLM call"} else: logger.warning(f"Network error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).") time.sleep(retry_delay) retry_delay *= 2 except Exception as e: logger.error(f"Groq API error (attempt {attempt + 1}/{max_retries}): {e}") if attempt == max_retries - 1: return {"filename": filename, "error": "General LLM failure"} else: logger.warning(f"General LLM error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).") time.sleep(retry_delay) retry_delay *= 2 return {"filename": filename, "error": "LLM call failed after multiple retries."} def extract_job_requirements(job_description, groq_client): """Extracts job requirements from the job description using the LLM.""" prompt_template = ChatPromptTemplate.from_template( template="Extract the key job requirements from the following job description:\n\n{job_description}\n\nRequirements:" ) prompt = prompt_template.format_messages(job_description=job_description) try: response = groq_client.invoke(prompt, timeout=30) requirements_text = response.content.strip() requirements = [req.strip() for req in requirements_text.split('\n') if req.strip()] return requirements except Exception as e: logger.error(f"Error extracting job requirements: {e}") return []