File size: 7,348 Bytes
0ad22ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import os
import logging
import requests
import json
import PyPDF2
import docx
from bs4 import BeautifulSoup
from chromadb import PersistentClient
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, ValidationError
from typing import List
from sentence_transformers import SentenceTransformer  # Import SentenceTransformer
from dotenv import load_dotenv

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# --- Text Extraction ---
def extract_text(uploaded_file):
    try:
        if isinstance(uploaded_file, str):
            return extract_text_from_webpage(uploaded_file)
        elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/pdf":
            if is_image_pdf(uploaded_file):
                logger.warning(f"Image-based PDF detected: {uploaded_file.name}")
                return ""  # Skip processing
            return extract_text_from_pdf(uploaded_file)
        elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            return extract_text_from_docx(uploaded_file)
        return ""
    except Exception as e:
        logger.error(f"Error extracting text: {e}")
        return ""

def is_image_pdf(uploaded_file):
    """Check if a PDF is image-based (contains no selectable text)."""
    try:
        reader = PyPDF2.PdfReader(uploaded_file)
        for page in reader.pages:
            if page.extract_text():
                return False  # Text is present, so not an image PDF
        return True  # No text detected, likely an image-based PDF
    except Exception as e:
        logger.error(f"Error checking if PDF is image-based: {e}")
        return True  # Assume image PDF if error occurs

def extract_text_from_pdf(uploaded_file):
    try:
        reader = PyPDF2.PdfReader(uploaded_file)
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    except Exception as e:
        logger.error(f"Error reading PDF {uploaded_file.name}: {e}")
        return ""

def extract_text_from_docx(uploaded_file):
    try:
        doc = docx.Document(uploaded_file)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        logger.error(f"Error reading DOCX: {e}")
        return ""

def extract_text_from_webpage(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text(separator='\n')
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching webpage: {e}")
        return ""

def preprocess_text(text):
    return text.lower()

def get_embeddings(text, model):
    return model.encode(text)

def get_similar_cvs(cvs, job_description_embedding, collection):
    results = collection.query(
        query_embeddings=[job_description_embedding],
        n_results=len(cvs),
        include=["distances", "metadatas"]
    )

    similar_cvs = []
    for i in range(len(results['metadatas'][0])): # Corrected loop
        metadata = results['metadatas'][0][i]
        if metadata: #Check if metadata exists
            filename = metadata.get('filename') # Use .get to handle missing keys
            if filename: # Check if filename exists in metadata
                similarity_score = 1 - results['distances'][0][i]
                similar_cvs.append({
                    "filename": filename,
                    "initial_score": similarity_score
                })
            else:
                logger.warning(f"Metadata for CV at index {i} is missing 'filename'.")
        else:
            logger.warning(f"No metadata found for CV at index {i}.")


    similar_cvs.sort(key=lambda x: x['initial_score'], reverse=True)
    return similar_cvs

# ... (CV Assessment & Ranking functions)

class RequirementAssessment(BaseModel):
    requirement: str
    match: str = Field(pattern="^(Yes|No|Partial|Not Applicable)$")
    evidence: str
    justification: str

class CandidateAssessment(BaseModel):
    filename: str
    requirements: List[RequirementAssessment]
    overall_assessment: str = Field(pattern="^(Excellent|Good|Fair|Poor)$")
    recommendation: str = Field(pattern="^(Interview|Reject|Maybe)$")
    justification: str


import time
import requests
import json
from pydantic import ValidationError


def assess_cv(cv_text, requirements, filename, groq_client, max_retries=3, retry_delay=2):
    """Assess CV against specific job requirements with Tree-of-Thoughts."""

    requirements_str = "\n".join([f"- {req}" for req in requirements])
    prompt_template = ChatPromptTemplate.from_template(

        template = os.environ.get("LLM_PROMPT")

    )
    
    prompt = prompt_template.format_messages(requirements=requirements_str, cv_text=cv_text)

    # ... (rest of the assess_cv function remains the same)
    for attempt in range(max_retries):
        try:
            response = groq_client.invoke(prompt, timeout=30)
            response_content = response.content

            return {"filename": filename, "raw_response": response_content}

        except requests.exceptions.RequestException as e:
            logger.error(f"Network error during Groq API call: {e}")
            if attempt == max_retries - 1:
                return {"filename": filename, "error": "Network error during LLM call"}
            else:
                logger.warning(f"Network error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).")
                time.sleep(retry_delay)
                retry_delay *= 2

        except Exception as e:
            logger.error(f"Groq API error (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt == max_retries - 1:
                return {"filename": filename, "error": "General LLM failure"}
            else:
                logger.warning(f"General LLM error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).")
                time.sleep(retry_delay)
                retry_delay *= 2

    return {"filename": filename, "error": "LLM call failed after multiple retries."}

def extract_job_requirements(job_description, groq_client):
        """Extracts job requirements from the job description using the LLM."""
        prompt_template = ChatPromptTemplate.from_template(
            template="Extract the key job requirements from the following job description:\n\n{job_description}\n\nRequirements:"
        )
        prompt = prompt_template.format_messages(job_description=job_description)

        try:
            response = groq_client.invoke(prompt, timeout=30)
            requirements_text = response.content.strip()
            requirements = [req.strip() for req in requirements_text.split('\n') if req.strip()]
            return requirements
        except Exception as e:
            logger.error(f"Error extracting job requirements: {e}")
            return []