Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| import requests | |
| import json | |
| import PyPDF2 | |
| import docx | |
| from bs4 import BeautifulSoup | |
| from chromadb import PersistentClient | |
| from langchain_groq import ChatGroq | |
| from langchain.prompts import ChatPromptTemplate | |
| from langchain.output_parsers import PydanticOutputParser | |
| from pydantic import BaseModel, Field, ValidationError | |
| from typing import List | |
| from sentence_transformers import SentenceTransformer # Import SentenceTransformer | |
| from dotenv import load_dotenv | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| # --- Text Extraction --- | |
| def extract_text(uploaded_file): | |
| try: | |
| if isinstance(uploaded_file, str): | |
| return extract_text_from_webpage(uploaded_file) | |
| elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/pdf": | |
| if is_image_pdf(uploaded_file): | |
| logger.warning(f"Image-based PDF detected: {uploaded_file.name}") | |
| return "" # Skip processing | |
| return extract_text_from_pdf(uploaded_file) | |
| elif hasattr(uploaded_file, 'type') and uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| return extract_text_from_docx(uploaded_file) | |
| return "" | |
| except Exception as e: | |
| logger.error(f"Error extracting text: {e}") | |
| return "" | |
| def is_image_pdf(uploaded_file): | |
| """Check if a PDF is image-based (contains no selectable text).""" | |
| try: | |
| reader = PyPDF2.PdfReader(uploaded_file) | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| return False # Text is present, so not an image PDF | |
| return True # No text detected, likely an image-based PDF | |
| except Exception as e: | |
| logger.error(f"Error checking if PDF is image-based: {e}") | |
| return True # Assume image PDF if error occurs | |
| def extract_text_from_pdf(uploaded_file): | |
| try: | |
| reader = PyPDF2.PdfReader(uploaded_file) | |
| return "\n".join([page.extract_text() or "" for page in reader.pages]) | |
| except Exception as e: | |
| logger.error(f"Error reading PDF {uploaded_file.name}: {e}") | |
| return "" | |
| def extract_text_from_docx(uploaded_file): | |
| try: | |
| doc = docx.Document(uploaded_file) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| except Exception as e: | |
| logger.error(f"Error reading DOCX: {e}") | |
| return "" | |
| def extract_text_from_webpage(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| return soup.get_text(separator='\n') | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Error fetching webpage: {e}") | |
| return "" | |
| def preprocess_text(text): | |
| return text.lower() | |
| def get_embeddings(text, model): | |
| return model.encode(text) | |
| def get_similar_cvs(cvs, job_description_embedding, collection): | |
| results = collection.query( | |
| query_embeddings=[job_description_embedding], | |
| n_results=len(cvs), | |
| include=["distances", "metadatas"] | |
| ) | |
| similar_cvs = [] | |
| for i in range(len(results['metadatas'][0])): # Corrected loop | |
| metadata = results['metadatas'][0][i] | |
| if metadata: #Check if metadata exists | |
| filename = metadata.get('filename') # Use .get to handle missing keys | |
| if filename: # Check if filename exists in metadata | |
| similarity_score = 1 - results['distances'][0][i] | |
| similar_cvs.append({ | |
| "filename": filename, | |
| "initial_score": similarity_score | |
| }) | |
| else: | |
| logger.warning(f"Metadata for CV at index {i} is missing 'filename'.") | |
| else: | |
| logger.warning(f"No metadata found for CV at index {i}.") | |
| similar_cvs.sort(key=lambda x: x['initial_score'], reverse=True) | |
| return similar_cvs | |
| # ... (CV Assessment & Ranking functions) | |
| class RequirementAssessment(BaseModel): | |
| requirement: str | |
| match: str = Field(pattern="^(Yes|No|Partial|Not Applicable)$") | |
| evidence: str | |
| justification: str | |
| class CandidateAssessment(BaseModel): | |
| filename: str | |
| requirements: List[RequirementAssessment] | |
| overall_assessment: str = Field(pattern="^(Excellent|Good|Fair|Poor)$") | |
| recommendation: str = Field(pattern="^(Interview|Reject|Maybe)$") | |
| justification: str | |
| import time | |
| import requests | |
| import json | |
| from pydantic import ValidationError | |
| def assess_cv(cv_text, requirements, filename, groq_client, max_retries=3, retry_delay=2): | |
| """Assess CV against specific job requirements with Tree-of-Thoughts.""" | |
| requirements_str = "\n".join([f"- {req}" for req in requirements]) | |
| prompt_template = ChatPromptTemplate.from_template( | |
| template = os.environ.get("LLM_PROMPT") | |
| ) | |
| prompt = prompt_template.format_messages(requirements=requirements_str, cv_text=cv_text) | |
| # ... (rest of the assess_cv function remains the same) | |
| for attempt in range(max_retries): | |
| try: | |
| response = groq_client.invoke(prompt, timeout=30) | |
| response_content = response.content | |
| return {"filename": filename, "raw_response": response_content} | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Network error during Groq API call: {e}") | |
| if attempt == max_retries - 1: | |
| return {"filename": filename, "error": "Network error during LLM call"} | |
| else: | |
| logger.warning(f"Network error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).") | |
| time.sleep(retry_delay) | |
| retry_delay *= 2 | |
| except Exception as e: | |
| logger.error(f"Groq API error (attempt {attempt + 1}/{max_retries}): {e}") | |
| if attempt == max_retries - 1: | |
| return {"filename": filename, "error": "General LLM failure"} | |
| else: | |
| logger.warning(f"General LLM error, retrying in {retry_delay} seconds (attempt {attempt+1}/{max_retries}).") | |
| time.sleep(retry_delay) | |
| retry_delay *= 2 | |
| return {"filename": filename, "error": "LLM call failed after multiple retries."} | |
| def extract_job_requirements(job_description, groq_client): | |
| """Extracts job requirements from the job description using the LLM.""" | |
| prompt_template = ChatPromptTemplate.from_template( | |
| template="Extract the key job requirements from the following job description:\n\n{job_description}\n\nRequirements:" | |
| ) | |
| prompt = prompt_template.format_messages(job_description=job_description) | |
| try: | |
| response = groq_client.invoke(prompt, timeout=30) | |
| requirements_text = response.content.strip() | |
| requirements = [req.strip() for req in requirements_text.split('\n') if req.strip()] | |
| return requirements | |
| except Exception as e: | |
| logger.error(f"Error extracting job requirements: {e}") | |
| return [] |