import os import requests from io import BytesIO from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredFileLoader from dotenv import load_dotenv from yolo_text_extraction import pdf_to_text load_dotenv() class CV: def __init__(self, file_url): self.file_url = file_url self.doc_loader = { ".pdf": PyPDFLoader, ".docx": Docx2txtLoader, ".txt": UnstructuredFileLoader } def get_cv_text(self): # Download the file from Supabase response = requests.get(self.file_url) file_content = BytesIO(response.content) # Determine file extension _, ext = os.path.splitext(self.file_url.split("?")[0]) # Remove query parameters if ext.lower() in self.doc_loader: if ext.lower() == '.pdf': loader = self.doc_loader[ext.lower()](file_content) else: # For non-PDF files, save temporarily and use the appropriate loader with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file: temp_file.write(file_content.getvalue()) temp_file_path = temp_file.name loader = self.doc_loader[ext.lower()](temp_file_path) data = loader.load() if ext.lower() != '.pdf': os.unlink(temp_file_path) # Delete the temporary file if data: text = "\n".join([page.page_content for page in data]) return text if text.strip() else pdf_to_text(file_content) else: return pdf_to_text(file_content) else: return pdf_to_text(file_content) def analyse_cv_quality(self): from cv_analyzer import analyze_cv cv_text = self.get_cv_text() result = analyze_cv(cv_text) return result