Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| from io import BytesIO | |
| from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredFileLoader | |
| from dotenv import load_dotenv | |
| from yolo_text_extraction import pdf_to_text | |
| load_dotenv() | |
| class CV: | |
| def __init__(self, file_url): | |
| self.file_url = file_url | |
| self.doc_loader = { | |
| ".pdf": PyPDFLoader, | |
| ".docx": Docx2txtLoader, | |
| ".txt": UnstructuredFileLoader | |
| } | |
| def get_cv_text(self): | |
| # Download the file from Supabase | |
| response = requests.get(self.file_url) | |
| file_content = BytesIO(response.content) | |
| # Determine file extension | |
| _, ext = os.path.splitext(self.file_url.split("?")[0]) # Remove query parameters | |
| if ext.lower() in self.doc_loader: | |
| if ext.lower() == '.pdf': | |
| loader = self.doc_loader[ext.lower()](file_content) | |
| else: | |
| # For non-PDF files, save temporarily and use the appropriate loader | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file: | |
| temp_file.write(file_content.getvalue()) | |
| temp_file_path = temp_file.name | |
| loader = self.doc_loader[ext.lower()](temp_file_path) | |
| data = loader.load() | |
| if ext.lower() != '.pdf': | |
| os.unlink(temp_file_path) # Delete the temporary file | |
| if data: | |
| text = "\n".join([page.page_content for page in data]) | |
| return text if text.strip() else pdf_to_text(file_content) | |
| else: | |
| return pdf_to_text(file_content) | |
| else: | |
| return pdf_to_text(file_content) | |
| def analyse_cv_quality(self): | |
| from cv_analyzer import analyze_cv | |
| cv_text = self.get_cv_text() | |
| result = analyze_cv(cv_text) | |
| return result |