import os import logging import base64 from mistralai import Mistral, DocumentURLChunk from mistralai.extra import response_format_from_pydantic_model from model import ApplicantDocument from .constant import MISTRAL_OCR_MODEL logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class CVProcessor: def __init__(self, api_key: str = None): if api_key is None: # Use the default API key from environment variable api_key = os.environ.get("MISTRAL_API_KEY") if not api_key: raise ValueError("API key for Mistral is not set.") self.client = Mistral(api_key=api_key) @staticmethod def encode_base64(file_path: str) -> str: """Encode the pdf/image file to base64.""" try: with open(file_path, "rb") as _file: return base64.b64encode(_file.read()).decode("utf-8") except FileNotFoundError: logger.error(f"Error: The file {file_path} was not found.") return None except Exception as e: logger.error(f"Error: {e}") return None def get_cv_content_from_base64(self, base64_pdf: str) -> dict: """ """ if not base64_pdf: raise ValueError("Base64 PDF content is empty.") response = { "cv": { "file_content": base64_pdf, } } # Use the OCR model to extract text from the PDF logger.info("Processing OCR for CV file from base64 content") ocr_response = self.client.ocr.process( model=MISTRAL_OCR_MODEL, pages=list(range(2)), document={ "type": "document_url", "document_url": f"data:application/pdf;base64,{base64_pdf}", }, document_annotation_format=response_format_from_pydantic_model( ApplicantDocument ), include_image_base64=False, ) response["cv"]["annotation"] = ocr_response.document_annotation return response def upload_cv(self, pdf_path: str) -> str: """ Upload a CV PDF file and return the signed URL. """ with open(pdf_path, "rb") as pdf_file: uploaded_pdf = self.client.files.upload( file={ "file_name": os.path.basename(pdf_path), "content": pdf_file, }, purpose="ocr", ) signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id) return { "file_id": uploaded_pdf.id, "file_name": uploaded_pdf.filename, "file_url": signed_url.url, } def get_cv_content(self, pdf_path: str) -> str: """ """ if not os.path.exists(pdf_path): raise FileNotFoundError(f"The file {pdf_path} does not exist.") response = { "cv": { "file_path": pdf_path, } } # Upload the CV PDF file and get the signed URL logger.info(f"Uploading CV PDF file: {pdf_path}") signed_url = self.upload_cv(pdf_path) response["cv"] |= signed_url # Use the OCR model to extract text from the PDF logger.info(f"Processing OCR for cv file: {pdf_path}") ocr_response = self.client.ocr.process( model=MISTRAL_OCR_MODEL, document=DocumentURLChunk(document_url=response["cv"]["file_url"]), pages=list(range(2)), document_annotation_format=response_format_from_pydantic_model( ApplicantDocument ), include_image_base64=False, ) response["cv"]["annotation"] = ocr_response.document_annotation return response if __name__ == "__main__": pass