|
|
import os |
|
|
import logging |
|
|
import base64 |
|
|
from mistralai import Mistral, DocumentURLChunk |
|
|
from mistralai.extra import response_format_from_pydantic_model |
|
|
|
|
|
from model import ApplicantDocument |
|
|
from .constant import MISTRAL_OCR_MODEL |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
|
|
class CVProcessor: |
|
|
def __init__(self, api_key: str = None): |
|
|
if api_key is None: |
|
|
|
|
|
api_key = os.environ.get("MISTRAL_API_KEY") |
|
|
if not api_key: |
|
|
raise ValueError("API key for Mistral is not set.") |
|
|
self.client = Mistral(api_key=api_key) |
|
|
|
|
|
@staticmethod |
|
|
def encode_base64(file_path: str) -> str: |
|
|
"""Encode the pdf/image file to base64.""" |
|
|
try: |
|
|
with open(file_path, "rb") as _file: |
|
|
return base64.b64encode(_file.read()).decode("utf-8") |
|
|
except FileNotFoundError: |
|
|
logger.error(f"Error: The file {file_path} was not found.") |
|
|
return None |
|
|
except Exception as e: |
|
|
logger.error(f"Error: {e}") |
|
|
return None |
|
|
|
|
|
def get_cv_content_from_base64(self, base64_pdf: str) -> dict: |
|
|
""" """ |
|
|
if not base64_pdf: |
|
|
raise ValueError("Base64 PDF content is empty.") |
|
|
|
|
|
response = { |
|
|
"cv": { |
|
|
"file_content": base64_pdf, |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
logger.info("Processing OCR for CV file from base64 content") |
|
|
ocr_response = self.client.ocr.process( |
|
|
model=MISTRAL_OCR_MODEL, |
|
|
pages=list(range(2)), |
|
|
document={ |
|
|
"type": "document_url", |
|
|
"document_url": f"data:application/pdf;base64,{base64_pdf}", |
|
|
}, |
|
|
document_annotation_format=response_format_from_pydantic_model( |
|
|
ApplicantDocument |
|
|
), |
|
|
include_image_base64=False, |
|
|
) |
|
|
|
|
|
response["cv"]["annotation"] = ocr_response.document_annotation |
|
|
return response |
|
|
|
|
|
def upload_cv(self, pdf_path: str) -> str: |
|
|
""" |
|
|
Upload a CV PDF file and return the signed URL. |
|
|
""" |
|
|
with open(pdf_path, "rb") as pdf_file: |
|
|
uploaded_pdf = self.client.files.upload( |
|
|
file={ |
|
|
"file_name": os.path.basename(pdf_path), |
|
|
"content": pdf_file, |
|
|
}, |
|
|
purpose="ocr", |
|
|
) |
|
|
signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id) |
|
|
return { |
|
|
"file_id": uploaded_pdf.id, |
|
|
"file_name": uploaded_pdf.filename, |
|
|
"file_url": signed_url.url, |
|
|
} |
|
|
|
|
|
def get_cv_content(self, pdf_path: str) -> str: |
|
|
""" """ |
|
|
if not os.path.exists(pdf_path): |
|
|
raise FileNotFoundError(f"The file {pdf_path} does not exist.") |
|
|
|
|
|
response = { |
|
|
"cv": { |
|
|
"file_path": pdf_path, |
|
|
} |
|
|
} |
|
|
|
|
|
logger.info(f"Uploading CV PDF file: {pdf_path}") |
|
|
signed_url = self.upload_cv(pdf_path) |
|
|
response["cv"] |= signed_url |
|
|
|
|
|
|
|
|
logger.info(f"Processing OCR for cv file: {pdf_path}") |
|
|
ocr_response = self.client.ocr.process( |
|
|
model=MISTRAL_OCR_MODEL, |
|
|
document=DocumentURLChunk(document_url=response["cv"]["file_url"]), |
|
|
pages=list(range(2)), |
|
|
document_annotation_format=response_format_from_pydantic_model( |
|
|
ApplicantDocument |
|
|
), |
|
|
include_image_base64=False, |
|
|
) |
|
|
|
|
|
response["cv"]["annotation"] = ocr_response.document_annotation |
|
|
return response |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
pass |
|
|
|