AIRecruiterAgent / processing /cv_processing.py
vankhieu's picture
upload MCP app
4db8ed6
import os
import logging
import base64
from mistralai import Mistral, DocumentURLChunk
from mistralai.extra import response_format_from_pydantic_model
from model import ApplicantDocument
from .constant import MISTRAL_OCR_MODEL
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class CVProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
# Use the default API key from environment variable
api_key = os.environ.get("MISTRAL_API_KEY")
if not api_key:
raise ValueError("API key for Mistral is not set.")
self.client = Mistral(api_key=api_key)
@staticmethod
def encode_base64(file_path: str) -> str:
"""Encode the pdf/image file to base64."""
try:
with open(file_path, "rb") as _file:
return base64.b64encode(_file.read()).decode("utf-8")
except FileNotFoundError:
logger.error(f"Error: The file {file_path} was not found.")
return None
except Exception as e:
logger.error(f"Error: {e}")
return None
def get_cv_content_from_base64(self, base64_pdf: str) -> dict:
""" """
if not base64_pdf:
raise ValueError("Base64 PDF content is empty.")
response = {
"cv": {
"file_content": base64_pdf,
}
}
# Use the OCR model to extract text from the PDF
logger.info("Processing OCR for CV file from base64 content")
ocr_response = self.client.ocr.process(
model=MISTRAL_OCR_MODEL,
pages=list(range(2)),
document={
"type": "document_url",
"document_url": f"data:application/pdf;base64,{base64_pdf}",
},
document_annotation_format=response_format_from_pydantic_model(
ApplicantDocument
),
include_image_base64=False,
)
response["cv"]["annotation"] = ocr_response.document_annotation
return response
def upload_cv(self, pdf_path: str) -> str:
"""
Upload a CV PDF file and return the signed URL.
"""
with open(pdf_path, "rb") as pdf_file:
uploaded_pdf = self.client.files.upload(
file={
"file_name": os.path.basename(pdf_path),
"content": pdf_file,
},
purpose="ocr",
)
signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
return {
"file_id": uploaded_pdf.id,
"file_name": uploaded_pdf.filename,
"file_url": signed_url.url,
}
def get_cv_content(self, pdf_path: str) -> str:
""" """
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"The file {pdf_path} does not exist.")
response = {
"cv": {
"file_path": pdf_path,
}
}
# Upload the CV PDF file and get the signed URL
logger.info(f"Uploading CV PDF file: {pdf_path}")
signed_url = self.upload_cv(pdf_path)
response["cv"] |= signed_url
# Use the OCR model to extract text from the PDF
logger.info(f"Processing OCR for cv file: {pdf_path}")
ocr_response = self.client.ocr.process(
model=MISTRAL_OCR_MODEL,
document=DocumentURLChunk(document_url=response["cv"]["file_url"]),
pages=list(range(2)),
document_annotation_format=response_format_from_pydantic_model(
ApplicantDocument
),
include_image_base64=False,
)
response["cv"]["annotation"] = ocr_response.document_annotation
return response
if __name__ == "__main__":
pass