Spaces:

Agents-MCP-Hackathon
/

AIRecruiterAgent

Sleeping

File size: 3,892 Bytes

4db8ed6

import os
import logging
import base64
from mistralai import Mistral, DocumentURLChunk
from mistralai.extra import response_format_from_pydantic_model

from model import ApplicantDocument
from .constant import MISTRAL_OCR_MODEL

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

class CVProcessor:
    def __init__(self, api_key: str = None):
        if api_key is None:
            # Use the default API key from environment variable
            api_key = os.environ.get("MISTRAL_API_KEY")
            if not api_key:
                raise ValueError("API key for Mistral is not set.")
        self.client = Mistral(api_key=api_key)

    @staticmethod
    def encode_base64(file_path: str) -> str:
        """Encode the pdf/image file to base64."""
        try:
            with open(file_path, "rb") as _file:
                return base64.b64encode(_file.read()).decode("utf-8")
        except FileNotFoundError:
            logger.error(f"Error: The file {file_path} was not found.")
            return None
        except Exception as e:
            logger.error(f"Error: {e}")
            return None

    def get_cv_content_from_base64(self, base64_pdf: str) -> dict:
        """ """
        if not base64_pdf:
            raise ValueError("Base64 PDF content is empty.")

        response = {
            "cv": {
                "file_content": base64_pdf,
            }
        }

        # Use the OCR model to extract text from the PDF
        logger.info("Processing OCR for CV file from base64 content")
        ocr_response = self.client.ocr.process(
            model=MISTRAL_OCR_MODEL,
            pages=list(range(2)),
            document={
                "type": "document_url",
                "document_url": f"data:application/pdf;base64,{base64_pdf}",
            },
            document_annotation_format=response_format_from_pydantic_model(
                ApplicantDocument
            ),
            include_image_base64=False,
        )

        response["cv"]["annotation"] = ocr_response.document_annotation
        return response

    def upload_cv(self, pdf_path: str) -> str:
        """
        Upload a CV PDF file and return the signed URL.
        """
        with open(pdf_path, "rb") as pdf_file:
            uploaded_pdf = self.client.files.upload(
                file={
                    "file_name": os.path.basename(pdf_path),
                    "content": pdf_file,
                },
                purpose="ocr",
            )
            signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
            return {
                "file_id": uploaded_pdf.id,
                "file_name": uploaded_pdf.filename,
                "file_url": signed_url.url,
            }

    def get_cv_content(self, pdf_path: str) -> str:
        """ """
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"The file {pdf_path} does not exist.")

        response = {
            "cv": {
                "file_path": pdf_path,
            }
        }
        # Upload the CV PDF file and get the signed URL
        logger.info(f"Uploading CV PDF file: {pdf_path}")
        signed_url = self.upload_cv(pdf_path)
        response["cv"] |= signed_url

        # Use the OCR model to extract text from the PDF
        logger.info(f"Processing OCR for cv file: {pdf_path}")
        ocr_response = self.client.ocr.process(
            model=MISTRAL_OCR_MODEL,
            document=DocumentURLChunk(document_url=response["cv"]["file_url"]),
            pages=list(range(2)),
            document_annotation_format=response_format_from_pydantic_model(
                ApplicantDocument
            ),
            include_image_base64=False,
        )

        response["cv"]["annotation"] = ocr_response.document_annotation
        return response
    
    
if __name__ == "__main__":
    pass