File size: 3,892 Bytes
4db8ed6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import logging
import base64
from mistralai import Mistral, DocumentURLChunk
from mistralai.extra import response_format_from_pydantic_model

from model import ApplicantDocument
from .constant import MISTRAL_OCR_MODEL

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

class CVProcessor:
    def __init__(self, api_key: str = None):
        if api_key is None:
            # Use the default API key from environment variable
            api_key = os.environ.get("MISTRAL_API_KEY")
            if not api_key:
                raise ValueError("API key for Mistral is not set.")
        self.client = Mistral(api_key=api_key)

    @staticmethod
    def encode_base64(file_path: str) -> str:
        """Encode the pdf/image file to base64."""
        try:
            with open(file_path, "rb") as _file:
                return base64.b64encode(_file.read()).decode("utf-8")
        except FileNotFoundError:
            logger.error(f"Error: The file {file_path} was not found.")
            return None
        except Exception as e:
            logger.error(f"Error: {e}")
            return None

    def get_cv_content_from_base64(self, base64_pdf: str) -> dict:
        """ """
        if not base64_pdf:
            raise ValueError("Base64 PDF content is empty.")

        response = {
            "cv": {
                "file_content": base64_pdf,
            }
        }

        # Use the OCR model to extract text from the PDF
        logger.info("Processing OCR for CV file from base64 content")
        ocr_response = self.client.ocr.process(
            model=MISTRAL_OCR_MODEL,
            pages=list(range(2)),
            document={
                "type": "document_url",
                "document_url": f"data:application/pdf;base64,{base64_pdf}",
            },
            document_annotation_format=response_format_from_pydantic_model(
                ApplicantDocument
            ),
            include_image_base64=False,
        )

        response["cv"]["annotation"] = ocr_response.document_annotation
        return response

    def upload_cv(self, pdf_path: str) -> str:
        """
        Upload a CV PDF file and return the signed URL.
        """
        with open(pdf_path, "rb") as pdf_file:
            uploaded_pdf = self.client.files.upload(
                file={
                    "file_name": os.path.basename(pdf_path),
                    "content": pdf_file,
                },
                purpose="ocr",
            )
            signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
            return {
                "file_id": uploaded_pdf.id,
                "file_name": uploaded_pdf.filename,
                "file_url": signed_url.url,
            }

    def get_cv_content(self, pdf_path: str) -> str:
        """ """
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"The file {pdf_path} does not exist.")

        response = {
            "cv": {
                "file_path": pdf_path,
            }
        }
        # Upload the CV PDF file and get the signed URL
        logger.info(f"Uploading CV PDF file: {pdf_path}")
        signed_url = self.upload_cv(pdf_path)
        response["cv"] |= signed_url

        # Use the OCR model to extract text from the PDF
        logger.info(f"Processing OCR for cv file: {pdf_path}")
        ocr_response = self.client.ocr.process(
            model=MISTRAL_OCR_MODEL,
            document=DocumentURLChunk(document_url=response["cv"]["file_url"]),
            pages=list(range(2)),
            document_annotation_format=response_format_from_pydantic_model(
                ApplicantDocument
            ),
            include_image_base64=False,
        )

        response["cv"]["annotation"] = ocr_response.document_annotation
        return response
    
    
if __name__ == "__main__":
    pass