Spaces:

Agents-MCP-Hackathon
/

AIRecruiterAgent

Sleeping

App Files Files Community

AIRecruiterAgent / processing /cv_processing.py

vankhieu

upload MCP app

4db8ed6 7 months ago

raw

history blame contribute delete

3.89 kB

	import os
	import logging
	import base64
	from mistralai import Mistral, DocumentURLChunk
	from mistralai.extra import response_format_from_pydantic_model

	from model import ApplicantDocument
	from .constant import MISTRAL_OCR_MODEL

	logger = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO)

	class CVProcessor:
	def __init__(self, api_key: str = None):
	if api_key is None:
	# Use the default API key from environment variable
	api_key = os.environ.get("MISTRAL_API_KEY")
	if not api_key:
	raise ValueError("API key for Mistral is not set.")
	self.client = Mistral(api_key=api_key)

	@staticmethod
	def encode_base64(file_path: str) -> str:
	"""Encode the pdf/image file to base64."""
	try:
	with open(file_path, "rb") as _file:
	return base64.b64encode(_file.read()).decode("utf-8")
	except FileNotFoundError:
	logger.error(f"Error: The file {file_path} was not found.")
	return None
	except Exception as e:
	logger.error(f"Error: {e}")
	return None

	def get_cv_content_from_base64(self, base64_pdf: str) -> dict:
	""" """
	if not base64_pdf:
	raise ValueError("Base64 PDF content is empty.")

	response = {
	"cv": {
	"file_content": base64_pdf,
	}
	}

	# Use the OCR model to extract text from the PDF
	logger.info("Processing OCR for CV file from base64 content")
	ocr_response = self.client.ocr.process(
	model=MISTRAL_OCR_MODEL,
	pages=list(range(2)),
	document={
	"type": "document_url",
	"document_url": f"data:application/pdf;base64,{base64_pdf}",
	},
	document_annotation_format=response_format_from_pydantic_model(
	ApplicantDocument
	),
	include_image_base64=False,
	)

	response["cv"]["annotation"] = ocr_response.document_annotation
	return response

	def upload_cv(self, pdf_path: str) -> str:
	"""
	Upload a CV PDF file and return the signed URL.
	"""
	with open(pdf_path, "rb") as pdf_file:
	uploaded_pdf = self.client.files.upload(
	file={
	"file_name": os.path.basename(pdf_path),
	"content": pdf_file,
	},
	purpose="ocr",
	)
	signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
	return {
	"file_id": uploaded_pdf.id,
	"file_name": uploaded_pdf.filename,
	"file_url": signed_url.url,
	}

	def get_cv_content(self, pdf_path: str) -> str:
	""" """
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"The file {pdf_path} does not exist.")

	response = {
	"cv": {
	"file_path": pdf_path,
	}
	}
	# Upload the CV PDF file and get the signed URL
	logger.info(f"Uploading CV PDF file: {pdf_path}")
	signed_url = self.upload_cv(pdf_path)
	response["cv"] \|= signed_url

	# Use the OCR model to extract text from the PDF
	logger.info(f"Processing OCR for cv file: {pdf_path}")
	ocr_response = self.client.ocr.process(
	model=MISTRAL_OCR_MODEL,
	document=DocumentURLChunk(document_url=response["cv"]["file_url"]),
	pages=list(range(2)),
	document_annotation_format=response_format_from_pydantic_model(
	ApplicantDocument
	),
	include_image_base64=False,
	)

	response["cv"]["annotation"] = ocr_response.document_annotation
	return response


	if __name__ == "__main__":
	pass