Spaces:

mimoha
/

mistral_pdf_ocr

Sleeping

mistral_pdf_ocr / app.py

Update app.py

6f4a157 verified 8 months ago

1.63 kB

	import gradio as gr
	import json
	from pathlib import Path
	from mistralai import Mistral, DocumentURLChunk
	from mistralai.models import OCRResponse

	client = Mistral(api_key="RJIqm5OvwoMvLeWrFdv5JBx26tLsSSK7")

	def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
	for img_name, base64_str in images_dict.items():
	markdown_str = markdown_str.replace(
	f"![{img_name}]({img_name})", f"![{img_name}](data:image/png;base64,{base64_str})"
	)
	return markdown_str

	def get_combined_markdown(ocr_response: OCRResponse) -> str:
	markdowns = []
	for page in ocr_response.pages:
	image_data = {}
	for img in page.images:
	image_data[img.id] = img.image_base64
	markdowns.append(replace_images_in_markdown(page.markdown, image_data))
	return "\n\n".join(markdowns)

	def process_pdf(pdf_path):
	with open(pdf_path, "rb") as f:
	uploaded_file = client.files.upload(
	file={
	"file_name": Path(pdf_path).stem,
	"content": f.read()
	},
	purpose="ocr"
	)

	signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
	pdf_response = client.ocr.process(
	document=DocumentURLChunk(document_url=signed_url.url),
	model="mistral-ocr-latest",
	include_image_base64=True
	)

	markdown = get_combined_markdown(pdf_response)
	return markdown

	gr.Interface(
	fn=process_pdf,
	inputs=gr.File(type="filepath", label="ارفع ملف PDF"),
	outputs=gr.Markdown(label="الناتج"),
	).launch(share=True, show_error=True)