Spaces:

mimoha
/

mistral_pdf_ocr

Sleeping

File size: 1,634 Bytes

import gradio as gr
import json
from pathlib import Path
from mistralai import Mistral, DocumentURLChunk
from mistralai.models import OCRResponse

client = Mistral(api_key="RJIqm5OvwoMvLeWrFdv5JBx26tLsSSK7")

def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}](data:image/png;base64,{base64_str})"
        )
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
    markdowns = []
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))
    return "\n\n".join(markdowns)

def process_pdf(pdf_path):
    with open(pdf_path, "rb") as f:
        uploaded_file = client.files.upload(
            file={
                "file_name": Path(pdf_path).stem,
                "content": f.read()
            },
            purpose="ocr"
        )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
    pdf_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
        include_image_base64=True
    )

    markdown = get_combined_markdown(pdf_response)
    return markdown

gr.Interface(
    fn=process_pdf,
    inputs=gr.File(type="filepath", label="ارفع ملف PDF"),
    outputs=gr.Markdown(label="الناتج"),
).launch(share=True, show_error=True)