Spaces:
Sleeping
Sleeping
File size: 1,634 Bytes
f7be05d 6f4a157 f7be05d 6f4a157 f7be05d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import gradio as gr
import json
from pathlib import Path
from mistralai import Mistral, DocumentURLChunk
from mistralai.models import OCRResponse
client = Mistral(api_key="RJIqm5OvwoMvLeWrFdv5JBx26tLsSSK7")
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
for img_name, base64_str in images_dict.items():
markdown_str = markdown_str.replace(
f"", f""
)
return markdown_str
def get_combined_markdown(ocr_response: OCRResponse) -> str:
markdowns = []
for page in ocr_response.pages:
image_data = {}
for img in page.images:
image_data[img.id] = img.image_base64
markdowns.append(replace_images_in_markdown(page.markdown, image_data))
return "\n\n".join(markdowns)
def process_pdf(pdf_path):
with open(pdf_path, "rb") as f:
uploaded_file = client.files.upload(
file={
"file_name": Path(pdf_path).stem,
"content": f.read()
},
purpose="ocr"
)
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
pdf_response = client.ocr.process(
document=DocumentURLChunk(document_url=signed_url.url),
model="mistral-ocr-latest",
include_image_base64=True
)
markdown = get_combined_markdown(pdf_response)
return markdown
gr.Interface(
fn=process_pdf,
inputs=gr.File(type="filepath", label="ارفع ملف PDF"),
outputs=gr.Markdown(label="الناتج"),
).launch(share=True, show_error=True)
|