mistral_ocr2

Sleeping

File size: 13,196 Bytes

import streamlit as st
import streamlit.components.v1 as components
import base64
import tempfile
import os
from mistralai import Mistral
from PIL import Image
import io
from dotenv import load_dotenv
from pdf2image import convert_from_bytes

# Configuration de la page - DOIT être la première commande Streamlit
st.set_page_config(page_title="OCR Facture avec Mistral", layout="wide")


load_dotenv()

MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")

"""
![image](https://www.osfarm.org/assets/img/logo_white.png)

# Welcome to Gaia OCR Template by OSFarm!

Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
forums](https://discuss.streamlit.io).

In the meantime, below is an example of what you can do with just a few lines of code:
"""

SYSTEM_PROMPT = """From the user prompt coming from purchase invoice below, extract informations strictly as instructed.
    Most of the time, the pattern of a purchase invoice is composed of supplier informations, invoice informations and one or many invoice lines.
    Information come from France in french language.
    Return the purchase informations in JSON format like an API according to the schema.
    Do not return 'description', 'type' or 'format' attributes in the response.
    Use it only to detect correct value of each attributes.
    example of a response : { supplier: { name: "AXA", address: "10 rue du Bouil bleu", postal_code: "17250", ... }, invoice: {number: "FA25632", ... }, items: [{number: '1', ... }, {number: '2', ... }, ...]}.
    for the items, try to detect the role of the item in 'merchandise' or 'service' in role attribute.
    for all the date, try to convert it in the following format : 'DD/MM/YYYY'.
    for the items, try to classify it like an accountant in nature attribute.
"""

SYSTEM_PROMPT2 = """From this delivery note document, extract the following information by following these instructions.
    The information is in French. Return the information in JSON format according to the schema.
    The details of each piece of information to be extracted are found in the description field of each item.
"""

# Requis

# Code permettant d'identifier l'aliment de manière unique
# type d'aliment parmis fourrages, aliments composés, matières premières.
# Nom ou raison sociale du responsable de l’étiquetage
# Adresse du responsable de l’étiquetage, A compléter avec rue / ville / code postal
# Conditionnement utilisé pour la vente du produit : vrac ou non-vrac. Si vrac - Quantité nette livrée poids (kg ou tonne) ou volume (litre) unité à préciser), Si non vrac, Quantité d'aliment dans une unité de vente exclu le poids du conditionnement = contenu net
# Pays d'origine ou zone géographique en code iso. Si plusieurs origines sont mélangées, alors c'est l'origine qui englobe toutes ces orignes qui est retenue. Par exemple lot de blé contenant 97% de blé d'origine française et 3% de blé d'origine UE sera considéré comme étant d'origine UE
# Indique si l'aliment est biologique ou non
# Indique si alimentation sans OGM (<0,9%) ou non


# Optionnels (essayer de passer les infos optionnels pour qu'il puisse les inventer)
# + teneur en eau


JSON_SCHEMA = {
                "name": "PurchaseInvoice",
                "schema_definition": {
                    "$defs": {
                        "Explanation": {
                            "properties": {
                                "explanation": {
                                    "title": "Explanation",
                                    "type": "string",
                                },
                                "output": {"title": "Output", "type": "string"},
                            },
                            "required": ["explanation", "output"],
                            "title": "Explanation",
                            "type": "object",
                            "additionalProperties": False,
                        }
                    },
                    "properties": {
                        "steps": {
                            "items": {"$ref": "#/$defs/Explanation"},
                            "title": "Steps",
                            "type": "array",
                        },
                        "final_answer": {"title": "Final Answer", "type": "string"},
                    },
                    "required": ["steps", "final_answer"],
                    "title": "MathDemonstration",
                    "type": "object",
                    "additionalProperties": False,
                }, 
                "description": None,
                "strict": True
}

def upload_pdf(client, content, filename):
    """
    Uploads a PDF to Mistral's API and retrieves a signed URL for processing.
    
    Args:
        client (Mistral): Mistral API client instance.
        content (bytes): The content of the PDF file.
        filename (str): The name of the PDF file.

    Returns:
        str: Signed URL for the uploaded PDF.
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = os.path.join(temp_dir, filename)
        
        with open(temp_path, "wb") as tmp:
            tmp.write(content)
        
        try:
            with open(temp_path, "rb") as file_obj:
                file_upload = client.files.upload(
                    file={"file_name": filename, "content": file_obj},
                    purpose="ocr"
                )
            
            signed_url = client.files.get_signed_url(file_id=file_upload.id)
            return signed_url.url
        finally:
            if os.path.exists(temp_path):
                os.remove(temp_path)

def extract_json_from_doc(client, document_source):
    """
    Extracts JSON data from a document using Mistral's OCR API.

    Args:
        client (Mistral): Mistral API client instance.
        document_source (dict): The source of the document (URL or image).

    Returns:
        dict: The extracted JSON data.
    """
    # Specify model
    model = "mistral-small-latest"

    messages = [
        {
            "role": "system",
            "content": SYSTEM_PROMPT2,
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "what is the last sentence in the document"
                },
                document_source
            ]
        }
    ]

    print(messages)

    chat_response = client.chat.complete(
        model=model,
        messages=messages,
        response_format = {
          "type": "json_object" #, "json_schema": JSON_SCHEMA
      }
    )

    print(chat_response.choices[0].message.content)

    return chat_response.choices[0].message.content

def process_ocr(client, document_source):
    """
    Processes a document using Mistral's OCR API.

    Args:
        client (Mistral): Mistral API client instance.
        document_source (dict): The source of the document (URL or image).

    Returns:
        OCRResponse: The response from Mistral's OCR API.
    """
    return client.ocr.process(
        model="mistral-ocr-latest",
        document=document_source,
        include_image_base64=True
    )

def display_pdf(content: bytes):
    try:
        images = convert_from_bytes(content)
        for i, image in enumerate(images):
            st.image(image, caption=f"Page {i+1}", use_container_width=True)
    except Exception as e:
        st.error(f"Impossible d'afficher le PDF : {e}")
        st.download_button(
            label="📥 Télécharger le PDF",
            data=content,
            file_name="document.pdf",
            mime="application/pdf"
        )

def main():
    """
    Main function to run the Streamlit app.
    """
    
    # Sidebar: Authentication for Mistral API
    if not MISTRAL_API_KEY:
        api_key = st.sidebar.text_input("Mistral API Key", type="password")
    else:
        api_key = MISTRAL_API_KEY
    
    if not api_key:
        st.warning("Enter API key to continue")
        return
    
    # Initialize Mistral API client
    client = Mistral(api_key=api_key)
    
    # Main app interface
    st.header("OCR Facture avec Mistral")
    
    # Input method selection: URL, PDF Upload, or Image Upload
    input_method = st.radio("Format de la facture:", ["URL", "PDF", "Image"])
    
    document_source = None
    preview_content = None
    content_type = None
    
    if input_method == "URL":
        # Handle document URL input
        url = st.text_input("Document URL:")
        if url:
            document_source = {
                "type": "document_url",
                "document_url": url
            }
            preview_content = url
            content_type = "url"
    
    elif input_method == "PDF":
        # Handle PDF file upload
        uploaded_file = st.file_uploader("Choisissez un PDF", type=["pdf"])
        if uploaded_file:
            content = uploaded_file.read()
            preview_content = uploaded_file

            # Display the uploaded PDF
            display_pdf(content)
            
            # Prepare document source for OCR processing
            document_source = {
                "type": "document_url",
                "document_url": upload_pdf(client, content, uploaded_file.name)
            }
            content_type = "pdf"
    
    elif input_method == "Image":
        # Handle image file upload
        uploaded_image = st.file_uploader("Choisissez une image", type=["png", "jpg", "jpeg"])
        if uploaded_image:
            # Display the uploaded image
            image = Image.open(uploaded_image)
            st.image(image, caption="Uploaded Image", use_container_width=True)
            
            # Convert image to base64
            buffered = io.BytesIO()
            image.save(buffered, format="PNG")
            img_str = base64.b64encode(buffered.getvalue()).decode()
            
            # Prepare document source for OCR processing
            document_source = {
                "type": "image_url",
                "image_url": f"data:image/png;base64,{img_str}"
            }
            content_type = "image"
    
    if document_source and st.button("Générer les données au format JSON"):
        # Process the document when the user clicks the button
        with st.spinner("Extracting JSON content..."):
            try:
                ocr_response = extract_json_from_doc(client, document_source)

                with st.expander("Response"):
                    st.json(ocr_response)

            except Exception as e:
                # Display an error message if processing fails
                st.error(f"Processing error: {str(e)}")
    
    if document_source and st.button("Générer un Document"):
        # Process the document when the user clicks the button
        with st.spinner("Extracting content..."):
            try:
                ocr_response = process_ocr(client, document_source)
                
                if ocr_response and ocr_response.pages:
                    # Combine extracted text from all pages into one string
                    extracted_content = "\n\n".join(
                        [f"**Page {i+1}**\n{page.markdown}" 
                         for i, page in enumerate(ocr_response.pages)]
                    )
                    
                    # Display extracted content in Markdown format
                    st.subheader("Extracted Content")
                    st.markdown(extracted_content)
                    
                    # Prepare plain text version
                    plain_text_content = "\n\n".join(
                        [f"Page {i+1}\n{page.markdown}" 
                         for i, page in enumerate(ocr_response.pages)]
                    )
                    
                    # Add download buttons for both text and Markdown formats
                    col1, col2 = st.columns(2)
                    with col1:
                        st.download_button(
                            label="Download Text",
                            data=plain_text_content,
                            file_name="extracted_content.txt",
                            mime="text/plain"
                        )
                    with col2:
                        st.download_button(
                            label="Download Markdown",
                            data=extracted_content,
                            file_name="extracted_content.md",
                            mime="text/markdown"
                        )
                    
                    # Optional: Show raw response for debugging purposes
                    with st.expander("Réponse API"):
                        st.json(ocr_response.model_dump())
                
                else:
                    st.warning("No content extracted.")
            
            except Exception as e:
                # Display an error message if processing fails
                st.error(f"Processing error: {str(e)}")

if __name__ == "__main__":
    main()