File size: 13,196 Bytes
5f01f50
5adde2d
e05e6ee
 
 
 
 
 
 
5adde2d
e05e6ee
6b989e1
 
 
 
e05e6ee
 
 
5f01f50
 
e05e6ee
 
 
5f01f50
 
 
 
 
 
 
 
e05e6ee
 
 
 
 
 
 
 
 
 
 
 
6169f7e
 
 
5dae67f
 
20f2ffe
5dae67f
20f2ffe
 
 
 
5dae67f
 
 
 
 
20f2ffe
 
 
 
 
e05e6ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dae67f
e05e6ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5adde2d
 
 
 
 
 
 
 
 
 
 
 
 
e05e6ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b989e1
 
 
e05e6ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import streamlit as st
import streamlit.components.v1 as components
import base64
import tempfile
import os
from mistralai import Mistral
from PIL import Image
import io
from dotenv import load_dotenv
from pdf2image import convert_from_bytes

# Configuration de la page - DOIT être la première commande Streamlit
st.set_page_config(page_title="OCR Facture avec Mistral", layout="wide")


load_dotenv()

MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")

"""
![image](https://www.osfarm.org/assets/img/logo_white.png)

# Welcome to Gaia OCR Template by OSFarm!

Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
forums](https://discuss.streamlit.io).

In the meantime, below is an example of what you can do with just a few lines of code:
"""

SYSTEM_PROMPT = """From the user prompt coming from purchase invoice below, extract informations strictly as instructed.
    Most of the time, the pattern of a purchase invoice is composed of supplier informations, invoice informations and one or many invoice lines.
    Information come from France in french language.
    Return the purchase informations in JSON format like an API according to the schema.
    Do not return 'description', 'type' or 'format' attributes in the response.
    Use it only to detect correct value of each attributes.
    example of a response : { supplier: { name: "AXA", address: "10 rue du Bouil bleu", postal_code: "17250", ... }, invoice: {number: "FA25632", ... }, items: [{number: '1', ... }, {number: '2', ... }, ...]}.
    for the items, try to detect the role of the item in 'merchandise' or 'service' in role attribute.
    for all the date, try to convert it in the following format : 'DD/MM/YYYY'.
    for the items, try to classify it like an accountant in nature attribute.
"""

SYSTEM_PROMPT2 = """From this delivery note document, extract the following information by following these instructions.
    The information is in French. Return the information in JSON format according to the schema.
    The details of each piece of information to be extracted are found in the description field of each item.
"""

# Requis

# Code permettant d'identifier l'aliment de manière unique
# type d'aliment parmis fourrages, aliments composés, matières premières.
# Nom ou raison sociale du responsable de l’étiquetage
# Adresse du responsable de l’étiquetage, A compléter avec rue / ville / code postal
# Conditionnement utilisé pour la vente du produit : vrac ou non-vrac. Si vrac - Quantité nette livrée poids (kg ou tonne) ou volume (litre) unité à préciser), Si non vrac, Quantité d'aliment dans une unité de vente exclu le poids du conditionnement = contenu net
# Pays d'origine ou zone géographique en code iso. Si plusieurs origines sont mélangées, alors c'est l'origine qui englobe toutes ces orignes qui est retenue. Par exemple lot de blé contenant 97% de blé d'origine française et 3% de blé d'origine UE sera considéré comme étant d'origine UE
# Indique si l'aliment est biologique ou non
# Indique si alimentation sans OGM (<0,9%) ou non


# Optionnels (essayer de passer les infos optionnels pour qu'il puisse les inventer)
# + teneur en eau


JSON_SCHEMA = {
                "name": "PurchaseInvoice",
                "schema_definition": {
                    "$defs": {
                        "Explanation": {
                            "properties": {
                                "explanation": {
                                    "title": "Explanation",
                                    "type": "string",
                                },
                                "output": {"title": "Output", "type": "string"},
                            },
                            "required": ["explanation", "output"],
                            "title": "Explanation",
                            "type": "object",
                            "additionalProperties": False,
                        }
                    },
                    "properties": {
                        "steps": {
                            "items": {"$ref": "#/$defs/Explanation"},
                            "title": "Steps",
                            "type": "array",
                        },
                        "final_answer": {"title": "Final Answer", "type": "string"},
                    },
                    "required": ["steps", "final_answer"],
                    "title": "MathDemonstration",
                    "type": "object",
                    "additionalProperties": False,
                }, 
                "description": None,
                "strict": True
}

def upload_pdf(client, content, filename):
    """
    Uploads a PDF to Mistral's API and retrieves a signed URL for processing.
    
    Args:
        client (Mistral): Mistral API client instance.
        content (bytes): The content of the PDF file.
        filename (str): The name of the PDF file.

    Returns:
        str: Signed URL for the uploaded PDF.
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = os.path.join(temp_dir, filename)
        
        with open(temp_path, "wb") as tmp:
            tmp.write(content)
        
        try:
            with open(temp_path, "rb") as file_obj:
                file_upload = client.files.upload(
                    file={"file_name": filename, "content": file_obj},
                    purpose="ocr"
                )
            
            signed_url = client.files.get_signed_url(file_id=file_upload.id)
            return signed_url.url
        finally:
            if os.path.exists(temp_path):
                os.remove(temp_path)

def extract_json_from_doc(client, document_source):
    """
    Extracts JSON data from a document using Mistral's OCR API.

    Args:
        client (Mistral): Mistral API client instance.
        document_source (dict): The source of the document (URL or image).

    Returns:
        dict: The extracted JSON data.
    """
    # Specify model
    model = "mistral-small-latest"

    messages = [
        {
            "role": "system",
            "content": SYSTEM_PROMPT2,
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "what is the last sentence in the document"
                },
                document_source
            ]
        }
    ]

    print(messages)

    chat_response = client.chat.complete(
        model=model,
        messages=messages,
        response_format = {
          "type": "json_object" #, "json_schema": JSON_SCHEMA
      }
    )

    print(chat_response.choices[0].message.content)

    return chat_response.choices[0].message.content

def process_ocr(client, document_source):
    """
    Processes a document using Mistral's OCR API.

    Args:
        client (Mistral): Mistral API client instance.
        document_source (dict): The source of the document (URL or image).

    Returns:
        OCRResponse: The response from Mistral's OCR API.
    """
    return client.ocr.process(
        model="mistral-ocr-latest",
        document=document_source,
        include_image_base64=True
    )

def display_pdf(content: bytes):
    try:
        images = convert_from_bytes(content)
        for i, image in enumerate(images):
            st.image(image, caption=f"Page {i+1}", use_container_width=True)
    except Exception as e:
        st.error(f"Impossible d'afficher le PDF : {e}")
        st.download_button(
            label="📥 Télécharger le PDF",
            data=content,
            file_name="document.pdf",
            mime="application/pdf"
        )

def main():
    """
    Main function to run the Streamlit app.
    """
    
    # Sidebar: Authentication for Mistral API
    if not MISTRAL_API_KEY:
        api_key = st.sidebar.text_input("Mistral API Key", type="password")
    else:
        api_key = MISTRAL_API_KEY
    
    if not api_key:
        st.warning("Enter API key to continue")
        return
    
    # Initialize Mistral API client
    client = Mistral(api_key=api_key)
    
    # Main app interface
    st.header("OCR Facture avec Mistral")
    
    # Input method selection: URL, PDF Upload, or Image Upload
    input_method = st.radio("Format de la facture:", ["URL", "PDF", "Image"])
    
    document_source = None
    preview_content = None
    content_type = None
    
    if input_method == "URL":
        # Handle document URL input
        url = st.text_input("Document URL:")
        if url:
            document_source = {
                "type": "document_url",
                "document_url": url
            }
            preview_content = url
            content_type = "url"
    
    elif input_method == "PDF":
        # Handle PDF file upload
        uploaded_file = st.file_uploader("Choisissez un PDF", type=["pdf"])
        if uploaded_file:
            content = uploaded_file.read()
            preview_content = uploaded_file

            # Display the uploaded PDF
            display_pdf(content)
            
            # Prepare document source for OCR processing
            document_source = {
                "type": "document_url",
                "document_url": upload_pdf(client, content, uploaded_file.name)
            }
            content_type = "pdf"
    
    elif input_method == "Image":
        # Handle image file upload
        uploaded_image = st.file_uploader("Choisissez une image", type=["png", "jpg", "jpeg"])
        if uploaded_image:
            # Display the uploaded image
            image = Image.open(uploaded_image)
            st.image(image, caption="Uploaded Image", use_container_width=True)
            
            # Convert image to base64
            buffered = io.BytesIO()
            image.save(buffered, format="PNG")
            img_str = base64.b64encode(buffered.getvalue()).decode()
            
            # Prepare document source for OCR processing
            document_source = {
                "type": "image_url",
                "image_url": f"data:image/png;base64,{img_str}"
            }
            content_type = "image"
    
    if document_source and st.button("Générer les données au format JSON"):
        # Process the document when the user clicks the button
        with st.spinner("Extracting JSON content..."):
            try:
                ocr_response = extract_json_from_doc(client, document_source)

                with st.expander("Response"):
                    st.json(ocr_response)

            except Exception as e:
                # Display an error message if processing fails
                st.error(f"Processing error: {str(e)}")
    
    if document_source and st.button("Générer un Document"):
        # Process the document when the user clicks the button
        with st.spinner("Extracting content..."):
            try:
                ocr_response = process_ocr(client, document_source)
                
                if ocr_response and ocr_response.pages:
                    # Combine extracted text from all pages into one string
                    extracted_content = "\n\n".join(
                        [f"**Page {i+1}**\n{page.markdown}" 
                         for i, page in enumerate(ocr_response.pages)]
                    )
                    
                    # Display extracted content in Markdown format
                    st.subheader("Extracted Content")
                    st.markdown(extracted_content)
                    
                    # Prepare plain text version
                    plain_text_content = "\n\n".join(
                        [f"Page {i+1}\n{page.markdown}" 
                         for i, page in enumerate(ocr_response.pages)]
                    )
                    
                    # Add download buttons for both text and Markdown formats
                    col1, col2 = st.columns(2)
                    with col1:
                        st.download_button(
                            label="Download Text",
                            data=plain_text_content,
                            file_name="extracted_content.txt",
                            mime="text/plain"
                        )
                    with col2:
                        st.download_button(
                            label="Download Markdown",
                            data=extracted_content,
                            file_name="extracted_content.md",
                            mime="text/markdown"
                        )
                    
                    # Optional: Show raw response for debugging purposes
                    with st.expander("Réponse API"):
                        st.json(ocr_response.model_dump())
                
                else:
                    st.warning("No content extracted.")
            
            except Exception as e:
                # Display an error message if processing fails
                st.error(f"Processing error: {str(e)}")

if __name__ == "__main__":
    main()