import streamlit as st import streamlit.components.v1 as components import base64 import tempfile import os from mistralai import Mistral from PIL import Image import io from dotenv import load_dotenv from pdf2image import convert_from_bytes # Configuration de la page - DOIT être la première commande Streamlit st.set_page_config(page_title="OCR Facture avec Mistral", layout="wide") load_dotenv() MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY") """ ![image](https://www.osfarm.org/assets/img/logo_white.png) # Welcome to Gaia OCR Template by OSFarm! Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:. If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community forums](https://discuss.streamlit.io). In the meantime, below is an example of what you can do with just a few lines of code: """ SYSTEM_PROMPT = """From the user prompt coming from purchase invoice below, extract informations strictly as instructed. Most of the time, the pattern of a purchase invoice is composed of supplier informations, invoice informations and one or many invoice lines. Information come from France in french language. Return the purchase informations in JSON format like an API according to the schema. Do not return 'description', 'type' or 'format' attributes in the response. Use it only to detect correct value of each attributes. example of a response : { supplier: { name: "AXA", address: "10 rue du Bouil bleu", postal_code: "17250", ... }, invoice: {number: "FA25632", ... }, items: [{number: '1', ... }, {number: '2', ... }, ...]}. for the items, try to detect the role of the item in 'merchandise' or 'service' in role attribute. for all the date, try to convert it in the following format : 'DD/MM/YYYY'. for the items, try to classify it like an accountant in nature attribute. """ SYSTEM_PROMPT2 = """From this delivery note document, extract the following information by following these instructions. The information is in French. Return the information in JSON format according to the schema. The details of each piece of information to be extracted are found in the description field of each item. """ # Requis # Code permettant d'identifier l'aliment de manière unique # type d'aliment parmis fourrages, aliments composés, matières premières. # Nom ou raison sociale du responsable de l’étiquetage # Adresse du responsable de l’étiquetage, A compléter avec rue / ville / code postal # Conditionnement utilisé pour la vente du produit : vrac ou non-vrac. Si vrac - Quantité nette livrée poids (kg ou tonne) ou volume (litre) unité à préciser), Si non vrac, Quantité d'aliment dans une unité de vente exclu le poids du conditionnement = contenu net # Pays d'origine ou zone géographique en code iso. Si plusieurs origines sont mélangées, alors c'est l'origine qui englobe toutes ces orignes qui est retenue. Par exemple lot de blé contenant 97% de blé d'origine française et 3% de blé d'origine UE sera considéré comme étant d'origine UE # Indique si l'aliment est biologique ou non # Indique si alimentation sans OGM (<0,9%) ou non # Optionnels (essayer de passer les infos optionnels pour qu'il puisse les inventer) # + teneur en eau JSON_SCHEMA = { "name": "PurchaseInvoice", "schema_definition": { "$defs": { "Explanation": { "properties": { "explanation": { "title": "Explanation", "type": "string", }, "output": {"title": "Output", "type": "string"}, }, "required": ["explanation", "output"], "title": "Explanation", "type": "object", "additionalProperties": False, } }, "properties": { "steps": { "items": {"$ref": "#/$defs/Explanation"}, "title": "Steps", "type": "array", }, "final_answer": {"title": "Final Answer", "type": "string"}, }, "required": ["steps", "final_answer"], "title": "MathDemonstration", "type": "object", "additionalProperties": False, }, "description": None, "strict": True } def upload_pdf(client, content, filename): """ Uploads a PDF to Mistral's API and retrieves a signed URL for processing. Args: client (Mistral): Mistral API client instance. content (bytes): The content of the PDF file. filename (str): The name of the PDF file. Returns: str: Signed URL for the uploaded PDF. """ with tempfile.TemporaryDirectory() as temp_dir: temp_path = os.path.join(temp_dir, filename) with open(temp_path, "wb") as tmp: tmp.write(content) try: with open(temp_path, "rb") as file_obj: file_upload = client.files.upload( file={"file_name": filename, "content": file_obj}, purpose="ocr" ) signed_url = client.files.get_signed_url(file_id=file_upload.id) return signed_url.url finally: if os.path.exists(temp_path): os.remove(temp_path) def extract_json_from_doc(client, document_source): """ Extracts JSON data from a document using Mistral's OCR API. Args: client (Mistral): Mistral API client instance. document_source (dict): The source of the document (URL or image). Returns: dict: The extracted JSON data. """ # Specify model model = "mistral-small-latest" messages = [ { "role": "system", "content": SYSTEM_PROMPT2, }, { "role": "user", "content": [ { "type": "text", "text": "what is the last sentence in the document" }, document_source ] } ] print(messages) chat_response = client.chat.complete( model=model, messages=messages, response_format = { "type": "json_object" #, "json_schema": JSON_SCHEMA } ) print(chat_response.choices[0].message.content) return chat_response.choices[0].message.content def process_ocr(client, document_source): """ Processes a document using Mistral's OCR API. Args: client (Mistral): Mistral API client instance. document_source (dict): The source of the document (URL or image). Returns: OCRResponse: The response from Mistral's OCR API. """ return client.ocr.process( model="mistral-ocr-latest", document=document_source, include_image_base64=True ) def display_pdf(content: bytes): try: images = convert_from_bytes(content) for i, image in enumerate(images): st.image(image, caption=f"Page {i+1}", use_container_width=True) except Exception as e: st.error(f"Impossible d'afficher le PDF : {e}") st.download_button( label="📥 Télécharger le PDF", data=content, file_name="document.pdf", mime="application/pdf" ) def main(): """ Main function to run the Streamlit app. """ # Sidebar: Authentication for Mistral API if not MISTRAL_API_KEY: api_key = st.sidebar.text_input("Mistral API Key", type="password") else: api_key = MISTRAL_API_KEY if not api_key: st.warning("Enter API key to continue") return # Initialize Mistral API client client = Mistral(api_key=api_key) # Main app interface st.header("OCR Facture avec Mistral") # Input method selection: URL, PDF Upload, or Image Upload input_method = st.radio("Format de la facture:", ["URL", "PDF", "Image"]) document_source = None preview_content = None content_type = None if input_method == "URL": # Handle document URL input url = st.text_input("Document URL:") if url: document_source = { "type": "document_url", "document_url": url } preview_content = url content_type = "url" elif input_method == "PDF": # Handle PDF file upload uploaded_file = st.file_uploader("Choisissez un PDF", type=["pdf"]) if uploaded_file: content = uploaded_file.read() preview_content = uploaded_file # Display the uploaded PDF display_pdf(content) # Prepare document source for OCR processing document_source = { "type": "document_url", "document_url": upload_pdf(client, content, uploaded_file.name) } content_type = "pdf" elif input_method == "Image": # Handle image file upload uploaded_image = st.file_uploader("Choisissez une image", type=["png", "jpg", "jpeg"]) if uploaded_image: # Display the uploaded image image = Image.open(uploaded_image) st.image(image, caption="Uploaded Image", use_container_width=True) # Convert image to base64 buffered = io.BytesIO() image.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() # Prepare document source for OCR processing document_source = { "type": "image_url", "image_url": f"data:image/png;base64,{img_str}" } content_type = "image" if document_source and st.button("Générer les données au format JSON"): # Process the document when the user clicks the button with st.spinner("Extracting JSON content..."): try: ocr_response = extract_json_from_doc(client, document_source) with st.expander("Response"): st.json(ocr_response) except Exception as e: # Display an error message if processing fails st.error(f"Processing error: {str(e)}") if document_source and st.button("Générer un Document"): # Process the document when the user clicks the button with st.spinner("Extracting content..."): try: ocr_response = process_ocr(client, document_source) if ocr_response and ocr_response.pages: # Combine extracted text from all pages into one string extracted_content = "\n\n".join( [f"**Page {i+1}**\n{page.markdown}" for i, page in enumerate(ocr_response.pages)] ) # Display extracted content in Markdown format st.subheader("Extracted Content") st.markdown(extracted_content) # Prepare plain text version plain_text_content = "\n\n".join( [f"Page {i+1}\n{page.markdown}" for i, page in enumerate(ocr_response.pages)] ) # Add download buttons for both text and Markdown formats col1, col2 = st.columns(2) with col1: st.download_button( label="Download Text", data=plain_text_content, file_name="extracted_content.txt", mime="text/plain" ) with col2: st.download_button( label="Download Markdown", data=extracted_content, file_name="extracted_content.md", mime="text/markdown" ) # Optional: Show raw response for debugging purposes with st.expander("Réponse API"): st.json(ocr_response.model_dump()) else: st.warning("No content extracted.") except Exception as e: # Display an error message if processing fails st.error(f"Processing error: {str(e)}") if __name__ == "__main__": main()