Spaces:

gaia-space
/

mistral_ocr

Sleeping

App Files Files Community

mistral_ocr / src /streamlit_app.py

cdupland

Display PDF page per page

5adde2d verified 4 months ago

raw

history blame contribute delete

11.8 kB

	import streamlit as st
	import streamlit.components.v1 as components
	import base64
	import tempfile
	import os
	from mistralai import Mistral
	from PIL import Image
	import io
	from dotenv import load_dotenv
	from pdf2image import convert_from_bytes

	# Configuration de la page - DOIT être la première commande Streamlit
	st.set_page_config(page_title="OCR Facture avec Mistral", layout="wide")


	load_dotenv()

	MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")

	"""
	![image](https://www.osfarm.org/assets/img/logo_white.png)

	# Welcome to Gaia OCR Template by OSFarm!

	Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
	If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
	forums](https://discuss.streamlit.io).

	In the meantime, below is an example of what you can do with just a few lines of code:
	"""

	SYSTEM_PROMPT = """From the user prompt coming from purchase invoice below, extract informations strictly as instructed.
	Most of the time, the pattern of a purchase invoice is composed of supplier informations, invoice informations and one or many invoice lines.
	Information come from France in french language.
	Return the purchase informations in JSON format like an API according to the schema.
	Do not return 'description', 'type' or 'format' attributes in the response.
	Use it only to detect correct value of each attributes.
	example of a response : { supplier: { name: "AXA", address: "10 rue du Bouil bleu", postal_code: "17250", ... }, invoice: {number: "FA25632", ... }, items: [{number: '1', ... }, {number: '2', ... }, ...]}.
	for the items, try to detect the role of the item in 'merchandise' or 'service' in role attribute.
	for all the date, try to convert it in the following format : 'DD/MM/YYYY'.
	for the items, try to classify it like an accountant in nature attribute.
	"""

	JSON_SCHEMA = {
	"name": "PurchaseInvoice",
	"schema_definition": {
	"$defs": {
	"Explanation": {
	"properties": {
	"explanation": {
	"title": "Explanation",
	"type": "string",
	},
	"output": {"title": "Output", "type": "string"},
	},
	"required": ["explanation", "output"],
	"title": "Explanation",
	"type": "object",
	"additionalProperties": False,
	}
	},
	"properties": {
	"steps": {
	"items": {"$ref": "#/$defs/Explanation"},
	"title": "Steps",
	"type": "array",
	},
	"final_answer": {"title": "Final Answer", "type": "string"},
	},
	"required": ["steps", "final_answer"],
	"title": "MathDemonstration",
	"type": "object",
	"additionalProperties": False,
	},
	"description": None,
	"strict": True
	}

	def upload_pdf(client, content, filename):
	"""
	Uploads a PDF to Mistral's API and retrieves a signed URL for processing.

	Args:
	client (Mistral): Mistral API client instance.
	content (bytes): The content of the PDF file.
	filename (str): The name of the PDF file.

	Returns:
	str: Signed URL for the uploaded PDF.
	"""
	with tempfile.TemporaryDirectory() as temp_dir:
	temp_path = os.path.join(temp_dir, filename)

	with open(temp_path, "wb") as tmp:
	tmp.write(content)

	try:
	with open(temp_path, "rb") as file_obj:
	file_upload = client.files.upload(
	file={"file_name": filename, "content": file_obj},
	purpose="ocr"
	)

	signed_url = client.files.get_signed_url(file_id=file_upload.id)
	return signed_url.url
	finally:
	if os.path.exists(temp_path):
	os.remove(temp_path)

	def extract_json_from_doc(client, document_source):
	"""
	Extracts JSON data from a document using Mistral's OCR API.

	Args:
	client (Mistral): Mistral API client instance.
	document_source (dict): The source of the document (URL or image).

	Returns:
	dict: The extracted JSON data.
	"""
	# Specify model
	model = "mistral-small-latest"

	messages = [
	{
	"role": "system",
	"content": SYSTEM_PROMPT,
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "what is the last sentence in the document"
	},
	document_source
	]
	}
	]

	print(messages)

	chat_response = client.chat.complete(
	model=model,
	messages=messages,
	response_format = {
	"type": "json_object" #, "json_schema": JSON_SCHEMA
	}
	)

	print(chat_response.choices[0].message.content)

	return chat_response.choices[0].message.content

	def process_ocr(client, document_source):
	"""
	Processes a document using Mistral's OCR API.

	Args:
	client (Mistral): Mistral API client instance.
	document_source (dict): The source of the document (URL or image).

	Returns:
	OCRResponse: The response from Mistral's OCR API.
	"""
	return client.ocr.process(
	model="mistral-ocr-latest",
	document=document_source,
	include_image_base64=True
	)

	def display_pdf(content: bytes):
	try:
	images = convert_from_bytes(content)
	for i, image in enumerate(images):
	st.image(image, caption=f"Page {i+1}", use_container_width=True)
	except Exception as e:
	st.error(f"Impossible d'afficher le PDF : {e}")
	st.download_button(
	label="📥 Télécharger le PDF",
	data=content,
	file_name="document.pdf",
	mime="application/pdf"
	)

	def main():
	"""
	Main function to run the Streamlit app.
	"""

	# Sidebar: Authentication for Mistral API
	if not MISTRAL_API_KEY:
	api_key = st.sidebar.text_input("Mistral API Key", type="password")
	else:
	api_key = MISTRAL_API_KEY

	if not api_key:
	st.warning("Enter API key to continue")
	return

	# Initialize Mistral API client
	client = Mistral(api_key=api_key)

	# Main app interface
	st.header("OCR Facture avec Mistral")

	# Input method selection: URL, PDF Upload, or Image Upload
	input_method = st.radio("Format de la facture:", ["URL", "PDF", "Image"])

	document_source = None
	preview_content = None
	content_type = None

	if input_method == "URL":
	# Handle document URL input
	url = st.text_input("Document URL:")
	if url:
	document_source = {
	"type": "document_url",
	"document_url": url
	}
	preview_content = url
	content_type = "url"

	elif input_method == "PDF":
	# Handle PDF file upload
	uploaded_file = st.file_uploader("Choisissez un PDF", type=["pdf"])
	if uploaded_file:
	content = uploaded_file.read()
	preview_content = uploaded_file

	# Display the uploaded PDF
	display_pdf(content)

	# Prepare document source for OCR processing
	document_source = {
	"type": "document_url",
	"document_url": upload_pdf(client, content, uploaded_file.name)
	}
	content_type = "pdf"

	elif input_method == "Image":
	# Handle image file upload
	uploaded_image = st.file_uploader("Choisissez une image", type=["png", "jpg", "jpeg"])
	if uploaded_image:
	# Display the uploaded image
	image = Image.open(uploaded_image)
	st.image(image, caption="Uploaded Image", use_container_width=True)

	# Convert image to base64
	buffered = io.BytesIO()
	image.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode()

	# Prepare document source for OCR processing
	document_source = {
	"type": "image_url",
	"image_url": f"data:image/png;base64,{img_str}"
	}
	content_type = "image"

	if document_source and st.button("Générer les données au format JSON"):
	# Process the document when the user clicks the button
	with st.spinner("Extracting JSON content..."):
	try:
	ocr_response = extract_json_from_doc(client, document_source)

	with st.expander("Response"):
	st.json(ocr_response)

	except Exception as e:
	# Display an error message if processing fails
	st.error(f"Processing error: {str(e)}")

	if document_source and st.button("Générer un Document"):
	# Process the document when the user clicks the button
	with st.spinner("Extracting content..."):
	try:
	ocr_response = process_ocr(client, document_source)

	if ocr_response and ocr_response.pages:
	# Combine extracted text from all pages into one string
	extracted_content = "\n\n".join(
	[f"Page {i+1}\n{page.markdown}"
	for i, page in enumerate(ocr_response.pages)]
	)

	# Display extracted content in Markdown format
	st.subheader("Extracted Content")
	st.markdown(extracted_content)

	# Prepare plain text version
	plain_text_content = "\n\n".join(
	[f"Page {i+1}\n{page.markdown}"
	for i, page in enumerate(ocr_response.pages)]
	)

	# Add download buttons for both text and Markdown formats
	col1, col2 = st.columns(2)
	with col1:
	st.download_button(
	label="Download Text",
	data=plain_text_content,
	file_name="extracted_content.txt",
	mime="text/plain"
	)
	with col2:
	st.download_button(
	label="Download Markdown",
	data=extracted_content,
	file_name="extracted_content.md",
	mime="text/markdown"
	)

	# Optional: Show raw response for debugging purposes
	with st.expander("Réponse API"):
	st.json(ocr_response.model_dump())

	else:
	st.warning("No content extracted.")

	except Exception as e:
	# Display an error message if processing fails
	st.error(f"Processing error: {str(e)}")

	if __name__ == "__main__":
	main()