Spaces:
Sleeping
Sleeping
File size: 13,196 Bytes
5f01f50 5adde2d e05e6ee 5adde2d e05e6ee 6b989e1 e05e6ee 5f01f50 e05e6ee 5f01f50 e05e6ee 6169f7e 5dae67f 20f2ffe 5dae67f 20f2ffe 5dae67f 20f2ffe e05e6ee 5dae67f e05e6ee 5adde2d e05e6ee 6b989e1 e05e6ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
import streamlit as st
import streamlit.components.v1 as components
import base64
import tempfile
import os
from mistralai import Mistral
from PIL import Image
import io
from dotenv import load_dotenv
from pdf2image import convert_from_bytes
# Configuration de la page - DOIT être la première commande Streamlit
st.set_page_config(page_title="OCR Facture avec Mistral", layout="wide")
load_dotenv()
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
"""

# Welcome to Gaia OCR Template by OSFarm!
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
forums](https://discuss.streamlit.io).
In the meantime, below is an example of what you can do with just a few lines of code:
"""
SYSTEM_PROMPT = """From the user prompt coming from purchase invoice below, extract informations strictly as instructed.
Most of the time, the pattern of a purchase invoice is composed of supplier informations, invoice informations and one or many invoice lines.
Information come from France in french language.
Return the purchase informations in JSON format like an API according to the schema.
Do not return 'description', 'type' or 'format' attributes in the response.
Use it only to detect correct value of each attributes.
example of a response : { supplier: { name: "AXA", address: "10 rue du Bouil bleu", postal_code: "17250", ... }, invoice: {number: "FA25632", ... }, items: [{number: '1', ... }, {number: '2', ... }, ...]}.
for the items, try to detect the role of the item in 'merchandise' or 'service' in role attribute.
for all the date, try to convert it in the following format : 'DD/MM/YYYY'.
for the items, try to classify it like an accountant in nature attribute.
"""
SYSTEM_PROMPT2 = """From this delivery note document, extract the following information by following these instructions.
The information is in French. Return the information in JSON format according to the schema.
The details of each piece of information to be extracted are found in the description field of each item.
"""
# Requis
# Code permettant d'identifier l'aliment de manière unique
# type d'aliment parmis fourrages, aliments composés, matières premières.
# Nom ou raison sociale du responsable de l’étiquetage
# Adresse du responsable de l’étiquetage, A compléter avec rue / ville / code postal
# Conditionnement utilisé pour la vente du produit : vrac ou non-vrac. Si vrac - Quantité nette livrée poids (kg ou tonne) ou volume (litre) unité à préciser), Si non vrac, Quantité d'aliment dans une unité de vente exclu le poids du conditionnement = contenu net
# Pays d'origine ou zone géographique en code iso. Si plusieurs origines sont mélangées, alors c'est l'origine qui englobe toutes ces orignes qui est retenue. Par exemple lot de blé contenant 97% de blé d'origine française et 3% de blé d'origine UE sera considéré comme étant d'origine UE
# Indique si l'aliment est biologique ou non
# Indique si alimentation sans OGM (<0,9%) ou non
# Optionnels (essayer de passer les infos optionnels pour qu'il puisse les inventer)
# + teneur en eau
JSON_SCHEMA = {
"name": "PurchaseInvoice",
"schema_definition": {
"$defs": {
"Explanation": {
"properties": {
"explanation": {
"title": "Explanation",
"type": "string",
},
"output": {"title": "Output", "type": "string"},
},
"required": ["explanation", "output"],
"title": "Explanation",
"type": "object",
"additionalProperties": False,
}
},
"properties": {
"steps": {
"items": {"$ref": "#/$defs/Explanation"},
"title": "Steps",
"type": "array",
},
"final_answer": {"title": "Final Answer", "type": "string"},
},
"required": ["steps", "final_answer"],
"title": "MathDemonstration",
"type": "object",
"additionalProperties": False,
},
"description": None,
"strict": True
}
def upload_pdf(client, content, filename):
"""
Uploads a PDF to Mistral's API and retrieves a signed URL for processing.
Args:
client (Mistral): Mistral API client instance.
content (bytes): The content of the PDF file.
filename (str): The name of the PDF file.
Returns:
str: Signed URL for the uploaded PDF.
"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = os.path.join(temp_dir, filename)
with open(temp_path, "wb") as tmp:
tmp.write(content)
try:
with open(temp_path, "rb") as file_obj:
file_upload = client.files.upload(
file={"file_name": filename, "content": file_obj},
purpose="ocr"
)
signed_url = client.files.get_signed_url(file_id=file_upload.id)
return signed_url.url
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
def extract_json_from_doc(client, document_source):
"""
Extracts JSON data from a document using Mistral's OCR API.
Args:
client (Mistral): Mistral API client instance.
document_source (dict): The source of the document (URL or image).
Returns:
dict: The extracted JSON data.
"""
# Specify model
model = "mistral-small-latest"
messages = [
{
"role": "system",
"content": SYSTEM_PROMPT2,
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "what is the last sentence in the document"
},
document_source
]
}
]
print(messages)
chat_response = client.chat.complete(
model=model,
messages=messages,
response_format = {
"type": "json_object" #, "json_schema": JSON_SCHEMA
}
)
print(chat_response.choices[0].message.content)
return chat_response.choices[0].message.content
def process_ocr(client, document_source):
"""
Processes a document using Mistral's OCR API.
Args:
client (Mistral): Mistral API client instance.
document_source (dict): The source of the document (URL or image).
Returns:
OCRResponse: The response from Mistral's OCR API.
"""
return client.ocr.process(
model="mistral-ocr-latest",
document=document_source,
include_image_base64=True
)
def display_pdf(content: bytes):
try:
images = convert_from_bytes(content)
for i, image in enumerate(images):
st.image(image, caption=f"Page {i+1}", use_container_width=True)
except Exception as e:
st.error(f"Impossible d'afficher le PDF : {e}")
st.download_button(
label="📥 Télécharger le PDF",
data=content,
file_name="document.pdf",
mime="application/pdf"
)
def main():
"""
Main function to run the Streamlit app.
"""
# Sidebar: Authentication for Mistral API
if not MISTRAL_API_KEY:
api_key = st.sidebar.text_input("Mistral API Key", type="password")
else:
api_key = MISTRAL_API_KEY
if not api_key:
st.warning("Enter API key to continue")
return
# Initialize Mistral API client
client = Mistral(api_key=api_key)
# Main app interface
st.header("OCR Facture avec Mistral")
# Input method selection: URL, PDF Upload, or Image Upload
input_method = st.radio("Format de la facture:", ["URL", "PDF", "Image"])
document_source = None
preview_content = None
content_type = None
if input_method == "URL":
# Handle document URL input
url = st.text_input("Document URL:")
if url:
document_source = {
"type": "document_url",
"document_url": url
}
preview_content = url
content_type = "url"
elif input_method == "PDF":
# Handle PDF file upload
uploaded_file = st.file_uploader("Choisissez un PDF", type=["pdf"])
if uploaded_file:
content = uploaded_file.read()
preview_content = uploaded_file
# Display the uploaded PDF
display_pdf(content)
# Prepare document source for OCR processing
document_source = {
"type": "document_url",
"document_url": upload_pdf(client, content, uploaded_file.name)
}
content_type = "pdf"
elif input_method == "Image":
# Handle image file upload
uploaded_image = st.file_uploader("Choisissez une image", type=["png", "jpg", "jpeg"])
if uploaded_image:
# Display the uploaded image
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_container_width=True)
# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
# Prepare document source for OCR processing
document_source = {
"type": "image_url",
"image_url": f"data:image/png;base64,{img_str}"
}
content_type = "image"
if document_source and st.button("Générer les données au format JSON"):
# Process the document when the user clicks the button
with st.spinner("Extracting JSON content..."):
try:
ocr_response = extract_json_from_doc(client, document_source)
with st.expander("Response"):
st.json(ocr_response)
except Exception as e:
# Display an error message if processing fails
st.error(f"Processing error: {str(e)}")
if document_source and st.button("Générer un Document"):
# Process the document when the user clicks the button
with st.spinner("Extracting content..."):
try:
ocr_response = process_ocr(client, document_source)
if ocr_response and ocr_response.pages:
# Combine extracted text from all pages into one string
extracted_content = "\n\n".join(
[f"**Page {i+1}**\n{page.markdown}"
for i, page in enumerate(ocr_response.pages)]
)
# Display extracted content in Markdown format
st.subheader("Extracted Content")
st.markdown(extracted_content)
# Prepare plain text version
plain_text_content = "\n\n".join(
[f"Page {i+1}\n{page.markdown}"
for i, page in enumerate(ocr_response.pages)]
)
# Add download buttons for both text and Markdown formats
col1, col2 = st.columns(2)
with col1:
st.download_button(
label="Download Text",
data=plain_text_content,
file_name="extracted_content.txt",
mime="text/plain"
)
with col2:
st.download_button(
label="Download Markdown",
data=extracted_content,
file_name="extracted_content.md",
mime="text/markdown"
)
# Optional: Show raw response for debugging purposes
with st.expander("Réponse API"):
st.json(ocr_response.model_dump())
else:
st.warning("No content extracted.")
except Exception as e:
# Display an error message if processing fails
st.error(f"Processing error: {str(e)}")
if __name__ == "__main__":
main() |