Spaces:

omkar-surve126
/

Document_OCR

Sleeping

App Files Files Community

omkar-surve126 commited on Dec 29, 2025

Commit

109676f

verified ·

1 Parent(s): 568477c

Upload app.py

Browse files

Files changed (1) hide show

app.py +157 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import streamlit as st
+import pandas as pd
+from mistralai import Mistral
+import os
+import io
+import zipfile
+# --- Page Config ---
+st.set_page_config(page_title="Mistral OCR Processor", layout="wide")
+st.title("📄 OCR Extraction")
+st.markdown("""
+This tool allows you to upload multiple PDF files, process them using **Mistral's OCR**.
+**Output:**
+1. A ZIP file containing full text files for each PDF.
+2. A CSV with every sentence having its metadata.
+""")
+# --- Sidebar: Configuration ---
+with st.sidebar:
+    st.header("Configuration")
+    api_key = st.text_input("Enter Mistral API Key", type="password")
+    st.caption("Your key is not stored and is only used for this session.")
+    st.divider()
+    st.info("Ensure you have access to `mistral-ocr-latest`.")
+# --- Helper Function: Process Single PDF ---
+def process_pdf(client, uploaded_file):
+    """
+    Uploads file to Mistral, gets a signed URL, runs OCR, and extracts text.
+    """
+    try:
+        # 1. Upload the file to Mistral
+        # using .getvalue() to satisfy Pydantic strict typing
+        uploaded_mistral_file = client.files.upload(
+            file={
+                "file_name": uploaded_file.name,
+                "content": uploaded_file.getvalue(),
+            },
+            purpose="ocr"
+        )
+        # 2. Get a signed URL (temporary)
+        signed_url = client.files.get_signed_url(file_id=uploaded_mistral_file.id)
+        # 3. Process with Mistral OCR
+        ocr_response = client.ocr.process(
+            model="mistral-ocr-latest",
+            document={
+                "type": "document_url",
+                "document_url": signed_url.url
+            },
+            include_image_base64=False
+        )
+        # 4. Extract text from the response
+        full_text = ""
+        if hasattr(ocr_response, 'pages'):
+            for page in ocr_response.pages:
+                full_text += page.markdown + " " # Adding space to ensure sentences don't merge
+        return full_text
+    except Exception as e:
+        st.error(f"Error processing {uploaded_file.name}: {e}")
+        return None
+# --- Main Interface ---
+uploaded_files = st.file_uploader(
+    "Upload PDF files",
+    type=["pdf"],
+    accept_multiple_files=True
+)
+if st.button("Start OCR Processing", type="primary"):
+    if not api_key:
+        st.error("Please provide your Mistral API Key in the sidebar.")
+    elif not uploaded_files:
+        st.warning("Please upload at least one PDF file.")
+    else:
+        # Initialize Mistral Client
+        client = Mistral(api_key=api_key)
+        # Containers for results
+        processed_data = [] # For CSV
+        zip_buffer = io.BytesIO() # For ZIP of TXT files
+        # Progress Bar
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
+            for idx, file in enumerate(uploaded_files):
+                status_text.text(f"Processing file {idx + 1} of {len(uploaded_files)}: {file.name}...")
+                # Perform OCR
+                extracted_text = process_pdf(client, file)
+                if extracted_text:
+                    # --- LOGIC CHANGE FOR CSV ---
+                    # Split text by full stop to create rows per sentence
+                    sentences = extracted_text.split('.')
+                    for sentence in sentences:
+                        # Clean whitespace
+                        clean_sentence = sentence.strip()
+                        # Only add if the sentence is not empty
+                        if clean_sentence:
+                            processed_data.append({
+                                "text": clean_sentence,
+                                "source_file": file.name
+                            })
+                    # --- LOGIC FOR ZIP (Keep Full Text) ---
+                    txt_filename = f"{os.path.splitext(file.name)[0]}.txt"
+                    zf.writestr(txt_filename, extracted_text)
+                # Update progress
+                progress_bar.progress((idx + 1) / len(uploaded_files))
+        status_text.text("Processing Complete!")
+        # --- Display Results & Downloads ---
+        if processed_data:
+            st.success(f"Successfully processed {len(uploaded_files)} files.")
+            # Create DataFrame
+            df = pd.DataFrame(processed_data)
+            # Layout for download buttons
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("1. Download Sentence-wise CSV")
+                csv = df.to_csv(index=False).encode('utf-8')
+                st.download_button(
+                    label="Download CSV (Sentences)",
+                    data=csv,
+                    file_name="ocr_sentences.csv",
+                    mime="text/csv",
+                )
+            with col2:
+                st.subheader("2. Download Full Text Files")
+                st.download_button(
+                    label="Download All .txt Files (ZIP)",
+                    data=zip_buffer.getvalue(),
+                    file_name="individual_ocr_files.zip",
+                    mime="application/zip"
+                )
+            # Preview Data
+            with st.expander("Preview Extracted Sentences"):
+                st.dataframe(df)