Document_OCR / src /streamlit_app.py
omkar-surve126's picture
Update src/streamlit_app.py
a7f6a81 verified
import streamlit as st
import pandas as pd
from mistralai import Mistral
import os
import io
import zipfile
# --- Page Config ---
st.set_page_config(page_title="PDF Loader", layout="wide")
st.title("📄 PDF Loader")
st.markdown("""
This tool allows you to upload multiple PDF files, process them using **Mistral's OCR**.
**Output:**
1. A ZIP file containing full text files for each PDF.
2. A CSV with every sentence having its metadata.
""")
# --- Sidebar: Configuration ---
with st.sidebar:
st.header("Configuration")
api_key = st.text_input("Enter Mistral API Key", type="password")
st.caption("Your key is not stored and is only used for this session.")
st.divider()
st.info("Ensure you have access to `mistral-ocr-latest`.")
# --- Helper Function: Process Single PDF ---
def process_pdf(client, uploaded_file):
"""
Uploads file to Mistral, gets a signed URL, runs OCR, and extracts text.
"""
try:
# 1. Upload the file to Mistral
# using .getvalue() to satisfy Pydantic strict typing
uploaded_mistral_file = client.files.upload(
file={
"file_name": uploaded_file.name,
"content": uploaded_file.getvalue(),
},
purpose="ocr"
)
# 2. Get a signed URL (temporary)
signed_url = client.files.get_signed_url(file_id=uploaded_mistral_file.id)
# 3. Process with Mistral OCR
ocr_response = client.ocr.process(
model="mistral-ocr-latest",
document={
"type": "document_url",
"document_url": signed_url.url
},
include_image_base64=False
)
# 4. Extract text from the response
full_text = ""
if hasattr(ocr_response, 'pages'):
for page in ocr_response.pages:
full_text += page.markdown + " " # Adding space to ensure sentences don't merge
return full_text
except Exception as e:
st.error(f"Error processing {uploaded_file.name}: {e}")
return None
# --- Main Interface ---
uploaded_files = st.file_uploader(
"Upload PDF files",
type=["pdf"],
accept_multiple_files=True
)
if st.button("Start PDF Processing", type="primary"):
if not api_key:
st.error("Please provide your Mistral API Key in the sidebar.")
elif not uploaded_files:
st.warning("Please upload at least one PDF file.")
else:
# Initialize Mistral Client
client = Mistral(api_key=api_key)
# Containers for results
processed_data = [] # For CSV
zip_buffer = io.BytesIO() # For ZIP of TXT files
# Progress Bar
progress_bar = st.progress(0)
status_text = st.empty()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
for idx, file in enumerate(uploaded_files):
status_text.text(f"Processing file {idx + 1} of {len(uploaded_files)}: {file.name}...")
# Perform OCR
extracted_text = process_pdf(client, file)
if extracted_text:
# --- LOGIC CHANGE FOR CSV ---
# Split text by full stop to create rows per sentence
sentences = extracted_text.split('.')
for sentence in sentences:
# Clean whitespace
clean_sentence = sentence.strip()
# Only add if the sentence is not empty
if clean_sentence:
processed_data.append({
"text": clean_sentence,
"source_file": file.name
})
# --- LOGIC FOR ZIP (Keep Full Text) ---
txt_filename = f"{os.path.splitext(file.name)[0]}.txt"
zf.writestr(txt_filename, extracted_text)
# Update progress
progress_bar.progress((idx + 1) / len(uploaded_files))
status_text.text("Processing Complete!")
# --- Display Results & Downloads ---
if processed_data:
st.success(f"Successfully processed {len(uploaded_files)} files.")
# Create DataFrame
df = pd.DataFrame(processed_data)
# Layout for download buttons
col1, col2 = st.columns(2)
with col1:
st.subheader("1. Download CSV")
csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download CSV (Sentences)",
data=csv,
file_name="pdf_sentences.csv",
mime="text/csv",
)
with col2:
st.subheader("2. Download Full Text Files")
st.download_button(
label="Download All .txt Files (ZIP)",
data=zip_buffer.getvalue(),
file_name="individual_pdf_files.zip",
mime="application/zip"
)
# Preview Data
with st.expander("Preview Extracted Sentences"):
st.dataframe(df)