import streamlit as st import pandas as pd from mistralai import Mistral import os import io import zipfile # --- Page Config --- st.set_page_config(page_title="PDF Loader", layout="wide") st.title("📄 PDF Loader") st.markdown(""" This tool allows you to upload multiple PDF files, process them using **Mistral's OCR**. **Output:** 1. A ZIP file containing full text files for each PDF. 2. A CSV with every sentence having its metadata. """) # --- Sidebar: Configuration --- with st.sidebar: st.header("Configuration") api_key = st.text_input("Enter Mistral API Key", type="password") st.caption("Your key is not stored and is only used for this session.") st.divider() st.info("Ensure you have access to `mistral-ocr-latest`.") # --- Helper Function: Process Single PDF --- def process_pdf(client, uploaded_file): """ Uploads file to Mistral, gets a signed URL, runs OCR, and extracts text. """ try: # 1. Upload the file to Mistral # using .getvalue() to satisfy Pydantic strict typing uploaded_mistral_file = client.files.upload( file={ "file_name": uploaded_file.name, "content": uploaded_file.getvalue(), }, purpose="ocr" ) # 2. Get a signed URL (temporary) signed_url = client.files.get_signed_url(file_id=uploaded_mistral_file.id) # 3. Process with Mistral OCR ocr_response = client.ocr.process( model="mistral-ocr-latest", document={ "type": "document_url", "document_url": signed_url.url }, include_image_base64=False ) # 4. Extract text from the response full_text = "" if hasattr(ocr_response, 'pages'): for page in ocr_response.pages: full_text += page.markdown + " " # Adding space to ensure sentences don't merge return full_text except Exception as e: st.error(f"Error processing {uploaded_file.name}: {e}") return None # --- Main Interface --- uploaded_files = st.file_uploader( "Upload PDF files", type=["pdf"], accept_multiple_files=True ) if st.button("Start PDF Processing", type="primary"): if not api_key: st.error("Please provide your Mistral API Key in the sidebar.") elif not uploaded_files: st.warning("Please upload at least one PDF file.") else: # Initialize Mistral Client client = Mistral(api_key=api_key) # Containers for results processed_data = [] # For CSV zip_buffer = io.BytesIO() # For ZIP of TXT files # Progress Bar progress_bar = st.progress(0) status_text = st.empty() with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: for idx, file in enumerate(uploaded_files): status_text.text(f"Processing file {idx + 1} of {len(uploaded_files)}: {file.name}...") # Perform OCR extracted_text = process_pdf(client, file) if extracted_text: # --- LOGIC CHANGE FOR CSV --- # Split text by full stop to create rows per sentence sentences = extracted_text.split('.') for sentence in sentences: # Clean whitespace clean_sentence = sentence.strip() # Only add if the sentence is not empty if clean_sentence: processed_data.append({ "text": clean_sentence, "source_file": file.name }) # --- LOGIC FOR ZIP (Keep Full Text) --- txt_filename = f"{os.path.splitext(file.name)[0]}.txt" zf.writestr(txt_filename, extracted_text) # Update progress progress_bar.progress((idx + 1) / len(uploaded_files)) status_text.text("Processing Complete!") # --- Display Results & Downloads --- if processed_data: st.success(f"Successfully processed {len(uploaded_files)} files.") # Create DataFrame df = pd.DataFrame(processed_data) # Layout for download buttons col1, col2 = st.columns(2) with col1: st.subheader("1. Download CSV") csv = df.to_csv(index=False).encode('utf-8') st.download_button( label="Download CSV (Sentences)", data=csv, file_name="pdf_sentences.csv", mime="text/csv", ) with col2: st.subheader("2. Download Full Text Files") st.download_button( label="Download All .txt Files (ZIP)", data=zip_buffer.getvalue(), file_name="individual_pdf_files.zip", mime="application/zip" ) # Preview Data with st.expander("Preview Extracted Sentences"): st.dataframe(df)