Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from mistralai import Mistral | |
| import os | |
| import io | |
| import zipfile | |
| # --- Page Config --- | |
| st.set_page_config(page_title="PDF Loader", layout="wide") | |
| st.title("📄 PDF Loader") | |
| st.markdown(""" | |
| This tool allows you to upload multiple PDF files, process them using **Mistral's OCR**. | |
| **Output:** | |
| 1. A ZIP file containing full text files for each PDF. | |
| 2. A CSV with every sentence having its metadata. | |
| """) | |
| # --- Sidebar: Configuration --- | |
| with st.sidebar: | |
| st.header("Configuration") | |
| api_key = st.text_input("Enter Mistral API Key", type="password") | |
| st.caption("Your key is not stored and is only used for this session.") | |
| st.divider() | |
| st.info("Ensure you have access to `mistral-ocr-latest`.") | |
| # --- Helper Function: Process Single PDF --- | |
| def process_pdf(client, uploaded_file): | |
| """ | |
| Uploads file to Mistral, gets a signed URL, runs OCR, and extracts text. | |
| """ | |
| try: | |
| # 1. Upload the file to Mistral | |
| # using .getvalue() to satisfy Pydantic strict typing | |
| uploaded_mistral_file = client.files.upload( | |
| file={ | |
| "file_name": uploaded_file.name, | |
| "content": uploaded_file.getvalue(), | |
| }, | |
| purpose="ocr" | |
| ) | |
| # 2. Get a signed URL (temporary) | |
| signed_url = client.files.get_signed_url(file_id=uploaded_mistral_file.id) | |
| # 3. Process with Mistral OCR | |
| ocr_response = client.ocr.process( | |
| model="mistral-ocr-latest", | |
| document={ | |
| "type": "document_url", | |
| "document_url": signed_url.url | |
| }, | |
| include_image_base64=False | |
| ) | |
| # 4. Extract text from the response | |
| full_text = "" | |
| if hasattr(ocr_response, 'pages'): | |
| for page in ocr_response.pages: | |
| full_text += page.markdown + " " # Adding space to ensure sentences don't merge | |
| return full_text | |
| except Exception as e: | |
| st.error(f"Error processing {uploaded_file.name}: {e}") | |
| return None | |
| # --- Main Interface --- | |
| uploaded_files = st.file_uploader( | |
| "Upload PDF files", | |
| type=["pdf"], | |
| accept_multiple_files=True | |
| ) | |
| if st.button("Start PDF Processing", type="primary"): | |
| if not api_key: | |
| st.error("Please provide your Mistral API Key in the sidebar.") | |
| elif not uploaded_files: | |
| st.warning("Please upload at least one PDF file.") | |
| else: | |
| # Initialize Mistral Client | |
| client = Mistral(api_key=api_key) | |
| # Containers for results | |
| processed_data = [] # For CSV | |
| zip_buffer = io.BytesIO() # For ZIP of TXT files | |
| # Progress Bar | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: | |
| for idx, file in enumerate(uploaded_files): | |
| status_text.text(f"Processing file {idx + 1} of {len(uploaded_files)}: {file.name}...") | |
| # Perform OCR | |
| extracted_text = process_pdf(client, file) | |
| if extracted_text: | |
| # --- LOGIC CHANGE FOR CSV --- | |
| # Split text by full stop to create rows per sentence | |
| sentences = extracted_text.split('.') | |
| for sentence in sentences: | |
| # Clean whitespace | |
| clean_sentence = sentence.strip() | |
| # Only add if the sentence is not empty | |
| if clean_sentence: | |
| processed_data.append({ | |
| "text": clean_sentence, | |
| "source_file": file.name | |
| }) | |
| # --- LOGIC FOR ZIP (Keep Full Text) --- | |
| txt_filename = f"{os.path.splitext(file.name)[0]}.txt" | |
| zf.writestr(txt_filename, extracted_text) | |
| # Update progress | |
| progress_bar.progress((idx + 1) / len(uploaded_files)) | |
| status_text.text("Processing Complete!") | |
| # --- Display Results & Downloads --- | |
| if processed_data: | |
| st.success(f"Successfully processed {len(uploaded_files)} files.") | |
| # Create DataFrame | |
| df = pd.DataFrame(processed_data) | |
| # Layout for download buttons | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("1. Download CSV") | |
| csv = df.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label="Download CSV (Sentences)", | |
| data=csv, | |
| file_name="pdf_sentences.csv", | |
| mime="text/csv", | |
| ) | |
| with col2: | |
| st.subheader("2. Download Full Text Files") | |
| st.download_button( | |
| label="Download All .txt Files (ZIP)", | |
| data=zip_buffer.getvalue(), | |
| file_name="individual_pdf_files.zip", | |
| mime="application/zip" | |
| ) | |
| # Preview Data | |
| with st.expander("Preview Extracted Sentences"): | |
| st.dataframe(df) |