Spaces:

omkar-surve126
/

Document_OCR

Sleeping

App Files Files Community

Document_OCR / src /streamlit_app.py

omkar-surve126

Update src/streamlit_app.py

a7f6a81 verified about 2 months ago

raw

history blame contribute delete

5.52 kB

	import streamlit as st
	import pandas as pd
	from mistralai import Mistral
	import os
	import io
	import zipfile

	# --- Page Config ---
	st.set_page_config(page_title="PDF Loader", layout="wide")

	st.title("📄 PDF Loader")
	st.markdown("""
	This tool allows you to upload multiple PDF files, process them using Mistral's OCR.
	Output:
	1. A ZIP file containing full text files for each PDF.
	2. A CSV with every sentence having its metadata.
	""")

	# --- Sidebar: Configuration ---
	with st.sidebar:
	st.header("Configuration")
	api_key = st.text_input("Enter Mistral API Key", type="password")
	st.caption("Your key is not stored and is only used for this session.")

	st.divider()
	st.info("Ensure you have access to `mistral-ocr-latest`.")

	# --- Helper Function: Process Single PDF ---
	def process_pdf(client, uploaded_file):
	"""
	Uploads file to Mistral, gets a signed URL, runs OCR, and extracts text.
	"""
	try:
	# 1. Upload the file to Mistral
	# using .getvalue() to satisfy Pydantic strict typing
	uploaded_mistral_file = client.files.upload(
	file={
	"file_name": uploaded_file.name,
	"content": uploaded_file.getvalue(),
	},
	purpose="ocr"
	)

	# 2. Get a signed URL (temporary)
	signed_url = client.files.get_signed_url(file_id=uploaded_mistral_file.id)

	# 3. Process with Mistral OCR
	ocr_response = client.ocr.process(
	model="mistral-ocr-latest",
	document={
	"type": "document_url",
	"document_url": signed_url.url
	},
	include_image_base64=False
	)

	# 4. Extract text from the response
	full_text = ""
	if hasattr(ocr_response, 'pages'):
	for page in ocr_response.pages:
	full_text += page.markdown + " " # Adding space to ensure sentences don't merge

	return full_text

	except Exception as e:
	st.error(f"Error processing {uploaded_file.name}: {e}")
	return None

	# --- Main Interface ---

	uploaded_files = st.file_uploader(
	"Upload PDF files",
	type=["pdf"],
	accept_multiple_files=True
	)

	if st.button("Start PDF Processing", type="primary"):
	if not api_key:
	st.error("Please provide your Mistral API Key in the sidebar.")
	elif not uploaded_files:
	st.warning("Please upload at least one PDF file.")
	else:
	# Initialize Mistral Client
	client = Mistral(api_key=api_key)

	# Containers for results
	processed_data = [] # For CSV
	zip_buffer = io.BytesIO() # For ZIP of TXT files

	# Progress Bar
	progress_bar = st.progress(0)
	status_text = st.empty()

	with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
	for idx, file in enumerate(uploaded_files):
	status_text.text(f"Processing file {idx + 1} of {len(uploaded_files)}: {file.name}...")

	# Perform OCR
	extracted_text = process_pdf(client, file)

	if extracted_text:
	# --- LOGIC CHANGE FOR CSV ---
	# Split text by full stop to create rows per sentence
	sentences = extracted_text.split('.')

	for sentence in sentences:
	# Clean whitespace
	clean_sentence = sentence.strip()

	# Only add if the sentence is not empty
	if clean_sentence:
	processed_data.append({
	"text": clean_sentence,
	"source_file": file.name
	})

	# --- LOGIC FOR ZIP (Keep Full Text) ---
	txt_filename = f"{os.path.splitext(file.name)[0]}.txt"
	zf.writestr(txt_filename, extracted_text)

	# Update progress
	progress_bar.progress((idx + 1) / len(uploaded_files))

	status_text.text("Processing Complete!")

	# --- Display Results & Downloads ---
	if processed_data:
	st.success(f"Successfully processed {len(uploaded_files)} files.")

	# Create DataFrame
	df = pd.DataFrame(processed_data)

	# Layout for download buttons
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("1. Download CSV")
	csv = df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download CSV (Sentences)",
	data=csv,
	file_name="pdf_sentences.csv",
	mime="text/csv",
	)

	with col2:
	st.subheader("2. Download Full Text Files")
	st.download_button(
	label="Download All .txt Files (ZIP)",
	data=zip_buffer.getvalue(),
	file_name="individual_pdf_files.zip",
	mime="application/zip"
	)

	# Preview Data
	with st.expander("Preview Extracted Sentences"):
	st.dataframe(df)