Spaces:

arghya007
/

bengali_OCR_LLM

Build error

App Files Files Community

bengali_OCR_LLM / src /streamlit_app.py

arghya007

Update src/streamlit_app.py

b4b646a verified 11 months ago

raw

history blame contribute delete

9.23 kB

	import streamlit as st
	from PIL import Image, UnidentifiedImageError
	import io
	import zipfile
	from pypdf import PdfReader
	from pdf2image import convert_from_bytes
	import pytesseract
	import os

	# Set the Streamlit config directory explicitly as an environment variable before any Streamlit import or usage.
	os.environ["STREAMLIT_CONFIG_PATH"] = "/app/.streamlit"

	# Placeholder for TituLLMs integration
	def process_text_with_titullm(text):
	"""Placeholder for TituLLM post-processing."""
	return text


	def perform_ocr_and_postprocess(image, lang='ben'):
	"""Performs OCR on a PIL Image and then post-processes the text."""
	try:
	ocr_text = pytesseract.image_to_string(image, lang=lang)
	print(f"OCR Output:\n---\n{ocr_text}\n---\n")
	final_text = process_text_with_titullm(ocr_text)
	print(f"LLM Post-processed Output:\n---\n{final_text}\n---\n")
	return final_text
	except pytesseract.TesseractError as te:
	st.error(
	f"Tesseract Error: {te}. Ensure Tesseract and Bengali language pack are installed in your environment."
	)
	return None
	except Exception as e:
	print(f"An error occurred during OCR/post-processing: {e}")
	st.error(f"An error occurred during OCR: {e}")
	return None


	def process_pdf_to_images(file_bytes):
	"""Converts PDF file bytes into a list of PIL Images using pdf2image."""
	images = []
	try:
	pil_images = convert_from_bytes(file_bytes)
	for pil_image in pil_images:
	images.append(pil_image)
	except Exception as e:
	st.error(f"Error processing PDF with pdf2image: {e}")
	return images


	def process_zip_to_images(file_bytes):
	"""Extracts images from a ZIP file's bytes into a list of PIL Images."""
	images = []
	supported_image_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')
	try:
	with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
	for member_name in zf.namelist():
	if (
	member_name.lower().endswith(supported_image_extensions)
	and not member_name.startswith('__MACOSX/')
	):
	try:
	image_data = zf.read(member_name)
	img = Image.open(io.BytesIO(image_data))
	images.append(img)
	except UnidentifiedImageError:
	st.warning(
	f"Could not identify or open image file '{member_name}' in the ZIP. Skipping."
	)
	except Exception as e:
	st.warning(
	f"Error processing image '{member_name}' in ZIP: {e}. Skipping."
	)
	except zipfile.BadZipFile:
	st.error("Invalid or corrupted ZIP file.")
	except Exception as e:
	st.error(f"Error processing ZIP file: {e}")
	return images


	def main():
	"""Main function to run the Streamlit app."""
	st.set_page_config(layout="wide")
	st.title("Bengali OCR with LLM Post-processing (PDF & ZIP Support)")

	# Initialize session state variables
	if 'processed_images' not in st.session_state:
	st.session_state.processed_images = []
	if 'current_page_index' not in st.session_state:
	st.session_state.current_page_index = 0
	if 'file_id' not in st.session_state: # To track if a new file is uploaded
	st.session_state.file_id = None

	uploaded_file = st.file_uploader(
	"Upload a Bengali image, multi-page PDF, or a ZIP file with images",
	type=["png", "jpg", "jpeg", "pdf", "zip"],
	key="file_uploader_key",
	)

	if uploaded_file is not None:
	# Check if a new file has been uploaded
	new_file_uploaded = False
	# Using uploaded_file.id is more reliable if available and unique per upload
	current_file_id = getattr(uploaded_file, 'id', uploaded_file.name + str(uploaded_file.size))
	if st.session_state.file_id != current_file_id:
	st.session_state.file_id = current_file_id
	st.session_state.processed_images = [] # Reset images
	st.session_state.current_page_index = 0 # Reset page index
	new_file_uploaded = True
	print(f"New file uploaded: {uploaded_file.name}")

	if (
	new_file_uploaded or not st.session_state.processed_images
	): # Process only if it's a new file or images aren't processed yet
	with st.spinner(f"Processing {uploaded_file.name}..."):
	file_bytes = uploaded_file.getvalue()
	file_extension = os.path.splitext(uploaded_file.name)[1].lower()

	if file_extension == ".pdf":
	st.session_state.processed_images = process_pdf_to_images(
	file_bytes)
	elif file_extension == ".zip":
	st.session_state.processed_images = process_zip_to_images(
	file_bytes)
	else: # Single image file
	try:
	img = Image.open(io.BytesIO(file_bytes))
	st.session_state.processed_images = [img]
	except UnidentifiedImageError:
	st.error(
	f"Could not identify or open the uploaded image file: {uploaded_file.name}"
	)
	st.session_state.processed_images = []
	except Exception as e:
	st.error(f"Error processing image file: {e}")
	st.session_state.processed_images = []

	if not st.session_state.processed_images:
	st.warning(
	"No images were successfully processed from the uploaded file.")

	if st.session_state.processed_images:
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Uploaded Image Viewer")

	num_images = len(st.session_state.processed_images)
	if num_images > 1:
	# Create page numbers starting from 1 for user display
	page_options = [f"Page {i + 1}" for i in range(num_images)]
	# The selectbox returns the string "Page X", we need to convert back to 0-based index
	selected_page_str = st.selectbox(
	"Select Page:",
	options=page_options,
	index=st.session_state.current_page_index,
	key="page_selector",
	)
	# Update current_page_index based on selection
	st.session_state.current_page_index = page_options.index(
	selected_page_str)

	# Display the current image
	current_image_to_display = st.session_state.processed_images[
	st.session_state.current_page_index
	]
	st.image(
	current_image_to_display,
	use_container_width=True,
	caption=f"Displaying image {st.session_state.current_page_index + 1} of {num_images}",
	)

	with col2:
	st.subheader("Extracted Text")
	# Perform OCR on the currently displayed image
	# Ensure current_image_to_display is valid before OCR
	if current_image_to_display:
	extracted_text = perform_ocr_and_postprocess(
	current_image_to_display, lang='ben')
	if extracted_text is not None:
	# Adjust text area height based on the current image's original height
	image_height_px = current_image_to_display.size[1]
	st.text_area(
	"OCR Result",
	value=extracted_text,
	height=image_height_px,
	key="extracted_text_area",
	)
	else:
	st.info(
	"OCR could not be performed or returned no text for the current page.")
	else:
	st.info("No image selected or available for OCR.")
	elif uploaded_file and not st.session_state.processed_images: # If a new file was uploaded but resulted in no images
	st.warning("No images found or processed from the uploaded file.")



	if __name__ == "__main__":
	# --- Tesseract Path Configuration (Simplified for Spaces) ---
	# In Hugging Face Spaces with the Dockerfile, Tesseract should be in the system's PATH.
	try:
	pytesseract.get_tesseract_version()
	print("Tesseract found by Pytesseract.")
	except pytesseract.TesseractNotFoundError:
	st.error(
	"Pytesseract could not find Tesseract. Ensure it's installed correctly in the Space (check your Dockerfile)."
	)
	print("Pytesseract could not find Tesseract.")

	# --- Streamlit App Execution ---
	main()