kab-ocr-tanti / app.py
AitBAD's picture
Update app.py
49a6789 verified
# app.py script that converts pdf or png txt image to UTF8 text
# Kabyle OCR tool
import streamlit as st
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image, ImageEnhance, ImageFilter
import os
# --- Set TESSDATA_PREFIX for Docker deployment ---
# This tells pytesseract where to find the .traineddata files
os.environ['TESSDATA_PREFIX'] = '/app/tessdata/' # Point to the local tessdata folder inside the container
# --- Configuration ---
# Simplified config using the model found via TESSDATA_PREFIX
custom_config = '-l kab'
# --- Function Definition ---
def enhance_image(image, for_display=False):
"""Applies enhancements to improve OCR quality or display quality."""
# Convert to 'L' mode (grayscale) if not already
if image.mode != 'L':
image = image.convert('L')
# Enhance Contrast
contrast_enhancer = ImageEnhance.Contrast(image)
# Increase contrast slightly for OCR. Values > 1.0 increase contrast.
# For display, we might want a slightly different value or skip this step entirely
# depending on the original image quality. Let's use the same value for now.
contrast_factor = 1.5
if for_display:
# Potentially use a different factor for display if needed
# contrast_factor = 1.3 # Example for display
pass # Using same factor for now
image = contrast_enhancer.enhance(contrast_factor)
# Enhance Sharpness
sharpness_enhancer = ImageEnhance.Sharpness(image)
# Slightly increase sharpness for OCR. Values > 1.0 increase sharpness.
# Again, for display, a different value might be preferred.
sharpness_factor = 1.2
if for_display:
# Potentially use a different factor for display if needed
# sharpness_factor = 1.1 # Example for display
pass # Using same factor for now
image = sharpness_enhancer.enhance(sharpness_factor)
# Optional: Apply a slight unsharp mask filter for further sharpening
# if not for_display: # Only for OCR processing?
# image = image.filter(ImageFilter.UnsharpMask(radius=1, percent=50, threshold=0))
return image
def process_image(image):
"""Processes a single image using pytesseract, applying enhancements first."""
try:
# Apply enhancements before OCR
enhanced_image = enhance_image(image, for_display=False) # Explicitly for OCR
text = pytesseract.image_to_string(enhanced_image, config=custom_config)
return text
except Exception as e:
return f"Error during OCR: {e}"
# --- Page Setup ---
st.set_page_config(page_title="Kabyle OCR", layout="wide")
st.title("Asemmezdey Asekdan n Teqbaylit - Kabyle OCR")
# --- Sidebar ---
st.sidebar.header("Isefka")
uploaded_file = st.sidebar.file_uploader("Ssali-d Afaylu - Smenyif amerkid n 300+ DPI", type=["png", "jpg", "jpeg", "pdf"])
# Add font size selector to the sidebar
font_size = st.sidebar.slider("Tiddi n Yisekkilen n Uḍris seg OCR (px)", min_value=10, max_value=30, value=18, step=1)
# Add the preview quality (DPI) slider to the sidebar, under font size
preview_dpi = st.sidebar.slider("Amerkid n Uskan (DPI)", min_value=150, max_value=700, value=300, step=50)
# --- Inject Custom CSS for Font Size, Text Color, and Image Layout ---
# This CSS applies the selected font size and a darker color to the text area
# It also ensures images in col1 behave predictably with max-width
st.markdown(
f"""
<style>
/* --- Text Area Styling --- */
/* Target the main text area input field */
textarea[data-testid="stText"] {{
font-size: {font_size}px;
color: #000000 !important; /* Force black text */
background-color: #FFFFFF !important; /* Force white background */
line-height: 1.5; /* Improve readability with line spacing */
}}
/* Target the text area inside stTextArea component */
.stTextArea textarea {{
font-size: {font_size}px;
color: #000000 !important; /* Force black text */
background-color: #FFFFFF !important; /* Force white background */
line-height: 1.5;
}}
/* Target the container of the text area for potential background issues */
.stTextArea > div > div {{
background-color: #FFFFFF !important; /* Ensure container background is white */
}}
/* --- Image Styling (Kept as is) --- */
section[data-testid="stSidebar"] ~ div > div:has(div[data-testid="stColumn"] > div:nth-child(1)) > div:nth-child(1) img {{
max-width: 100%; /* Ensures image doesn't exceed the column width */
height: auto; /* Maintains aspect ratio when width is constrained */
display: block; /* Makes the image a block element, necessary for max-width */
margin-left: auto; /* Center the image horizontally within its container */
margin-right: auto; /* Center the image horizontally within its container */
margin-bottom: 10px; /* Add some space below the image */
}}
</style>
""",
unsafe_allow_html=True
)
# --- Main App Logic ---
if uploaded_file is not None:
# Check if the uploaded file object has changed (a new file was selected)
# Compare the new file's info with the one stored in session state (if it exists)
current_file_info = (uploaded_file.name, uploaded_file.size, uploaded_file.type)
if 'current_file_info' not in st.session_state or st.session_state.current_file_info != current_file_info:
# New file detected, reset session state
st.session_state.current_file_info = current_file_info
st.session_state.ocr_text = ""
st.session_state.display_image = None
st.session_state.all_pdf_images = [] # Add list for all PDF preview images
st.session_state.current_page_index = 0 # Add index for slideshow
# Optional: Clear other relevant session state variables if needed
# st.session_state.some_other_var = default_value
st.info(f"Afaylu i d-yulin: {uploaded_file.name}")
# Initialize session state for text and display image if needed (should be after reset check)
if 'ocr_text' not in st.session_state:
st.session_state.ocr_text = ""
if 'display_image' not in st.session_state:
st.session_state.display_image = None
# Initialize list for all PDF preview images
if 'all_pdf_images' not in st.session_state:
st.session_state.all_pdf_images = []
# Initialize current page index for slideshow
if 'current_page_index' not in st.session_state:
st.session_state.current_page_index = 0
# Create two columns for side-by-side view
# Using [1, 1] ratio as suggested
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("Askan n Ufaylu deg Talɣa-s Tamenzut")
if "pdf" in uploaded_file.type:
if st.button("Sekker PDF (Askan n Yisebtar)"):
try:
# Read the file content once for PDF conversion
pdf_content = uploaded_file.read()
# Convert PDF pages to images with the DPI selected by the slider in the sidebar
images = convert_from_bytes(pdf_content, dpi=preview_dpi) # Use the sidebar value directly
if images:
# Store the *original* images for display (un-enhanced copies for preview)
st.session_state.all_pdf_images = [img.copy() for img in images]
# Reset current page index for slideshow
st.session_state.current_page_index = 0
# Store the first page image for display (original, for preview) - kept for backward compatibility if needed
st.session_state.display_image = images[0]
# Reset OCR text for new processing (only reset here if button is pressed)
# st.session_state.ocr_text = "" # This is done on file change now
st.success(f"Yuli-d uPDF (DPI n Uskan: {preview_dpi}). Tekki ɣef 'Sekker OCR' i Uselket.")
else:
st.error("Ulac isebtare deg ufaylu PDF.")
except Exception as e:
st.error(f"Ugul deg uselket n PDF: {e}")
st.warning("Senked ma yuli poppler (MD. 'conda install -c conda-forge poppler').")
# Display the current PDF page and navigation controls (if PDF was processed and pages are stored)
if st.session_state.all_pdf_images:
st.subheader("Askan n Yisebtar n Ufaylu PDF") # Subheader for the single page view
num_pages = len(st.session_state.all_pdf_images)
current_idx = st.session_state.current_page_index
# Display navigation buttons and page info on the same line using columns
col_nav1, col_nav2, col_nav3 = st.columns([1, 2, 1]) # Create columns for layout
with col_nav1:
# Disable 'Previous' button if on the first page
st.button("Ɣer deffir", disabled=(current_idx == 0), on_click=lambda: setattr(st.session_state, 'current_page_index', max(0, current_idx - 1)), key='prev_btn_slideshow')
with col_nav2:
# Display page number centered
st.text(f"Asebter {current_idx + 1} n {num_pages}")
with col_nav3:
# Disable 'Next' button if on the last page
st.button("Ɣer zdat", disabled=(current_idx == num_pages - 1), on_click=lambda: setattr(st.session_state, 'current_page_index', min(num_pages - 1, current_idx + 1)), key='next_btn_slideshow')
# Display the current image below the navigation
# Calculate width based on DPI relative to a standard DPI (e.g., 300)
# This provides a dynamic scaling effect based on DPI for the *display*.
base_dpi = 300
base_width = 600 # A reasonable base width for 300 DPI
calculated_width = int((preview_dpi / base_dpi) * base_width)
# Enhance the *copy* of the *current* image just for display
current_img = st.session_state.all_pdf_images[current_idx]
display_image_enhanced = enhance_image(current_img.copy(), for_display=True)
# Width is now calculated based on the DPI slider value
# Display the *enhanced* image for preview with the calculated width
# The CSS rule max-width: 100% will prevent it from exceeding col1's width
st.image(display_image_enhanced, caption=f"Asebter {current_idx + 1} ({preview_dpi} DPI)", width=calculated_width) # Use calculated width
else: # It's an image file
# For direct image files, use the calculated width based on the slider value from the sidebar.
base_dpi = 300
base_width = 600
calculated_width = int((preview_dpi / base_dpi) * base_width)
image = Image.open(uploaded_file)
# Enhance the *copy* of the image just for display
display_image_enhanced = enhance_image(image.copy(), for_display=True)
# Display the *enhanced* image for preview (NOT inside the PDF container)
st.image(display_image_enhanced, caption=uploaded_file.name, width=calculated_width)
# Store the *original* image for potential use later (though not strictly needed here)
st.session_state.display_image = image
with col2:
st.subheader("Asezṛeg n Uḍris seg OCR")
# OCR Button (only appears after preview is potentially loaded for PDFs)
if st.button("Sekker OCR"):
full_text = ""
# Use a placeholder to clear the area and then update with progress
progress_text = st.empty()
progress_bar = st.progress(0)
with st.spinner("Asekker n OCR..."):
if "pdf" in uploaded_file.type:
try:
# Read the file content again for OCR, using the value from the sidebar slider
pdf_content = uploaded_file.read()
images = convert_from_bytes(pdf_content, dpi=preview_dpi) # Use sidebar value directly
num_pages = len(images)
for i, page_image in enumerate(images):
# Update the placeholder with the current page message
progress_text.text(f"Yeɣɣar asebter {i+1} n {num_pages}...")
# Update the progress bar
progress_bar.progress((i + 1) / num_pages)
# Process the page
txt = process_image(page_image)
full_text += f"""
--- Asebter {i+1} ---
{txt}
"""
except Exception as e:
st.error(f"Ugul deg uPDF deg OCR: {e}")
st.warning("Ma twalaḍ ugul yeɛnan 'poppler_path', senked ma yuli poppler (MD. 'conda install -c conda-forge poppler').")
else: # It's an image file
# For images, we don't need to re-open or re-scale based on DPI slider for OCR itself,
# just use the original uploaded image.
# The process_image function now handles enhancement internally
progress_text.text("Yeɣɣar tugna...") # Inform user about single image processing
image = Image.open(uploaded_file)
full_text = process_image(image)
progress_bar.progress(100) # Indicate completion for single image
# Clear the progress text and bar after processing is complete
progress_text.empty()
progress_bar.empty()
st.session_state.ocr_text = full_text
st.success("OCR Yemmed!")
# Text Editor - Always visible, updates with OCR result or user edits
# Height increased, font size controlled by sidebar slider via CSS
# Text color also controlled by CSS
edited_text = st.text_area("Zṛeg Aḍris, Seɣti Tira-s da", value=st.session_state.ocr_text, height=600, key="text_editor")
# Update session state if user edits the text area
if edited_text != st.session_state.ocr_text:
st.session_state.ocr_text = edited_text
# Download Button (only enabled if there's text)
if st.session_state.ocr_text:
st.download_button(
label="Zdem Aḍris",
data=st.session_state.ocr_text.encode('utf-8'),
file_name=f"{uploaded_file.name.replace('.', '_')}_ocr.txt",
mime="text/plain"
)
else:
st.info("Seddu OCR, Selket s Aḍris.")
else:
# If no file is uploaded, clear session state to ensure clean interface on initial load
# or if user deselects the file.
for key in ["ocr_text", "display_image", "current_file_info", "all_pdf_images", "current_page_index"]:
if key in st.session_state:
del st.session_state[key]
st.write("Ldi afaylu PDF, PNG, JPG, or JPEG seg ufeggad n yisefka.")