Spaces:

badru
/

mmsocr

Build error

File size: 3,584 Bytes

bfa2ec9
 
 
3e85cab
 
 
 
bfa2ec9
be12b02
ad6cb4a
 
 
 
 
 
 
bfa2ec9
3e85cab
dbfe32a
3e85cab
 
 
 
 
 
 
 
 
dbfe32a
 
 
 
3e85cab
dbfe32a
3e85cab
 
 
 
 
dbfe32a
 
 
 
 
 
 
3e85cab
dbfe32a
3e85cab
 
 
 
 
 
dbfe32a
ad6cb4a
be12b02
 
ad6cb4a
 
 
 
 
 
 
3e85cab
dbfe32a
3e85cab
 
 
 
 
 
 
 
 
 
 
 
 
 
ad6cb4a
3e85cab
 
ad6cb4a
3e85cab
ad6cb4a
3e85cab
bfa2ec9
3e85cab
ad6cb4a
 
3e85cab
ad6cb4a
 
 
bfa2ec9
ad6cb4a

import streamlit as st
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch
import cv2
import numpy as np
import tempfile

# Load the processor and model
@st.cache_resource
def load_model():
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
    return processor, model

processor, model = load_model()

# Helper function to preprocess the image and detect lines
def detect_lines(image, min_height=20, min_width=100):
    # Convert the PIL image to a NumPy array
    image_np = np.array(image)

    # Convert to grayscale
    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)

    # Apply binary thresholding
    _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Dilate to merge nearby text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    dilated = cv2.dilate(binary, kernel, iterations=1)

    # Find contours
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Sort contours top-to-bottom
    bounding_boxes = [cv2.boundingRect(c) for c in contours]
    bounding_boxes = sorted(bounding_boxes, key=lambda b: b[1])  # Sort by y-coordinate

    # Filter out small contours and merge nearby ones
    filtered_boxes = []
    for x, y, w, h in bounding_boxes:
        if h >= min_height and w >= min_width:  # Filter small boxes
            filtered_boxes.append((x, y, w, h))

    # Extract individual lines as images
    line_images = []
    for (x, y, w, h) in filtered_boxes:
        line = image_np[y:y+h, x:x+w]
        line_images.append(line)

    return line_images

# Streamlit app
st.title("Multiline Handwritten OCR with Hugging Face and OpenCV")
uploaded_file = st.file_uploader("Upload an Image (JPG, JPEG, PNG)", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    try:
        # Load and display the uploaded image
        image = Image.open(uploaded_file).convert("RGB")
        st.image(image, caption="Uploaded Image", use_column_width=True)

        st.write("Processing the image...")

        # Detect lines in the image
        line_images = detect_lines(image, min_height=30, min_width=100)
        st.write(f"Detected {len(line_images)} lines in the image.")

        # Perform OCR on each detected line
        extracted_text = ""
        for idx, line_img in enumerate(line_images):
            # Convert the line image to PIL format
            line_pil = Image.fromarray(line_img)

            # Prepare the image for OCR
            pixel_values = processor(images=line_pil, return_tensors="pt").pixel_values

            # Generate text from the line image
            generated_ids = model.generate(pixel_values)
            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

            # Append the extracted text
            extracted_text += f"Line {idx + 1}: {generated_text}\n"

        # Display the extracted text
        st.subheader("Extracted Text:")
        st.text_area("Output Text", extracted_text, height=200)

        # Provide an option to download the extracted text
        st.download_button(
            label="Download Text",
            data=extracted_text,
            file_name="extracted_text.txt",
            mime="text/plain",
        )

    except Exception as e:
        st.error(f"An error occurred while processing the image: {e}")
else:
    st.info("Please upload an image to start the OCR process.")