File size: 2,144 Bytes
535cfc5
d421cca
b862961
d421cca
 
 
 
a68a28a
d421cca
 
535cfc5
d421cca
 
 
f08df26
d421cca
 
535cfc5
d421cca
 
 
 
 
 
535cfc5
d421cca
 
 
 
 
 
 
 
535cfc5
d421cca
 
 
 
 
 
b862961
d421cca
 
 
535cfc5
d421cca
 
ddd6099
d421cca
 
535cfc5
d421cca
 
535cfc5
d421cca
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import streamlit as st
from paddleocr import PaddleOCR
from PIL import Image
import numpy as np
import re
from pyngrok import ngrok
import subprocess

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Enable angle classification for better accuracy

# Streamlit App
st.title("Real-Time Text Extraction from Images (PaddleOCR)")
st.markdown("Upload or capture an image to extract text using PaddleOCR.")

# Upload Image
uploaded_file = st.file_uploader("Upload Image", type=['png', 'jpg', 'jpeg'])

st.markdown("OR")

# Capture Image
captured_image = st.camera_input("Capture Image")

image = None  # Placeholder for the image

if uploaded_file is not None:
    # Open and Display the Uploaded Image
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Image", use_container_width=True)
elif captured_image is not None:
    # Open and Display the Captured Image
    image = Image.open(captured_image)
    st.image(image, caption="Captured Image", use_container_width=True)

if image is not None:
    # Convert image to numpy array
    image_np = np.array(image)

    # Perform OCR with PaddleOCR
    with st.spinner("Extracting text..."):
        try:
            # Extract text from the image
            results = ocr.ocr(image_np, cls=True)
            extracted_text = " ".join([line[1][0] for line in results[0]])  # Concatenate recognized text

            # Clean the extracted text: replace tabs or multiple spaces with a single space
            cleaned_text = re.sub(r'\s+', ' ', extracted_text).strip()

            # Add HTML <br> tags for line breaks after numbers
            formatted_text = re.sub(r'(\b\d+\b)', r'\1<br>', cleaned_text)

            # Add line breaks for table columns or box-separated text
            formatted_text = re.sub(r'[\t|]', r'<br>', formatted_text)

            # Display the formatted text with HTML rendering
            st.subheader("Extracted Text:")
            st.markdown(formatted_text, unsafe_allow_html=True)  # Use unsafe_allow_html=True to render HTML
        except Exception as e:
            st.error(f"Error during text extraction: {e}")