Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

PRIYANSHUDHAKED commited on Sep 28, 2024

Commit

e50af9e

verified ·

1 Parent(s): 806527c

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -79

app.py CHANGED Viewed

@@ -1,79 +1,54 @@
-import streamlit as st
-import cv2
-import pytesseract
-import numpy as np
-from PIL import Image
-import io
-import re
-# ANSI escape codes for console color
-YELLOW_HIGHLIGHT = '\033[43m'
-RESET_COLOR = '\033[0m'
-# Function for OCR processing
-def process_image(image_bytes):
-    # Convert bytes to image and process
-    image = Image.open(io.BytesIO(image_bytes))
-    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    text = pytesseract.image_to_string(opencv_image)
-    return text
-# Function for search and highlight
-def search_and_highlight(full_text, keyword):
-    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
-    matches = list(pattern.finditer(full_text))
-    if not matches:
-        return [], full_text
-    highlighted_text = full_text
-    html_text = full_text
-    results = []
-    for match in reversed(matches):
-        start, end = match.span()
-        context_start = max(0, start - 50)
-        context_end = min(len(full_text), end + 50)
-        context = full_text[context_start:context_end]
-        # Highlight for console output
-        highlighted_context = (
-            context[:start - context_start] +
-            YELLOW_HIGHLIGHT + context[start - context_start:end - context_start] + RESET_COLOR +
-            context[end - context_start:]
-        )
-        results.append(highlighted_context)
-        # Highlight for HTML output
-        html_text = (
-            html_text[:start] +
-            f'<span style="background-color: yellow;">{html_text[start:end]}</span>' +
-            html_text[end:]
-        )
-    return results, html_text
-# Streamlit app layout
-st.title("Image Text Search App")
-uploaded_file = st.file_uploader("Upload an Image (JPG or PNG)", type=["jpg", "jpeg", "png"])
-if uploaded_file is not None:
-    image_bytes = uploaded_file.read()
-    st.image(image_bytes)
-    # Perform OCR
-    extracted_text = process_image(image_bytes)
-    st.write("Extracted Text:")
-    st.write(extracted_text)
-    # Search functionality
-    search_keyword = st.text_input("Enter a keyword to search:")
-    if search_keyword:
-        results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
-        if results:
-            st.write(f"Keyword '{search_keyword}' found in the extracted text:")
-            for i, result in enumerate(results, 1):
-                st.write(f"{i}. ...{result}...")
-        else:
-            st.write(f"Keyword '{search_keyword}' not found in the extracted text.")

+# app.py
+import streamlit as st
+import cv2
+import numpy as np
+import pytesseract
+from PIL import Image
+import re
+# Set the title of the webpage
+st.title("OCR Text Extraction Tool")
+# Uploading an image
+uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
+if uploaded_file is not None:
+    # Convert the uploaded file content to an image
+    image = Image.open(uploaded_file)
+    # Convert PIL Image to OpenCV format
+    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    # Display the image
+    st.image(image, caption='Uploaded Image', use_column_width=True)
+    try:
+        # Perform OCR
+        text = pytesseract.image_to_string(opencv_image)
+        st.subheader("Extracted Text:")
+        st.write(text)
+        # Search functionality
+        search_keyword = st.text_input("Enter a keyword to search in the extracted text:")
+        if search_keyword:
+            pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
+            matches = list(pattern.finditer(text))
+            if matches:
+                st.markdown("### Keyword Found:")
+                for match in matches:
+                    start, end = match.span()
+                    context_start = max(0, start - 50)
+                    context_end = min(len(text), end + 50)
+                    context = text[context_start:context_end]
+                    highlighted_text = (
+                        context[:start-context_start] +
+                        f"<span style='background-color: yellow;'>{context[start-context_start:end-context_start]}</span>" +
+                        context[end-context_start:]
+                    )
+                    st.markdown(f"...{highlighted_text}...")
+            else:
+                st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
+    except Exception as e:
+        st.error(f"An error occurred while processing the image: {str(e)}")