Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

PRIYANSHUDHAKED commited on Sep 29, 2024

Commit

d1a52ca

verified ·

1 Parent(s): 35f540b

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -30

app.py CHANGED Viewed

@@ -17,9 +17,12 @@ model = genai.GenerativeModel("gemini-1.5-flash")
 def extract_text_with_gemini(image, keyword=None):
     if keyword:
         prompt = f"""
-        Extract all text from this image. Provide the output as HTML, maintaining the general layout and structure of the document. Include all visible text, headings, and any important information.
-        Highlight all instances of the keyword '{keyword}' (case-insensitive) with a yellow background using HTML span tags.
         For example: <span style="background-color: yellow;">keyword</span>
         """
     else:
         prompt = """
@@ -35,6 +38,18 @@ def extract_text_with_gemini(image, keyword=None):
     return text
 def search_and_highlight(full_text, keyword):
     pattern = re.compile(re.escape(keyword), re.IGNORECASE)
     matches = list(pattern.finditer(full_text))
@@ -67,7 +82,7 @@ def search_and_highlight(full_text, keyword):
     return results, highlighted_text
 def app():
-    st.title("Image OCR and Search")
     uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
     if uploaded_file is not None:
@@ -78,36 +93,45 @@ def app():
         # Select search method
         search_method = st.radio("Choose search method:",
                                  ("Extract text first, then search",
-                                  "Search while extracting text"))
         search_keyword = st.text_input("Enter a keyword to search (or press Enter to exit)")
-        if st.button("Process Image"):
-            if search_method == "Extract text first, then search":
-                print("Extracting text from the image...")
-                extracted_text = extract_text_with_gemini(image)
-                st.subheader("Extracted Text:")
-                st.write(extracted_text)
-                if search_keyword:
-                    results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
-                    if results:
-                        st.subheader(f"Keyword '{search_keyword}' found in the extracted text:")
-                        for i, result in enumerate(results, 1):
-                            st.markdown(f"{i}. ...{result}...", unsafe_allow_html=True)
-                        st.subheader("Full Text with Highlighted Keywords:")
-                        st.markdown(highlighted_text, unsafe_allow_html=True)
-                    else:
-                        st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
-            else:  # Search while extracting text
-                print("Extracting text and highlighting keyword...")
-                highlighted_text = extract_text_with_gemini(image, search_keyword)
-                st.subheader("Extracted Text with Highlighted Keyword:")
-                st.markdown(highlighted_text, unsafe_allow_html=True)
-            st.write("OCR and search completed.")
 if __name__ == "__main__":
     app()

 def extract_text_with_gemini(image, keyword=None):
     if keyword:
         prompt = f"""
+        1. Extract all text from this image.
+        2. Search for the keyword '{keyword}' (case-insensitive) in the extracted text.
+        3. Provide the output as HTML, maintaining the general layout and structure of the document.
+        4. Highlight all instances of the keyword '{keyword}' with a yellow background using HTML span tags.
         For example: <span style="background-color: yellow;">keyword</span>
+        5. If the keyword is not found, simply return the extracted text without highlighting.
         """
     else:
         prompt = """
     return text
+def extract_ner_with_gemini(image):
+    prompt = """
+    Analyze this image and extract all Named Entities (NER) present in the text.
+    Categorize them into types such as Person, Organization, Location, Date, etc.
+    Provide the output as a formatted list with categories and entities.
+    """
+    response = model.generate_content([prompt, image])
+    ner_text = response.text
+    return ner_text
 def search_and_highlight(full_text, keyword):
     pattern = re.compile(re.escape(keyword), re.IGNORECASE)
     matches = list(pattern.finditer(full_text))
     return results, highlighted_text
 def app():
+    st.title("Image OCR, Search, and NER Extraction")
     uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
     if uploaded_file is not None:
         # Select search method
         search_method = st.radio("Choose search method:",
                                  ("Extract text first, then search",
+                                  "Search while extracting text (using Gemini API)"))
         search_keyword = st.text_input("Enter a keyword to search (or press Enter to exit)")
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("Process Image"):
+                if search_method == "Extract text first, then search":
+                    print("Extracting text from the image...")
+                    extracted_text = extract_text_with_gemini(image)
+                    st.subheader("Extracted Text:")
+                    st.write(extracted_text)
+                    if search_keyword:
+                        results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
+                        if results:
+                            st.subheader(f"Keyword '{search_keyword}' found in the extracted text:")
+                            for i, result in enumerate(results, 1):
+                                st.markdown(f"{i}. ...{result}...", unsafe_allow_html=True)
+                            st.subheader("Full Text with Highlighted Keywords:")
+                            st.markdown(highlighted_text, unsafe_allow_html=True)
+                        else:
+                            st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
+                else:  # Search while extracting text using Gemini API
+                    print("Extracting text and searching keyword using Gemini API...")
+                    highlighted_text = extract_text_with_gemini(image, search_keyword)
+                    st.subheader("Extracted Text with Highlighted Keyword:")
+                    st.markdown(highlighted_text, unsafe_allow_html=True)
+                st.write("OCR and search completed.")
+        with col2:
+            if st.button("Extract NER"):
+                print("Extracting Named Entities...")
+                ner_results = extract_ner_with_gemini(image)
+                st.subheader("Named Entities Extracted:")
+                st.write(ner_results)
 if __name__ == "__main__":
     app()