Spaces:

kirchik47
/

ocr_task

Sleeping

kirchik47 commited on Sep 29, 2024

Commit

89a3f00

1 Parent(s): 3153dd0

Upgraded search feature

Files changed (2) hide show

app.py CHANGED Viewed

@@ -3,6 +3,22 @@ from main_got import extract_text
 import re
 # Streamlit UI
 st.title("OCR and Document Search Web App")
@@ -18,12 +34,5 @@ if uploaded_image is not None:
         # Search functionality
         search_query = st.text_input("Enter a keyword to search within the text")
-        if search_query:
-            results = [match.start() for match in re.finditer(search_query, extracted_text)]
-            if results:
-                st.subheader("Search Results")
-                for result in results:
-                    st.write(f"Keyword found at index: {result}")
-            else:
-                st.write("No results found.")

 import re
+def highlight_keywords(text: str, keyword: str) -> str:
+    # Split text into sentences
+    sentences = text.split('. ')
+    highlighted_text = ""
+    for sentence in sentences:
+        if keyword.lower() in sentence.lower():
+            # Highlight sentence in red and the keyword in yellow
+            highlighted_sentence = sentence.replace(keyword, f'<mark style="background-color: yellow">{keyword}</mark>')
+            highlighted_text += f'<span style="color: red">{highlighted_sentence}.</span> '
+        else:
+            highlighted_text += sentence + ". "
+    return highlighted_text
 # Streamlit UI
 st.title("OCR and Document Search Web App")
         # Search functionality
         search_query = st.text_input("Enter a keyword to search within the text")
+        st.markdown(highlight_keywords(search_query), unsafe_allow_html=True)

main_got.py CHANGED Viewed

@@ -19,7 +19,7 @@ def extract_text(image_path):
                                                             # and provides safe deserialization unlike pickle-based one
                                     pad_token_id=tokenizer.eos_token_id, # Set the pad token from tokenizer
             )
     image_file = image_path
     # Extract text
     res = model.chat(tokenizer, image_file, ocr_type='ocr')

                                                             # and provides safe deserialization unlike pickle-based one
                                     pad_token_id=tokenizer.eos_token_id, # Set the pad token from tokenizer
             )
+    model.to(device)
     image_file = image_path
     # Extract text
     res = model.chat(tokenizer, image_file, ocr_type='ocr')