Upgraded search feature
Browse files- app.py +17 -8
- main_got.py +1 -1
app.py
CHANGED
|
@@ -3,6 +3,22 @@ from main_got import extract_text
|
|
| 3 |
import re
|
| 4 |
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
# Streamlit UI
|
| 7 |
st.title("OCR and Document Search Web App")
|
| 8 |
|
|
@@ -18,12 +34,5 @@ if uploaded_image is not None:
|
|
| 18 |
|
| 19 |
# Search functionality
|
| 20 |
search_query = st.text_input("Enter a keyword to search within the text")
|
| 21 |
-
|
| 22 |
-
results = [match.start() for match in re.finditer(search_query, extracted_text)]
|
| 23 |
-
if results:
|
| 24 |
-
st.subheader("Search Results")
|
| 25 |
-
for result in results:
|
| 26 |
-
st.write(f"Keyword found at index: {result}")
|
| 27 |
-
else:
|
| 28 |
-
st.write("No results found.")
|
| 29 |
|
|
|
|
| 3 |
import re
|
| 4 |
|
| 5 |
|
| 6 |
+
def highlight_keywords(text: str, keyword: str) -> str:
|
| 7 |
+
# Split text into sentences
|
| 8 |
+
sentences = text.split('. ')
|
| 9 |
+
highlighted_text = ""
|
| 10 |
+
|
| 11 |
+
for sentence in sentences:
|
| 12 |
+
if keyword.lower() in sentence.lower():
|
| 13 |
+
# Highlight sentence in red and the keyword in yellow
|
| 14 |
+
highlighted_sentence = sentence.replace(keyword, f'<mark style="background-color: yellow">{keyword}</mark>')
|
| 15 |
+
highlighted_text += f'<span style="color: red">{highlighted_sentence}.</span> '
|
| 16 |
+
else:
|
| 17 |
+
highlighted_text += sentence + ". "
|
| 18 |
+
|
| 19 |
+
return highlighted_text
|
| 20 |
+
|
| 21 |
+
|
| 22 |
# Streamlit UI
|
| 23 |
st.title("OCR and Document Search Web App")
|
| 24 |
|
|
|
|
| 34 |
|
| 35 |
# Search functionality
|
| 36 |
search_query = st.text_input("Enter a keyword to search within the text")
|
| 37 |
+
st.markdown(highlight_keywords(search_query), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
main_got.py
CHANGED
|
@@ -19,7 +19,7 @@ def extract_text(image_path):
|
|
| 19 |
# and provides safe deserialization unlike pickle-based one
|
| 20 |
pad_token_id=tokenizer.eos_token_id, # Set the pad token from tokenizer
|
| 21 |
)
|
| 22 |
-
|
| 23 |
image_file = image_path
|
| 24 |
# Extract text
|
| 25 |
res = model.chat(tokenizer, image_file, ocr_type='ocr')
|
|
|
|
| 19 |
# and provides safe deserialization unlike pickle-based one
|
| 20 |
pad_token_id=tokenizer.eos_token_id, # Set the pad token from tokenizer
|
| 21 |
)
|
| 22 |
+
model.to(device)
|
| 23 |
image_file = image_path
|
| 24 |
# Extract text
|
| 25 |
res = model.chat(tokenizer, image_file, ocr_type='ocr')
|