kirchik47 commited on
Commit
89a3f00
·
1 Parent(s): 3153dd0

Upgraded search feature

Browse files
Files changed (2) hide show
  1. app.py +17 -8
  2. main_got.py +1 -1
app.py CHANGED
@@ -3,6 +3,22 @@ from main_got import extract_text
3
  import re
4
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # Streamlit UI
7
  st.title("OCR and Document Search Web App")
8
 
@@ -18,12 +34,5 @@ if uploaded_image is not None:
18
 
19
  # Search functionality
20
  search_query = st.text_input("Enter a keyword to search within the text")
21
- if search_query:
22
- results = [match.start() for match in re.finditer(search_query, extracted_text)]
23
- if results:
24
- st.subheader("Search Results")
25
- for result in results:
26
- st.write(f"Keyword found at index: {result}")
27
- else:
28
- st.write("No results found.")
29
 
 
3
  import re
4
 
5
 
6
+ def highlight_keywords(text: str, keyword: str) -> str:
7
+ # Split text into sentences
8
+ sentences = text.split('. ')
9
+ highlighted_text = ""
10
+
11
+ for sentence in sentences:
12
+ if keyword.lower() in sentence.lower():
13
+ # Highlight sentence in red and the keyword in yellow
14
+ highlighted_sentence = sentence.replace(keyword, f'<mark style="background-color: yellow">{keyword}</mark>')
15
+ highlighted_text += f'<span style="color: red">{highlighted_sentence}.</span> '
16
+ else:
17
+ highlighted_text += sentence + ". "
18
+
19
+ return highlighted_text
20
+
21
+
22
  # Streamlit UI
23
  st.title("OCR and Document Search Web App")
24
 
 
34
 
35
  # Search functionality
36
  search_query = st.text_input("Enter a keyword to search within the text")
37
+ st.markdown(highlight_keywords(search_query), unsafe_allow_html=True)
 
 
 
 
 
 
 
38
 
main_got.py CHANGED
@@ -19,7 +19,7 @@ def extract_text(image_path):
19
  # and provides safe deserialization unlike pickle-based one
20
  pad_token_id=tokenizer.eos_token_id, # Set the pad token from tokenizer
21
  )
22
-
23
  image_file = image_path
24
  # Extract text
25
  res = model.chat(tokenizer, image_file, ocr_type='ocr')
 
19
  # and provides safe deserialization unlike pickle-based one
20
  pad_token_id=tokenizer.eos_token_id, # Set the pad token from tokenizer
21
  )
22
+ model.to(device)
23
  image_file = image_path
24
  # Extract text
25
  res = model.chat(tokenizer, image_file, ocr_type='ocr')