Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -107,9 +107,9 @@ def extract_text_qwen(image_file, model, processor):
|
|
| 107 |
# Function to highlight the keyword in the text
|
| 108 |
|
| 109 |
|
| 110 |
-
def highlight_text(
|
| 111 |
text_highlighter(
|
| 112 |
-
text=
|
| 113 |
labels=[("KEYWORD", "#0000FF")],
|
| 114 |
annotations=[
|
| 115 |
{"start": start, "end": end, "tag": "KEYWORD"},
|
|
@@ -165,7 +165,7 @@ if uploaded_file:
|
|
| 165 |
images_dir = 'images'
|
| 166 |
os.makedirs(images_dir, exist_ok=True)
|
| 167 |
image_path = os.path.join(
|
| 168 |
-
images_dir, "temp_file.
|
| 169 |
with open(image_path, 'wb') as f:
|
| 170 |
f.write(uploaded_file.getvalue())
|
| 171 |
|
|
@@ -175,12 +175,21 @@ if uploaded_file:
|
|
| 175 |
result_path = os.path.join(
|
| 176 |
results_dir, "temp_file_result.json" if clipboard_use else f"{uploaded_file.name}_result.json")
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
# Handle predictions
|
| 179 |
if predict_button:
|
| 180 |
if os.path.exists(result_path):
|
| 181 |
with open(result_path, 'r') as f:
|
| 182 |
result_data = json.load(f)
|
| 183 |
-
extracted_text = result_data["
|
|
|
|
|
|
|
| 184 |
else:
|
| 185 |
with st.spinner("Processing..."):
|
| 186 |
if model_choice == "GOT_CPU":
|
|
@@ -199,30 +208,35 @@ if uploaded_file:
|
|
| 199 |
image_path, qwen_model, qwen_processor)
|
| 200 |
|
| 201 |
# Clean and polish extracted text
|
| 202 |
-
cleaned_text
|
| 203 |
-
|
| 204 |
-
|
|
|
|
| 205 |
|
| 206 |
# Save results to JSON file
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
st.
|
| 215 |
-
st.
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Function to highlight the keyword in the text
|
| 108 |
|
| 109 |
|
| 110 |
+
def highlight_text(cleaned_text, start, end):
|
| 111 |
text_highlighter(
|
| 112 |
+
text=cleaned_text,
|
| 113 |
labels=[("KEYWORD", "#0000FF")],
|
| 114 |
annotations=[
|
| 115 |
{"start": start, "end": end, "tag": "KEYWORD"},
|
|
|
|
| 165 |
images_dir = 'images'
|
| 166 |
os.makedirs(images_dir, exist_ok=True)
|
| 167 |
image_path = os.path.join(
|
| 168 |
+
images_dir, "temp_file.png" if clipboard_use else uploaded_file.name)
|
| 169 |
with open(image_path, 'wb') as f:
|
| 170 |
f.write(uploaded_file.getvalue())
|
| 171 |
|
|
|
|
| 175 |
result_path = os.path.join(
|
| 176 |
results_dir, "temp_file_result.json" if clipboard_use else f"{uploaded_file.name}_result.json")
|
| 177 |
|
| 178 |
+
# Display extracted text
|
| 179 |
+
st.subheader("Extracted Text (Cleaned & Polished)")
|
| 180 |
+
if 'cleaned_text' not in st.session_state:
|
| 181 |
+
st.session_state.cleaned_text = ""
|
| 182 |
+
if 'polished_text' not in st.session_state:
|
| 183 |
+
st.session_state.polished_text = ""
|
| 184 |
+
|
| 185 |
# Handle predictions
|
| 186 |
if predict_button:
|
| 187 |
if os.path.exists(result_path):
|
| 188 |
with open(result_path, 'r') as f:
|
| 189 |
result_data = json.load(f)
|
| 190 |
+
extracted_text = result_data["extracted_text"]
|
| 191 |
+
cleaned_text = result_data["cleaned_text"]
|
| 192 |
+
polished_text = result_data["polished_text"]
|
| 193 |
else:
|
| 194 |
with st.spinner("Processing..."):
|
| 195 |
if model_choice == "GOT_CPU":
|
|
|
|
| 208 |
image_path, qwen_model, qwen_processor)
|
| 209 |
|
| 210 |
# Clean and polish extracted text
|
| 211 |
+
if not cleaned_text and polished_text:
|
| 212 |
+
cleaned_text = clean_extracted_text(extracted_text)
|
| 213 |
+
polished_text = polish_text_with_ai(cleaned_text) if model_choice in [
|
| 214 |
+
"GOT_CPU", "GOT_GPU"] else cleaned_text
|
| 215 |
|
| 216 |
# Save results to JSON file
|
| 217 |
+
if not os.path.exists(result_path):
|
| 218 |
+
result_data = {"extracted_text": extracted_text,
|
| 219 |
+
"cleaned_text": cleaned_text, "polished_text": polished_text}
|
| 220 |
+
with open(result_path, 'w') as f:
|
| 221 |
+
json.dump(result_data, f)
|
| 222 |
+
|
| 223 |
+
# Save results to session state
|
| 224 |
+
st.session_state.cleaned_text = cleaned_text
|
| 225 |
+
st.session_state.polished_text = polished_text
|
| 226 |
+
|
| 227 |
+
# Display extracted text
|
| 228 |
+
st.markdown(st.session_state.cleaned_text, unsafe_allow_html=True)
|
| 229 |
+
st.markdown(st.session_state.polished_text, unsafe_allow_html=True)
|
| 230 |
+
|
| 231 |
+
# Input search term with real-time update on key press
|
| 232 |
+
search_query = st_keyup("Search in extracted text:")
|
| 233 |
+
|
| 234 |
+
if search_query:
|
| 235 |
+
index = st.session_state.cleaned_text.find(search_query)
|
| 236 |
+
start = index
|
| 237 |
+
len = search_query.length
|
| 238 |
+
end = index + len
|
| 239 |
+
if index != -1:
|
| 240 |
+
highlight_text(st.session_state.cleaned_text, start, end)
|
| 241 |
+
else:
|
| 242 |
+
st.write("No Search Found.")
|