Spaces:

UniquePratham
/

DualTextOCRFusion

Sleeping

App Files Files Community

UniquePratham commited on Sep 30, 2024

Commit

8308624

verified ·

1 Parent(s): 7297505

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -29

app.py CHANGED Viewed

@@ -107,9 +107,9 @@ def extract_text_qwen(image_file, model, processor):
 # Function to highlight the keyword in the text
-def highlight_text(text_sentence, start, end):
     text_highlighter(
-        text=text_sentence,
         labels=[("KEYWORD", "#0000FF")],
         annotations=[
             {"start": start, "end": end, "tag": "KEYWORD"},
@@ -165,7 +165,7 @@ if uploaded_file:
     images_dir = 'images'
     os.makedirs(images_dir, exist_ok=True)
     image_path = os.path.join(
-        images_dir, "temp_file.jpg" if clipboard_use else uploaded_file.name)
     with open(image_path, 'wb') as f:
         f.write(uploaded_file.getvalue())
@@ -175,12 +175,21 @@ if uploaded_file:
     result_path = os.path.join(
         results_dir, "temp_file_result.json" if clipboard_use else f"{uploaded_file.name}_result.json")
     # Handle predictions
     if predict_button:
         if os.path.exists(result_path):
             with open(result_path, 'r') as f:
                 result_data = json.load(f)
-            extracted_text = result_data["polished_text"]
         else:
             with st.spinner("Processing..."):
                 if model_choice == "GOT_CPU":
@@ -199,30 +208,35 @@ if uploaded_file:
                         image_path, qwen_model, qwen_processor)
         # Clean and polish extracted text
-        cleaned_text = clean_extracted_text(extracted_text)
-        polished_text = polish_text_with_ai(cleaned_text) if model_choice in [
-            "GOT_CPU", "GOT_GPU"] else cleaned_text
         # Save results to JSON file
-        result_data = {"extracted_text": extracted_text,
-                       "cleaner_text": cleaned_text, "polished_text": polished_text}
-        with open(result_path, 'w') as f:
-            json.dump(result_data, f)
-        # Display extracted text
-        st.subheader("Extracted Text (Cleaned & Polished)")
-        st.markdown(cleaned_text, unsafe_allow_html=True)
-        st.markdown(polished_text, unsafe_allow_html=True)
-        # Input search term with real-time update on key press
-        search_query = st_keyup("Search in extracted text:")
-        if search_query:
-            index = extracted_text.find(search_query)
-            start = index
-            len = search_query.length
-            end = index + len
-            if index != -1:
-                highlight_text(extracted_text, start, end)
-            else:
-                st.write("No Search Found.")

 # Function to highlight the keyword in the text
+def highlight_text(cleaned_text, start, end):
     text_highlighter(
+        text=cleaned_text,
         labels=[("KEYWORD", "#0000FF")],
         annotations=[
             {"start": start, "end": end, "tag": "KEYWORD"},
     images_dir = 'images'
     os.makedirs(images_dir, exist_ok=True)
     image_path = os.path.join(
+        images_dir, "temp_file.png" if clipboard_use else uploaded_file.name)
     with open(image_path, 'wb') as f:
         f.write(uploaded_file.getvalue())
     result_path = os.path.join(
         results_dir, "temp_file_result.json" if clipboard_use else f"{uploaded_file.name}_result.json")
+    # Display extracted text
+    st.subheader("Extracted Text (Cleaned & Polished)")
+    if 'cleaned_text' not in st.session_state:
+        st.session_state.cleaned_text = ""
+    if 'polished_text' not in st.session_state:
+        st.session_state.polished_text = ""
     # Handle predictions
     if predict_button:
         if os.path.exists(result_path):
             with open(result_path, 'r') as f:
                 result_data = json.load(f)
+            extracted_text = result_data["extracted_text"]
+            cleaned_text = result_data["cleaned_text"]
+            polished_text = result_data["polished_text"]
         else:
             with st.spinner("Processing..."):
                 if model_choice == "GOT_CPU":
                         image_path, qwen_model, qwen_processor)
         # Clean and polish extracted text
+        if not cleaned_text and polished_text:
+            cleaned_text = clean_extracted_text(extracted_text)
+            polished_text = polish_text_with_ai(cleaned_text) if model_choice in [
+                "GOT_CPU", "GOT_GPU"] else cleaned_text
         # Save results to JSON file
+        if not os.path.exists(result_path):
+            result_data = {"extracted_text": extracted_text,
+                           "cleaned_text": cleaned_text, "polished_text": polished_text}
+            with open(result_path, 'w') as f:
+                json.dump(result_data, f)
+         # Save results to session state
+        st.session_state.cleaned_text = cleaned_text
+        st.session_state.polished_text = polished_text
+# Display extracted text
+st.markdown(st.session_state.cleaned_text, unsafe_allow_html=True)
+st.markdown(st.session_state.polished_text, unsafe_allow_html=True)
+# Input search term with real-time update on key press
+search_query = st_keyup("Search in extracted text:")
+if search_query:
+    index = st.session_state.cleaned_text.find(search_query)
+    start = index
+    len = search_query.length
+    end = index + len
+    if index != -1:
+        highlight_text(st.session_state.cleaned_text, start, end)
+    else:
+        st.write("No Search Found.")