Spaces:

raahinaez
/

doc_classify_lora

Sleeping

App Files Files Community

raahinaez commited on Feb 2

Commit

4aec666

verified ·

1 Parent(s): 32bfddd

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -31

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import os
 import io
 import re
 import html as _html
 import streamlit as st
 import streamlit.components.v1 as components
 import torch
@@ -332,7 +334,7 @@ with st.sidebar:
         "OCR quality (DPI)",
         min_value=150,
         max_value=400,
-        value=300,
         step=25,
         help="Higher DPI improves OCR but increases processing time.",
     )
@@ -364,42 +366,84 @@ with tab_upload:
     st.markdown("</div>", unsafe_allow_html=True)
     if uploaded_file:
-        pdf_bytes = uploaded_file.read()
-        # `text=` was added to st.progress in later Streamlit versions; keep compatible with 1.27.0.
-        prog = st.progress(0)
-        prog_text = st.empty()
-        prog_text.caption("Running OCR…")
-        def _cb(done, total):
-            pct = int((done / total) * 100)
-            prog.progress(pct)
-            prog_text.caption(f"Running OCR… {done}/{total} pages")
-        with st.spinner("Extracting text with Tesseract…"):
-            extracted_text = extract_text_from_pdf(pdf_bytes, dpi=ocr_dpi, progress_cb=_cb)
-        prog.empty()
-        prog_text.empty()
-        with st.expander("Extracted text preview", expanded=False):
-            st.text_area("OCR Output", extracted_text[:preview_chars], height=260, label_visibility="collapsed")
-        col_a, col_b = st.columns([1, 2.2])
-        with col_a:
-            run = st.button("Classify document", use_container_width=True)
-        with col_b:
             st.markdown(
                 "<div class='glass fade-up' style='padding: 14px 16px;'>"
-                "<b>What happens next?</b><br/>"
-                "We tokenize the OCR text and run your LoRA-adapted TinyBERT classifier."
                 "</div>",
                 unsafe_allow_html=True,
             )
-        if run:
-            with st.spinner("Running model inference…"):
-                predictions = predict(extracted_text, top_k=top_k, max_length=MAX_LENGTH)
-            st.markdown("### Results")
-            _render_predictions(predictions)
 with tab_paste:
     st.markdown("<div class='glass fade-up'>", unsafe_allow_html=True)

 import io
 import re
 import html as _html
+import hashlib
+import time
 import streamlit as st
 import streamlit.components.v1 as components
 import torch
         "OCR quality (DPI)",
         min_value=150,
         max_value=400,
+        value=250,
         step=25,
         help="Higher DPI improves OCR but increases processing time.",
     )
     st.markdown("</div>", unsafe_allow_html=True)
     if uploaded_file:
+        # IMPORTANT: Streamlit reruns the script on every interaction. If we OCR inside this block,
+        # clicking "Classify" would re-trigger OCR and feel like it's stuck. So we store OCR output
+        # in session_state keyed by (file hash + DPI).
+        pdf_bytes = uploaded_file.getvalue()
+        file_hash = hashlib.sha256(pdf_bytes).hexdigest()[:16]
+        ocr_key = f"{file_hash}:{int(ocr_dpi)}"
+        if st.session_state.get("ocr_key") != ocr_key:
+            st.session_state["ocr_key"] = ocr_key
+            st.session_state["ocr_text"] = None
+            st.session_state["ocr_seconds"] = None
+        extracted_text = st.session_state.get("ocr_text")
+        col_run, col_hint = st.columns([1, 2.2])
+        with col_run:
+            run_ocr = st.button("Run OCR", use_container_width=True, key="run_ocr_btn")
+        with col_hint:
             st.markdown(
                 "<div class='glass fade-up' style='padding: 14px 16px;'>"
+                "<b>Tip</b><br/>"
+                "OCR is the slowest part. Run it once, then classify instantly. "
+                "Lower DPI = faster OCR."
                 "</div>",
                 unsafe_allow_html=True,
             )
+        if run_ocr or (extracted_text is None and st.session_state.get("auto_ocr_once") is None):
+            # Auto-run OCR once on first upload to keep UX smooth, but never re-run on button clicks.
+            st.session_state["auto_ocr_once"] = True
+            # `text=` was added to st.progress in later Streamlit versions; keep compatible with 1.27.0.
+            prog = st.progress(0)
+            prog_text = st.empty()
+            prog_text.caption("Running OCR…")
+            def _cb(done, total):
+                pct = int((done / total) * 100)
+                prog.progress(pct)
+                prog_text.caption(f"Running OCR… {done}/{total} pages")
+            t0 = time.time()
+            with st.spinner("Extracting text with Tesseract…"):
+                extracted_text = extract_text_from_pdf(pdf_bytes, dpi=ocr_dpi, progress_cb=_cb)
+            st.session_state["ocr_text"] = extracted_text
+            st.session_state["ocr_seconds"] = max(0.0, time.time() - t0)
+            prog.empty()
+            prog_text.empty()
+        extracted_text = st.session_state.get("ocr_text")
+        if extracted_text:
+            secs = st.session_state.get("ocr_seconds")
+            if secs is not None:
+                st.caption(f"OCR completed in {secs:.1f}s • DPI {int(ocr_dpi)}")
+            with st.expander("Extracted text preview", expanded=False):
+                st.text_area("OCR Output", extracted_text[:preview_chars], height=260, label_visibility="collapsed")
+            col_a, col_b = st.columns([1, 2.2])
+            with col_a:
+                run = st.button("Classify document", use_container_width=True, key="classify_doc_btn")
+            with col_b:
+                st.markdown(
+                    "<div class='glass fade-up' style='padding: 14px 16px;'>"
+                    "<b>What happens next?</b><br/>"
+                    "We tokenize the OCR text and run your LoRA-adapted TinyBERT classifier."
+                    "</div>",
+                    unsafe_allow_html=True,
+                )
+            if run:
+                with st.spinner("Running model inference…"):
+                    predictions = predict(extracted_text, top_k=top_k, max_length=MAX_LENGTH)
+                st.markdown("### Results")
+                _render_predictions(predictions)
+        else:
+            st.info("Click **Run OCR** to extract text, then you can classify the document.")
 with tab_paste:
     st.markdown("<div class='glass fade-up'>", unsafe_allow_html=True)