Spaces:

noodledom
/

focusflow

Sleeping

App Files Files Community

FocusFlow Assistant commited on Feb 26

Commit

2c32e38

1 Parent(s): 4adfbc0

Add OCR fallback for scanned/image PDFs using pytesseract + pdf2image

Browse files

Files changed (5) hide show

Dockerfile +3 -0
app.py +12 -1
backend/rag_engine.py +55 -10
packages.txt +3 -0
requirements.txt +3 -0

Dockerfile CHANGED Viewed

@@ -6,6 +6,9 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install Python dependencies

 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    poppler-utils \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install Python dependencies

app.py CHANGED Viewed

@@ -1013,7 +1013,18 @@ if not st.session_state.focus_mode:
                                     time.sleep(1)
                                     st.rerun()
                                 else:
-                                    st.error(f"Upload failed: {resp.text}")
                         except Exception as e:
                             st.error(f"Error: {e}")

                                     time.sleep(1)
                                     st.rerun()
                                 else:
+                                    # Parse error for user-friendly message
+                                    try:
+                                        error_detail = resp.json().get("detail", resp.text)
+                                    except Exception:
+                                        error_detail = resp.text
+                                    if "OCR" in str(error_detail) or "scan" in str(error_detail).lower():
+                                        st.error(f"📸 {error_detail}")
+                                    elif "No readable text" in str(error_detail):
+                                        st.error("📄 This PDF appears to be scanned/image-only. OCR could not extract text. Please try a clearer scan or a text-based PDF.")
+                                    else:
+                                        st.error(f"Upload failed: {error_detail}")
                         except Exception as e:
                             st.error(f"Error: {e}")

backend/rag_engine.py CHANGED Viewed

@@ -26,24 +26,70 @@ CACHE_DIR = "./chroma_db"
 def ingest_document(file_path: str):
     """
     Ingests a PDF document into the vector database.
     """
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"File not found: {file_path}")
-    # Load PDF
     loader = PyPDFLoader(file_path)
     docs = loader.load()
     # Filter out pages with no real text content
     docs = [d for d in docs if d.page_content.strip()]
-    if not docs:
-        raise ValueError(
-            "No readable text found in this PDF. "
-            "It may be a scanned/image-only document."
-        )
-    # Split text
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     splits = splitter.split_documents(docs)
@@ -53,8 +99,7 @@ def ingest_document(file_path: str):
             "It may be a scanned/image-only document."
         )
-    # Store in ChromaDB
-    # Note: Chroma will automatically persist to disk in newer versions when persist_directory is set
     Chroma.from_documents(
         documents=splits,
         embedding=get_embeddings(),

 def ingest_document(file_path: str):
     """
     Ingests a PDF document into the vector database.
+    Falls back to OCR (pytesseract) if standard text extraction yields little/no text.
     """
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"File not found: {file_path}")
+    # --- Step 1: Try standard text extraction ---
     loader = PyPDFLoader(file_path)
     docs = loader.load()
     # Filter out pages with no real text content
     docs = [d for d in docs if d.page_content.strip()]
+    # Check total extracted text length
+    total_text = "".join(d.page_content.strip() for d in docs)
+    # --- Step 2: OCR fallback if text is too short ---
+    if len(total_text) < 50:
+        logger.info(f"Standard extraction found only {len(total_text)} chars, attempting OCR fallback...")
+        try:
+            from pdf2image import convert_from_path
+            import pytesseract
+            # Convert PDF pages to images at 300 DPI
+            images = convert_from_path(file_path, dpi=300)
+            ocr_pages = []
+            for page_num, image in enumerate(images):
+                page_text = pytesseract.image_to_string(image)
+                if page_text.strip():
+                    ocr_pages.append(Document(
+                        page_content=page_text,
+                        metadata={"source": file_path, "page": page_num}
+                    ))
+            if ocr_pages:
+                ocr_total = "".join(d.page_content.strip() for d in ocr_pages)
+                if len(ocr_total) < 50:
+                    raise ValueError(
+                        "Could not extract text even after OCR. "
+                        "Please upload a clearer scan."
+                    )
+                docs = ocr_pages
+                logger.info(f"OCR extracted {len(ocr_total)} chars from {len(ocr_pages)} pages")
+            else:
+                raise ValueError(
+                    "Could not extract text even after OCR. "
+                    "Please upload a clearer scan."
+                )
+        except ImportError:
+            logger.warning("pytesseract/pdf2image not installed, cannot OCR")
+            raise ValueError(
+                "No readable text found and OCR libraries are not available. "
+                "Please upload a text-based PDF."
+            )
+        except ValueError:
+            raise  # Re-raise our own clear errors
+        except Exception as e:
+            logger.error(f"OCR fallback failed: {e}")
+            raise ValueError(
+                f"OCR processing failed: {str(e)}. "
+                "Please try a clearer scan or a text-based PDF."
+            )
+    # --- Step 3: Split text (unchanged) ---
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     splits = splitter.split_documents(docs)
             "It may be a scanned/image-only document."
         )
+    # --- Step 4: Store in ChromaDB (unchanged) ---
     Chroma.from_documents(
         documents=splits,
         embedding=get_embeddings(),

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+tesseract-ocr
+tesseract-ocr-eng
+poppler-utils

requirements.txt CHANGED Viewed

@@ -17,6 +17,9 @@ plotly>=5.18.0
 beautifulsoup4>=4.12.0
 youtube-transcript-api>=0.6.0
 pypdf>=3.17.0
 python-dotenv>=1.0.0
 # Hugging Face dependencies for cloud deployment

 beautifulsoup4>=4.12.0
 youtube-transcript-api>=0.6.0
 pypdf>=3.17.0
+pytesseract>=0.3.10
+pdf2image>=1.16.0
+Pillow>=10.0.0
 python-dotenv>=1.0.0
 # Hugging Face dependencies for cloud deployment