FocusFlow Assistant commited on
Commit ·
2c32e38
1
Parent(s): 4adfbc0
Add OCR fallback for scanned/image PDFs using pytesseract + pdf2image
Browse files- Dockerfile +3 -0
- app.py +12 -1
- backend/rag_engine.py +55 -10
- packages.txt +3 -0
- requirements.txt +3 -0
Dockerfile
CHANGED
|
@@ -6,6 +6,9 @@ WORKDIR /app
|
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
curl \
|
|
|
|
|
|
|
|
|
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
# Copy requirements and install Python dependencies
|
|
|
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
curl \
|
| 9 |
+
tesseract-ocr \
|
| 10 |
+
tesseract-ocr-eng \
|
| 11 |
+
poppler-utils \
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
# Copy requirements and install Python dependencies
|
app.py
CHANGED
|
@@ -1013,7 +1013,18 @@ if not st.session_state.focus_mode:
|
|
| 1013 |
time.sleep(1)
|
| 1014 |
st.rerun()
|
| 1015 |
else:
|
| 1016 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1017 |
except Exception as e:
|
| 1018 |
st.error(f"Error: {e}")
|
| 1019 |
|
|
|
|
| 1013 |
time.sleep(1)
|
| 1014 |
st.rerun()
|
| 1015 |
else:
|
| 1016 |
+
# Parse error for user-friendly message
|
| 1017 |
+
try:
|
| 1018 |
+
error_detail = resp.json().get("detail", resp.text)
|
| 1019 |
+
except Exception:
|
| 1020 |
+
error_detail = resp.text
|
| 1021 |
+
|
| 1022 |
+
if "OCR" in str(error_detail) or "scan" in str(error_detail).lower():
|
| 1023 |
+
st.error(f"📸 {error_detail}")
|
| 1024 |
+
elif "No readable text" in str(error_detail):
|
| 1025 |
+
st.error("📄 This PDF appears to be scanned/image-only. OCR could not extract text. Please try a clearer scan or a text-based PDF.")
|
| 1026 |
+
else:
|
| 1027 |
+
st.error(f"Upload failed: {error_detail}")
|
| 1028 |
except Exception as e:
|
| 1029 |
st.error(f"Error: {e}")
|
| 1030 |
|
backend/rag_engine.py
CHANGED
|
@@ -26,24 +26,70 @@ CACHE_DIR = "./chroma_db"
|
|
| 26 |
def ingest_document(file_path: str):
|
| 27 |
"""
|
| 28 |
Ingests a PDF document into the vector database.
|
|
|
|
| 29 |
"""
|
| 30 |
if not os.path.exists(file_path):
|
| 31 |
raise FileNotFoundError(f"File not found: {file_path}")
|
| 32 |
|
| 33 |
-
#
|
| 34 |
loader = PyPDFLoader(file_path)
|
| 35 |
docs = loader.load()
|
| 36 |
|
| 37 |
# Filter out pages with no real text content
|
| 38 |
docs = [d for d in docs if d.page_content.strip()]
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 48 |
splits = splitter.split_documents(docs)
|
| 49 |
|
|
@@ -53,8 +99,7 @@ def ingest_document(file_path: str):
|
|
| 53 |
"It may be a scanned/image-only document."
|
| 54 |
)
|
| 55 |
|
| 56 |
-
# Store in ChromaDB
|
| 57 |
-
# Note: Chroma will automatically persist to disk in newer versions when persist_directory is set
|
| 58 |
Chroma.from_documents(
|
| 59 |
documents=splits,
|
| 60 |
embedding=get_embeddings(),
|
|
|
|
| 26 |
def ingest_document(file_path: str):
|
| 27 |
"""
|
| 28 |
Ingests a PDF document into the vector database.
|
| 29 |
+
Falls back to OCR (pytesseract) if standard text extraction yields little/no text.
|
| 30 |
"""
|
| 31 |
if not os.path.exists(file_path):
|
| 32 |
raise FileNotFoundError(f"File not found: {file_path}")
|
| 33 |
|
| 34 |
+
# --- Step 1: Try standard text extraction ---
|
| 35 |
loader = PyPDFLoader(file_path)
|
| 36 |
docs = loader.load()
|
| 37 |
|
| 38 |
# Filter out pages with no real text content
|
| 39 |
docs = [d for d in docs if d.page_content.strip()]
|
| 40 |
|
| 41 |
+
# Check total extracted text length
|
| 42 |
+
total_text = "".join(d.page_content.strip() for d in docs)
|
| 43 |
+
|
| 44 |
+
# --- Step 2: OCR fallback if text is too short ---
|
| 45 |
+
if len(total_text) < 50:
|
| 46 |
+
logger.info(f"Standard extraction found only {len(total_text)} chars, attempting OCR fallback...")
|
| 47 |
+
try:
|
| 48 |
+
from pdf2image import convert_from_path
|
| 49 |
+
import pytesseract
|
| 50 |
+
|
| 51 |
+
# Convert PDF pages to images at 300 DPI
|
| 52 |
+
images = convert_from_path(file_path, dpi=300)
|
| 53 |
+
ocr_pages = []
|
| 54 |
+
|
| 55 |
+
for page_num, image in enumerate(images):
|
| 56 |
+
page_text = pytesseract.image_to_string(image)
|
| 57 |
+
if page_text.strip():
|
| 58 |
+
ocr_pages.append(Document(
|
| 59 |
+
page_content=page_text,
|
| 60 |
+
metadata={"source": file_path, "page": page_num}
|
| 61 |
+
))
|
| 62 |
+
|
| 63 |
+
if ocr_pages:
|
| 64 |
+
ocr_total = "".join(d.page_content.strip() for d in ocr_pages)
|
| 65 |
+
if len(ocr_total) < 50:
|
| 66 |
+
raise ValueError(
|
| 67 |
+
"Could not extract text even after OCR. "
|
| 68 |
+
"Please upload a clearer scan."
|
| 69 |
+
)
|
| 70 |
+
docs = ocr_pages
|
| 71 |
+
logger.info(f"OCR extracted {len(ocr_total)} chars from {len(ocr_pages)} pages")
|
| 72 |
+
else:
|
| 73 |
+
raise ValueError(
|
| 74 |
+
"Could not extract text even after OCR. "
|
| 75 |
+
"Please upload a clearer scan."
|
| 76 |
+
)
|
| 77 |
+
except ImportError:
|
| 78 |
+
logger.warning("pytesseract/pdf2image not installed, cannot OCR")
|
| 79 |
+
raise ValueError(
|
| 80 |
+
"No readable text found and OCR libraries are not available. "
|
| 81 |
+
"Please upload a text-based PDF."
|
| 82 |
+
)
|
| 83 |
+
except ValueError:
|
| 84 |
+
raise # Re-raise our own clear errors
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"OCR fallback failed: {e}")
|
| 87 |
+
raise ValueError(
|
| 88 |
+
f"OCR processing failed: {str(e)}. "
|
| 89 |
+
"Please try a clearer scan or a text-based PDF."
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# --- Step 3: Split text (unchanged) ---
|
| 93 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 94 |
splits = splitter.split_documents(docs)
|
| 95 |
|
|
|
|
| 99 |
"It may be a scanned/image-only document."
|
| 100 |
)
|
| 101 |
|
| 102 |
+
# --- Step 4: Store in ChromaDB (unchanged) ---
|
|
|
|
| 103 |
Chroma.from_documents(
|
| 104 |
documents=splits,
|
| 105 |
embedding=get_embeddings(),
|
packages.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tesseract-ocr
|
| 2 |
+
tesseract-ocr-eng
|
| 3 |
+
poppler-utils
|
requirements.txt
CHANGED
|
@@ -17,6 +17,9 @@ plotly>=5.18.0
|
|
| 17 |
beautifulsoup4>=4.12.0
|
| 18 |
youtube-transcript-api>=0.6.0
|
| 19 |
pypdf>=3.17.0
|
|
|
|
|
|
|
|
|
|
| 20 |
python-dotenv>=1.0.0
|
| 21 |
|
| 22 |
# Hugging Face dependencies for cloud deployment
|
|
|
|
| 17 |
beautifulsoup4>=4.12.0
|
| 18 |
youtube-transcript-api>=0.6.0
|
| 19 |
pypdf>=3.17.0
|
| 20 |
+
pytesseract>=0.3.10
|
| 21 |
+
pdf2image>=1.16.0
|
| 22 |
+
Pillow>=10.0.0
|
| 23 |
python-dotenv>=1.0.0
|
| 24 |
|
| 25 |
# Hugging Face dependencies for cloud deployment
|