FocusFlow Assistant commited on
Commit
2c32e38
·
1 Parent(s): 4adfbc0

Add OCR fallback for scanned/image PDFs using pytesseract + pdf2image

Browse files
Files changed (5) hide show
  1. Dockerfile +3 -0
  2. app.py +12 -1
  3. backend/rag_engine.py +55 -10
  4. packages.txt +3 -0
  5. requirements.txt +3 -0
Dockerfile CHANGED
@@ -6,6 +6,9 @@ WORKDIR /app
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  curl \
 
 
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
  # Copy requirements and install Python dependencies
 
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  curl \
9
+ tesseract-ocr \
10
+ tesseract-ocr-eng \
11
+ poppler-utils \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  # Copy requirements and install Python dependencies
app.py CHANGED
@@ -1013,7 +1013,18 @@ if not st.session_state.focus_mode:
1013
  time.sleep(1)
1014
  st.rerun()
1015
  else:
1016
- st.error(f"Upload failed: {resp.text}")
 
 
 
 
 
 
 
 
 
 
 
1017
  except Exception as e:
1018
  st.error(f"Error: {e}")
1019
 
 
1013
  time.sleep(1)
1014
  st.rerun()
1015
  else:
1016
+ # Parse error for user-friendly message
1017
+ try:
1018
+ error_detail = resp.json().get("detail", resp.text)
1019
+ except Exception:
1020
+ error_detail = resp.text
1021
+
1022
+ if "OCR" in str(error_detail) or "scan" in str(error_detail).lower():
1023
+ st.error(f"📸 {error_detail}")
1024
+ elif "No readable text" in str(error_detail):
1025
+ st.error("📄 This PDF appears to be scanned/image-only. OCR could not extract text. Please try a clearer scan or a text-based PDF.")
1026
+ else:
1027
+ st.error(f"Upload failed: {error_detail}")
1028
  except Exception as e:
1029
  st.error(f"Error: {e}")
1030
 
backend/rag_engine.py CHANGED
@@ -26,24 +26,70 @@ CACHE_DIR = "./chroma_db"
26
  def ingest_document(file_path: str):
27
  """
28
  Ingests a PDF document into the vector database.
 
29
  """
30
  if not os.path.exists(file_path):
31
  raise FileNotFoundError(f"File not found: {file_path}")
32
 
33
- # Load PDF
34
  loader = PyPDFLoader(file_path)
35
  docs = loader.load()
36
 
37
  # Filter out pages with no real text content
38
  docs = [d for d in docs if d.page_content.strip()]
39
 
40
- if not docs:
41
- raise ValueError(
42
- "No readable text found in this PDF. "
43
- "It may be a scanned/image-only document."
44
- )
45
-
46
- # Split text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
48
  splits = splitter.split_documents(docs)
49
 
@@ -53,8 +99,7 @@ def ingest_document(file_path: str):
53
  "It may be a scanned/image-only document."
54
  )
55
 
56
- # Store in ChromaDB
57
- # Note: Chroma will automatically persist to disk in newer versions when persist_directory is set
58
  Chroma.from_documents(
59
  documents=splits,
60
  embedding=get_embeddings(),
 
26
  def ingest_document(file_path: str):
27
  """
28
  Ingests a PDF document into the vector database.
29
+ Falls back to OCR (pytesseract) if standard text extraction yields little/no text.
30
  """
31
  if not os.path.exists(file_path):
32
  raise FileNotFoundError(f"File not found: {file_path}")
33
 
34
+ # --- Step 1: Try standard text extraction ---
35
  loader = PyPDFLoader(file_path)
36
  docs = loader.load()
37
 
38
  # Filter out pages with no real text content
39
  docs = [d for d in docs if d.page_content.strip()]
40
 
41
+ # Check total extracted text length
42
+ total_text = "".join(d.page_content.strip() for d in docs)
43
+
44
+ # --- Step 2: OCR fallback if text is too short ---
45
+ if len(total_text) < 50:
46
+ logger.info(f"Standard extraction found only {len(total_text)} chars, attempting OCR fallback...")
47
+ try:
48
+ from pdf2image import convert_from_path
49
+ import pytesseract
50
+
51
+ # Convert PDF pages to images at 300 DPI
52
+ images = convert_from_path(file_path, dpi=300)
53
+ ocr_pages = []
54
+
55
+ for page_num, image in enumerate(images):
56
+ page_text = pytesseract.image_to_string(image)
57
+ if page_text.strip():
58
+ ocr_pages.append(Document(
59
+ page_content=page_text,
60
+ metadata={"source": file_path, "page": page_num}
61
+ ))
62
+
63
+ if ocr_pages:
64
+ ocr_total = "".join(d.page_content.strip() for d in ocr_pages)
65
+ if len(ocr_total) < 50:
66
+ raise ValueError(
67
+ "Could not extract text even after OCR. "
68
+ "Please upload a clearer scan."
69
+ )
70
+ docs = ocr_pages
71
+ logger.info(f"OCR extracted {len(ocr_total)} chars from {len(ocr_pages)} pages")
72
+ else:
73
+ raise ValueError(
74
+ "Could not extract text even after OCR. "
75
+ "Please upload a clearer scan."
76
+ )
77
+ except ImportError:
78
+ logger.warning("pytesseract/pdf2image not installed, cannot OCR")
79
+ raise ValueError(
80
+ "No readable text found and OCR libraries are not available. "
81
+ "Please upload a text-based PDF."
82
+ )
83
+ except ValueError:
84
+ raise # Re-raise our own clear errors
85
+ except Exception as e:
86
+ logger.error(f"OCR fallback failed: {e}")
87
+ raise ValueError(
88
+ f"OCR processing failed: {str(e)}. "
89
+ "Please try a clearer scan or a text-based PDF."
90
+ )
91
+
92
+ # --- Step 3: Split text (unchanged) ---
93
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
94
  splits = splitter.split_documents(docs)
95
 
 
99
  "It may be a scanned/image-only document."
100
  )
101
 
102
+ # --- Step 4: Store in ChromaDB (unchanged) ---
 
103
  Chroma.from_documents(
104
  documents=splits,
105
  embedding=get_embeddings(),
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ tesseract-ocr
2
+ tesseract-ocr-eng
3
+ poppler-utils
requirements.txt CHANGED
@@ -17,6 +17,9 @@ plotly>=5.18.0
17
  beautifulsoup4>=4.12.0
18
  youtube-transcript-api>=0.6.0
19
  pypdf>=3.17.0
 
 
 
20
  python-dotenv>=1.0.0
21
 
22
  # Hugging Face dependencies for cloud deployment
 
17
  beautifulsoup4>=4.12.0
18
  youtube-transcript-api>=0.6.0
19
  pypdf>=3.17.0
20
+ pytesseract>=0.3.10
21
+ pdf2image>=1.16.0
22
+ Pillow>=10.0.0
23
  python-dotenv>=1.0.0
24
 
25
  # Hugging Face dependencies for cloud deployment