pluto90 commited on
Commit
0f97f06
Β·
verified Β·
1 Parent(s): 248fcbb

Update app/core/pdf_processor.py

Browse files
Files changed (1) hide show
  1. app/core/pdf_processor.py +120 -52
app/core/pdf_processor.py CHANGED
@@ -1,52 +1,120 @@
1
- # pdf_preprocessor.py
2
-
3
- import os
4
- from pypdf import PdfReader
5
- from pdf2image import convert_from_path
6
- import pytesseract
7
-
8
- # Optional: Set Tesseract path manually on Windows
9
- # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
10
-
11
- def extract_text_from_pdf(file_path: str) -> str:
12
- """
13
- Extract text from both text-based and image-based PDFs.
14
- Falls back to OCR using pytesseract if no embedded text is found.
15
- """
16
- text_output = []
17
- reader = PdfReader(file_path)
18
- total_pages = len(reader.pages)
19
-
20
- print(f"πŸ“„ Processing PDF: {file_path} ({total_pages} pages)")
21
-
22
- for page_num, page in enumerate(reader.pages, start=1):
23
- try:
24
- # Try normal text extraction
25
- extracted_text = page.extract_text()
26
- if extracted_text and extracted_text.strip():
27
- text_output.append(extracted_text)
28
- print(f"βœ… Page {page_num}: Extracted embedded text.")
29
- else:
30
- # Run OCR if no text found
31
- print(f"πŸ” Page {page_num}: No text found, running OCR...")
32
- images = convert_from_path(
33
- file_path, first_page=page_num, last_page=page_num
34
- )
35
- ocr_text = ""
36
- for img in images:
37
- ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
38
- if ocr_text.strip():
39
- text_output.append(ocr_text)
40
- print(f"🧠 Page {page_num}: OCR extraction complete.")
41
- else:
42
- print(f"⚠️ Page {page_num}: OCR found no readable text.")
43
- except Exception as e:
44
- print(f"❌ Error processing page {page_num}: {e}")
45
-
46
- full_text = "\n".join(text_output)
47
- if not full_text.strip():
48
- print("⚠️ Warning: No text extracted from this PDF at all.")
49
- else:
50
- print(f"βœ… Done! Extracted {len(full_text.split())} words total.")
51
-
52
- return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # pdf_preprocessor.py
2
+
3
+ # import os
4
+ # from pypdf import PdfReader
5
+ # from pdf2image import convert_from_path
6
+ # import pytesseract
7
+
8
+ # # Optional: Set Tesseract path manually on Windows
9
+ # # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
10
+
11
+ # def extract_text_from_pdf(file_path: str) -> str:
12
+ # """
13
+ # Extract text from both text-based and image-based PDFs.
14
+ # Falls back to OCR using pytesseract if no embedded text is found.
15
+ # """
16
+ # text_output = []
17
+ # reader = PdfReader(file_path)
18
+ # total_pages = len(reader.pages)
19
+
20
+ # print(f"πŸ“„ Processing PDF: {file_path} ({total_pages} pages)")
21
+
22
+ # for page_num, page in enumerate(reader.pages, start=1):
23
+ # try:
24
+ # # Try normal text extraction
25
+ # extracted_text = page.extract_text()
26
+ # if extracted_text and extracted_text.strip():
27
+ # text_output.append(extracted_text)
28
+ # print(f"βœ… Page {page_num}: Extracted embedded text.")
29
+ # else:
30
+ # # Run OCR if no text found
31
+ # print(f"πŸ” Page {page_num}: No text found, running OCR...")
32
+ # images = convert_from_path(
33
+ # file_path, first_page=page_num, last_page=page_num
34
+ # )
35
+ # ocr_text = ""
36
+ # for img in images:
37
+ # ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
38
+ # if ocr_text.strip():
39
+ # text_output.append(ocr_text)
40
+ # print(f"🧠 Page {page_num}: OCR extraction complete.")
41
+ # else:
42
+ # print(f"⚠️ Page {page_num}: OCR found no readable text.")
43
+ # except Exception as e:
44
+ # print(f"❌ Error processing page {page_num}: {e}")
45
+
46
+ # full_text = "\n".join(text_output)
47
+ # if not full_text.strip():
48
+ # print("⚠️ Warning: No text extracted from this PDF at all.")
49
+ # else:
50
+ # print(f"βœ… Done! Extracted {len(full_text.split())} words total.")
51
+
52
+ # return full_text
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+ # pdf_preprocessor.py
70
+
71
+ import os
72
+ from pypdf import PdfReader
73
+ from pdf2image import convert_from_path
74
+ import pytesseract
75
+
76
+ # Optional: Set Tesseract path manually on Windows
77
+ # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
78
+
79
+ def extract_text_from_pdf(file_path: str) -> str:
80
+ """
81
+ Extract text from both text-based and image-based PDFs.
82
+ Falls back to OCR using pytesseract if no embedded text is found.
83
+ """
84
+ text_output = []
85
+ reader = PdfReader(file_path)
86
+ total_pages = len(reader.pages)
87
+
88
+ print(f"πŸ“„ Processing PDF: {file_path} ({total_pages} pages)")
89
+
90
+ for page_num, page in enumerate(reader.pages, start=1):
91
+ try:
92
+ # Try normal text extraction
93
+ extracted_text = page.extract_text()
94
+ if extracted_text and extracted_text.strip():
95
+ text_output.append(extracted_text)
96
+ print(f"βœ… Page {page_num}: Extracted embedded text.")
97
+ else:
98
+ # Run OCR if no text found
99
+ print(f"πŸ” Page {page_num}: No text found, running OCR...")
100
+ images = convert_from_path(
101
+ file_path, first_page=page_num, last_page=page_num
102
+ )
103
+ ocr_text = ""
104
+ for img in images:
105
+ ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
106
+ if ocr_text.strip():
107
+ text_output.append(ocr_text)
108
+ print(f"🧠 Page {page_num}: OCR extraction complete.")
109
+ else:
110
+ print(f"⚠️ Page {page_num}: OCR found no readable text.")
111
+ except Exception as e:
112
+ print(f"❌ Error processing page {page_num}: {e}")
113
+
114
+ full_text = "\n".join(text_output)
115
+ if not full_text.strip():
116
+ print("⚠️ Warning: No text extracted from this PDF at all.")
117
+ else:
118
+ print(f"βœ… Done! Extracted {len(full_text.split())} words total.")
119
+
120
+ return full_text