Cassius1Morbant commited on
Commit
593231c
Β·
verified Β·
1 Parent(s): da25573

Upload extract.py

Browse files
Files changed (1) hide show
  1. extract.py +84 -0
extract.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf # PyMuPDF
2
+ import easyocr
3
+ import io
4
+ import os
5
+
6
+ # Initialise EasyOCR once (French + English) β€” outside the function for efficiency
7
+ # First run will download models (~400 MB); subsequent runs are fast.
8
+ reader = easyocr.Reader(['fr', 'en'], gpu=False) # Set gpu=True if you have CUDA
9
+
10
+ def extract_pdf_text_with_easyocr(
11
+ file_path: str,
12
+ zoom: float = 3.0, # 3.0 β‰ˆ 300 DPI β€” good balance of quality/speed
13
+ min_text_length: int = 50, # Ignore pages with very little text (likely blank)
14
+ save_to_file: str = None # Optional: path to save extracted text
15
+ ) -> str:
16
+ """
17
+ Extract text from a PDF using EasyOCR with OCR fallback for image-based pages.
18
+ Returns clean concatenated text.
19
+ """
20
+ if not os.path.exists(file_path):
21
+ raise FileNotFoundError(f"PDF file not found: {file_path}")
22
+
23
+ try:
24
+ doc = pymupdf.open(file_path)
25
+ except Exception as e:
26
+ raise RuntimeError(f"Failed to open PDF with PyMuPDF: {e}")
27
+
28
+ full_text = ""
29
+ total_pages = len(doc)
30
+
31
+ for page_num in range(total_pages):
32
+ page = doc[page_num]
33
+
34
+ # Render page to high-resolution pixmap
35
+ mat = pymupdf.Matrix(zoom, zoom)
36
+ pix = page.get_pixmap(matrix=mat, colorspace=pymupdf.csRGB)
37
+
38
+ # Convert to PNG bytes
39
+ img_bytes = pix.tobytes("png")
40
+
41
+ # Perform OCR
42
+ try:
43
+ result = reader.readtext(
44
+ img_bytes,
45
+ detail=0, # Return only text strings
46
+ paragraph=True, # Group into paragraphs
47
+ width_ths=0.7, # Adjust for better line grouping
48
+ height_ths=0.7
49
+ )
50
+ page_text = "\n".join([line.strip() for line in result if line.strip()])
51
+ except Exception as ocr_error:
52
+ page_text = f"[OCR Error on page {page_num + 1}: {ocr_error}]"
53
+
54
+ # Only add page if meaningful text was extracted
55
+ if len(page_text) > min_text_length or "OCR Error" in page_text:
56
+ full_text += f"--- Page {page_num + 1} ---\n{page_text}\n\n"
57
+
58
+ doc.close()
59
+
60
+ # Optional: Save to file
61
+ if save_to_file:
62
+ try:
63
+ with open(save_to_file, "w", encoding="utf-8") as f:
64
+ f.write(full_text)
65
+ print(f"Extracted text saved to: {save_to_file}")
66
+ except Exception as e:
67
+ print(f"Failed to save file: {e}")
68
+
69
+ return full_text.strip()
70
+
71
+ # β€”β€”β€”β€”β€”β€”β€”β€” TEST USAGE β€”β€”β€”β€”β€”β€”β€”β€”
72
+ if __name__ == "__main__":
73
+ pdf_file = "Kbis.pdf" # Update with your actual filename/path
74
+
75
+ extracted_text = extract_pdf_text_with_easyocr(
76
+ file_path=pdf_file,
77
+ zoom=3.5, # Slightly higher for very small text
78
+ save_to_file="kbis_extracted.txt"
79
+ )
80
+
81
+ # Print preview (first 1000 chars)
82
+ print("\n=== EXTRACTED TEXT PREVIEW ===\n")
83
+ print(extracted_text[:1000])
84
+ print("\n... (truncated)" if len(extracted_text) > 1000 else "")