redhairedshanks1 commited on
Commit
7d5ea10
·
verified ·
1 Parent(s): 65398f9

Update services/extract_text.py

Browse files
Files changed (1) hide show
  1. services/extract_text.py +133 -121
services/extract_text.py CHANGED
@@ -1,122 +1,134 @@
1
- import os
2
- import logging
3
- import fitz # PyMuPDF
4
- import numpy as np
5
- from PIL import Image
6
- import tempfile
7
- import cv2
8
- import re
9
-
10
- # OCR
11
- from paddleocr import PaddleOCR
12
-
13
- # Mistral OCR (optional)
14
- try:
15
- from doctr.models import ocr_predictor
16
- from doctr.io import DocumentFile
17
- mistral_ocr = ocr_predictor(pretrained=True)
18
- use_mistral_ocr = True
19
- except ImportError:
20
- mistral_ocr = None
21
- use_mistral_ocr = False
22
-
23
- # Ensure OCR environment paths
24
- os.environ.setdefault("HOME", "/app")
25
- os.environ.setdefault("PADDLEOCR_HOME", "/app/.paddleocr")
26
-
27
- # Logging
28
- logging.basicConfig(level=logging.INFO)
29
- logger = logging.getLogger(__name__)
30
-
31
- # Load PaddleOCR
32
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
33
-
34
- def clean_text(text):
35
- return re.sub(r'\s+', ' ', text).strip()
36
-
37
- def auto_rotate_image(pil_img):
38
- img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
39
- coords = np.column_stack(np.where(img_cv > 0))
40
- angle = cv2.minAreaRect(coords)[-1]
41
- angle = -(90 + angle) if angle < -45 else -angle
42
- (h, w) = img_cv.shape[:2]
43
- M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
44
- rotated = cv2.warpAffine(img_cv, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
45
- return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
46
-
47
- def extract_images_with_fitz(pdf_path):
48
- doc = fitz.open(pdf_path)
49
- images = []
50
- for page in doc:
51
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
52
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
53
- images.append(img)
54
- doc.close()
55
- return images
56
-
57
- def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
58
- ext = os.path.splitext(filename or "")[-1].lower()
59
- result = []
60
-
61
- if ext == ".pdf":
62
- doc = fitz.open(file.name)
63
- images = extract_images_with_fitz(file.name)
64
- total_pages = len(doc)
65
- start = max(start_page or 1, 1)
66
- end = min(end_page or total_pages, total_pages)
67
-
68
- for i, page in enumerate(doc):
69
- page_num = i + 1
70
- if not (start <= page_num <= end):
71
- continue
72
-
73
- text = page.get_text()
74
- if text.strip():
75
- result.append(f"Page {page_num} (Extracted):\n{clean_text(text)}")
76
- else:
77
- if i < len(images):
78
- img = auto_rotate_image(images[i])
79
- img_np = np.array(img)
80
- try:
81
- ocr_result = ocr.ocr(img_np, cls=True)
82
- ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
83
- if not ocr_text and use_mistral_ocr:
84
- doc_img = DocumentFile.from_images(img)
85
- ocr_text = mistral_ocr(doc_img).render()
86
- except Exception as e:
87
- logger.warning(f"OCR error on page {page_num}: {e}")
88
- ocr_text = "[OCR Error]"
89
- result.append(f"Page {page_num} (OCR):\n{clean_text(ocr_text) or '[No OCR Text]'}")
90
- else:
91
- result.append(f"Page {page_num}: [No text or image]")
92
-
93
- doc.close()
94
- return "\n\n".join(result)
95
-
96
- elif ext == ".docx":
97
- from docx.api import Document
98
- doc = Document(file.name)
99
- paras = [p.text for p in doc.paragraphs if p.text.strip()]
100
- page_texts = []
101
- page_size = 500
102
- for i in range(0, len(paras), page_size):
103
- page_texts.append("\n".join(paras[i:i + page_size]))
104
-
105
- selected_pages = page_texts
106
- if start_page and end_page:
107
- selected_pages = page_texts[start_page - 1:end_page]
108
- return clean_text("\n\n".join(selected_pages))
109
-
110
- elif ext == ".csv":
111
- import pandas as pd
112
- return pd.read_csv(file.name).to_string(index=False)
113
-
114
- elif ext in [".xls", ".xlsx"]:
115
- import pandas as pd
116
- xl = pd.ExcelFile(file.name)
117
- return "\n\n".join([
118
- f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}"
119
- for s in xl.sheet_names
120
- ])
121
-
 
 
 
 
 
 
 
 
 
 
 
 
122
  return "Unsupported file type"
 
1
+ import os
2
+ import logging
3
+ import fitz # PyMuPDF
4
+ import numpy as np
5
+ from PIL import Image
6
+ import tempfile
7
+ import cv2
8
+ import re
9
+
10
+ # OCR
11
+ from paddleocr import PaddleOCR
12
+
13
+ # Mistral OCR (optional)
14
+ try:
15
+ from doctr.models import ocr_predictor
16
+ from doctr.io import DocumentFile
17
+ mistral_ocr = ocr_predictor(pretrained=True)
18
+ use_mistral_ocr = True
19
+ except ImportError:
20
+ mistral_ocr = None
21
+ use_mistral_ocr = False
22
+
23
+ # Ensure OCR environment paths
24
+ os.environ.setdefault("HOME", "/app")
25
+ os.environ.setdefault("PADDLEOCR_HOME", "/app/.paddleocr")
26
+
27
+ # Logging
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Load PaddleOCR
32
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
33
+
34
+ def clean_text(text):
35
+ return re.sub(r'\s+', ' ', text).strip()
36
+
37
+ def auto_rotate_image(pil_img):
38
+ img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
39
+ coords = np.column_stack(np.where(img_cv > 0))
40
+ angle = cv2.minAreaRect(coords)[-1]
41
+ angle = -(90 + angle) if angle < -45 else -angle
42
+ (h, w) = img_cv.shape[:2]
43
+ M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
44
+ rotated = cv2.warpAffine(img_cv, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
45
+ return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
46
+
47
+ def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
48
+ images = []
49
+ try:
50
+ doc = fitz.open(pdf_path)
51
+ total_pages = len(doc)
52
+ end = min(end_page or total_pages, total_pages)
53
+
54
+ for i in range(start_page - 1, end):
55
+ try:
56
+ page = doc[i]
57
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
58
+ mode = "RGBA" if pix.alpha else "RGB"
59
+ img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
60
+ images.append((i + 1, img)) # include page number with image
61
+ except Exception as e:
62
+ logger.error(f"Error processing page {i + 1}: {e}")
63
+ doc.close()
64
+ except Exception as e:
65
+ logger.error(f"Failed to open PDF file: {e}")
66
+ return images
67
+
68
+
69
+ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
70
+ ext = os.path.splitext(filename or "")[-1].lower()
71
+ result = []
72
+
73
+ if ext == ".pdf":
74
+ doc = fitz.open(file.name)
75
+ images = extract_images_with_fitz(file.name, start_page, end_page)
76
+ total_pages = len(doc)
77
+ start = max(start_page or 1, 1)
78
+ end = min(end_page or total_pages, total_pages)
79
+
80
+ for i, page in enumerate(doc):
81
+ page_num = i + 1
82
+ if not (start <= page_num <= end):
83
+ continue
84
+
85
+ text = page.get_text()
86
+ if text.strip():
87
+ result.append(f"Page {page_num} (Extracted):\n{clean_text(text)}")
88
+ else:
89
+ if i < len(images):
90
+ img = auto_rotate_image(images[i])
91
+ img_np = np.array(img)
92
+ try:
93
+ ocr_result = ocr.ocr(img_np, cls=True)
94
+ ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
95
+ if not ocr_text and use_mistral_ocr:
96
+ doc_img = DocumentFile.from_images(img)
97
+ ocr_text = mistral_ocr(doc_img).render()
98
+ except Exception as e:
99
+ logger.warning(f"OCR error on page {page_num}: {e}")
100
+ ocr_text = "[OCR Error]"
101
+ result.append(f"Page {page_num} (OCR):\n{clean_text(ocr_text) or '[No OCR Text]'}")
102
+ else:
103
+ result.append(f"Page {page_num}: [No text or image]")
104
+
105
+ doc.close()
106
+ return "\n\n".join(result)
107
+
108
+ elif ext == ".docx":
109
+ from docx.api import Document
110
+ doc = Document(file.name)
111
+ paras = [p.text for p in doc.paragraphs if p.text.strip()]
112
+ page_texts = []
113
+ page_size = 500
114
+ for i in range(0, len(paras), page_size):
115
+ page_texts.append("\n".join(paras[i:i + page_size]))
116
+
117
+ selected_pages = page_texts
118
+ if start_page and end_page:
119
+ selected_pages = page_texts[start_page - 1:end_page]
120
+ return clean_text("\n\n".join(selected_pages))
121
+
122
+ elif ext == ".csv":
123
+ import pandas as pd
124
+ return pd.read_csv(file.name).to_string(index=False)
125
+
126
+ elif ext in [".xls", ".xlsx"]:
127
+ import pandas as pd
128
+ xl = pd.ExcelFile(file.name)
129
+ return "\n\n".join([
130
+ f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}"
131
+ for s in xl.sheet_names
132
+ ])
133
+
134
  return "Unsupported file type"