Do commited on
Commit
fe05a55
·
verified ·
1 Parent(s): 2c97904

Upload Project_Text_Detection_OCR.py

Browse files
Files changed (1) hide show
  1. src/Project_Text_Detection_OCR.py +209 -0
src/Project_Text_Detection_OCR.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import cv2
3
+ import numpy as np
4
+ import pytesseract
5
+ from PIL import Image
6
+ from pdf2image import convert_from_bytes
7
+ from langdetect import detect
8
+ <<<<<<< HEAD
9
+ =======
10
+ <<<<<<< HEAD
11
+ >>>>>>> d426f62 (Initial commit on master)
12
+
13
+
14
+
15
+ # ===============================
16
+ # ⚙️ Cấu hình Tesseract OCR
17
+ # ===============================
18
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
19
+ <<<<<<< HEAD
20
+ =======
21
+ =======
22
+ import platform
23
+ import os
24
+
25
+ # ===============================
26
+ # ⚙️ Cấu hình Tesseract OCR tự động theo OS
27
+ # ===============================
28
+ if platform.system() == "Windows":
29
+ # Windows local
30
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
31
+ poppler_path = r"C:\poppler-23.12.0\bin" # Thay bằng đường dẫn Poppler của bạn
32
+ else:
33
+ # Linux / Streamlit Cloud
34
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
35
+ poppler_path = None
36
+
37
+ >>>>>>> 2ba32ab (update requirement.txt)
38
+ >>>>>>> d426f62 (Initial commit on master)
39
+ custom_config = r'--oem 3 --psm 6'
40
+
41
+ # ===============================
42
+ # 🖼️ Giao diện Streamlit
43
+ # ===============================
44
+ <<<<<<< HEAD
45
+ =======
46
+ <<<<<<< HEAD
47
+ >>>>>>> d426f62 (Initial commit on master)
48
+ st.set_page_config(page_title="Smart OCR Premium - Nguyen Tin Tin Do", page_icon="🧠", layout="wide")
49
+
50
+ st.title("🧠 Smart OCR Pro - Vietnamese + English Text Recognition")
51
+ st.markdown("""
52
+ Ứng dụng OCR chuyên nghiệp dùng **Tesseract + Streamlit**
53
+ <<<<<<< HEAD
54
+ =======
55
+ =======
56
+ st.set_page_config(page_title="Smart OCR Premium", page_icon="🧠", layout="wide")
57
+ st.title("🧠 Smart OCR Pro - Vietnamese + English Text Recognition")
58
+ st.markdown("""
59
+ Ứng dụng OCR dùng **Tesseract + Streamlit**
60
+ >>>>>>> 2ba32ab (update requirement.txt)
61
+ >>>>>>> d426f62 (Initial commit on master)
62
+ Hỗ trợ ảnh, PDF, song ngữ (🇻🇳 + 🇺🇸), xuất văn bản và bounding box.
63
+ """)
64
+
65
+ uploaded_file = st.file_uploader("📤 Tải ảnh hoặc PDF", type=["png", "jpg", "jpeg", "pdf"])
66
+
67
+ @st.cache_resource
68
+ def ocr_process(image_np, lang):
69
+ gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
70
+ gray = cv2.bilateralFilter(gray, 9, 75, 75)
71
+ gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
72
+ cv2.THRESH_BINARY, 31, 2)
73
+ <<<<<<< HEAD
74
+ =======
75
+ <<<<<<< HEAD
76
+ >>>>>>> d426f62 (Initial commit on master)
77
+
78
+ hImg, wImg = gray.shape
79
+ boxes = pytesseract.image_to_boxes(gray, config=custom_config, lang=lang)
80
+ img_copy = image_np.copy()
81
+
82
+ <<<<<<< HEAD
83
+ =======
84
+ =======
85
+ hImg, wImg = gray.shape
86
+ boxes = pytesseract.image_to_boxes(gray, config=custom_config, lang=lang)
87
+ img_copy = image_np.copy()
88
+ >>>>>>> 2ba32ab (update requirement.txt)
89
+ >>>>>>> d426f62 (Initial commit on master)
90
+ for b in boxes.splitlines():
91
+ b = b.split(' ')
92
+ if len(b) >= 5:
93
+ x, y, w, h = int(b[1]), int(b[2]), int(b[3]), int(b[4])
94
+ cv2.rectangle(img_copy, (x, hImg - y), (w, hImg - h), (0, 255, 0), 2)
95
+ cv2.putText(img_copy, b[0], (x, hImg - y + 25),
96
+ cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
97
+ <<<<<<< HEAD
98
+ =======
99
+ <<<<<<< HEAD
100
+ >>>>>>> d426f62 (Initial commit on master)
101
+
102
+ text = pytesseract.image_to_string(gray, config=custom_config, lang=lang)
103
+ return img_copy, text
104
+
105
+ if uploaded_file is not None:
106
+ st.subheader("📄 Kết quả OCR:")
107
+
108
+ # ===============================
109
+ # 🔍 Xử lý PDF hoặc Ảnh
110
+ # ===============================
111
+ if uploaded_file.name.lower().endswith(".pdf"):
112
+ # ✅ Thêm đường dẫn poppler
113
+ poppler_path = r"C:\poppler-23.12.0\bin" # <-- sửa theo nơi bạn giải nén Poppler
114
+ pages = convert_from_bytes(uploaded_file.read(), poppler_path=poppler_path)
115
+
116
+ <<<<<<< HEAD
117
+ =======
118
+ =======
119
+ text = pytesseract.image_to_string(gray, config=custom_config, lang=lang)
120
+ return img_copy, text
121
+
122
+ # ===============================
123
+ # 🔍 Xử lý file upload
124
+ # ===============================
125
+ if uploaded_file is not None:
126
+ st.subheader("📄 Kết quả OCR:")
127
+
128
+ if uploaded_file.name.lower().endswith(".pdf"):
129
+ pages = convert_from_bytes(uploaded_file.read(), poppler_path=poppler_path)
130
+ >>>>>>> 2ba32ab (update requirement.txt)
131
+ >>>>>>> d426f62 (Initial commit on master)
132
+ for i, page in enumerate(pages):
133
+ st.write(f"### Trang {i+1}")
134
+ img_np = np.array(page)
135
+ img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
136
+
137
+ <<<<<<< HEAD
138
+ # Phát hiện ngôn ngữ tự động
139
+ =======
140
+ <<<<<<< HEAD
141
+ # Phát hiện ngôn ngữ tự động
142
+ =======
143
+ >>>>>>> 2ba32ab (update requirement.txt)
144
+ >>>>>>> d426f62 (Initial commit on master)
145
+ sample_text = pytesseract.image_to_string(img_bgr, config=custom_config, lang="vie+eng")
146
+ try:
147
+ lang_detected = detect(sample_text)
148
+ lang = "vie" if lang_detected == "vi" else "eng"
149
+ except:
150
+ lang = "vie+eng"
151
+
152
+ result_img, result_text = ocr_process(img_bgr, lang)
153
+ st.image(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB), caption=f"OCR Trang {i+1}", use_container_width=True)
154
+ st.text_area(f"📘 Văn bản Trang {i+1}", result_text, height=200)
155
+
156
+ else:
157
+ image = Image.open(uploaded_file)
158
+ img_np = np.array(image)
159
+ img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
160
+
161
+ <<<<<<< HEAD
162
+ # Phát hiện ngôn ngữ
163
+ =======
164
+ <<<<<<< HEAD
165
+ # Phát hiện ngôn ngữ
166
+ =======
167
+ >>>>>>> 2ba32ab (update requirement.txt)
168
+ >>>>>>> d426f62 (Initial commit on master)
169
+ sample_text = pytesseract.image_to_string(img_bgr, config=custom_config, lang="vie+eng")
170
+ try:
171
+ lang_detected = detect(sample_text)
172
+ lang = "vie" if lang_detected == "vi" else "eng"
173
+ except:
174
+ lang = "vie+eng"
175
+
176
+ result_img, result_text = ocr_process(img_bgr, lang)
177
+ <<<<<<< HEAD
178
+ =======
179
+ <<<<<<< HEAD
180
+ >>>>>>> d426f62 (Initial commit on master)
181
+
182
+ st.image(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB), caption="Ảnh OCR", use_container_width=True)
183
+ st.text_area("📘 Văn bản nhận dạng được", result_text, height=250)
184
+
185
+ <<<<<<< HEAD
186
+ =======
187
+ =======
188
+ st.image(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB), caption="Ảnh OCR", use_container_width=True)
189
+ st.text_area("📘 Văn bản nhận dạng được", result_text, height=250)
190
+ >>>>>>> 2ba32ab (update requirement.txt)
191
+ >>>>>>> d426f62 (Initial commit on master)
192
+ st.download_button("📥 Tải kết quả OCR (.txt)",
193
+ data=result_text,
194
+ file_name="ocr_output.txt",
195
+ mime="text/plain")
196
+ <<<<<<< HEAD
197
+
198
+ else:
199
+ st.info("⬆️ Tải lên ảnh hoặc PDF để bắt đầu nhận dạng.")
200
+ =======
201
+ <<<<<<< HEAD
202
+
203
+ else:
204
+ st.info("⬆️ Tải lên ảnh hoặc PDF để bắt đầu nhận dạng.")
205
+ =======
206
+ else:
207
+ st.info("⬆️ Tải lên ảnh hoặc PDF để bắt đầu nhận dạng.")
208
+ >>>>>>> 2ba32ab (update requirement.txt)
209
+ >>>>>>> d426f62 (Initial commit on master)