Spaces:

HoagMin
/

FoodDetection

Sleeping

App Files Files Community

HoagMin commited on Mar 12

Commit

237d929

verified ·

1 Parent(s): b02ec6d

Update services/ocr_service.py

Browse files

Files changed (1) hide show

services/ocr_service.py +19 -12

services/ocr_service.py CHANGED Viewed

@@ -14,42 +14,47 @@ class OCRService:
         self.ocr = PaddleOCR(lang='vi', use_textline_orientation=False, use_doc_orientation_classify=False,
     use_doc_unwarping=False)
-    def extract_text(self, img_path: str) -> List[str]:
         """
         Trích xuất văn bản từ ảnh và lọc rác.
         """
         if not os.path.exists(img_path):
             print(f"❌ Không tìm thấy file ảnh: {img_path}")
             return []
         try:
             results = self.ocr.ocr(img_path, cls=True)
             raw_texts = []
-            for res in results:
-                if hasattr(res, 'rec_texts'):
-                    if res.rec_texts:
-                        for text in res.rec_texts:
-                            if text:
-                                raw_texts.append(str(text))
-                elif isinstance(res, dict) and 'rec_texts' in res:
-                    raw_texts.extend(res['rec_texts'])
             clean_texts = []
             for text in raw_texts:
                 text = text.strip()
                 if not text: continue
                 if text.lower() in ['k', 'đ', 'd', '$', 'vnd', 'xu']:
                     clean_texts.append(text)
                     continue
                 if len(text) < 2 and not re.search(r'\d', text):
                     continue
                 if not re.search(r'[a-zA-Z0-9àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', text):
                     continue
@@ -63,6 +68,8 @@ class OCRService:
             traceback.print_exc()
             return []
  #--- TEST CODE ---
 #if __name__ == "__main__":

         self.ocr = PaddleOCR(lang='vi', use_textline_orientation=False, use_doc_orientation_classify=False,
     use_doc_unwarping=False)
+    def extract_text(self, img_path: str) -> list[str]: # Đã sửa List thành list hoặc giữ nguyên List của bạn
         """
         Trích xuất văn bản từ ảnh và lọc rác.
         """
+        import os
+        import re
         if not os.path.exists(img_path):
             print(f"❌ Không tìm thấy file ảnh: {img_path}")
             return []
         try:
+            # 1. Chạy OCR
             results = self.ocr.ocr(img_path, cls=True)
             raw_texts = []
+            # 2. Bóc tách dữ liệu từ mảng lồng nhau của PaddleOCR
+            # Kết quả cho 1 ảnh thường nằm ở results[0]
+            if results and results[0] is not None:
+                for line in results[0]:
+                    # Cấu trúc mỗi line: [[tọa_độ_4_góc], ('Văn bản', độ_tự_tin)]
+                    # Phần chữ nằm ở phần tử thứ 2 của tuple -> line[1][0]
+                    text = line[1][0]
+                    raw_texts.append(str(text))
+            # 3. Dọn dẹp văn bản (Phần logic cực kỳ tốt của bạn giữ nguyên)
             clean_texts = []
             for text in raw_texts:
                 text = text.strip()
                 if not text: continue
+                # Giữ lại các ký hiệu tiền tệ
                 if text.lower() in ['k', 'đ', 'd', '$', 'vnd', 'xu']:
                     clean_texts.append(text)
                     continue
+                # Bỏ qua chuỗi quá ngắn không có số
                 if len(text) < 2 and not re.search(r'\d', text):
                     continue
+                # Bỏ qua chuỗi không có ký tự alphabet/số nào (rác hoàn toàn)
                 if not re.search(r'[a-zA-Z0-9àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', text):
                     continue
             traceback.print_exc()
             return []
  #--- TEST CODE ---
 #if __name__ == "__main__":