Spaces:

HoagMin
/

FoodDetection

Sleeping

File size: 3,069 Bytes

a16cceb
 
 
 
 
 
 
 
 
 
 
 
1415771
 
 
a16cceb
237d929
a16cceb
 
 
237d929
 
 
a16cceb
 
 
 
 
237d929
b02ec6d
a16cceb
 
 
237d929
 
 
 
 
 
 
 
1415771
237d929
a16cceb
 
 
 
 
237d929
a16cceb
 
 
 
237d929
a16cceb
 
237d929
a16cceb
 
 
 
 
 
 
 
 
 
 
 
 
237d929
 
1415771

import os
import re
from paddleocr import PaddleOCR
from typing import List
import logging

# Tắt log rác
logging.getLogger("ppocr").setLevel(logging.ERROR)
logging.getLogger("paddle").setLevel(logging.ERROR)

class OCRService:
    def __init__(self):
        """Khởi tạo PaddleOCR cho phiên bản Pipeline mới."""
        self.ocr = PaddleOCR(lang='vi', use_textline_orientation=False, use_doc_orientation_classify=False,
    use_doc_unwarping=False)

    def extract_text(self, img_path: str) -> list[str]: # Đã sửa List thành list hoặc giữ nguyên List của bạn
        """
        Trích xuất văn bản từ ảnh và lọc rác.
        """
        import os
        import re

        if not os.path.exists(img_path):
            print(f"❌ Không tìm thấy file ảnh: {img_path}")
            return []

        try:
            # 1. Chạy OCR
            results = self.ocr.ocr(img_path, cls=True)
            
            raw_texts = []
            
            # 2. Bóc tách dữ liệu từ mảng lồng nhau của PaddleOCR
            # Kết quả cho 1 ảnh thường nằm ở results[0]
            if results and results[0] is not None:
                for line in results[0]:
                    # Cấu trúc mỗi line: [[tọa_độ_4_góc], ('Văn bản', độ_tự_tin)]
                    # Phần chữ nằm ở phần tử thứ 2 của tuple -> line[1][0]
                    text = line[1][0]
                    raw_texts.append(str(text))

            # 3. Dọn dẹp văn bản (Phần logic cực kỳ tốt của bạn giữ nguyên)
            clean_texts = []
            for text in raw_texts:
                text = text.strip()
                if not text: continue
                
                # Giữ lại các ký hiệu tiền tệ
                if text.lower() in ['k', 'đ', 'd', '$', 'vnd', 'xu']:
                    clean_texts.append(text)
                    continue

                # Bỏ qua chuỗi quá ngắn không có số
                if len(text) < 2 and not re.search(r'\d', text):
                    continue
                # Bỏ qua chuỗi không có ký tự alphabet/số nào (rác hoàn toàn)
                if not re.search(r'[a-zA-Z0-9àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', text):
                    continue

                clean_texts.append(text)

            return clean_texts

        except Exception as e:
            print(f"❌ Lỗi OCR Service: {e}")
            import traceback
            traceback.print_exc()
            return []

        

 #--- TEST CODE ---

#if __name__ == "__main__":
 #   service = OCRService()
  #  test_path = r"static/debug/bunbohue.png" 
  #  print(f" Đang đọc: {test_path}")
  #  texts = service.extract_text(test_path)
  #  
  #  print("-" * 30)
  #  print(f" KẾT QUẢ ({len(texts)} dòng):")
  #  for t in texts:
  #      print(f"  - {t}") ##