Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 29, 2025

Commit

4beae10

verified ·

1 Parent(s): 5c720e5

Update services/image_service.py

Browse files

Files changed (1) hide show

services/image_service.py +283 -20

services/image_service.py CHANGED Viewed

@@ -1,33 +1,296 @@
 from groq import Groq
 from config.settings import settings
 class ImageService:
     def __init__(self, groq_client: Groq):
         self.client = groq_client
-    def analyze_image_with_description(self, image, user_description: str) -> str:
-        """Phân tích hình ảnh kết hợp với mô tả từ người dùng"""
         if image is None:
-            return "No image uploaded."
         try:
-            if user_description:
-                prompt = f"""Người dùng tải lên một hình ảnh và mô tả: "{user_description}"
-Dựa trên mô tả này, hãy phân tích chi tiết bằng tiếng Việt:
-1. Mô tả chi tiết những gì có trong hình ảnh
-2. Phân tích các yếu tố liên quan đến mô tả của người dùng
-3. Đưa ra nhận xét và thông tin hữu ích"""
             else:
-                prompt = """Hãy mô tả chi tiết bằng tiếng Việt những gì bạn nghĩ có thể có trong hình ảnh này.
-                Mô tả các đối tượng, màu sắc, bố cục và ngữ cảnh có thể có của hình ảnh."""
-            chat_completion = self.client.chat.completions.create(
-                messages=[{"role": "user", "content": prompt}],
-                model=settings.LLM_MODEL,
             )
-            description = chat_completion.choices[0].message.content
         except Exception as e:
-            description = f"Error in image analysis: {str(e)}"
-        return description

+import cv2
+import numpy as np
+from PIL import Image
+import io
+import base64
+from typing import List, Dict, Any, Optional
+import torch
 from groq import Groq
 from config.settings import settings
 class ImageService:
     def __init__(self, groq_client: Groq):
         self.client = groq_client
+        self.ocr_processor = None
+        self.easy_ocr_reader = None
+        self._initialize_ocr_models()
+    def _initialize_ocr_models(self):
+        """Khởi tạo các model OCR"""
+        try:
+            print("🔄 Đang khởi tạo OCR models...")
+            # Khởi tạo EasyOCR cho đa ngôn ngữ
+            try:
+                import easyocr
+                self.easy_ocr_reader = easyocr.Reader(
+                    settings.EASYOCR_LANGUAGES,
+                    gpu=torch.cuda.is_available()
+                )
+                print("✅ EasyOCR initialized successfully")
+            except ImportError:
+                print("❌ EasyOCR not installed, installing...")
+                import subprocess
+                subprocess.run(["pip", "install", "easyocr"])
+                import easyocr
+                self.easy_ocr_reader = easyocr.Reader(settings.EASYOCR_LANGUAGES)
+                print("✅ EasyOCR installed and initialized")
+            # Khởi tạo MangaOCR cho tiếng Việt và chữ in
+            try:
+                from manga_ocr import MangaOcr
+                self.ocr_processor = MangaOcr()
+                print("✅ MangaOCR initialized successfully")
+            except ImportError:
+                print("⚠️ MangaOCR not available, using EasyOCR only")
+        except Exception as e:
+            print(f"❌ Lỗi khởi tạo OCR: {e}")
+    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
+        """Tiền xử lý ảnh để cải thiện OCR accuracy"""
+        try:
+            # Chuyển sang grayscale nếu là ảnh màu
+            if len(image.shape) == 3:
+                gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            else:
+                gray = image
+            # Áp dụng filters để cải thiện chất lượng
+            # Noise reduction
+            denoised = cv2.medianBlur(gray, 3)
+            # Contrast enhancement
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(denoised)
+            # Thresholding
+            _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+            return binary
+        except Exception as e:
+            print(f"⚠️ Lỗi tiền xử lý ảnh: {e}")
+            return image
+    def extract_text_easyocr(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """Trích xuất text sử dụng EasyOCR"""
+        try:
+            # Tiền xử lý ảnh
+            processed_image = self.preprocess_image(image)
+            # Chuyển đổi numpy array sang PIL Image cho phù hợp
+            if processed_image is not None:
+                pil_image = Image.fromarray(processed_image)
+                image_np = np.array(pil_image)
+            else:
+                image_np = image
+            # Chạy OCR
+            results = self.easy_ocr_reader.readtext(
+                image_np,
+                detail=1,
+                paragraph=True,
+                contrast_ths=0.3,
+                adjust_contrast=0.7
+            )
+            # Format kết quả
+            extracted_texts = []
+            for bbox, text, confidence in results:
+                extracted_texts.append({
+                    'text': text,
+                    'confidence': float(confidence),
+                    'bbox': bbox
+                })
+            return extracted_texts
+        except Exception as e:
+            print(f"❌ Lỗi EasyOCR: {e}")
+            return []
+    def extract_text_mangaocr(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """Trích xuất text sử dụng MangaOCR (tốt cho tiếng Việt)"""
+        try:
+            if self.ocr_processor is None:
+                return []
+            # Chuyển numpy array sang PIL Image
+            pil_image = Image.fromarray(image)
+            # Chạy OCR
+            text = self.ocr_processor(pil_image)
+            return [{
+                'text': text,
+                'confidence': 0.9,  # MangaOCR không trả về confidence
+                'bbox': None,
+                'source': 'manga_ocr'
+            }]
+        except Exception as e:
+            print(f"❌ Lỗi MangaOCR: {e}")
+            return []
+    def merge_ocr_results(self, easyocr_results: List, mangaocr_results: List) -> str:
+        """Kết hợp và chọn lọc kết quả từ nhiều OCR engine"""
+        all_texts = []
+        # Ưu tiên kết quả từ EasyOCR với confidence cao
+        for result in easyocr_results:
+            if result['confidence'] > 0.5:  # Ngưỡng confidence
+                all_texts.append(result['text'])
+        # Thêm kết quả từ MangaOCR nếu có
+        for result in mangaocr_results:
+            all_texts.append(result['text'])
+        # Loại bỏ trùng lặp và kết hợp
+        unique_texts = []
+        seen_texts = set()
+        for text in all_texts:
+            clean_text = text.strip()
+            if clean_text and len(clean_text) > 1 and clean_text not in seen_texts:
+                unique_texts.append(clean_text)
+                seen_texts.add(clean_text)
+        return "\n".join(unique_texts) if unique_texts else "Không phát hiện được văn bản trong ảnh."
+    def extract_text_from_image(self, image: np.ndarray) -> str:
+        """Trích xuất văn bản từ ảnh sử dụng nhiều OCR engine"""
         if image is None:
+            return "Không có ảnh được tải lên."
         try:
+            print("🔍 Đang trích xuất văn bản từ ảnh...")
+            # Chạy cả hai OCR engine
+            easyocr_results = self.extract_text_easyocr(image)
+            mangaocr_results = self.extract_text_mangaocr(image)
+            print(f"📊 EasyOCR found {len(easyocr_results)} text regions")
+            print(f"📊 MangaOCR found {len(mangaocr_results)} text regions")
+            # Kết hợp kết quả
+            merged_text = self.merge_ocr_results(easyocr_results, mangaocr_results)
+            print(f"✅ Extracted text: {merged_text[:100]}...")
+            return merged_text
+        except Exception as e:
+            print(f"❌ Lỗi trích xuất văn bản: {e}")
+            return f"Lỗi khi trích xuất văn bản: {str(e)}"
+    def analyze_text_with_llm(self, extracted_text: str, user_description: str = "") -> str:
+        """Phân tích văn bản trích xuất được bằng LLM"""
+        try:
+            if not extracted_text or extracted_text == "Không phát hiện được văn bản trong ảnh.":
+                prompt = """
+                Tôi đã tải lên một hình ảnh nhưng không thể trích xuất được văn bản từ đó.
+                Hãy mô tả tổng quan về hình ảnh này và đưa ra các phán đoán về nội dung có thể có.
+                """
             else:
+                if user_description:
+                    prompt = f"""
+                    NGƯỜI DÙNG MÔ TẢ: "{user_description}"
+                    VĂN BẢN TRÍCH XUẤT TỪ ẢNH:
+                    {extracted_text}
+                    Dựa trên mô tả của người dùng và văn bản trích xuất được, hãy:
+                    1. Phân tích và tóm tắt nội dung chính
+                    2. Giải thích ý nghĩa của văn bản trong ngữ cảnh
+                    3. Đưa ra thông tin bổ sung hữu ích
+                    """
+                else:
+                    prompt = f"""
+                    VĂN BẢN TRÍCH XUẤT TỪ ẢNH:
+                    {extracted_text}
+                    Hãy phân tích và cung cấp thông tin về:
+                    1. Nội dung chính của văn bản
+                    2. Loại văn bản (tài liệu, quảng cáo, tin nhắn, etc.)
+                    3. Ngữ cảnh và ý nghĩa
+                    4. Thông tin hữu ích khác
+                    """
+            # Gọi LLM để phân tích
+            completion = self.client.chat.completions.create(
+                model=settings.DEFAULT_LLM_MODEL,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "Bạn là trợ lý phân tích hình ảnh và văn bản. Hãy trả lời bằng tiếng Việt tự nhiên, rõ ràng và hữu ích."
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ],
+                max_tokens=500,
+                temperature=0.7
             )
+            return completion.choices[0].message.content
         except Exception as e:
+            print(f"❌ Lỗi phân tích với LLM: {e}")
+            return f"Lỗi khi phân tích với AI: {str(e)}"
+    def analyze_image_with_description(self, image, user_description: str = "") -> str:
+        """Phân tích ảnh hoàn chỉnh: OCR + LLM"""
+        if image is None:
+            return "❌ Vui lòng tải lên một hình ảnh để phân tích."
+        try:
+            # Bước 1: Trích xuất văn bản từ ảnh
+            extracted_text = self.extract_text_from_image(image)
+            # Bước 2: Phân tích với LLM
+            analysis_result = self.analyze_text_with_llm(extracted_text, user_description)
+            # Format kết quả cuối cùng
+            result = f"""📊 **KẾT QUẢ PHÂN TÍCH HÌNH ẢNH**
+🔍 **Văn bản trích xuất được:**
+{extracted_text}
+🤖 **Phân tích AI:**
+{analysis_result}
+---
+*Phân tích sử dụng OCR và AI - Độ chính xác có thể thay đổi tùy thuộc vào chất lượng ảnh.*"""
+            return result
+        except Exception as e:
+            print(f"❌ Lỗi phân tích ảnh: {e}")
+            return f"❌ Lỗi trong quá trình phân tích: {str(e)}"
+    def detect_image_type(self, image: np.ndarray) -> str:
+        """Nhận diện loại ảnh (tài liệu, ảnh chụp, meme, etc.)"""
+        try:
+            # Phân tích đơn giản dựa trên đặc điểm ảnh
+            height, width = image.shape[:2]
+            aspect_ratio = width / height
+            # Phân tích màu sắc
+            if len(image.shape) == 3:
+                color_variance = np.var(image, axis=(0,1))
+                is_colorful = np.mean(color_variance) > 100
+            else:
+                is_colorful = False
+            # Phân tích sơ bộ loại ảnh
+            if aspect_ratio > 2.0 or aspect_ratio < 0.5:
+                return "document"  # Tài liệu thường có tỷ lệ khác thường
+            elif not is_colorful:
+                return "document"  # Ít màu sắc -> có thể là tài liệu
+            else:
+                return "photo"  # Ảnh chụp
+        except Exception as e:
+            print(f"⚠️ Lỗi nhận diện loại ảnh: {e}")
+            return "unknown"