File size: 3,347 Bytes
291afdf
 
 
 
 
 
 
 
 
 
 
 
 
 
efa7e0c
 
 
291afdf
 
 
 
efa7e0c
 
 
291afdf
 
 
 
 
 
 
 
 
 
 
efa7e0c
 
 
 
 
 
291afdf
 
 
 
 
 
 
efa7e0c
291afdf
 
efa7e0c
 
 
 
291afdf
 
 
 
 
 
efa7e0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291afdf
 
efa7e0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291afdf
 
 
 
efa7e0c
291afdf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import base64
import cv2
import numpy as np
import fitz  # PyMuPDF
from dotenv import load_dotenv
from groq import Groq

# Load environment variables
load_dotenv()
API_KEY = os.getenv("GROQ_API_KEY")  # Fetch API key from environment

class OCRProcessor:
    def __init__(self, model="llama-3.2-90b-vision-preview"):
        if not API_KEY:
            raise ValueError("GROQ_API_KEY is missing! Please set it as an environment variable.")
        
        self.model = model
        self.client = Groq(api_key=API_KEY)

    def enhance_image(self, input_path, output_path):
        """
        Enhances the quality of an image for OCR processing.
        """
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"File not found: {input_path}")

        image = cv2.imread(input_path)
        if image is None:
            raise ValueError(f"Could not process image: {input_path}")

        cv2.imwrite(output_path, image)
        return output_path

    def convert_pdf_to_images(self, pdf_path, save_dir="./uploads"):
        """
        Converts a PDF to images and returns the image file paths.
        """
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        os.makedirs(save_dir, exist_ok=True)
        doc = fitz.open(pdf_path)
        image_paths = []

        for page_idx in range(len(doc)):
            page = doc.load_page(page_idx)
            img = page.get_pixmap()

            image_file = os.path.join(save_dir, f"page_{page_idx + 1}.png")
            img.save(image_file)

            if not os.path.exists(image_file):
                raise Exception(f"Failed to save image: {image_file}")

            image_paths.append(image_file)

        doc.close()
        return image_paths

    def encode_image(self, img_path):
        """
        Encodes an image to base64 format after verifying its existence.
        """
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"File not found: {img_path}")

        try:
            with open(img_path, "rb") as img_file:
                encoded_data = base64.b64encode(img_file.read()).decode("utf-8")

            if not encoded_data or len(encoded_data) < 50:
                raise ValueError("Encoded image data is too short, possibly corrupted.")

            return encoded_data

        except Exception as e:
            raise Exception(f"Failed to encode image: {e}")

    def extract_text_from_image(self, encoded_img, prompt_text):
        """
        Extracts text from an image using OCR. Ensures base64 encoding is valid.
        """
        if not encoded_img or len(encoded_img) < 50:  # Ensures valid base64 string
            raise ValueError("Invalid base64-encoded image data!")

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_img}"}}
                ]
            }
        ]

        try:
            response = self.client.chat.completions.create(model=self.model, messages=messages)
            return response.choices[0].message

        except Exception as err:
            raise Exception(f"OCR extraction failed: {err}")