Spaces:
Sleeping
Sleeping
Update ocr_processing.py
Browse files- ocr_processing.py +50 -3
ocr_processing.py
CHANGED
|
@@ -12,10 +12,16 @@ API_KEY = os.getenv("GROQ_API_KEY") # Fetch API key from environment
|
|
| 12 |
|
| 13 |
class OCRProcessor:
|
| 14 |
def __init__(self, model="llama-3.2-90b-vision-preview"):
|
|
|
|
|
|
|
|
|
|
| 15 |
self.model = model
|
| 16 |
self.client = Groq(api_key=API_KEY)
|
| 17 |
|
| 18 |
def enhance_image(self, input_path, output_path):
|
|
|
|
|
|
|
|
|
|
| 19 |
if not os.path.exists(input_path):
|
| 20 |
raise FileNotFoundError(f"File not found: {input_path}")
|
| 21 |
|
|
@@ -27,6 +33,12 @@ class OCRProcessor:
|
|
| 27 |
return output_path
|
| 28 |
|
| 29 |
def convert_pdf_to_images(self, pdf_path, save_dir="./uploads"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
os.makedirs(save_dir, exist_ok=True)
|
| 31 |
doc = fitz.open(pdf_path)
|
| 32 |
image_paths = []
|
|
@@ -34,22 +46,57 @@ class OCRProcessor:
|
|
| 34 |
for page_idx in range(len(doc)):
|
| 35 |
page = doc.load_page(page_idx)
|
| 36 |
img = page.get_pixmap()
|
|
|
|
| 37 |
image_file = os.path.join(save_dir, f"page_{page_idx + 1}.png")
|
| 38 |
img.save(image_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
image_paths.append(image_file)
|
| 40 |
|
| 41 |
doc.close()
|
| 42 |
return image_paths
|
| 43 |
|
| 44 |
def encode_image(self, img_path):
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def extract_text_from_image(self, encoded_img, prompt_text):
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
try:
|
| 52 |
response = self.client.chat.completions.create(model=self.model, messages=messages)
|
| 53 |
return response.choices[0].message
|
|
|
|
| 54 |
except Exception as err:
|
| 55 |
raise Exception(f"OCR extraction failed: {err}")
|
|
|
|
| 12 |
|
| 13 |
class OCRProcessor:
|
| 14 |
def __init__(self, model="llama-3.2-90b-vision-preview"):
|
| 15 |
+
if not API_KEY:
|
| 16 |
+
raise ValueError("GROQ_API_KEY is missing! Please set it as an environment variable.")
|
| 17 |
+
|
| 18 |
self.model = model
|
| 19 |
self.client = Groq(api_key=API_KEY)
|
| 20 |
|
| 21 |
def enhance_image(self, input_path, output_path):
|
| 22 |
+
"""
|
| 23 |
+
Enhances the quality of an image for OCR processing.
|
| 24 |
+
"""
|
| 25 |
if not os.path.exists(input_path):
|
| 26 |
raise FileNotFoundError(f"File not found: {input_path}")
|
| 27 |
|
|
|
|
| 33 |
return output_path
|
| 34 |
|
| 35 |
def convert_pdf_to_images(self, pdf_path, save_dir="./uploads"):
|
| 36 |
+
"""
|
| 37 |
+
Converts a PDF to images and returns the image file paths.
|
| 38 |
+
"""
|
| 39 |
+
if not os.path.exists(pdf_path):
|
| 40 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
| 41 |
+
|
| 42 |
os.makedirs(save_dir, exist_ok=True)
|
| 43 |
doc = fitz.open(pdf_path)
|
| 44 |
image_paths = []
|
|
|
|
| 46 |
for page_idx in range(len(doc)):
|
| 47 |
page = doc.load_page(page_idx)
|
| 48 |
img = page.get_pixmap()
|
| 49 |
+
|
| 50 |
image_file = os.path.join(save_dir, f"page_{page_idx + 1}.png")
|
| 51 |
img.save(image_file)
|
| 52 |
+
|
| 53 |
+
if not os.path.exists(image_file):
|
| 54 |
+
raise Exception(f"Failed to save image: {image_file}")
|
| 55 |
+
|
| 56 |
image_paths.append(image_file)
|
| 57 |
|
| 58 |
doc.close()
|
| 59 |
return image_paths
|
| 60 |
|
| 61 |
def encode_image(self, img_path):
|
| 62 |
+
"""
|
| 63 |
+
Encodes an image to base64 format after verifying its existence.
|
| 64 |
+
"""
|
| 65 |
+
if not os.path.exists(img_path):
|
| 66 |
+
raise FileNotFoundError(f"File not found: {img_path}")
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
with open(img_path, "rb") as img_file:
|
| 70 |
+
encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
|
| 71 |
+
|
| 72 |
+
if not encoded_data or len(encoded_data) < 50:
|
| 73 |
+
raise ValueError("Encoded image data is too short, possibly corrupted.")
|
| 74 |
+
|
| 75 |
+
return encoded_data
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
raise Exception(f"Failed to encode image: {e}")
|
| 79 |
|
| 80 |
def extract_text_from_image(self, encoded_img, prompt_text):
|
| 81 |
+
"""
|
| 82 |
+
Extracts text from an image using OCR. Ensures base64 encoding is valid.
|
| 83 |
+
"""
|
| 84 |
+
if not encoded_img or len(encoded_img) < 50: # Ensures valid base64 string
|
| 85 |
+
raise ValueError("Invalid base64-encoded image data!")
|
| 86 |
+
|
| 87 |
+
messages = [
|
| 88 |
+
{
|
| 89 |
+
"role": "user",
|
| 90 |
+
"content": [
|
| 91 |
+
{"type": "text", "text": prompt_text},
|
| 92 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_img}"}}
|
| 93 |
+
]
|
| 94 |
+
}
|
| 95 |
+
]
|
| 96 |
|
| 97 |
try:
|
| 98 |
response = self.client.chat.completions.create(model=self.model, messages=messages)
|
| 99 |
return response.choices[0].message
|
| 100 |
+
|
| 101 |
except Exception as err:
|
| 102 |
raise Exception(f"OCR extraction failed: {err}")
|