File size: 2,081 Bytes
84d64f1 c1c6de7 84d64f1 c1c6de7 84d64f1 c1c6de7 84d64f1 c1c6de7 84d64f1 c1c6de7 84d64f1 c1c6de7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from google import genai
from google.genai import types
from src.utils.models_loader import ocr_llm , reply_llm , reader
from .prompts import prompt
import easyocr
from PIL import Image
import io
import numpy as np
from dotenv import load_dotenv
import os
from langchain_core.messages import HumanMessage , SystemMessage
load_dotenv()
os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')
class ConversationExtractor:
def __init__(self):
self.client = genai.Client()
self.model_name = ocr_llm
self.prompt = prompt
def complete_ocr(self, image_bytes:bytes , lang_list=['en']):
img = Image.open(io.BytesIO(image_bytes))
w_img = img.width
# Convert PIL image to RGB and read it directly using EasyOCR
results = reader.readtext(np.array(img)) # Use numpy array instead of file path
conversation = []
for bbox, text, conf in results:
# bbox: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
x_coords = [p[0] for p in bbox]
y_coords = [p[1] for p in bbox]
x, y, w, h = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)-min(x_coords)), int(max(y_coords)-min(y_coords))
# Determine left or right speaker
speaker = "A" if (x + w/2) < w_img/2 else "B"
conversation.append({
"speaker": speaker,
"text": text.strip(),
"box": [x, y, w, h]
})
# Sort top to bottom
raw_ocr = sorted(conversation, key=lambda x: x["box"][1])
raw_ocr_text = "Detected Conversation:\n" + "\n".join(
[f"Line {i}: {turn['text']}" for i, turn in enumerate(raw_ocr, start=1)]
)
return raw_ocr_text
def extract_conversation(self, image_bytes:bytes):
raw_ocr_text=self.complete_ocr(image_bytes)
messages = [SystemMessage(content = prompt),
HumanMessage(content = raw_ocr_text)]
response =reply_llm.invoke(messages)
print('The cleaned ocr:', response.content)
return response.content
|