Spaces:

DvorakInnovationAI
/

REVEL-RIZZ-API

Running

App Files Files Community

REVEL-RIZZ-API / src /conversation_extractor /extractor.py

subashpoudel

Implemented easy ocr

c1c6de7 3 months ago

raw

history blame contribute delete

2.08 kB


	from google import genai
	from google.genai import types
	from src.utils.models_loader import ocr_llm , reply_llm , reader
	from .prompts import prompt
	import easyocr
	from PIL import Image
	import io
	import numpy as np
	from dotenv import load_dotenv
	import os
	from langchain_core.messages import HumanMessage , SystemMessage

	load_dotenv()
	os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')

	class ConversationExtractor:
	def __init__(self):
	self.client = genai.Client()
	self.model_name = ocr_llm
	self.prompt = prompt

	def complete_ocr(self, image_bytes:bytes , lang_list=['en']):
	img = Image.open(io.BytesIO(image_bytes))
	w_img = img.width

	# Convert PIL image to RGB and read it directly using EasyOCR
	results = reader.readtext(np.array(img)) # Use numpy array instead of file path

	conversation = []
	for bbox, text, conf in results:
	# bbox: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
	x_coords = [p[0] for p in bbox]
	y_coords = [p[1] for p in bbox]
	x, y, w, h = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)-min(x_coords)), int(max(y_coords)-min(y_coords))

	# Determine left or right speaker
	speaker = "A" if (x + w/2) < w_img/2 else "B"

	conversation.append({
	"speaker": speaker,
	"text": text.strip(),
	"box": [x, y, w, h]
	})

	# Sort top to bottom
	raw_ocr = sorted(conversation, key=lambda x: x["box"][1])
	raw_ocr_text = "Detected Conversation:\n" + "\n".join(
	[f"Line {i}: {turn['text']}" for i, turn in enumerate(raw_ocr, start=1)]
	)
	return raw_ocr_text

	def extract_conversation(self, image_bytes:bytes):
	raw_ocr_text=self.complete_ocr(image_bytes)
	messages = [SystemMessage(content = prompt),
	HumanMessage(content = raw_ocr_text)]
	response =reply_llm.invoke(messages)
	print('The cleaned ocr:', response.content)
	return response.content