PARTHA181098 commited on
Commit
bab5a4c
·
verified ·
1 Parent(s): fea04e5

Update agents/document_agent.py

Browse files
Files changed (1) hide show
  1. agents/document_agent.py +43 -3
agents/document_agent.py CHANGED
@@ -13,6 +13,10 @@ import google.generativeai as genai
13
  from dotenv import load_dotenv
14
  from datetime import datetime
15
 
 
 
 
 
16
  from agents.base_agent import BaseAgent
17
  from state import (
18
  InvoiceProcessingState, InvoiceData, ItemDetail,
@@ -114,6 +118,8 @@ class DocumentAgent(BaseAgent):
114
  genai.configure(api_key=self.api_key)
115
  # genai.configure(api_key=os.getenv("GEMINI_API_KEY_7"))
116
  self.model = genai.GenerativeModel("gemini-2.5-flash")
 
 
117
 
118
  def generate(self, prompt):
119
  try:
@@ -175,6 +181,38 @@ class DocumentAgent(BaseAgent):
175
  self._should_escalate(state, reason=str(e))
176
  return state
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  async def _extract_text_from_pdf(self, file_name: str) -> str:
180
  # pass
@@ -192,9 +230,11 @@ class DocumentAgent(BaseAgent):
192
  with pdfplumber.open(file_name) as pdf:
193
  for page in pdf.pages:
194
  text += page.extract_text() or ""
195
- except Exception as e2:
196
- self.logger.logger.error("[DocumentAgent] PDFPlumber failed :{e2}")
197
- text = ""
 
 
198
  return text
199
 
200
  async def _parse_invoice_with_ai(self, text: str) -> InvoiceData:
 
13
  from dotenv import load_dotenv
14
  from datetime import datetime
15
 
16
+ from paddleocr import PaddleOCR
17
+ import io
18
+ from PIL import Image
19
+
20
  from agents.base_agent import BaseAgent
21
  from state import (
22
  InvoiceProcessingState, InvoiceData, ItemDetail,
 
118
  genai.configure(api_key=self.api_key)
119
  # genai.configure(api_key=os.getenv("GEMINI_API_KEY_7"))
120
  self.model = genai.GenerativeModel("gemini-2.5-flash")
121
+ # Initialize PaddleOCR (English example)
122
+ self.ocr = PaddleOCR(use_angle_cls=True, lang="en")
123
 
124
  def generate(self, prompt):
125
  try:
 
181
  self._should_escalate(state, reason=str(e))
182
  return state
183
 
184
+ async def _extract_with_paddle_ocr(self, file_name: str) -> str:
185
+ full_text = ""
186
+
187
+ try:
188
+ doc = fitz.open(file_name)
189
+
190
+ for page_num in range(len(doc)):
191
+ page = doc[page_num]
192
+
193
+ # Convert PDF page to image
194
+ pix = page.get_pixmap(dpi=300)
195
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
196
+ img_np = np.array(img)
197
+
198
+ # Run OCR
199
+ result = self.ocr.ocr(img_np, cls=True)
200
+
201
+ # Extract text lines
202
+ for line in result:
203
+ for word_info in line:
204
+ full_text += word_info[1][0] + " "
205
+
206
+ full_text += "\n"
207
+
208
+ doc.close()
209
+ self.logger.logger.info("[DocumentAgent] PaddleOCR extraction completed.")
210
+
211
+ except Exception as e:
212
+ self.logger.logger.error(f"[DocumentAgent] PaddleOCR failed: {e}")
213
+ print('text from ocr........', full_text)
214
+ return full_text
215
+
216
 
217
  async def _extract_text_from_pdf(self, file_name: str) -> str:
218
  # pass
 
230
  with pdfplumber.open(file_name) as pdf:
231
  for page in pdf.pages:
232
  text += page.extract_text() or ""
233
+ except Exception:
234
+ self.logger.logger.info("[DocumentAgent] Falling back to PaddleOCR...")
235
+
236
+ # Final fallback → PaddleOCR
237
+ text = await self._extract_with_paddle_ocr(file_name)
238
  return text
239
 
240
  async def _parse_invoice_with_ai(self, text: str) -> InvoiceData: