manabb commited on
Commit
d538593
·
verified ·
1 Parent(s): b9350f0

Update manabCQgenetaion.py

Browse files
Files changed (1) hide show
  1. manabCQgenetaion.py +13 -5
manabCQgenetaion.py CHANGED
@@ -7,6 +7,9 @@ import os
7
  import re
8
  import pandas as pd
9
 
 
 
 
10
 
11
 
12
  def normalize_text(s: str) -> str:
@@ -31,12 +34,17 @@ def NRLimportRules1():
31
  return NRLimportRules
32
  manual_rules=NRLimportRules1()
33
  def compliance_import_OEM(manabfile: str, client):
 
 
 
 
 
34
  # Extract full PDF text (handles layout/tables well)
35
- loader = PyMuPDFLoader(manabfile)
36
- docs = loader.load()
37
- for d in docs:
38
- d.page_content = normalize_text(d.page_content)
39
- doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]
40
 
41
  #==================
42
  #modified prompt with items value
 
7
  import re
8
  import pandas as pd
9
 
10
+ from langchain_core.documents import Document # Correct current import
11
+ from pdf2image import convert_from_path
12
+ import pytesseract
13
 
14
 
15
  def normalize_text(s: str) -> str:
 
34
  return NRLimportRules
35
  manual_rules=NRLimportRules1()
36
  def compliance_import_OEM(manabfile: str, client):
37
+
38
+ pages = convert_from_path(manabfile, dpi=300)
39
+ doc_text = ""
40
+ for page in pages:
41
+ doc_text += pytesseract.image_to_string(page) + "\n"
42
  # Extract full PDF text (handles layout/tables well)
43
+ #loader = PyMuPDFLoader(manabfile)
44
+ #docs = loader.load()
45
+ #for d in docs:
46
+ #d.page_content = normalize_text(d.page_content)
47
+ #doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]
48
 
49
  #==================
50
  #modified prompt with items value