NRLCommercialAI-dev

Sleeping

manabb commited on Feb 21

Commit

d538593

verified ·

1 Parent(s): b9350f0

Update manabCQgenetaion.py

Files changed (1) hide show

manabCQgenetaion.py CHANGED Viewed

@@ -7,6 +7,9 @@ import os
 import re
 import pandas as pd
 def normalize_text(s: str) -> str:
@@ -31,12 +34,17 @@ def NRLimportRules1():
     return NRLimportRules
 manual_rules=NRLimportRules1()
 def compliance_import_OEM(manabfile: str, client):
     # Extract full PDF text (handles layout/tables well)
-    loader = PyMuPDFLoader(manabfile)
-    docs = loader.load()
-    for d in docs:
-        d.page_content = normalize_text(d.page_content)
-    doc_text = "\n\n".join(doc.page_content for doc in docs)  # Flatten to string[cite:5]
     #==================
     #modified prompt with items value

 import re
 import pandas as pd
+from langchain_core.documents import Document  # Correct current import
+from pdf2image import convert_from_path
+import pytesseract
 def normalize_text(s: str) -> str:
     return NRLimportRules
 manual_rules=NRLimportRules1()
 def compliance_import_OEM(manabfile: str, client):
+    pages = convert_from_path(manabfile, dpi=300)
+    doc_text = ""
+    for page in pages:
+        doc_text += pytesseract.image_to_string(page) + "\n"
     # Extract full PDF text (handles layout/tables well)
+    #loader = PyMuPDFLoader(manabfile)
+    #docs = loader.load()
+    #for d in docs:
+        #d.page_content = normalize_text(d.page_content)
+    #doc_text = "\n\n".join(doc.page_content for doc in docs)  # Flatten to string[cite:5]
     #==================
     #modified prompt with items value