Spaces:
Sleeping
Sleeping
Update manabCQgenetaion.py
Browse files- manabCQgenetaion.py +13 -5
manabCQgenetaion.py
CHANGED
|
@@ -7,6 +7,9 @@ import os
|
|
| 7 |
import re
|
| 8 |
import pandas as pd
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def normalize_text(s: str) -> str:
|
|
@@ -31,12 +34,17 @@ def NRLimportRules1():
|
|
| 31 |
return NRLimportRules
|
| 32 |
manual_rules=NRLimportRules1()
|
| 33 |
def compliance_import_OEM(manabfile: str, client):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# Extract full PDF text (handles layout/tables well)
|
| 35 |
-
loader = PyMuPDFLoader(manabfile)
|
| 36 |
-
docs = loader.load()
|
| 37 |
-
for d in docs:
|
| 38 |
-
d.page_content = normalize_text(d.page_content)
|
| 39 |
-
doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]
|
| 40 |
|
| 41 |
#==================
|
| 42 |
#modified prompt with items value
|
|
|
|
| 7 |
import re
|
| 8 |
import pandas as pd
|
| 9 |
|
| 10 |
+
from langchain_core.documents import Document # Correct current import
|
| 11 |
+
from pdf2image import convert_from_path
|
| 12 |
+
import pytesseract
|
| 13 |
|
| 14 |
|
| 15 |
def normalize_text(s: str) -> str:
|
|
|
|
| 34 |
return NRLimportRules
|
| 35 |
manual_rules=NRLimportRules1()
|
| 36 |
def compliance_import_OEM(manabfile: str, client):
|
| 37 |
+
|
| 38 |
+
pages = convert_from_path(manabfile, dpi=300)
|
| 39 |
+
doc_text = ""
|
| 40 |
+
for page in pages:
|
| 41 |
+
doc_text += pytesseract.image_to_string(page) + "\n"
|
| 42 |
# Extract full PDF text (handles layout/tables well)
|
| 43 |
+
#loader = PyMuPDFLoader(manabfile)
|
| 44 |
+
#docs = loader.load()
|
| 45 |
+
#for d in docs:
|
| 46 |
+
#d.page_content = normalize_text(d.page_content)
|
| 47 |
+
#doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]
|
| 48 |
|
| 49 |
#==================
|
| 50 |
#modified prompt with items value
|