NRLCommercialAI-dev

Running

App Files Files Community

manabb commited on Feb 19

Commit

9d02537

verified ·

1 Parent(s): 2b4672b

Update technicalDocCompliance.py

Browse files

Files changed (1) hide show

technicalDocCompliance.py +32 -17

technicalDocCompliance.py CHANGED Viewed

@@ -1,11 +1,37 @@
 #technicalDocCompliance.py
 from openai import OpenAI  # Core import for client[web:30][web:32]
 def compliance_tech(file: str, client, MANUAL_RULES):
     PROMPT = f"""
     You are a strict procurement compliance auditor.
     Your task is to check whether the uploaded file FULLY complies against each heading of the MANUAL RULES.
@@ -32,25 +58,14 @@ def compliance_tech(file: str, client, MANUAL_RULES):
     {MANUAL_RULES}
     """
-    with open(file, "rb") as f:
-        uploaded_file = client.files.create(file=f, purpose="vision")  # Fixed var name & method[web:27][web:34]
-    response = client.chat.completions.create(  # Fixed: chat.completions.create()[web:30][web:38]
         model="gpt-4o-mini",
-        messages=[  # Fixed structure: messages list of dicts[web:38]
-            {
-                "role": "user",
-                "content": [  # Fixed: content is list of dicts
-                    {"type": "text", "text": PROMPT},  # Fixed: "text" not "input_text"
-                    {
-                        "type": "input_image",  # Fixed: "input_image" for vision/PDFs[web:27]
-                        "file_id": uploaded_file.id  # Reference uploaded file ID
-                    }
-                ]
-            }
-        ],
-        temperature=0,                 # 👈 VERY IMPORTANT
-        max_tokens=1200  # Fixed: max_tokens (not max_output_tokens)[web:38]
     )
     return response.choices[0].message.content  # Fixed: access output text[web:32]

 #technicalDocCompliance.py
 from openai import OpenAI  # Core import for client[web:30][web:32]
+from openai import OpenAI
+from langchain_community.document_loaders import PyMuPDFLoader  # pip install pymupdf[web:42]
+import os
+import re
+def normalize_text(s: str) -> str:
+    """Normalize whitespace / newlines in page_content."""
+    s = s.replace("\r\n", "\n").replace("\r", "\n")
+    s = s.replace("\t", " ")
+    # collapse 3+ newlines to 2
+    s = re.sub(r"\n{3,}", "\n\n", s)
+    # multiple spaces -> 1
+    s = re.sub(r"[ \u00A0]{2,}", " ", s)
+    # strip
+    return s.strip()
 def compliance_tech(file: str, client, MANUAL_RULES):
+    # Extract full PDF text (handles layout/tables well)
+    loader = PyMuPDFLoader(file)
+    docs = loader.load()
+    for d in docs:
+        d.page_content = normalize_text(d.page_content)
+    doc_text = "\n\n".join(doc.page_content for doc in docs)  # Flatten to string[cite:5]
     PROMPT = f"""
+    Document content (complete extracted text):
+    {doc_text[:16000]}  # Truncate if needed for token limits
     You are a strict procurement compliance auditor.
     Your task is to check whether the uploaded file FULLY complies against each heading of the MANUAL RULES.
     {MANUAL_RULES}
     """
+    #with open(file, "rb") as f:
+        #uploaded_file = client.files.create(file=f, purpose="vision")  # Fixed var name & method[web:27][web:34]
+    response = client.chat.completions.create(
         model="gpt-4o-mini",
+        messages=[{"role": "user", "content": PROMPT}],
+        temperature=0,
+        max_tokens=1200
     )
     return response.choices[0].message.content  # Fixed: access output text[web:32]