Spaces:

manabb
/

NRLCommercialAI

Running

File size: 4,237 Bytes

#technicalDocCompliance.py

from openai import OpenAI  # Core import for client[web:30][web:32]
from openai import OpenAI
from langchain_community.document_loaders import PyMuPDFLoader  # pip install pymupdf[web:42]
import os
import re

def normalize_text(s: str) -> str:
    """Normalize whitespace / newlines in page_content."""
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = s.replace("\t", " ")

    # collapse 3+ newlines to 2
    s = re.sub(r"\n{3,}", "\n\n", s)

    # multiple spaces -> 1
    s = re.sub(r"[ \u00A0]{2,}", " ", s)

    # strip
    return s.strip()

def compliance_tech(file: str, client, MANUAL_RULES):
    # Extract full PDF text (handles layout/tables well)
    loader = PyMuPDFLoader(file)
    docs = loader.load()
    for d in docs:
        d.page_content = normalize_text(d.page_content)
    doc_text = "\n\n".join(doc.page_content for doc in docs)  # Flatten to string[cite:5]
    PROMPT = f"""
    Document content (complete extracted text):
    
    {doc_text[:16000]}  # Truncate if needed for token limits
    
    You are a strict procurement compliance auditor.

    Your task is to check whether the uploaded file FULLY complies against each point of each heading of the MANUAL RULES.

    MANDATORY INSTRUCTIONS:

    1. Do NOT assume anything.
    2. Do NOT interpret beyond what is written.
    3. If information is missing → mark as NON-COMPLIANT.
    4. If partially satisfied → mark as NON-COMPLIANT.
    5. Only explicit written evidence is valid.
    6. Quote exact sentence from document as evidence.
    7. Do not provide explanation beyond required format.
    8. Include E File No in the response.

    Summarise the response only on rule headingwise and not pointwise mentioned under each heading.
    
    OUTPUT FORMAT (STRICT):

    Rule heading: Heading of the rule
    Status: COMPLIANT / NON-COMPLIANT
    Evidence: "<Exact quoted sentence from document>" OR "Not found in document"    
    Deviations: <short bullet-style description or 'None'>
    COMPLIANCE ANALYSIS: <2–4 sentences explaining reasoning>

    MANUAL RULES:
    {MANUAL_RULES}
    """

    #with open(file, "rb") as f:
        #uploaded_file = client.files.create(file=f, purpose="vision")  # Fixed var name & method[web:27][web:34]

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": PROMPT}],
        temperature=0,
        max_tokens=1200
    )

    return response.choices[0].message.content  # Fixed: access output text[web:32]

def compliance_tech_pdf(file_name: str, client, MANUAL_RULES):
    
    PROMPT = f"""
    
    You are a strict procurement compliance auditor.

    Your task is to check whether the uploaded file FULLY complies against each point of each heading of the MANUAL RULES.

    MANDATORY INSTRUCTIONS:

    1. Do NOT assume anything.
    2. Do NOT interpret beyond what is written.
    3. If information is missing → mark as NON-COMPLIANT.
    4. If partially satisfied → mark as PARTIALLY-COMPLIANT.
    5. Only explicit written evidence is valid.
    6. Quote exact sentence from document as evidence.
    7. Do not provide explanation beyond required format.
    8. Include subject in the response.

    Response against each point.
    
    OUTPUT FORMAT (STRICT):

    Rule heading: (text)
    Status: COMPLIANT / NON-COMPLIANT / PARTIALLY-COMPLIANT
    Evidence: "<Exact quoted sentence from document>" OR "Not found in document"    
    COMPLIANCE ANALYSIS: <2–4 sentences explaining reasoning>
    

    MANUAL RULES:
    {MANUAL_RULES}
    """

    with open(file_name, "rb") as f:
        file = client.files.create(file=f, purpose="assistants")
    
    response = client.responses.create(
        model="gpt-4o-mini",
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": PROMPT},
                    {
                        "type": "input_file",
                        "file_id": file.id
                    }
                ]
            }
        ],
        temperature=0,                 # 👈 VERY IMPORTANT
        max_output_tokens=1200
    )
    return response.output_text