Spaces:
Running
Running
File size: 4,237 Bytes
481783d e93aefc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | #technicalDocCompliance.py
from openai import OpenAI # Core import for client[web:30][web:32]
from openai import OpenAI
from langchain_community.document_loaders import PyMuPDFLoader # pip install pymupdf[web:42]
import os
import re
def normalize_text(s: str) -> str:
"""Normalize whitespace / newlines in page_content."""
s = s.replace("\r\n", "\n").replace("\r", "\n")
s = s.replace("\t", " ")
# collapse 3+ newlines to 2
s = re.sub(r"\n{3,}", "\n\n", s)
# multiple spaces -> 1
s = re.sub(r"[ \u00A0]{2,}", " ", s)
# strip
return s.strip()
def compliance_tech(file: str, client, MANUAL_RULES):
# Extract full PDF text (handles layout/tables well)
loader = PyMuPDFLoader(file)
docs = loader.load()
for d in docs:
d.page_content = normalize_text(d.page_content)
doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]
PROMPT = f"""
Document content (complete extracted text):
{doc_text[:16000]} # Truncate if needed for token limits
You are a strict procurement compliance auditor.
Your task is to check whether the uploaded file FULLY complies against each point of each heading of the MANUAL RULES.
MANDATORY INSTRUCTIONS:
1. Do NOT assume anything.
2. Do NOT interpret beyond what is written.
3. If information is missing β mark as NON-COMPLIANT.
4. If partially satisfied β mark as NON-COMPLIANT.
5. Only explicit written evidence is valid.
6. Quote exact sentence from document as evidence.
7. Do not provide explanation beyond required format.
8. Include E File No in the response.
Summarise the response only on rule headingwise and not pointwise mentioned under each heading.
OUTPUT FORMAT (STRICT):
Rule heading: Heading of the rule
Status: COMPLIANT / NON-COMPLIANT
Evidence: "<Exact quoted sentence from document>" OR "Not found in document"
Deviations: <short bullet-style description or 'None'>
COMPLIANCE ANALYSIS: <2β4 sentences explaining reasoning>
MANUAL RULES:
{MANUAL_RULES}
"""
#with open(file, "rb") as f:
#uploaded_file = client.files.create(file=f, purpose="vision") # Fixed var name & method[web:27][web:34]
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": PROMPT}],
temperature=0,
max_tokens=1200
)
return response.choices[0].message.content # Fixed: access output text[web:32]
def compliance_tech_pdf(file_name: str, client, MANUAL_RULES):
PROMPT = f"""
You are a strict procurement compliance auditor.
Your task is to check whether the uploaded file FULLY complies against each point of each heading of the MANUAL RULES.
MANDATORY INSTRUCTIONS:
1. Do NOT assume anything.
2. Do NOT interpret beyond what is written.
3. If information is missing β mark as NON-COMPLIANT.
4. If partially satisfied β mark as PARTIALLY-COMPLIANT.
5. Only explicit written evidence is valid.
6. Quote exact sentence from document as evidence.
7. Do not provide explanation beyond required format.
8. Include subject in the response.
Response against each point.
OUTPUT FORMAT (STRICT):
Rule heading: (text)
Status: COMPLIANT / NON-COMPLIANT / PARTIALLY-COMPLIANT
Evidence: "<Exact quoted sentence from document>" OR "Not found in document"
COMPLIANCE ANALYSIS: <2β4 sentences explaining reasoning>
MANUAL RULES:
{MANUAL_RULES}
"""
with open(file_name, "rb") as f:
file = client.files.create(file=f, purpose="assistants")
response = client.responses.create(
model="gpt-4o-mini",
input=[
{
"role": "user",
"content": [
{"type": "input_text", "text": PROMPT},
{
"type": "input_file",
"file_id": file.id
}
]
}
],
temperature=0, # π VERY IMPORTANT
max_output_tokens=1200
)
return response.output_text
|