NRLCommercialAI-dev / manabCQgenetaion.py
manabb's picture
Update manabCQgenetaion.py
5d69f2a verified
#manabCQgenetaion.py
import json
from openai import OpenAI
from langchain_community.document_loaders import PyMuPDFLoader # pip install pymupdf[web:42]
import os
import re
import pandas as pd
from langchain_core.documents import Document # Correct current import
from pdf2image import convert_from_path
import pytesseract
def normalize_text(s: str) -> str:
"""Normalize whitespace / newlines in page_content."""
s = s.replace("\r\n", "\n").replace("\r", "\n")
s = s.replace("\t", " ")
# collapse 3+ newlines to 2
s = re.sub(r"\n{3,}", "\n\n", s)
# multiple spaces -> 1
s = re.sub(r"[ \u00A0]{2,}", " ", s)
# strip
return s.strip()
def NRLimportRules1():
NRLimportRules = f"""
1. Main rule: All other rules of shall be as per General Purchase Condition (GPC) of NRL
2. Firm offer: The Quoted price shall remain firm and fixed till complete execution of the order.
"""
return NRLimportRules
manual_rules=NRLimportRules1()
def compliance_import_OEM(manabfile: str, client):
pages = convert_from_path(manabfile, dpi=300)
text = ""
for page in pages:
text += pytesseract.image_to_string(page) + "\n"
doc_text=normalize_text(text)
# Extract full PDF text (handles layout/tables well)
#loader = PyMuPDFLoader(manabfile)
#docs = loader.load()
#for d in docs:
#d.page_content = normalize_text(d.page_content)
#doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]
#==================
#modified prompt with items value
SYSTEM_PROMPT = """
You are a strict procurement compliance auditor with dual responsibilities:
1. **EXTRACTION**: Extract specific fields from vendor documents with zero hallucination
2. **COMPLIANCE**: Perform strict point-by-point compliance checking against manual rules
MANDATORY INSTRUCTIONS (APPLY TO BOTH TASKS):
1. Do NOT assume anything beyond explicitly written text
2. Do NOT interpret or infer missing information
3. Missing information = null (extraction) OR NON-COMPLIANT (rules)
4. Partial matches = NON-COMPLIANT
5. Only explicit written evidence is valid
6. Quote exact sentences from document as evidence
OUTPUT ONLY valid JSON. No explanations, commentary, or additional text.
EXACT JSON SCHEMA:
{
"extraction": {
"items": [
{
"item_description": "string or null",
"rate": "string or null",
"qty": "string or null",
"value": "string or null"
}
],
"Value of the offer": "string or null",
"Company name":"string or null",
"Contact person":"string or null",
"Contact person email id":"string or null",
"Subject of the offer":"string or null",
"offer reference Number":"string or null",
"offer date":"string or null",
"Freight":"string or null",
"Transit Insurance":"string or null",
"Dispatch point": "string or null",
"mode of dispatch": "string or null",
"Weight and dimension of item": "string or null",
"Incoterm": "string or null",
"Packing & forwarding": "string or null",
"Cost of transportation/delivery from Ex work to the nearest port": "string or null",
"Charges outside India": "string or null",
"Third party Inspection or TPI": "string or null",
"Currency of quoted price": "string or null",
"Charges within India": "string or null",
"Payment terms": "string or null",
"Delivery period": "string or null",
"OFFER VALIDITY": "string or null",
"PRICE REDUCTION CLAUSE FOR DELAYED DELIVERY (LD)": "string or null",
"GUARANTEE / WARRANTEE": "string or null",
"interchangeability certificate": "string or null",
"test report / inspection report": "string or null",
"Certificate of origin": "string or null"
},
"compliance_check": [
{
"rule_heading": "Exact heading from manual",
"status": "COMPLIANT or NON-COMPLIANT",
"as_per_vendor": "Exact quoted sentence OR Not found in document"
}
]
}
"""
USER_PROMPT= f"""
Document content (complete extracted text from vendor offer):
{doc_text[:16000]}
MANUAL RULES (check compliance against each point exactly):
{manual_rules}
INSTRUCTIONS:
1. FIRST extract all specified fields from document (use null if missing)
2. SECOND check each manual rule point-by-point
3. For each rule: quote exact evidence OR "Not found in document"
4. Output ONLY the exact JSON schema from system prompt
"""
#===============
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": USER_PROMPT}
],
temperature=0.0,
max_tokens=3000,
response_format={"type": "json_object"}
)
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
total_tokens = response.usage.total_tokens
result_dic = json.loads(response.choices[0].message.content)
data=result_dic
extracted_item_value_df=pd.DataFrame(data['extraction']['items'])
extraction_df = pd.DataFrame(list(data['extraction'].items()), columns=['Field', 'Value'])
compliance_df = pd.DataFrame(data['compliance_check'])
# Generate HTML tables with Bootstrap styling
extraction_item_value_html = extracted_item_value_df.to_html(
index=False,
escape=False,
classes='table table-striped table-bordered table-hover',
table_id='extraction-table'
)
extraction_html = extraction_df.to_html(
index=False,
escape=False,
classes='table table-striped table-bordered table-hover',
table_id='extraction-table'
)
compliance_html = compliance_df.to_html(
index=False,
escape=False,
classes='table table-striped table-bordered table-hover',
table_id='compliance-table'
)
finalHtml = extraction_item_value_html + " " + extraction_html+" "+compliance_html
# Print DataFrames (for verification)
#print("Extraction DataFrame:")
#print(extraction_df)
#print("\nCompliance DataFrame:")
#print(compliance_df)
#print("\nExtraction HTML table:")
#print(extraction_html)
#print("\nCompliance HTML table:")
#print(compliance_html)
#usage
#print(result_dic["extraction"]["Value of the offer"])
#print (result_dic["extraction"]["Dispatch point"])
#print(result_dic["compliance_check"][0]['rule_heading'])
return extraction_item_value_html, extraction_html, compliance_html, input_tokens ,output_tokens, total_tokens, result_dic