NRLCommercialAI-dev

Running

App Files Files Community

NRLCommercialAI-dev / manabCQgenetaion.py

manabb

Update manabCQgenetaion.py

5d69f2a verified about 2 months ago

raw

history blame contribute delete

6.96 kB

	#manabCQgenetaion.py

	import json
	from openai import OpenAI
	from langchain_community.document_loaders import PyMuPDFLoader # pip install pymupdf[web:42]
	import os
	import re
	import pandas as pd

	from langchain_core.documents import Document # Correct current import
	from pdf2image import convert_from_path
	import pytesseract


	def normalize_text(s: str) -> str:
	"""Normalize whitespace / newlines in page_content."""
	s = s.replace("\r\n", "\n").replace("\r", "\n")
	s = s.replace("\t", " ")

	# collapse 3+ newlines to 2
	s = re.sub(r"\n{3,}", "\n\n", s)

	# multiple spaces -> 1
	s = re.sub(r"[ \u00A0]{2,}", " ", s)

	# strip
	return s.strip()
	def NRLimportRules1():
	NRLimportRules = f"""
	1. Main rule: All other rules of shall be as per General Purchase Condition (GPC) of NRL
	2. Firm offer: The Quoted price shall remain firm and fixed till complete execution of the order.

	"""
	return NRLimportRules
	manual_rules=NRLimportRules1()
	def compliance_import_OEM(manabfile: str, client):

	pages = convert_from_path(manabfile, dpi=300)
	text = ""
	for page in pages:
	text += pytesseract.image_to_string(page) + "\n"
	doc_text=normalize_text(text)
	# Extract full PDF text (handles layout/tables well)
	#loader = PyMuPDFLoader(manabfile)
	#docs = loader.load()
	#for d in docs:
	#d.page_content = normalize_text(d.page_content)
	#doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]

	#==================
	#modified prompt with items value
	SYSTEM_PROMPT = """
	You are a strict procurement compliance auditor with dual responsibilities:

	1. EXTRACTION: Extract specific fields from vendor documents with zero hallucination
	2. COMPLIANCE: Perform strict point-by-point compliance checking against manual rules

	MANDATORY INSTRUCTIONS (APPLY TO BOTH TASKS):
	1. Do NOT assume anything beyond explicitly written text
	2. Do NOT interpret or infer missing information
	3. Missing information = null (extraction) OR NON-COMPLIANT (rules)
	4. Partial matches = NON-COMPLIANT
	5. Only explicit written evidence is valid
	6. Quote exact sentences from document as evidence

	OUTPUT ONLY valid JSON. No explanations, commentary, or additional text.

	EXACT JSON SCHEMA:
	{
	"extraction": {
	"items": [
	{
	"item_description": "string or null",
	"rate": "string or null",
	"qty": "string or null",
	"value": "string or null"
	}
	],
	"Value of the offer": "string or null",
	"Company name":"string or null",
	"Contact person":"string or null",
	"Contact person email id":"string or null",
	"Subject of the offer":"string or null",
	"offer reference Number":"string or null",
	"offer date":"string or null",
	"Freight":"string or null",
	"Transit Insurance":"string or null",
	"Dispatch point": "string or null",
	"mode of dispatch": "string or null",
	"Weight and dimension of item": "string or null",
	"Incoterm": "string or null",
	"Packing & forwarding": "string or null",
	"Cost of transportation/delivery from Ex work to the nearest port": "string or null",
	"Charges outside India": "string or null",
	"Third party Inspection or TPI": "string or null",
	"Currency of quoted price": "string or null",
	"Charges within India": "string or null",
	"Payment terms": "string or null",
	"Delivery period": "string or null",
	"OFFER VALIDITY": "string or null",
	"PRICE REDUCTION CLAUSE FOR DELAYED DELIVERY (LD)": "string or null",
	"GUARANTEE / WARRANTEE": "string or null",
	"interchangeability certificate": "string or null",
	"test report / inspection report": "string or null",
	"Certificate of origin": "string or null"
	},
	"compliance_check": [
	{
	"rule_heading": "Exact heading from manual",
	"status": "COMPLIANT or NON-COMPLIANT",
	"as_per_vendor": "Exact quoted sentence OR Not found in document"
	}
	]
	}
	"""
	USER_PROMPT= f"""
	Document content (complete extracted text from vendor offer):
	{doc_text[:16000]}
	MANUAL RULES (check compliance against each point exactly):
	{manual_rules}
	INSTRUCTIONS:
	1. FIRST extract all specified fields from document (use null if missing)
	2. SECOND check each manual rule point-by-point
	3. For each rule: quote exact evidence OR "Not found in document"
	4. Output ONLY the exact JSON schema from system prompt
	"""


	#===============

	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": USER_PROMPT}
	],
	temperature=0.0,
	max_tokens=3000,
	response_format={"type": "json_object"}
	)
	input_tokens = response.usage.prompt_tokens
	output_tokens = response.usage.completion_tokens
	total_tokens = response.usage.total_tokens
	result_dic = json.loads(response.choices[0].message.content)

	data=result_dic

	extracted_item_value_df=pd.DataFrame(data['extraction']['items'])
	extraction_df = pd.DataFrame(list(data['extraction'].items()), columns=['Field', 'Value'])
	compliance_df = pd.DataFrame(data['compliance_check'])

	# Generate HTML tables with Bootstrap styling
	extraction_item_value_html = extracted_item_value_df.to_html(
	index=False,
	escape=False,
	classes='table table-striped table-bordered table-hover',
	table_id='extraction-table'
	)
	extraction_html = extraction_df.to_html(
	index=False,
	escape=False,
	classes='table table-striped table-bordered table-hover',
	table_id='extraction-table'
	)
	compliance_html = compliance_df.to_html(
	index=False,
	escape=False,
	classes='table table-striped table-bordered table-hover',
	table_id='compliance-table'
	)
	finalHtml = extraction_item_value_html + " " + extraction_html+" "+compliance_html
	# Print DataFrames (for verification)
	#print("Extraction DataFrame:")
	#print(extraction_df)
	#print("\nCompliance DataFrame:")
	#print(compliance_df)
	#print("\nExtraction HTML table:")
	#print(extraction_html)
	#print("\nCompliance HTML table:")
	#print(compliance_html)
	#usage
	#print(result_dic["extraction"]["Value of the offer"])
	#print (result_dic["extraction"]["Dispatch point"])
	#print(result_dic["compliance_check"][0]['rule_heading'])

	return extraction_item_value_html, extraction_html, compliance_html, input_tokens ,output_tokens, total_tokens, result_dic