Spaces:
Running
Running
| #manabCQgenetaion.py | |
| import json | |
| from openai import OpenAI | |
| from langchain_community.document_loaders import PyMuPDFLoader # pip install pymupdf[web:42] | |
| import os | |
| import re | |
| import pandas as pd | |
| from langchain_core.documents import Document # Correct current import | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| def normalize_text(s: str) -> str: | |
| """Normalize whitespace / newlines in page_content.""" | |
| s = s.replace("\r\n", "\n").replace("\r", "\n") | |
| s = s.replace("\t", " ") | |
| # collapse 3+ newlines to 2 | |
| s = re.sub(r"\n{3,}", "\n\n", s) | |
| # multiple spaces -> 1 | |
| s = re.sub(r"[ \u00A0]{2,}", " ", s) | |
| # strip | |
| return s.strip() | |
| def NRLimportRules1(): | |
| NRLimportRules = f""" | |
| 1. Main rule: All other rules of shall be as per General Purchase Condition (GPC) of NRL | |
| 2. Firm offer: The Quoted price shall remain firm and fixed till complete execution of the order. | |
| """ | |
| return NRLimportRules | |
| manual_rules=NRLimportRules1() | |
| def compliance_import_OEM(manabfile: str, client): | |
| pages = convert_from_path(manabfile, dpi=300) | |
| text = "" | |
| for page in pages: | |
| text += pytesseract.image_to_string(page) + "\n" | |
| doc_text=normalize_text(text) | |
| # Extract full PDF text (handles layout/tables well) | |
| #loader = PyMuPDFLoader(manabfile) | |
| #docs = loader.load() | |
| #for d in docs: | |
| #d.page_content = normalize_text(d.page_content) | |
| #doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5] | |
| #================== | |
| #modified prompt with items value | |
| SYSTEM_PROMPT = """ | |
| You are a strict procurement compliance auditor with dual responsibilities: | |
| 1. **EXTRACTION**: Extract specific fields from vendor documents with zero hallucination | |
| 2. **COMPLIANCE**: Perform strict point-by-point compliance checking against manual rules | |
| MANDATORY INSTRUCTIONS (APPLY TO BOTH TASKS): | |
| 1. Do NOT assume anything beyond explicitly written text | |
| 2. Do NOT interpret or infer missing information | |
| 3. Missing information = null (extraction) OR NON-COMPLIANT (rules) | |
| 4. Partial matches = NON-COMPLIANT | |
| 5. Only explicit written evidence is valid | |
| 6. Quote exact sentences from document as evidence | |
| OUTPUT ONLY valid JSON. No explanations, commentary, or additional text. | |
| EXACT JSON SCHEMA: | |
| { | |
| "extraction": { | |
| "items": [ | |
| { | |
| "item_description": "string or null", | |
| "rate": "string or null", | |
| "qty": "string or null", | |
| "value": "string or null" | |
| } | |
| ], | |
| "Value of the offer": "string or null", | |
| "Company name":"string or null", | |
| "Contact person":"string or null", | |
| "Contact person email id":"string or null", | |
| "Subject of the offer":"string or null", | |
| "offer reference Number":"string or null", | |
| "offer date":"string or null", | |
| "Freight":"string or null", | |
| "Transit Insurance":"string or null", | |
| "Dispatch point": "string or null", | |
| "mode of dispatch": "string or null", | |
| "Weight and dimension of item": "string or null", | |
| "Incoterm": "string or null", | |
| "Packing & forwarding": "string or null", | |
| "Cost of transportation/delivery from Ex work to the nearest port": "string or null", | |
| "Charges outside India": "string or null", | |
| "Third party Inspection or TPI": "string or null", | |
| "Currency of quoted price": "string or null", | |
| "Charges within India": "string or null", | |
| "Payment terms": "string or null", | |
| "Delivery period": "string or null", | |
| "OFFER VALIDITY": "string or null", | |
| "PRICE REDUCTION CLAUSE FOR DELAYED DELIVERY (LD)": "string or null", | |
| "GUARANTEE / WARRANTEE": "string or null", | |
| "interchangeability certificate": "string or null", | |
| "test report / inspection report": "string or null", | |
| "Certificate of origin": "string or null" | |
| }, | |
| "compliance_check": [ | |
| { | |
| "rule_heading": "Exact heading from manual", | |
| "status": "COMPLIANT or NON-COMPLIANT", | |
| "as_per_vendor": "Exact quoted sentence OR Not found in document" | |
| } | |
| ] | |
| } | |
| """ | |
| USER_PROMPT= f""" | |
| Document content (complete extracted text from vendor offer): | |
| {doc_text[:16000]} | |
| MANUAL RULES (check compliance against each point exactly): | |
| {manual_rules} | |
| INSTRUCTIONS: | |
| 1. FIRST extract all specified fields from document (use null if missing) | |
| 2. SECOND check each manual rule point-by-point | |
| 3. For each rule: quote exact evidence OR "Not found in document" | |
| 4. Output ONLY the exact JSON schema from system prompt | |
| """ | |
| #=============== | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": USER_PROMPT} | |
| ], | |
| temperature=0.0, | |
| max_tokens=3000, | |
| response_format={"type": "json_object"} | |
| ) | |
| input_tokens = response.usage.prompt_tokens | |
| output_tokens = response.usage.completion_tokens | |
| total_tokens = response.usage.total_tokens | |
| result_dic = json.loads(response.choices[0].message.content) | |
| data=result_dic | |
| extracted_item_value_df=pd.DataFrame(data['extraction']['items']) | |
| extraction_df = pd.DataFrame(list(data['extraction'].items()), columns=['Field', 'Value']) | |
| compliance_df = pd.DataFrame(data['compliance_check']) | |
| # Generate HTML tables with Bootstrap styling | |
| extraction_item_value_html = extracted_item_value_df.to_html( | |
| index=False, | |
| escape=False, | |
| classes='table table-striped table-bordered table-hover', | |
| table_id='extraction-table' | |
| ) | |
| extraction_html = extraction_df.to_html( | |
| index=False, | |
| escape=False, | |
| classes='table table-striped table-bordered table-hover', | |
| table_id='extraction-table' | |
| ) | |
| compliance_html = compliance_df.to_html( | |
| index=False, | |
| escape=False, | |
| classes='table table-striped table-bordered table-hover', | |
| table_id='compliance-table' | |
| ) | |
| finalHtml = extraction_item_value_html + " " + extraction_html+" "+compliance_html | |
| # Print DataFrames (for verification) | |
| #print("Extraction DataFrame:") | |
| #print(extraction_df) | |
| #print("\nCompliance DataFrame:") | |
| #print(compliance_df) | |
| #print("\nExtraction HTML table:") | |
| #print(extraction_html) | |
| #print("\nCompliance HTML table:") | |
| #print(compliance_html) | |
| #usage | |
| #print(result_dic["extraction"]["Value of the offer"]) | |
| #print (result_dic["extraction"]["Dispatch point"]) | |
| #print(result_dic["compliance_check"][0]['rule_heading']) | |
| return extraction_item_value_html, extraction_html, compliance_html, input_tokens ,output_tokens, total_tokens, result_dic | |