manabb commited on
Commit
9d02537
·
verified ·
1 Parent(s): 2b4672b

Update technicalDocCompliance.py

Browse files
Files changed (1) hide show
  1. technicalDocCompliance.py +32 -17
technicalDocCompliance.py CHANGED
@@ -1,11 +1,37 @@
1
  #technicalDocCompliance.py
2
 
3
  from openai import OpenAI # Core import for client[web:30][web:32]
 
 
 
 
4
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
6
 
7
  def compliance_tech(file: str, client, MANUAL_RULES):
 
 
 
 
 
 
8
  PROMPT = f"""
 
 
 
 
9
  You are a strict procurement compliance auditor.
10
 
11
  Your task is to check whether the uploaded file FULLY complies against each heading of the MANUAL RULES.
@@ -32,25 +58,14 @@ def compliance_tech(file: str, client, MANUAL_RULES):
32
  {MANUAL_RULES}
33
  """
34
 
35
- with open(file, "rb") as f:
36
- uploaded_file = client.files.create(file=f, purpose="vision") # Fixed var name & method[web:27][web:34]
37
 
38
- response = client.chat.completions.create( # Fixed: chat.completions.create()[web:30][web:38]
39
  model="gpt-4o-mini",
40
- messages=[ # Fixed structure: messages list of dicts[web:38]
41
- {
42
- "role": "user",
43
- "content": [ # Fixed: content is list of dicts
44
- {"type": "text", "text": PROMPT}, # Fixed: "text" not "input_text"
45
- {
46
- "type": "input_image", # Fixed: "input_image" for vision/PDFs[web:27]
47
- "file_id": uploaded_file.id # Reference uploaded file ID
48
- }
49
- ]
50
- }
51
- ],
52
- temperature=0, # 👈 VERY IMPORTANT
53
- max_tokens=1200 # Fixed: max_tokens (not max_output_tokens)[web:38]
54
  )
55
 
56
  return response.choices[0].message.content # Fixed: access output text[web:32]
 
1
  #technicalDocCompliance.py
2
 
3
  from openai import OpenAI # Core import for client[web:30][web:32]
4
+ from openai import OpenAI
5
+ from langchain_community.document_loaders import PyMuPDFLoader # pip install pymupdf[web:42]
6
+ import os
7
+ import re
8
 
9
+ def normalize_text(s: str) -> str:
10
+ """Normalize whitespace / newlines in page_content."""
11
+ s = s.replace("\r\n", "\n").replace("\r", "\n")
12
+ s = s.replace("\t", " ")
13
 
14
+ # collapse 3+ newlines to 2
15
+ s = re.sub(r"\n{3,}", "\n\n", s)
16
+
17
+ # multiple spaces -> 1
18
+ s = re.sub(r"[ \u00A0]{2,}", " ", s)
19
+
20
+ # strip
21
+ return s.strip()
22
 
23
  def compliance_tech(file: str, client, MANUAL_RULES):
24
+ # Extract full PDF text (handles layout/tables well)
25
+ loader = PyMuPDFLoader(file)
26
+ docs = loader.load()
27
+ for d in docs:
28
+ d.page_content = normalize_text(d.page_content)
29
+ doc_text = "\n\n".join(doc.page_content for doc in docs) # Flatten to string[cite:5]
30
  PROMPT = f"""
31
+ Document content (complete extracted text):
32
+
33
+ {doc_text[:16000]} # Truncate if needed for token limits
34
+
35
  You are a strict procurement compliance auditor.
36
 
37
  Your task is to check whether the uploaded file FULLY complies against each heading of the MANUAL RULES.
 
58
  {MANUAL_RULES}
59
  """
60
 
61
+ #with open(file, "rb") as f:
62
+ #uploaded_file = client.files.create(file=f, purpose="vision") # Fixed var name & method[web:27][web:34]
63
 
64
+ response = client.chat.completions.create(
65
  model="gpt-4o-mini",
66
+ messages=[{"role": "user", "content": PROMPT}],
67
+ temperature=0,
68
+ max_tokens=1200
 
 
 
 
 
 
 
 
 
 
 
69
  )
70
 
71
  return response.choices[0].message.content # Fixed: access output text[web:32]