MakPr016 commited on
Commit
a77318b
·
1 Parent(s): d11a562

Dynamic Parser

Browse files
Files changed (5) hide show
  1. .gitignore +17 -0
  2. Dockerfile +16 -0
  3. main.py +41 -0
  4. requirements.txt +6 -0
  5. rfq_parser.py +313 -0
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.pyo
4
+ *.pyd
5
+ .venv/
6
+ venv/
7
+ .env
8
+ .env.*
9
+ .python-version
10
+ .DS_Store
11
+ .pytest_cache/
12
+ .mypy_cache/
13
+ .ruff_cache/
14
+ .ipynb_checkpoints/
15
+ build/
16
+ dist/
17
+ *.egg-info/
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+ ENV PORT=7860
8
+
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ EXPOSE 7860
15
+
16
+ CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]
main.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from fastapi import FastAPI, UploadFile, File, HTTPException
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ import uvicorn
6
+ from rfq_parser import parse_rfq_pdf
7
+
8
+ load_dotenv()
9
+
10
+ app = FastAPI(title="Dynamic RFQ Parser")
11
+
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_credentials=True,
16
+ allow_methods=["*"],
17
+ allow_headers=["*"],
18
+ )
19
+
20
+ @app.post("/parse-rfq")
21
+ async def parse_rfq(file: UploadFile = File(...)):
22
+ if not file.filename.endswith(".pdf"):
23
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
24
+
25
+ if not os.getenv("GOOGLE_API_KEY"):
26
+ raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not configured")
27
+
28
+ contents = await file.read()
29
+ try:
30
+ result = parse_rfq_pdf(contents)
31
+ return result
32
+ except Exception as e:
33
+ raise HTTPException(status_code=500, detail=str(e))
34
+
35
+ @app.get("/health")
36
+ def health():
37
+ return {"status": "ok"}
38
+
39
+ if __name__ == "__main__":
40
+ port = int(os.getenv("PORT", "7860"))
41
+ uvicorn.run("main:app", host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ pdfplumber
5
+ python-dotenv
6
+ google-genai
rfq_parser.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import os
3
+ import json
4
+ import io
5
+ import re
6
+ from google import genai
7
+ from google.genai import types
8
+
9
+ GEMINI_MODEL = "gemini-2.5-pro"
10
+
11
+ _client = None
12
+
13
+ PLACEHOLDER_PATTERNS = ["click or tap", "click here", "enter text", "type here"]
14
+ SKIP_DESCS = {
15
+ "total", "subtotal", "grand total", "amount", "description", "item description",
16
+ "transportation price", "insurance price", "installation price", "training price",
17
+ "other charges (specify)", "other charges", "total price",
18
+ "total final and all-inclusive price",
19
+ }
20
+
21
+ DESC_RE = re.compile(r'(description|specifications|commodity|item\s*name|item\s*desc)')
22
+ QTY_RE = re.compile(r'(qty|quant|quantity|total\s*qty|total\s*quantity)')
23
+ SR_RE = re.compile(r'\b(sr|item\s*no|pos\.?)\b|^no\.?$')
24
+ UNIT_RE = re.compile(r'(unit|uom|pack\s*size|measure)')
25
+
26
+
27
+ def _get_genai_client():
28
+ global _client
29
+ if _client is None:
30
+ api_key = os.environ.get("GOOGLE_API_KEY")
31
+ if not api_key:
32
+ raise ValueError("GOOGLE_API_KEY is not configured")
33
+ _client = genai.Client(api_key=api_key)
34
+ return _client
35
+
36
+
37
+ def _clean(cell):
38
+ return str(cell).replace("\n", " ").strip() if cell else ""
39
+
40
+
41
+ def _is_placeholder(text):
42
+ t = text.lower()
43
+ return any(p in t for p in PLACEHOLDER_PATTERNS)
44
+
45
+
46
+ def _parse_qty(s):
47
+ q = re.sub(r"[^\d.]", "", s)
48
+ if not q:
49
+ return 0
50
+ try:
51
+ v = float(q)
52
+ return int(v) if v.is_integer() else v
53
+ except Exception:
54
+ return 0
55
+
56
+
57
+ def _detect_header(table):
58
+ for r_i, row in enumerate(table[:6]):
59
+ cells = [_clean(c).lower() for c in row]
60
+ flat = " ".join(cells)
61
+ if not (DESC_RE.search(flat) and (QTY_RE.search(flat) or UNIT_RE.search(flat))):
62
+ continue
63
+ idx = {"sr": -1, "desc": -1, "unit": -1, "qty": -1}
64
+ for c_i, h in enumerate(cells):
65
+ if not h:
66
+ continue
67
+ if SR_RE.search(h) and idx["sr"] == -1:
68
+ idx["sr"] = c_i
69
+ elif DESC_RE.search(h) and idx["desc"] == -1:
70
+ idx["desc"] = c_i
71
+ elif QTY_RE.search(h) and idx["qty"] == -1:
72
+ idx["qty"] = c_i
73
+ elif UNIT_RE.search(h) and idx["unit"] == -1:
74
+ idx["unit"] = c_i
75
+ if idx["desc"] != -1:
76
+ return r_i, idx, len(row)
77
+ return -1, None, 0
78
+
79
+
80
+ def _remap_by_data_row(idx_map, table, header_idx):
81
+ sample = next(
82
+ (r for r in table[header_idx + 1:] if any(c is not None for c in r)),
83
+ None
84
+ )
85
+ if not sample:
86
+ return idx_map
87
+
88
+ non_none = [i for i, c in enumerate(sample) if c is not None]
89
+ if len(non_none) < 2:
90
+ return idx_map
91
+
92
+ remapped = {
93
+ "sr": non_none[0] if len(non_none) > 0 else -1,
94
+ "desc": non_none[1] if len(non_none) > 1 else -1,
95
+ "unit": non_none[-2] if len(non_none) > 2 else -1,
96
+ "qty": non_none[-1] if len(non_none) > 1 else -1,
97
+ }
98
+ return remapped
99
+
100
+
101
+ def _looks_like_item_continuation(table):
102
+ hits = 0
103
+ for row in table[:8]:
104
+ non_empty = [_clean(c) for c in row if c is not None and _clean(c)]
105
+ if len(non_empty) >= 2 and re.match(r'^\d+\.?$', non_empty[0]) and len(non_empty[1]) > 3:
106
+ hits += 1
107
+ return hits >= 2
108
+
109
+
110
+ def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
111
+ for row in rows:
112
+ row_clean = [_clean(c) for c in row]
113
+ row_clean = (row_clean + [""] * num_cols)[:num_cols]
114
+
115
+ if not any(row_clean):
116
+ continue
117
+ if any(_is_placeholder(c) for c in row_clean):
118
+ continue
119
+
120
+ sr_val = None
121
+ if idx_map["sr"] != -1 and idx_map["sr"] < len(row_clean):
122
+ m = re.search(r'\d+', row_clean[idx_map["sr"]])
123
+ if m:
124
+ sr_val = int(m.group())
125
+ if sr_val is None:
126
+ non_empty = [c for c in row_clean if c]
127
+ if non_empty and re.match(r'^\d+\.?$', non_empty[0]):
128
+ sr_val = int(re.sub(r'\D', '', non_empty[0]))
129
+
130
+ desc = ""
131
+ if idx_map["desc"] != -1 and idx_map["desc"] < len(row_clean):
132
+ desc = row_clean[idx_map["desc"]]
133
+ if not desc:
134
+ for c in row_clean:
135
+ if c and not re.match(r'^[\d.,]+$', c) and not _is_placeholder(c):
136
+ desc = c
137
+ break
138
+
139
+ desc = desc.strip()
140
+ if not desc or len(desc) < 3 or desc.lower() in SKIP_DESCS or _is_placeholder(desc):
141
+ continue
142
+
143
+ unit_val = ""
144
+ if idx_map["unit"] != -1 and idx_map["unit"] < len(row_clean):
145
+ unit_val = row_clean[idx_map["unit"]]
146
+
147
+ qty_val = 0
148
+ if idx_map["qty"] != -1 and idx_map["qty"] < len(row_clean):
149
+ qty_val = _parse_qty(row_clean[idx_map["qty"]])
150
+
151
+ key = sr_val if sr_val is not None else desc
152
+ if key in seen_srs:
153
+ continue
154
+ seen_srs.add(key)
155
+
156
+ items.append({
157
+ "sr": sr_val if sr_val is not None else len(items) + 1,
158
+ "description": desc,
159
+ "unit": unit_val,
160
+ "qty": qty_val,
161
+ "unit_price": None,
162
+ "total_price": None,
163
+ "brand": "",
164
+ "expiry_date": "",
165
+ "remarks": "",
166
+ })
167
+
168
+
169
+ def extract_line_items(pdf_bytes):
170
+ items = []
171
+ seen_srs = set()
172
+ active_schema = None
173
+
174
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
175
+ for page in pdf.pages:
176
+ tables = page.extract_tables()
177
+ if not tables:
178
+ continue
179
+
180
+ for table in tables:
181
+ if len(table) < 2:
182
+ continue
183
+
184
+ h_idx, idx_map, num_cols = _detect_header(table)
185
+
186
+ if h_idx != -1 and idx_map and idx_map["desc"] != -1:
187
+ remapped = _remap_by_data_row(idx_map, table, h_idx)
188
+ active_schema = {"idx": remapped, "num_cols": num_cols}
189
+ _extract_rows(table[h_idx + 1:], remapped, num_cols, seen_srs, items)
190
+ continue
191
+
192
+ if active_schema and _looks_like_item_continuation(table):
193
+ actual_cols = max(len(r) for r in table)
194
+ sample = next((r for r in table if any(c is not None for c in r)), None)
195
+ none_ratio = sum(1 for c in (sample or []) if c is None) / max(len(sample or [1]), 1)
196
+
197
+ if none_ratio > 0.4:
198
+ non_none = [i for i, c in enumerate(sample) if c is not None]
199
+ remapped = {
200
+ "sr": non_none[0] if len(non_none) > 0 else -1,
201
+ "desc": non_none[1] if len(non_none) > 1 else -1,
202
+ "unit": non_none[-2] if len(non_none) > 2 else -1,
203
+ "qty": non_none[-1] if len(non_none) > 1 else -1,
204
+ }
205
+ else:
206
+ remapped = {"sr": 0, "desc": 1, "unit": 2, "qty": 3}
207
+
208
+ _extract_rows(table, remapped, actual_cols, seen_srs, items)
209
+
210
+ return items
211
+
212
+
213
+ def _extract_line_items_from_llm(full_text):
214
+ system_prompt = (
215
+ "You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
216
+ "Return a JSON array only. Each object must have exactly these keys: "
217
+ '{"sr": integer, "description": "string", "unit": "string or empty string", "qty": number or 0, '
218
+ '"unit_price": null, "total_price": null, "brand": "", "expiry_date": "", "remarks": ""}. '
219
+ "If no line items are found, return []. RETURN JSON ARRAY ONLY, no markdown, no preamble."
220
+ )
221
+ try:
222
+ client = _get_genai_client()
223
+ response = client.models.generate_content(
224
+ model=GEMINI_MODEL,
225
+ contents=full_text[:30000],
226
+ config=types.GenerateContentConfig(
227
+ system_instruction=system_prompt,
228
+ response_mime_type="application/json",
229
+ temperature=0,
230
+ ),
231
+ )
232
+ result = json.loads(response.text)
233
+ if isinstance(result, list):
234
+ return result
235
+ return []
236
+ except Exception:
237
+ return []
238
+
239
+
240
+ def parse_rfq_pdf(pdf_bytes):
241
+ full_text = ""
242
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
243
+ total_pages = len(pdf.pages)
244
+ pages_to_read = range(total_pages) if total_pages <= 10 else (
245
+ list(range(5)) + list(range(total_pages - 5, total_pages))
246
+ )
247
+ for p_idx in pages_to_read:
248
+ text = pdf.pages[p_idx].extract_text()
249
+ if text:
250
+ full_text += f"\n--- Page {p_idx + 1} ---\n{text}"
251
+
252
+ system_prompt = """You are an expert RFQ Parser. Extract data from the RFQ text into the exact JSON structure below.
253
+
254
+ JSON OUTPUT STRUCTURE:
255
+ {
256
+ "title": "string",
257
+ "description": "string",
258
+ "sections": [
259
+ "Quotation Submission",
260
+ "Vendor Information",
261
+ "Declaration of Conformity",
262
+ "Schedule of Requirements",
263
+ "Technical & Financial Offer",
264
+ "Compliance & Delivery"
265
+ ],
266
+ "fields": [
267
+ {
268
+ "id": "snake_case_id",
269
+ "label": "Human Readable Label",
270
+ "type": "file" | "text" | "number" | "date" | "dropdown" | "checkbox" | "email" | "phone" | "textarea",
271
+ "section": "Quotation Submission" | "Vendor Information" | "Declaration of Conformity" | "Schedule of Requirements" | "Technical & Financial Offer" | "Compliance & Delivery",
272
+ "required": boolean,
273
+ "default_value": null,
274
+ "placeholder": "Helpful hint",
275
+ "options": ["Option1", "Option2"],
276
+ "validation": {"min": null, "max": null, "pattern": null}
277
+ }
278
+ ]
279
+ }
280
+ """
281
+
282
+ try:
283
+ client = _get_genai_client()
284
+ response = client.models.generate_content(
285
+ model=GEMINI_MODEL,
286
+ contents=full_text[:30000],
287
+ config=types.GenerateContentConfig(
288
+ system_instruction=system_prompt + "\nRETURN JSON ONLY.",
289
+ response_mime_type="application/json",
290
+ temperature=0,
291
+ ),
292
+ )
293
+ llm_data = json.loads(response.text)
294
+ except Exception:
295
+ llm_data = {"title": "Error Parsing", "description": "", "sections": [], "fields": []}
296
+
297
+ line_items = extract_line_items(pdf_bytes)
298
+
299
+ valid_items = [
300
+ item for item in line_items
301
+ if item.get("description") and not _is_placeholder(item["description"])
302
+ ]
303
+
304
+ if not valid_items:
305
+ valid_items = _extract_line_items_from_llm(full_text)
306
+
307
+ return {
308
+ "title": llm_data.get("title", "RFQ Document"),
309
+ "description": llm_data.get("description", ""),
310
+ "sections": llm_data.get("sections", []),
311
+ "line_items": valid_items,
312
+ "fields": llm_data.get("fields", []),
313
+ }