Spaces:
Sleeping
Sleeping
MakPr016 commited on
Commit ·
a3f9e7d
1
Parent(s): 265e719
Updated
Browse files- main.py +8 -5
- rfq_parser.py +191 -20
main.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
-
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 4 |
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
import uvicorn
|
| 6 |
from rfq_parser import parse_rfq_pdf
|
|
@@ -18,16 +18,19 @@ app.add_middleware(
|
|
| 18 |
)
|
| 19 |
|
| 20 |
@app.post("/parse-rfq")
|
| 21 |
-
async def parse_rfq(
|
|
|
|
|
|
|
|
|
|
| 22 |
if not file.filename.endswith(".pdf"):
|
| 23 |
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 24 |
-
|
| 25 |
-
if not os.getenv("GOOGLE_API_KEY"):
|
| 26 |
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not configured")
|
| 27 |
|
| 28 |
contents = await file.read()
|
| 29 |
try:
|
| 30 |
-
result = parse_rfq_pdf(contents)
|
| 31 |
return result
|
| 32 |
except Exception as e:
|
| 33 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
|
| 4 |
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
import uvicorn
|
| 6 |
from rfq_parser import parse_rfq_pdf
|
|
|
|
| 18 |
)
|
| 19 |
|
| 20 |
@app.post("/parse-rfq")
|
| 21 |
+
async def parse_rfq(
|
| 22 |
+
file: UploadFile = File(...),
|
| 23 |
+
use_gemini: bool = Form(True),
|
| 24 |
+
):
|
| 25 |
if not file.filename.endswith(".pdf"):
|
| 26 |
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 27 |
+
|
| 28 |
+
if use_gemini and not os.getenv("GOOGLE_API_KEY"):
|
| 29 |
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not configured")
|
| 30 |
|
| 31 |
contents = await file.read()
|
| 32 |
try:
|
| 33 |
+
result = parse_rfq_pdf(contents, use_gemini=use_gemini)
|
| 34 |
return result
|
| 35 |
except Exception as e:
|
| 36 |
raise HTTPException(status_code=500, detail=str(e))
|
rfq_parser.py
CHANGED
|
@@ -468,7 +468,10 @@ def extract_line_items(pdf_bytes):
|
|
| 468 |
return items
|
| 469 |
|
| 470 |
|
| 471 |
-
def _extract_line_items_from_llm(full_text):
|
|
|
|
|
|
|
|
|
|
| 472 |
system_prompt = (
|
| 473 |
"You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
|
| 474 |
"Return a JSON array only. Each object must have exactly these keys: "
|
|
@@ -505,7 +508,169 @@ def _extract_line_items_from_llm(full_text):
|
|
| 505 |
return []
|
| 506 |
|
| 507 |
|
| 508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
full_text = ""
|
| 510 |
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
| 511 |
total_pages = len(pdf.pages)
|
|
@@ -517,7 +682,9 @@ def parse_rfq_pdf(pdf_bytes):
|
|
| 517 |
if text:
|
| 518 |
full_text += f"\n--- Page {p_idx + 1} ---\n{text}"
|
| 519 |
|
| 520 |
-
|
|
|
|
|
|
|
| 521 |
|
| 522 |
JSON OUTPUT STRUCTURE:
|
| 523 |
{
|
|
@@ -546,22 +713,24 @@ def parse_rfq_pdf(pdf_bytes):
|
|
| 546 |
]
|
| 547 |
}
|
| 548 |
"""
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
)
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
|
|
|
|
|
|
| 565 |
line_items = extract_line_items(pdf_bytes)
|
| 566 |
|
| 567 |
valid_items = [
|
|
@@ -570,7 +739,8 @@ def parse_rfq_pdf(pdf_bytes):
|
|
| 570 |
]
|
| 571 |
|
| 572 |
if not valid_items:
|
| 573 |
-
|
|
|
|
| 574 |
|
| 575 |
return {
|
| 576 |
"title": llm_data.get("title", "RFQ Document"),
|
|
@@ -578,4 +748,5 @@ def parse_rfq_pdf(pdf_bytes):
|
|
| 578 |
"sections": llm_data.get("sections", []),
|
| 579 |
"line_items": valid_items,
|
| 580 |
"fields": llm_data.get("fields", []),
|
|
|
|
| 581 |
}
|
|
|
|
| 468 |
return items
|
| 469 |
|
| 470 |
|
| 471 |
+
def _extract_line_items_from_llm(full_text, use_gemini: bool = True):
|
| 472 |
+
if not use_gemini:
|
| 473 |
+
return []
|
| 474 |
+
|
| 475 |
system_prompt = (
|
| 476 |
"You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
|
| 477 |
"Return a JSON array only. Each object must have exactly these keys: "
|
|
|
|
| 508 |
return []
|
| 509 |
|
| 510 |
|
| 511 |
+
|
| 512 |
+
# ---------------------------------------------------------------------------
|
| 513 |
+
# RULE-BASED STRUCTURE EXTRACTOR (no LLM)
|
| 514 |
+
# ---------------------------------------------------------------------------
|
| 515 |
+
|
| 516 |
+
_SECTION_SIGNALS = [
|
| 517 |
+
(re.compile(r'(quotation|quote|rfq|tender)\s*(submission|instruction|guideline)', re.I), 'Quotation Submission'),
|
| 518 |
+
(re.compile(r'vendor|supplier|company\s*info|bidder\s*info', re.I), 'Vendor Information'),
|
| 519 |
+
(re.compile(r'declaration|conformity|compliance\s*statement|certif', re.I), 'Declaration of Conformity'),
|
| 520 |
+
(re.compile(r'schedule\s*of\s*req|item\s*list|line\s*item|bill\s*of\s*material', re.I), 'Schedule of Requirements'),
|
| 521 |
+
(re.compile(r'technical\s*(offer|proposal|spec)|financial\s*(offer|proposal)', re.I), 'Technical & Financial Offer'),
|
| 522 |
+
(re.compile(r'delivery|compliance|lead\s*time|incoterm|warranty', re.I), 'Compliance & Delivery'),
|
| 523 |
+
]
|
| 524 |
+
|
| 525 |
+
_FIELD_RULES = [
|
| 526 |
+
# --- Quotation Submission ---
|
| 527 |
+
(re.compile(r'rfq\s*(number|no\.?|ref)', re.I),
|
| 528 |
+
dict(id='rfq_number', label='RFQ Number', type='text', section='Quotation Submission', required=True, placeholder='e.g. RFQ-2024-001')),
|
| 529 |
+
(re.compile(r'(submission|closing|deadline|due)\s*(date|by)', re.I),
|
| 530 |
+
dict(id='submission_date', label='Submission Deadline', type='date', section='Quotation Submission', required=True, placeholder='DD/MM/YYYY')),
|
| 531 |
+
(re.compile(r'validity\s*(period|days|of\s*offer)', re.I),
|
| 532 |
+
dict(id='validity_period', label='Validity Period (days)', type='number', section='Quotation Submission', required=True, placeholder='e.g. 90')),
|
| 533 |
+
(re.compile(r'(submit|send|deliver).{0,30}(email|electronically|portal)', re.I),
|
| 534 |
+
dict(id='submission_method', label='Submission Method', type='dropdown', section='Quotation Submission', required=True, options=['Email', 'Portal', 'Hard Copy'])),
|
| 535 |
+
(re.compile(r'\bcurrency\b', re.I),
|
| 536 |
+
dict(id='currency', label='Currency', type='dropdown', section='Quotation Submission', required=True, options=['USD', 'EUR', 'GBP', 'LYD', 'AED', 'SAR'])),
|
| 537 |
+
(re.compile(r'(price|quote|quotation).{0,20}(all.inclusive|include.*vat|include.*tax)', re.I),
|
| 538 |
+
dict(id='price_inclusive', label='Price Inclusive of All Taxes', type='checkbox', section='Quotation Submission', required=False)),
|
| 539 |
+
(re.compile(r'payment\s*(terms?|condition|method)', re.I),
|
| 540 |
+
dict(id='payment_terms', label='Payment Terms', type='text', section='Quotation Submission', required=False, placeholder='e.g. Net 30')),
|
| 541 |
+
|
| 542 |
+
# --- Vendor Information ---
|
| 543 |
+
(re.compile(r'(company|vendor|supplier|bidder|firm)\s*(name|full\s*name)', re.I),
|
| 544 |
+
dict(id='company_name', label='Company Name', type='text', section='Vendor Information', required=True, placeholder='Legal registered name')),
|
| 545 |
+
(re.compile(r'(company|vendor|business|registered)\s*(address|location|headquarter)', re.I),
|
| 546 |
+
dict(id='company_address', label='Company Address', type='textarea', section='Vendor Information', required=True, placeholder='Full postal address')),
|
| 547 |
+
(re.compile(r'country\s*(of\s*)?(origin|registration|incorporation)', re.I),
|
| 548 |
+
dict(id='country', label='Country', type='text', section='Vendor Information', required=True, placeholder='e.g. Libya')),
|
| 549 |
+
(re.compile(r'contact\s*(person|name|individual|representative)', re.I),
|
| 550 |
+
dict(id='contact_person', label='Contact Person', type='text', section='Vendor Information', required=True, placeholder='Full name')),
|
| 551 |
+
(re.compile(r'(phone|telephone|mobile|tel)\s*(number|no\.?)?', re.I),
|
| 552 |
+
dict(id='phone', label='Phone Number', type='phone', section='Vendor Information', required=True, placeholder='+xxx-xxx-xxxxxxx')),
|
| 553 |
+
(re.compile(r'(email|e-mail)\s*(address)?', re.I),
|
| 554 |
+
dict(id='email', label='Email Address', type='email', section='Vendor Information', required=True, placeholder='vendor@company.com')),
|
| 555 |
+
(re.compile(r'(vat|tax|gst|tin)\s*(number|no\.?|registration|id)', re.I),
|
| 556 |
+
dict(id='vat_number', label='VAT / Tax Number', type='text', section='Vendor Information', required=False, placeholder='Tax registration number')),
|
| 557 |
+
(re.compile(r'(commercial|trade|business)\s*(registr|licen|certif)', re.I),
|
| 558 |
+
dict(id='trade_license', label='Trade License / Registration', type='file', section='Vendor Information', required=False)),
|
| 559 |
+
(re.compile(r'bank\s*(name|details?|account|information)', re.I),
|
| 560 |
+
dict(id='bank_name', label='Bank Name', type='text', section='Vendor Information', required=False, placeholder='Bank name')),
|
| 561 |
+
(re.compile(r'iban|account\s*(number|no\.?)', re.I),
|
| 562 |
+
dict(id='iban', label='IBAN / Account Number', type='text', section='Vendor Information', required=False, placeholder='IBAN or account number')),
|
| 563 |
+
|
| 564 |
+
# --- Declaration of Conformity ---
|
| 565 |
+
(re.compile(r'(authorized|authorised)\s*(signator|representative|person)', re.I),
|
| 566 |
+
dict(id='authorized_signatory', label='Authorized Signatory Name', type='text', section='Declaration of Conformity', required=True, placeholder='Full name of signing authority')),
|
| 567 |
+
(re.compile(r'(signature|sign\s*here|signed\s*by)', re.I),
|
| 568 |
+
dict(id='signature', label='Signature', type='file', section='Declaration of Conformity', required=True)),
|
| 569 |
+
(re.compile(r'(stamp|seal|company\s*stamp)', re.I),
|
| 570 |
+
dict(id='company_stamp', label='Company Stamp', type='file', section='Declaration of Conformity', required=False)),
|
| 571 |
+
(re.compile(r'(date\s*of\s*(sign|submission)|signed\s*on|date\s*signed)', re.I),
|
| 572 |
+
dict(id='declaration_date', label='Date of Declaration', type='date', section='Declaration of Conformity', required=True, placeholder='DD/MM/YYYY')),
|
| 573 |
+
|
| 574 |
+
# --- Technical & Financial Offer ---
|
| 575 |
+
(re.compile(r'(brand|manufacturer|make)\s*(name|proposed|offered)?', re.I),
|
| 576 |
+
dict(id='brand_offered', label='Brand / Manufacturer', type='text', section='Technical & Financial Offer', required=False, placeholder='Proposed brand name')),
|
| 577 |
+
(re.compile(r'(catalogue|catalog|model|part)\s*(number|no\.?|ref)', re.I),
|
| 578 |
+
dict(id='catalogue_number', label='Catalogue / Model Number',type='text', section='Technical & Financial Offer', required=False, placeholder='e.g. CAT-12345')),
|
| 579 |
+
(re.compile(r'(unit|item)\s*price', re.I),
|
| 580 |
+
dict(id='unit_price', label='Unit Price', type='number', section='Technical & Financial Offer', required=True, placeholder='Price per unit')),
|
| 581 |
+
(re.compile(r'(total|overall)\s*(price|amount|value)', re.I),
|
| 582 |
+
dict(id='total_price', label='Total Price', type='number', section='Technical & Financial Offer', required=True, placeholder='Total quoted amount')),
|
| 583 |
+
(re.compile(r'(country|place)\s*of\s*(manufacture|origin|production)', re.I),
|
| 584 |
+
dict(id='country_of_origin', label='Country of Origin', type='text', section='Technical & Financial Offer', required=False, placeholder='e.g. Germany')),
|
| 585 |
+
(re.compile(r'(registration|approval|certif).{0,20}(ministry|moh|fda|ce\b|iso)', re.I),
|
| 586 |
+
dict(id='registration_cert', label='Regulatory Registration Certificate', type='file', section='Technical & Financial Offer', required=True)),
|
| 587 |
+
(re.compile(r'(shelf\s*life|expiry|expiration)', re.I),
|
| 588 |
+
dict(id='shelf_life', label='Shelf Life / Expiry Date',type='text', section='Technical & Financial Offer', required=False, placeholder='e.g. min. 18 months upon delivery')),
|
| 589 |
+
|
| 590 |
+
# --- Compliance & Delivery ---
|
| 591 |
+
(re.compile(r'(delivery\s*(date|time|schedule)|lead\s*time)', re.I),
|
| 592 |
+
dict(id='delivery_lead_time',label='Delivery Lead Time', type='text', section='Compliance & Delivery', required=True, placeholder='e.g. 4-6 weeks after PO')),
|
| 593 |
+
(re.compile(r'(delivery\s*(term|condition|location|address)|destination|ship\s*to)', re.I),
|
| 594 |
+
dict(id='delivery_address', label='Delivery Address / Terms',type='textarea', section='Compliance & Delivery', required=True, placeholder='Delivery destination and Incoterms')),
|
| 595 |
+
(re.compile(r'\bincoterm', re.I),
|
| 596 |
+
dict(id='incoterms', label='Incoterms', type='dropdown', section='Compliance & Delivery', required=False, options=['EXW', 'FOB', 'CIF', 'DDP', 'DAP', 'CPT'])),
|
| 597 |
+
(re.compile(r'warranty\s*(period|term|duration)?', re.I),
|
| 598 |
+
dict(id='warranty', label='Warranty Period', type='text', section='Compliance & Delivery', required=False, placeholder='e.g. 12 months')),
|
| 599 |
+
(re.compile(r'(after.?sales?|technical\s*support|maintenance\s*support)', re.I),
|
| 600 |
+
dict(id='after_sales_support',label='After-Sales Support', type='textarea', section='Compliance & Delivery', required=False, placeholder='Describe support offered')),
|
| 601 |
+
(re.compile(r'(packing|packaging)\s*(standard|requirement|specification)?', re.I),
|
| 602 |
+
dict(id='packing_standard', label='Packing Standard', type='text', section='Compliance & Delivery', required=False, placeholder='e.g. Original manufacturer packaging')),
|
| 603 |
+
]
|
| 604 |
+
|
| 605 |
+
_DEFAULT_FIELD_VALIDATION = {'min': None, 'max': None, 'pattern': None}
|
| 606 |
+
|
| 607 |
+
_KNOWN_SECTIONS = [
|
| 608 |
+
'Quotation Submission',
|
| 609 |
+
'Vendor Information',
|
| 610 |
+
'Declaration of Conformity',
|
| 611 |
+
'Schedule of Requirements',
|
| 612 |
+
'Technical & Financial Offer',
|
| 613 |
+
'Compliance & Delivery',
|
| 614 |
+
]
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
def _extract_structure_rule_based(full_text: str) -> dict:
|
| 618 |
+
"""
|
| 619 |
+
Parse title, sections, and fields from raw PDF text without an LLM.
|
| 620 |
+
Produces a best-effort result; quality depends on how legible the PDF text is.
|
| 621 |
+
"""
|
| 622 |
+
lines = [l.strip() for l in full_text.splitlines()]
|
| 623 |
+
non_empty = [l for l in lines if l and not l.startswith('---')]
|
| 624 |
+
|
| 625 |
+
# Title: first substantive non-page-marker line
|
| 626 |
+
title = 'RFQ Document'
|
| 627 |
+
for line in non_empty[:15]:
|
| 628 |
+
if len(line) > 5:
|
| 629 |
+
title = line[:150]
|
| 630 |
+
break
|
| 631 |
+
|
| 632 |
+
# Sections: scan every line for signals
|
| 633 |
+
found_sections = []
|
| 634 |
+
section_order = {s: i for i, s in enumerate(_KNOWN_SECTIONS)}
|
| 635 |
+
for line in lines:
|
| 636 |
+
for pattern, section_name in _SECTION_SIGNALS:
|
| 637 |
+
if pattern.search(line) and section_name not in found_sections:
|
| 638 |
+
found_sections.append(section_name)
|
| 639 |
+
break
|
| 640 |
+
found_sections.sort(key=lambda s: section_order.get(s, 99))
|
| 641 |
+
if 'Schedule of Requirements' not in found_sections:
|
| 642 |
+
found_sections.append('Schedule of Requirements')
|
| 643 |
+
|
| 644 |
+
# Fields: slide a 3-line window and match rules
|
| 645 |
+
windows = [' '.join(lines[i:i + 3]) for i in range(len(lines))]
|
| 646 |
+
seen_ids = set()
|
| 647 |
+
fields = []
|
| 648 |
+
for window in windows:
|
| 649 |
+
for pattern, field_def in _FIELD_RULES:
|
| 650 |
+
if pattern.search(window) and field_def['id'] not in seen_ids:
|
| 651 |
+
if field_def['section'] in found_sections or field_def['required']:
|
| 652 |
+
seen_ids.add(field_def['id'])
|
| 653 |
+
fields.append({
|
| 654 |
+
'id': field_def['id'],
|
| 655 |
+
'label': field_def['label'],
|
| 656 |
+
'type': field_def['type'],
|
| 657 |
+
'section': field_def['section'],
|
| 658 |
+
'required': field_def.get('required', False),
|
| 659 |
+
'default_value': None,
|
| 660 |
+
'placeholder': field_def.get('placeholder', ''),
|
| 661 |
+
'options': field_def.get('options', []),
|
| 662 |
+
'validation': _DEFAULT_FIELD_VALIDATION.copy(),
|
| 663 |
+
})
|
| 664 |
+
|
| 665 |
+
return {
|
| 666 |
+
'title': title,
|
| 667 |
+
'description': '',
|
| 668 |
+
'sections': found_sections,
|
| 669 |
+
'fields': fields,
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
|
| 673 |
+
def parse_rfq_pdf(pdf_bytes, use_gemini: bool = True):
|
| 674 |
full_text = ""
|
| 675 |
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
| 676 |
total_pages = len(pdf.pages)
|
|
|
|
| 682 |
if text:
|
| 683 |
full_text += f"\n--- Page {p_idx + 1} ---\n{text}"
|
| 684 |
|
| 685 |
+
# --- Main document structure extraction ---
|
| 686 |
+
if use_gemini:
|
| 687 |
+
system_prompt = """You are an expert RFQ Parser. Extract data from the RFQ text into the exact JSON structure below.
|
| 688 |
|
| 689 |
JSON OUTPUT STRUCTURE:
|
| 690 |
{
|
|
|
|
| 713 |
]
|
| 714 |
}
|
| 715 |
"""
|
| 716 |
+
try:
|
| 717 |
+
client = _get_genai_client()
|
| 718 |
+
response = client.models.generate_content(
|
| 719 |
+
model=GEMINI_MODEL,
|
| 720 |
+
contents=full_text[:30000],
|
| 721 |
+
config=types.GenerateContentConfig(
|
| 722 |
+
system_instruction=system_prompt + "\nRETURN JSON ONLY.",
|
| 723 |
+
response_mime_type="application/json",
|
| 724 |
+
temperature=0,
|
| 725 |
+
),
|
| 726 |
+
)
|
| 727 |
+
llm_data = json.loads(response.text)
|
| 728 |
+
except Exception:
|
| 729 |
+
llm_data = {"title": "Error Parsing", "description": "", "sections": [], "fields": []}
|
| 730 |
+
else:
|
| 731 |
+
llm_data = _extract_structure_rule_based(full_text)
|
| 732 |
+
|
| 733 |
+
# --- Line item extraction ---
|
| 734 |
line_items = extract_line_items(pdf_bytes)
|
| 735 |
|
| 736 |
valid_items = [
|
|
|
|
| 739 |
]
|
| 740 |
|
| 741 |
if not valid_items:
|
| 742 |
+
# use_gemini=False makes this return [] immediately (no API call)
|
| 743 |
+
valid_items = _extract_line_items_from_llm(full_text, use_gemini=use_gemini)
|
| 744 |
|
| 745 |
return {
|
| 746 |
"title": llm_data.get("title", "RFQ Document"),
|
|
|
|
| 748 |
"sections": llm_data.get("sections", []),
|
| 749 |
"line_items": valid_items,
|
| 750 |
"fields": llm_data.get("fields", []),
|
| 751 |
+
"gemini_used": use_gemini,
|
| 752 |
}
|