MakPr016 commited on
Commit
a3f9e7d
·
1 Parent(s): 265e719
Files changed (2) hide show
  1. main.py +8 -5
  2. rfq_parser.py +191 -20
main.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  from dotenv import load_dotenv
3
- from fastapi import FastAPI, UploadFile, File, HTTPException
4
  from fastapi.middleware.cors import CORSMiddleware
5
  import uvicorn
6
  from rfq_parser import parse_rfq_pdf
@@ -18,16 +18,19 @@ app.add_middleware(
18
  )
19
 
20
  @app.post("/parse-rfq")
21
- async def parse_rfq(file: UploadFile = File(...)):
 
 
 
22
  if not file.filename.endswith(".pdf"):
23
  raise HTTPException(status_code=400, detail="Only PDF files are supported")
24
-
25
- if not os.getenv("GOOGLE_API_KEY"):
26
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not configured")
27
 
28
  contents = await file.read()
29
  try:
30
- result = parse_rfq_pdf(contents)
31
  return result
32
  except Exception as e:
33
  raise HTTPException(status_code=500, detail=str(e))
 
1
  import os
2
  from dotenv import load_dotenv
3
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Form
4
  from fastapi.middleware.cors import CORSMiddleware
5
  import uvicorn
6
  from rfq_parser import parse_rfq_pdf
 
18
  )
19
 
20
  @app.post("/parse-rfq")
21
+ async def parse_rfq(
22
+ file: UploadFile = File(...),
23
+ use_gemini: bool = Form(True),
24
+ ):
25
  if not file.filename.endswith(".pdf"):
26
  raise HTTPException(status_code=400, detail="Only PDF files are supported")
27
+
28
+ if use_gemini and not os.getenv("GOOGLE_API_KEY"):
29
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not configured")
30
 
31
  contents = await file.read()
32
  try:
33
+ result = parse_rfq_pdf(contents, use_gemini=use_gemini)
34
  return result
35
  except Exception as e:
36
  raise HTTPException(status_code=500, detail=str(e))
rfq_parser.py CHANGED
@@ -468,7 +468,10 @@ def extract_line_items(pdf_bytes):
468
  return items
469
 
470
 
471
- def _extract_line_items_from_llm(full_text):
 
 
 
472
  system_prompt = (
473
  "You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
474
  "Return a JSON array only. Each object must have exactly these keys: "
@@ -505,7 +508,169 @@ def _extract_line_items_from_llm(full_text):
505
  return []
506
 
507
 
508
- def parse_rfq_pdf(pdf_bytes):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  full_text = ""
510
  with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
511
  total_pages = len(pdf.pages)
@@ -517,7 +682,9 @@ def parse_rfq_pdf(pdf_bytes):
517
  if text:
518
  full_text += f"\n--- Page {p_idx + 1} ---\n{text}"
519
 
520
- system_prompt = """You are an expert RFQ Parser. Extract data from the RFQ text into the exact JSON structure below.
 
 
521
 
522
  JSON OUTPUT STRUCTURE:
523
  {
@@ -546,22 +713,24 @@ def parse_rfq_pdf(pdf_bytes):
546
  ]
547
  }
548
  """
549
-
550
- try:
551
- client = _get_genai_client()
552
- response = client.models.generate_content(
553
- model=GEMINI_MODEL,
554
- contents=full_text[:30000],
555
- config=types.GenerateContentConfig(
556
- system_instruction=system_prompt + "\nRETURN JSON ONLY.",
557
- response_mime_type="application/json",
558
- temperature=0,
559
- ),
560
- )
561
- llm_data = json.loads(response.text)
562
- except Exception:
563
- llm_data = {"title": "Error Parsing", "description": "", "sections": [], "fields": []}
564
-
 
 
565
  line_items = extract_line_items(pdf_bytes)
566
 
567
  valid_items = [
@@ -570,7 +739,8 @@ def parse_rfq_pdf(pdf_bytes):
570
  ]
571
 
572
  if not valid_items:
573
- valid_items = _extract_line_items_from_llm(full_text)
 
574
 
575
  return {
576
  "title": llm_data.get("title", "RFQ Document"),
@@ -578,4 +748,5 @@ def parse_rfq_pdf(pdf_bytes):
578
  "sections": llm_data.get("sections", []),
579
  "line_items": valid_items,
580
  "fields": llm_data.get("fields", []),
 
581
  }
 
468
  return items
469
 
470
 
471
+ def _extract_line_items_from_llm(full_text, use_gemini: bool = True):
472
+ if not use_gemini:
473
+ return []
474
+
475
  system_prompt = (
476
  "You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
477
  "Return a JSON array only. Each object must have exactly these keys: "
 
508
  return []
509
 
510
 
511
+
512
+ # ---------------------------------------------------------------------------
513
+ # RULE-BASED STRUCTURE EXTRACTOR (no LLM)
514
+ # ---------------------------------------------------------------------------
515
+
516
+ _SECTION_SIGNALS = [
517
+ (re.compile(r'(quotation|quote|rfq|tender)\s*(submission|instruction|guideline)', re.I), 'Quotation Submission'),
518
+ (re.compile(r'vendor|supplier|company\s*info|bidder\s*info', re.I), 'Vendor Information'),
519
+ (re.compile(r'declaration|conformity|compliance\s*statement|certif', re.I), 'Declaration of Conformity'),
520
+ (re.compile(r'schedule\s*of\s*req|item\s*list|line\s*item|bill\s*of\s*material', re.I), 'Schedule of Requirements'),
521
+ (re.compile(r'technical\s*(offer|proposal|spec)|financial\s*(offer|proposal)', re.I), 'Technical & Financial Offer'),
522
+ (re.compile(r'delivery|compliance|lead\s*time|incoterm|warranty', re.I), 'Compliance & Delivery'),
523
+ ]
524
+
525
+ _FIELD_RULES = [
526
+ # --- Quotation Submission ---
527
+ (re.compile(r'rfq\s*(number|no\.?|ref)', re.I),
528
+ dict(id='rfq_number', label='RFQ Number', type='text', section='Quotation Submission', required=True, placeholder='e.g. RFQ-2024-001')),
529
+ (re.compile(r'(submission|closing|deadline|due)\s*(date|by)', re.I),
530
+ dict(id='submission_date', label='Submission Deadline', type='date', section='Quotation Submission', required=True, placeholder='DD/MM/YYYY')),
531
+ (re.compile(r'validity\s*(period|days|of\s*offer)', re.I),
532
+ dict(id='validity_period', label='Validity Period (days)', type='number', section='Quotation Submission', required=True, placeholder='e.g. 90')),
533
+ (re.compile(r'(submit|send|deliver).{0,30}(email|electronically|portal)', re.I),
534
+ dict(id='submission_method', label='Submission Method', type='dropdown', section='Quotation Submission', required=True, options=['Email', 'Portal', 'Hard Copy'])),
535
+ (re.compile(r'\bcurrency\b', re.I),
536
+ dict(id='currency', label='Currency', type='dropdown', section='Quotation Submission', required=True, options=['USD', 'EUR', 'GBP', 'LYD', 'AED', 'SAR'])),
537
+ (re.compile(r'(price|quote|quotation).{0,20}(all.inclusive|include.*vat|include.*tax)', re.I),
538
+ dict(id='price_inclusive', label='Price Inclusive of All Taxes', type='checkbox', section='Quotation Submission', required=False)),
539
+ (re.compile(r'payment\s*(terms?|condition|method)', re.I),
540
+ dict(id='payment_terms', label='Payment Terms', type='text', section='Quotation Submission', required=False, placeholder='e.g. Net 30')),
541
+
542
+ # --- Vendor Information ---
543
+ (re.compile(r'(company|vendor|supplier|bidder|firm)\s*(name|full\s*name)', re.I),
544
+ dict(id='company_name', label='Company Name', type='text', section='Vendor Information', required=True, placeholder='Legal registered name')),
545
+ (re.compile(r'(company|vendor|business|registered)\s*(address|location|headquarter)', re.I),
546
+ dict(id='company_address', label='Company Address', type='textarea', section='Vendor Information', required=True, placeholder='Full postal address')),
547
+ (re.compile(r'country\s*(of\s*)?(origin|registration|incorporation)', re.I),
548
+ dict(id='country', label='Country', type='text', section='Vendor Information', required=True, placeholder='e.g. Libya')),
549
+ (re.compile(r'contact\s*(person|name|individual|representative)', re.I),
550
+ dict(id='contact_person', label='Contact Person', type='text', section='Vendor Information', required=True, placeholder='Full name')),
551
+ (re.compile(r'(phone|telephone|mobile|tel)\s*(number|no\.?)?', re.I),
552
+ dict(id='phone', label='Phone Number', type='phone', section='Vendor Information', required=True, placeholder='+xxx-xxx-xxxxxxx')),
553
+ (re.compile(r'(email|e-mail)\s*(address)?', re.I),
554
+ dict(id='email', label='Email Address', type='email', section='Vendor Information', required=True, placeholder='vendor@company.com')),
555
+ (re.compile(r'(vat|tax|gst|tin)\s*(number|no\.?|registration|id)', re.I),
556
+ dict(id='vat_number', label='VAT / Tax Number', type='text', section='Vendor Information', required=False, placeholder='Tax registration number')),
557
+ (re.compile(r'(commercial|trade|business)\s*(registr|licen|certif)', re.I),
558
+ dict(id='trade_license', label='Trade License / Registration', type='file', section='Vendor Information', required=False)),
559
+ (re.compile(r'bank\s*(name|details?|account|information)', re.I),
560
+ dict(id='bank_name', label='Bank Name', type='text', section='Vendor Information', required=False, placeholder='Bank name')),
561
+ (re.compile(r'iban|account\s*(number|no\.?)', re.I),
562
+ dict(id='iban', label='IBAN / Account Number', type='text', section='Vendor Information', required=False, placeholder='IBAN or account number')),
563
+
564
+ # --- Declaration of Conformity ---
565
+ (re.compile(r'(authorized|authorised)\s*(signator|representative|person)', re.I),
566
+ dict(id='authorized_signatory', label='Authorized Signatory Name', type='text', section='Declaration of Conformity', required=True, placeholder='Full name of signing authority')),
567
+ (re.compile(r'(signature|sign\s*here|signed\s*by)', re.I),
568
+ dict(id='signature', label='Signature', type='file', section='Declaration of Conformity', required=True)),
569
+ (re.compile(r'(stamp|seal|company\s*stamp)', re.I),
570
+ dict(id='company_stamp', label='Company Stamp', type='file', section='Declaration of Conformity', required=False)),
571
+ (re.compile(r'(date\s*of\s*(sign|submission)|signed\s*on|date\s*signed)', re.I),
572
+ dict(id='declaration_date', label='Date of Declaration', type='date', section='Declaration of Conformity', required=True, placeholder='DD/MM/YYYY')),
573
+
574
+ # --- Technical & Financial Offer ---
575
+ (re.compile(r'(brand|manufacturer|make)\s*(name|proposed|offered)?', re.I),
576
+ dict(id='brand_offered', label='Brand / Manufacturer', type='text', section='Technical & Financial Offer', required=False, placeholder='Proposed brand name')),
577
+ (re.compile(r'(catalogue|catalog|model|part)\s*(number|no\.?|ref)', re.I),
578
+ dict(id='catalogue_number', label='Catalogue / Model Number',type='text', section='Technical & Financial Offer', required=False, placeholder='e.g. CAT-12345')),
579
+ (re.compile(r'(unit|item)\s*price', re.I),
580
+ dict(id='unit_price', label='Unit Price', type='number', section='Technical & Financial Offer', required=True, placeholder='Price per unit')),
581
+ (re.compile(r'(total|overall)\s*(price|amount|value)', re.I),
582
+ dict(id='total_price', label='Total Price', type='number', section='Technical & Financial Offer', required=True, placeholder='Total quoted amount')),
583
+ (re.compile(r'(country|place)\s*of\s*(manufacture|origin|production)', re.I),
584
+ dict(id='country_of_origin', label='Country of Origin', type='text', section='Technical & Financial Offer', required=False, placeholder='e.g. Germany')),
585
+ (re.compile(r'(registration|approval|certif).{0,20}(ministry|moh|fda|ce\b|iso)', re.I),
586
+ dict(id='registration_cert', label='Regulatory Registration Certificate', type='file', section='Technical & Financial Offer', required=True)),
587
+ (re.compile(r'(shelf\s*life|expiry|expiration)', re.I),
588
+ dict(id='shelf_life', label='Shelf Life / Expiry Date',type='text', section='Technical & Financial Offer', required=False, placeholder='e.g. min. 18 months upon delivery')),
589
+
590
+ # --- Compliance & Delivery ---
591
+ (re.compile(r'(delivery\s*(date|time|schedule)|lead\s*time)', re.I),
592
+ dict(id='delivery_lead_time',label='Delivery Lead Time', type='text', section='Compliance & Delivery', required=True, placeholder='e.g. 4-6 weeks after PO')),
593
+ (re.compile(r'(delivery\s*(term|condition|location|address)|destination|ship\s*to)', re.I),
594
+ dict(id='delivery_address', label='Delivery Address / Terms',type='textarea', section='Compliance & Delivery', required=True, placeholder='Delivery destination and Incoterms')),
595
+ (re.compile(r'\bincoterm', re.I),
596
+ dict(id='incoterms', label='Incoterms', type='dropdown', section='Compliance & Delivery', required=False, options=['EXW', 'FOB', 'CIF', 'DDP', 'DAP', 'CPT'])),
597
+ (re.compile(r'warranty\s*(period|term|duration)?', re.I),
598
+ dict(id='warranty', label='Warranty Period', type='text', section='Compliance & Delivery', required=False, placeholder='e.g. 12 months')),
599
+ (re.compile(r'(after.?sales?|technical\s*support|maintenance\s*support)', re.I),
600
+ dict(id='after_sales_support',label='After-Sales Support', type='textarea', section='Compliance & Delivery', required=False, placeholder='Describe support offered')),
601
+ (re.compile(r'(packing|packaging)\s*(standard|requirement|specification)?', re.I),
602
+ dict(id='packing_standard', label='Packing Standard', type='text', section='Compliance & Delivery', required=False, placeholder='e.g. Original manufacturer packaging')),
603
+ ]
604
+
605
+ _DEFAULT_FIELD_VALIDATION = {'min': None, 'max': None, 'pattern': None}
606
+
607
+ _KNOWN_SECTIONS = [
608
+ 'Quotation Submission',
609
+ 'Vendor Information',
610
+ 'Declaration of Conformity',
611
+ 'Schedule of Requirements',
612
+ 'Technical & Financial Offer',
613
+ 'Compliance & Delivery',
614
+ ]
615
+
616
+
617
+ def _extract_structure_rule_based(full_text: str) -> dict:
618
+ """
619
+ Parse title, sections, and fields from raw PDF text without an LLM.
620
+ Produces a best-effort result; quality depends on how legible the PDF text is.
621
+ """
622
+ lines = [l.strip() for l in full_text.splitlines()]
623
+ non_empty = [l for l in lines if l and not l.startswith('---')]
624
+
625
+ # Title: first substantive non-page-marker line
626
+ title = 'RFQ Document'
627
+ for line in non_empty[:15]:
628
+ if len(line) > 5:
629
+ title = line[:150]
630
+ break
631
+
632
+ # Sections: scan every line for signals
633
+ found_sections = []
634
+ section_order = {s: i for i, s in enumerate(_KNOWN_SECTIONS)}
635
+ for line in lines:
636
+ for pattern, section_name in _SECTION_SIGNALS:
637
+ if pattern.search(line) and section_name not in found_sections:
638
+ found_sections.append(section_name)
639
+ break
640
+ found_sections.sort(key=lambda s: section_order.get(s, 99))
641
+ if 'Schedule of Requirements' not in found_sections:
642
+ found_sections.append('Schedule of Requirements')
643
+
644
+ # Fields: slide a 3-line window and match rules
645
+ windows = [' '.join(lines[i:i + 3]) for i in range(len(lines))]
646
+ seen_ids = set()
647
+ fields = []
648
+ for window in windows:
649
+ for pattern, field_def in _FIELD_RULES:
650
+ if pattern.search(window) and field_def['id'] not in seen_ids:
651
+ if field_def['section'] in found_sections or field_def['required']:
652
+ seen_ids.add(field_def['id'])
653
+ fields.append({
654
+ 'id': field_def['id'],
655
+ 'label': field_def['label'],
656
+ 'type': field_def['type'],
657
+ 'section': field_def['section'],
658
+ 'required': field_def.get('required', False),
659
+ 'default_value': None,
660
+ 'placeholder': field_def.get('placeholder', ''),
661
+ 'options': field_def.get('options', []),
662
+ 'validation': _DEFAULT_FIELD_VALIDATION.copy(),
663
+ })
664
+
665
+ return {
666
+ 'title': title,
667
+ 'description': '',
668
+ 'sections': found_sections,
669
+ 'fields': fields,
670
+ }
671
+
672
+
673
+ def parse_rfq_pdf(pdf_bytes, use_gemini: bool = True):
674
  full_text = ""
675
  with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
676
  total_pages = len(pdf.pages)
 
682
  if text:
683
  full_text += f"\n--- Page {p_idx + 1} ---\n{text}"
684
 
685
+ # --- Main document structure extraction ---
686
+ if use_gemini:
687
+ system_prompt = """You are an expert RFQ Parser. Extract data from the RFQ text into the exact JSON structure below.
688
 
689
  JSON OUTPUT STRUCTURE:
690
  {
 
713
  ]
714
  }
715
  """
716
+ try:
717
+ client = _get_genai_client()
718
+ response = client.models.generate_content(
719
+ model=GEMINI_MODEL,
720
+ contents=full_text[:30000],
721
+ config=types.GenerateContentConfig(
722
+ system_instruction=system_prompt + "\nRETURN JSON ONLY.",
723
+ response_mime_type="application/json",
724
+ temperature=0,
725
+ ),
726
+ )
727
+ llm_data = json.loads(response.text)
728
+ except Exception:
729
+ llm_data = {"title": "Error Parsing", "description": "", "sections": [], "fields": []}
730
+ else:
731
+ llm_data = _extract_structure_rule_based(full_text)
732
+
733
+ # --- Line item extraction ---
734
  line_items = extract_line_items(pdf_bytes)
735
 
736
  valid_items = [
 
739
  ]
740
 
741
  if not valid_items:
742
+ # use_gemini=False makes this return [] immediately (no API call)
743
+ valid_items = _extract_line_items_from_llm(full_text, use_gemini=use_gemini)
744
 
745
  return {
746
  "title": llm_data.get("title", "RFQ Document"),
 
748
  "sections": llm_data.get("sections", []),
749
  "line_items": valid_items,
750
  "fields": llm_data.get("fields", []),
751
+ "gemini_used": use_gemini,
752
  }