Spaces:

blessedpug
/

Demo_Build

Sleeping

App Files Files Community

blessedpug commited on May 25, 2025

Commit

6cf8871

1 Parent(s): 720d849

added files

Browse files

Files changed (4) hide show

.gitignore +2 -0
form_fill.py +42 -0
models.py +24 -0
pipeline.py +157 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ .venv/

form_fill.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from pdfrw import PdfReader, PdfWriter
+from datetime import datetime
+def fill_child_fee_pdf(
+    template_pdf_path,
+    output_pdf_path,
+    emp_name,
+    emp_code,
+    department,
+    bill_month,
+    items,   # List of dicts: [{'bill_date': ..., 'description': ..., 'amount': ...}]
+    total
+):
+    data_dict = {
+        'emp_name': emp_name,
+        'emp_code': emp_code,
+        'department': department,
+        'bill_month': bill_month,
+        'total': str(total),
+        'current_date': datetime.now().strftime("%d-%b-%Y"),  # e.g. "25-May-2025"
+    }
+    # Map each row of items to field names
+    for idx, item in enumerate(items, start=1):
+        data_dict[f'date_{idx}'] = item.get('bill_date', '')
+        data_dict[f'description_{idx}'] = item.get('description', '')
+        data_dict[f'amount_{idx}'] = str(item.get('amount', ''))
+    # Fill the PDF
+    template_pdf = PdfReader(template_pdf_path)
+    for page in template_pdf.pages:
+        if not hasattr(page, 'Annots') or not page.Annots:
+            continue
+        for annotation in page.Annots:
+            if annotation.T:
+                key = annotation.T[1:-1]  # Remove parentheses
+                if key in data_dict:
+                    annotation.V = str(data_dict[key])
+                    annotation.AP = None  # Remove old appearance so new value appears
+    PdfWriter().write(output_pdf_path, template_pdf)
+    return output_pdf_path

models.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pydantic import BaseModel
+from typing import List, Optional
+class ReceiptItem(BaseModel):
+    description: str
+    amount: float
+class ReceiptData(BaseModel):
+    merchant: str
+    date: str
+    total_amount: float
+    items: Optional[List[ReceiptItem]] = None
+class FeeItem(BaseModel):
+    bill_date: Optional[str] = None  # Some bills may not have per-item date
+    description: str
+    amount: float
+    bill_month: Optional[str] = None  # Some bills may not have a billing month
+class ChildFeeForm(BaseModel):
+    items: List[FeeItem]
+    total: float  # Calculated after parsing

pipeline.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import openai
+from dotenv import load_dotenv
+from io import BytesIO
+import os
+from PIL import Image
+import base64
+import json
+from models import ReceiptData, ChildFeeForm
+from form_fill import fill_child_fee_pdf
+from pdf2image import convert_from_path
+load_dotenv()
+openai.api_key = os.getenv("OPENAI_API_KEY", "").strip()
+reciept_system_prompt = (
+    "You are an expert at extracting data from receipts. "
+    "Read the provided image of a receipt and return a JSON object that matches the following Pydantic model:\n"
+    "from typing import List, Optional\n"
+    "class ReceiptItem(BaseModel):\n"
+    "    description: str\n"
+    "    amount: float\n\n"
+    "class ReceiptData(BaseModel):\n"
+    "    merchant: str\n"
+    "    date: str\n"
+    "    total_amount: float\n"
+    "    items: Optional[List[ReceiptItem]] = None\n"
+    "- Extract only the above given information.\n"
+    "- If a value is missing, set it to null, \"\", or an empty list as appropriate.\n"
+    "- For the items field, provide a list of objects with description and amount.\n"
+    "- Only return a valid JSON object matching the model above.\n"
+    "- Do not add any explanation or extra text—only the JSON."
+)
+fee_bill_system_prompt = (
+    "You are an expert at extracting data from fee bills. "
+    "Read the provided image of a child fee bill and return a JSON object that matches the following Pydantic model:\n"
+    "from typing import List, Optional\n"
+    "class FeeItem(BaseModel):\n"
+    "    bill_date: Optional[str] = None  # Bill Date Field, leave null if not found\n"
+    "    description: str\n"
+    "    amount: float\n\n"
+    "    bill_month: Optional[str] = None  # Bill Month Field, leave null if not found\n"
+    "class FeeBillData(BaseModel):\n"
+    "    items: List[FeeItem]\n"
+    "    total: float\n"
+    "- Extract only the above given information.\n"
+    "- If a value is missing, set it to null, \"\", or an empty list as appropriate.\n"
+    "- For the items field, provide a list of objects with date, description, and amount.\n"
+    "- The total field must be the sum of all amount values in items.\n"
+    "- Only return a valid JSON object matching the model above.\n"
+    "- Do not add any explanation or extra text—only the JSON."
+)
+def pil_to_bytes(pil_img, quality=60):
+    buf = BytesIO()
+    pil_img.save(buf, format='JPEG', quality=quality)
+    buf.seek(0)
+    return buf
+def preprocess_image(pil_img, max_size=512):
+    return pil_img.resize((max_size, max_size), Image.LANCZOS)
+def extract_info(pil_img):
+    processed_image = preprocess_image(pil_img)
+    img_bytes = pil_to_bytes(processed_image)
+    img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
+    response = openai.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "system",
+                "content": reciept_system_prompt
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Here is a receipt image:"},
+                    {"type": "image_url", "image_url": {"url": "data:image/png;base64," + img_base64}}
+                ]
+            }
+        ]
+    )
+    raw_output = response.choices[0].message.content
+    # print(raw_output)
+    try:
+        if raw_output.startswith("```"):
+            raw_output = raw_output.strip("` \n")
+            if raw_output.startswith("json"):
+                raw_output = raw_output[4:].strip()
+        data = json.loads(raw_output)
+        print(data)
+        validated = ReceiptData(**data)
+        json_block = json.dumps(validated.dict(), indent=2, ensure_ascii=False)
+        return f"```json\n{json_block}\n```"
+    except Exception as e:
+        return f"```json\n{json.dumps({'error': str(e), 'raw_output': raw_output}, indent=2)}\n```"
+def extract_child_fee_info(img_input, emp_name, emp_code, department):
+    print(emp_name, emp_code, department)
+    processed_image = preprocess_image(img_input)
+    img_bytes = pil_to_bytes(processed_image)
+    img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
+    response = openai.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {"role": "system", "content": fee_bill_system_prompt},
+            {"role": "user",
+             "content": [
+                {"type": "text", "text": "Here is a child fee bill image:"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64," + img_base64}}
+             ]}
+        ]
+    )
+    raw_output = response.choices[0].message.content
+    try:
+        if raw_output.startswith("```"):
+            raw_output = raw_output.strip("` \n")
+            if raw_output.startswith("json"):
+                raw_output = raw_output[4:].strip()
+        data = json.loads(raw_output)
+        print(data)
+        # Validate if needed:
+        # ChildFeeForm(**data)
+        # Extract bill_month from first item if available, else use empty string
+        items = data.get("items", [])
+        bill_month = ""
+        if items and "bill_month" in items[0]:
+            bill_month = items[0]["bill_month"]
+        # Use a temp file for output so Gradio can return it
+        import tempfile
+        temp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        filled_pdf_path = fill_child_fee_pdf(
+            template_pdf_path="CHILD FEE REIMBURSEMENT FORM.pdf",
+            output_pdf_path=temp.name,
+            emp_name=emp_name,
+            emp_code=emp_code,
+            department=department,
+            bill_month=bill_month,
+            items=items,
+            total=data.get("total", "")
+        )
+        return filled_pdf_path # Return path to Gradio for download
+    except Exception as e:
+        print("ERROR:", e)
+        return None  # or f"Error: {str(e)}"