blessedpug commited on
Commit
6cf8871
·
1 Parent(s): 720d849

added files

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. form_fill.py +42 -0
  3. models.py +24 -0
  4. pipeline.py +157 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ .venv/
form_fill.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfrw import PdfReader, PdfWriter
2
+ from datetime import datetime
3
+
4
+ def fill_child_fee_pdf(
5
+ template_pdf_path,
6
+ output_pdf_path,
7
+ emp_name,
8
+ emp_code,
9
+ department,
10
+ bill_month,
11
+ items, # List of dicts: [{'bill_date': ..., 'description': ..., 'amount': ...}]
12
+ total
13
+ ):
14
+ data_dict = {
15
+ 'emp_name': emp_name,
16
+ 'emp_code': emp_code,
17
+ 'department': department,
18
+ 'bill_month': bill_month,
19
+ 'total': str(total),
20
+ 'current_date': datetime.now().strftime("%d-%b-%Y"), # e.g. "25-May-2025"
21
+
22
+ }
23
+
24
+ # Map each row of items to field names
25
+ for idx, item in enumerate(items, start=1):
26
+ data_dict[f'date_{idx}'] = item.get('bill_date', '')
27
+ data_dict[f'description_{idx}'] = item.get('description', '')
28
+ data_dict[f'amount_{idx}'] = str(item.get('amount', ''))
29
+
30
+ # Fill the PDF
31
+ template_pdf = PdfReader(template_pdf_path)
32
+ for page in template_pdf.pages:
33
+ if not hasattr(page, 'Annots') or not page.Annots:
34
+ continue
35
+ for annotation in page.Annots:
36
+ if annotation.T:
37
+ key = annotation.T[1:-1] # Remove parentheses
38
+ if key in data_dict:
39
+ annotation.V = str(data_dict[key])
40
+ annotation.AP = None # Remove old appearance so new value appears
41
+ PdfWriter().write(output_pdf_path, template_pdf)
42
+ return output_pdf_path
models.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List, Optional
3
+
4
+ class ReceiptItem(BaseModel):
5
+ description: str
6
+ amount: float
7
+
8
+ class ReceiptData(BaseModel):
9
+ merchant: str
10
+ date: str
11
+ total_amount: float
12
+ items: Optional[List[ReceiptItem]] = None
13
+
14
+
15
+
16
+ class FeeItem(BaseModel):
17
+ bill_date: Optional[str] = None # Some bills may not have per-item date
18
+ description: str
19
+ amount: float
20
+ bill_month: Optional[str] = None # Some bills may not have a billing month
21
+
22
+ class ChildFeeForm(BaseModel):
23
+ items: List[FeeItem]
24
+ total: float # Calculated after parsing
pipeline.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from dotenv import load_dotenv
3
+ from io import BytesIO
4
+ import os
5
+ from PIL import Image
6
+ import base64
7
+ import json
8
+ from models import ReceiptData, ChildFeeForm
9
+ from form_fill import fill_child_fee_pdf
10
+ from pdf2image import convert_from_path
11
+
12
+ load_dotenv()
13
+ openai.api_key = os.getenv("OPENAI_API_KEY", "").strip()
14
+
15
+
16
+ reciept_system_prompt = (
17
+ "You are an expert at extracting data from receipts. "
18
+ "Read the provided image of a receipt and return a JSON object that matches the following Pydantic model:\n"
19
+ "from typing import List, Optional\n"
20
+ "class ReceiptItem(BaseModel):\n"
21
+ " description: str\n"
22
+ " amount: float\n\n"
23
+ "class ReceiptData(BaseModel):\n"
24
+ " merchant: str\n"
25
+ " date: str\n"
26
+ " total_amount: float\n"
27
+ " items: Optional[List[ReceiptItem]] = None\n"
28
+ "- Extract only the above given information.\n"
29
+ "- If a value is missing, set it to null, \"\", or an empty list as appropriate.\n"
30
+ "- For the items field, provide a list of objects with description and amount.\n"
31
+ "- Only return a valid JSON object matching the model above.\n"
32
+ "- Do not add any explanation or extra text—only the JSON."
33
+ )
34
+
35
+ fee_bill_system_prompt = (
36
+ "You are an expert at extracting data from fee bills. "
37
+ "Read the provided image of a child fee bill and return a JSON object that matches the following Pydantic model:\n"
38
+ "from typing import List, Optional\n"
39
+ "class FeeItem(BaseModel):\n"
40
+ " bill_date: Optional[str] = None # Bill Date Field, leave null if not found\n"
41
+ " description: str\n"
42
+ " amount: float\n\n"
43
+ " bill_month: Optional[str] = None # Bill Month Field, leave null if not found\n"
44
+ "class FeeBillData(BaseModel):\n"
45
+ " items: List[FeeItem]\n"
46
+ " total: float\n"
47
+ "- Extract only the above given information.\n"
48
+ "- If a value is missing, set it to null, \"\", or an empty list as appropriate.\n"
49
+ "- For the items field, provide a list of objects with date, description, and amount.\n"
50
+ "- The total field must be the sum of all amount values in items.\n"
51
+ "- Only return a valid JSON object matching the model above.\n"
52
+ "- Do not add any explanation or extra text—only the JSON."
53
+ )
54
+
55
+
56
+
57
+
58
+ def pil_to_bytes(pil_img, quality=60):
59
+ buf = BytesIO()
60
+ pil_img.save(buf, format='JPEG', quality=quality)
61
+ buf.seek(0)
62
+ return buf
63
+
64
+
65
+ def preprocess_image(pil_img, max_size=512):
66
+ return pil_img.resize((max_size, max_size), Image.LANCZOS)
67
+
68
+
69
+ def extract_info(pil_img):
70
+ processed_image = preprocess_image(pil_img)
71
+ img_bytes = pil_to_bytes(processed_image)
72
+ img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
73
+ response = openai.chat.completions.create(
74
+ model="gpt-4o",
75
+ messages=[
76
+ {
77
+ "role": "system",
78
+ "content": reciept_system_prompt
79
+
80
+ },
81
+ {
82
+ "role": "user",
83
+ "content": [
84
+ {"type": "text", "text": "Here is a receipt image:"},
85
+ {"type": "image_url", "image_url": {"url": "data:image/png;base64," + img_base64}}
86
+ ]
87
+ }
88
+ ]
89
+ )
90
+
91
+ raw_output = response.choices[0].message.content
92
+ # print(raw_output)
93
+ try:
94
+ if raw_output.startswith("```"):
95
+ raw_output = raw_output.strip("` \n")
96
+ if raw_output.startswith("json"):
97
+ raw_output = raw_output[4:].strip()
98
+ data = json.loads(raw_output)
99
+ print(data)
100
+ validated = ReceiptData(**data)
101
+ json_block = json.dumps(validated.dict(), indent=2, ensure_ascii=False)
102
+ return f"```json\n{json_block}\n```"
103
+ except Exception as e:
104
+ return f"```json\n{json.dumps({'error': str(e), 'raw_output': raw_output}, indent=2)}\n```"
105
+
106
+
107
+ def extract_child_fee_info(img_input, emp_name, emp_code, department):
108
+ print(emp_name, emp_code, department)
109
+ processed_image = preprocess_image(img_input)
110
+ img_bytes = pil_to_bytes(processed_image)
111
+ img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
112
+ response = openai.chat.completions.create(
113
+ model="gpt-4o",
114
+ messages=[
115
+ {"role": "system", "content": fee_bill_system_prompt},
116
+ {"role": "user",
117
+ "content": [
118
+ {"type": "text", "text": "Here is a child fee bill image:"},
119
+ {"type": "image_url", "image_url": {"url": "data:image/png;base64," + img_base64}}
120
+ ]}
121
+ ]
122
+ )
123
+ raw_output = response.choices[0].message.content
124
+ try:
125
+ if raw_output.startswith("```"):
126
+ raw_output = raw_output.strip("` \n")
127
+ if raw_output.startswith("json"):
128
+ raw_output = raw_output[4:].strip()
129
+ data = json.loads(raw_output)
130
+ print(data)
131
+ # Validate if needed:
132
+ # ChildFeeForm(**data)
133
+
134
+ # Extract bill_month from first item if available, else use empty string
135
+ items = data.get("items", [])
136
+ bill_month = ""
137
+ if items and "bill_month" in items[0]:
138
+ bill_month = items[0]["bill_month"]
139
+
140
+ # Use a temp file for output so Gradio can return it
141
+ import tempfile
142
+ temp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
143
+ filled_pdf_path = fill_child_fee_pdf(
144
+ template_pdf_path="CHILD FEE REIMBURSEMENT FORM.pdf",
145
+ output_pdf_path=temp.name,
146
+ emp_name=emp_name,
147
+ emp_code=emp_code,
148
+ department=department,
149
+ bill_month=bill_month,
150
+ items=items,
151
+ total=data.get("total", "")
152
+ )
153
+
154
+ return filled_pdf_path # Return path to Gradio for download
155
+ except Exception as e:
156
+ print("ERROR:", e)
157
+ return None # or f"Error: {str(e)}"