Sathvik-kota commited on
Commit
9439b9f
·
verified ·
1 Parent(s): 0bfaa94

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +215 -86
app.py CHANGED
@@ -1,45 +1,57 @@
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
- from io import BytesIO
4
- from pdf2image import convert_from_bytes
5
  from PIL import Image
6
- import pytesseract, requests, re
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  app = FastAPI()
9
 
 
10
  class BillRequest(BaseModel):
 
 
 
 
11
  document: str
12
 
13
 
14
- def parse_text(text):
15
- """Extract bill items using a simple numeric line pattern."""
16
- lines = [l.strip() for l in text.splitlines() if l.strip()]
17
- pattern = re.compile(r"^(.*\D)?(\d+(?:\.\d+)?)$")
18
-
19
- items=[]
20
- for line in lines:
21
- m=pattern.match(line)
22
- if not m: continue
23
- name=(m.group(1) or "").strip()
24
- if not name: continue
25
- try: amount=float(m.group(2))
26
- except: continue
27
- items.append({"item_name":name,"item_amount":amount,"item_rate":0.0,"item_quantity":0.0})
28
- return items
29
 
30
  def extract_items_from_text(text: str):
31
  """
32
- Looser heuristic:
33
- - Take any line that has at least one numeric token
34
- - Use the last numeric token as item_amount
35
- - Everything before that token is item_name
36
- - Skip obvious total/summary lines
 
 
 
 
37
  """
38
  lines = [line.strip() for line in text.splitlines() if line.strip()]
39
  bill_items = []
40
 
41
  for line in lines:
42
- # Skip totals / summary lines
43
  if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
44
  continue
45
 
@@ -47,7 +59,7 @@ def extract_items_from_text(text: str):
47
  if not tokens:
48
  continue
49
 
50
- # Find all purely numeric tokens (e.g. 123, 45.67)
51
  numeric_indices = [
52
  i for i, tok in enumerate(tokens)
53
  if re.fullmatch(r"\d+(\.\d+)?", tok)
@@ -60,7 +72,6 @@ def extract_items_from_text(text: str):
60
  amount_str = tokens[last_idx]
61
  name_tokens = tokens[:last_idx]
62
 
63
- # If there's no text before the amount, skip
64
  if not name_tokens:
65
  continue
66
 
@@ -75,24 +86,128 @@ def extract_items_from_text(text: str):
75
  {
76
  "item_name": item_name,
77
  "item_amount": amount_val,
78
- "item_rate": 0.0,
79
- "item_quantity": 0.0,
80
  }
81
  )
82
 
83
  return bill_items
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  @app.post("/extract-bill-data")
86
  async def extract_bill_data(payload: BillRequest):
87
  """
88
  Main Datathon endpoint.
89
 
90
- Current flow:
91
- - Download the document from the provided URL
92
- - If it's a PDF, convert pages to images and run OCR per page
93
- - If it's an image (png/jpg/jpeg), run OCR on the image
94
- - Extract line items using a simple text heuristic
95
- - Return data in the required JSON format
 
 
96
  """
97
  doc_url = payload.document
98
 
@@ -104,7 +219,6 @@ async def extract_bill_data(payload: BillRequest):
104
  response = requests.get(doc_url, headers=headers, timeout=20)
105
 
106
  if response.status_code != 200:
107
- # URL not reachable → graceful failure
108
  return {
109
  "is_success": False,
110
  "token_usage": {
@@ -121,7 +235,6 @@ async def extract_bill_data(payload: BillRequest):
121
  file_bytes = response.content
122
 
123
  except Exception:
124
- # Network or other error
125
  return {
126
  "is_success": False,
127
  "token_usage": {
@@ -135,50 +248,42 @@ async def extract_bill_data(payload: BillRequest):
135
  }
136
  }
137
 
138
- pagewise_line_items = []
139
- total_item_count = 0
 
140
 
141
- # ---- Step 2: OCR + extraction ----
142
  try:
143
- lower_url = doc_url.lower()
144
-
145
- # PDF handling
146
  if lower_url.endswith(".pdf"):
147
  pages = convert_from_bytes(file_bytes)
148
  for idx, page_img in enumerate(pages, start=1):
149
- ocr_text = pytesseract.image_to_string(page_img)
150
- bill_items = extract_items_from_text(ocr_text)
151
-
152
- if bill_items:
153
- pagewise_line_items.append(
154
- {
155
- "page_no": str(idx),
156
- "page_type": "Bill Detail", # can refine later
157
- "bill_items": bill_items,
158
- }
159
- )
160
- total_item_count += len(bill_items)
161
-
162
- # Image handling
163
- elif any(lower_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
164
- image = Image.open(BytesIO(file_bytes))
165
- ocr_text = pytesseract.image_to_string(image)
166
- bill_items = extract_items_from_text(ocr_text)
167
-
168
- if bill_items:
169
- pagewise_line_items.append(
170
  {
171
- "page_no": "1",
172
- "page_type": "Bill Detail",
173
- "bill_items": bill_items,
174
  }
175
  )
176
- total_item_count = len(bill_items)
177
 
178
- # Other types (json, txt, etc.) → currently no extraction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  except Exception:
181
- # OCR / parsing failure → keep schema, mark as failure
182
  return {
183
  "is_success": False,
184
  "token_usage": {
@@ -192,14 +297,41 @@ async def extract_bill_data(payload: BillRequest):
192
  }
193
  }
194
 
195
- # ---- Step 3: Final response ----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  return {
197
  "is_success": True,
198
- "token_usage": {
199
- "total_tokens": 0, # update when LLMs are added
200
- "input_tokens": 0,
201
- "output_tokens": 0
202
- },
203
  "data": {
204
  "pagewise_line_items": pagewise_line_items,
205
  "total_item_count": total_item_count
@@ -207,16 +339,13 @@ async def extract_bill_data(payload: BillRequest):
207
  }
208
 
209
 
210
- def bad_response():
211
- return {
212
- "is_success":False,
213
- "token_usage":{"total_tokens":0,"input_tokens":0,"output_tokens":0},
214
- "data":{"pagewise_line_items":[],"total_item_count":0}
215
- }
216
-
217
- def success(data,count):
218
  return {
219
- "is_success":True,
220
- "token_usage":{"total_tokens":0,"input_tokens":0,"output_tokens":0},
221
- "data":{"pagewise_line_items":data,"total_item_count":count}
222
  }
 
1
+ # app.py
2
+ import os
3
+ import re
4
+ import json
5
+ from io import BytesIO
6
+
7
  from fastapi import FastAPI
8
  from pydantic import BaseModel
9
+ import requests
 
10
  from PIL import Image
11
+ from pdf2image import convert_from_bytes
12
+ import pytesseract
13
+ import google.generativeai as genai
14
+
15
+ # ---------------- LLM CONFIG (Gemini) ----------------
16
+
17
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
18
+ GEMINI_MODEL_NAME = "gemini-1.5-flash"
19
+
20
+ if GEMINI_API_KEY:
21
+ genai.configure(api_key=GEMINI_API_KEY)
22
+
23
+ # ---------------- FASTAPI APP ----------------
24
 
25
  app = FastAPI()
26
 
27
+
28
  class BillRequest(BaseModel):
29
+ """
30
+ Request body model.
31
+ Expects a public URL to a bill document (image/PDF).
32
+ """
33
  document: str
34
 
35
 
36
+ # ---------------- FALLBACK REGEX EXTRACTOR ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def extract_items_from_text(text: str):
39
  """
40
+ Very simple rule-based extractor used as a fallback
41
+ when LLM is not available or fails.
42
+
43
+ Logic:
44
+ - Split OCR text into lines
45
+ - For each line, if it has at least one numeric token,
46
+ treat the last numeric token as item_amount
47
+ - Everything before that is item_name
48
+ - Skip lines that look like totals
49
  """
50
  lines = [line.strip() for line in text.splitlines() if line.strip()]
51
  bill_items = []
52
 
53
  for line in lines:
54
+ # Skip obvious total lines
55
  if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
56
  continue
57
 
 
59
  if not tokens:
60
  continue
61
 
62
+ # Numeric tokens like 123 or 45.67
63
  numeric_indices = [
64
  i for i, tok in enumerate(tokens)
65
  if re.fullmatch(r"\d+(\.\d+)?", tok)
 
72
  amount_str = tokens[last_idx]
73
  name_tokens = tokens[:last_idx]
74
 
 
75
  if not name_tokens:
76
  continue
77
 
 
86
  {
87
  "item_name": item_name,
88
  "item_amount": amount_val,
89
+ "item_rate": 0.0, # to be improved later
90
+ "item_quantity": 0.0, # to be improved later
91
  }
92
  )
93
 
94
  return bill_items
95
 
96
+
97
+ # ---------------- LLM CALL (GEMINI) ----------------
98
+
99
+ def call_gemini_for_items(pages_ocr):
100
+ """
101
+ pages_ocr: list of dicts:
102
+ { "page_no": "1", "page_type": "Bill Detail", "text": "<ocr_text>" }
103
+
104
+ Returns:
105
+ (pagewise_line_items, token_usage_dict)
106
+ or (None, zero_token_usage) if LLM is unavailable / fails.
107
+ """
108
+ zero_usage = {
109
+ "total_tokens": 0,
110
+ "input_tokens": 0,
111
+ "output_tokens": 0
112
+ }
113
+
114
+ if not GEMINI_API_KEY:
115
+ # No key configured → skip LLM and let caller fallback
116
+ return None, zero_usage
117
+
118
+ # Build a concise representation of pages for the prompt
119
+ pages_repr = [
120
+ {
121
+ "page_no": p["page_no"],
122
+ "page_type": p["page_type"],
123
+ "text": p["text"],
124
+ }
125
+ for p in pages_ocr
126
+ ]
127
+
128
+ system_instruction = (
129
+ "You are a medical bill extraction engine. "
130
+ "Given OCR text from each page of a bill, extract individual line items.\n\n"
131
+ "For each page, you must return bill_items with fields:\n"
132
+ "- item_name (string, as close as possible to bill text)\n"
133
+ "- item_rate (float; 0.0 if not clearly present)\n"
134
+ "- item_quantity (float; 1.0 if implicit; 0.0 if unknown)\n"
135
+ "- item_amount (float; net amount for that line)\n\n"
136
+ "Do NOT include grand totals, sub-totals, or net payable rows as separate items.\n"
137
+ "Only include the per-service / per-medicine lines.\n\n"
138
+ "Return ONLY valid JSON in this exact shape (no comments, no extra keys):\n"
139
+ "{\n"
140
+ " \"pagewise_line_items\": [\n"
141
+ " {\n"
142
+ " \"page_no\": \"1\",\n"
143
+ " \"page_type\": \"Bill Detail\",\n"
144
+ " \"bill_items\": [\n"
145
+ " {\n"
146
+ " \"item_name\": \"...\",\n"
147
+ " \"item_amount\": 123.45,\n"
148
+ " \"item_rate\": 61.72,\n"
149
+ " \"item_quantity\": 2.0\n"
150
+ " }\n"
151
+ " ]\n"
152
+ " }\n"
153
+ " ]\n"
154
+ "}\n"
155
+ )
156
+
157
+ user_prompt = (
158
+ "Use the following OCR text per page to extract line items into the required schema.\n"
159
+ "The data is provided as a JSON array under the key 'pages_ocr'.\n\n"
160
+ f"pages_ocr = {json.dumps(pages_repr, ensure_ascii=False)}"
161
+ )
162
+
163
+ try:
164
+ model = genai.GenerativeModel(GEMINI_MODEL_NAME)
165
+ response = model.generate_content(
166
+ [
167
+ {"role": "system", "parts": [system_instruction]},
168
+ {"role": "user", "parts": [user_prompt]},
169
+ ]
170
+ )
171
+
172
+ raw_text = response.text.strip()
173
+
174
+ # Strip possible ```json ... ``` wrappers
175
+ if raw_text.startswith("```"):
176
+ raw_text = re.sub(r"^```[a-zA-Z]*", "", raw_text)
177
+ raw_text = re.sub(r"```$", "", raw_text)
178
+ raw_text = raw_text.strip()
179
+
180
+ parsed = json.loads(raw_text)
181
+
182
+ pagewise = parsed.get("pagewise_line_items", [])
183
+ if not isinstance(pagewise, list):
184
+ return None, zero_usage
185
+
186
+ # We are on free tier, so we keep token_usage as zeros (schema only)
187
+ token_usage = zero_usage
188
+
189
+ return pagewise, token_usage
190
+
191
+ except Exception:
192
+ # Any LLM error → caller will fallback to regex
193
+ return None, zero_usage
194
+
195
+
196
+ # ---------------- MAIN ENDPOINT ----------------
197
+
198
  @app.post("/extract-bill-data")
199
  async def extract_bill_data(payload: BillRequest):
200
  """
201
  Main Datathon endpoint.
202
 
203
+ Flow:
204
+ - Download document from URL
205
+ - If PDF: convert each page to an image and run OCR
206
+ - If image: run OCR directly
207
+ - Build page-wise OCR text
208
+ - Try LLM (Gemini) to extract structured line items
209
+ - If LLM fails or key missing → fallback to regex-only extraction
210
+ - Return JSON in the exact schema expected by the evaluators
211
  """
212
  doc_url = payload.document
213
 
 
219
  response = requests.get(doc_url, headers=headers, timeout=20)
220
 
221
  if response.status_code != 200:
 
222
  return {
223
  "is_success": False,
224
  "token_usage": {
 
235
  file_bytes = response.content
236
 
237
  except Exception:
 
238
  return {
239
  "is_success": False,
240
  "token_usage": {
 
248
  }
249
  }
250
 
251
+ # ---- Step 2: OCR (PDF + images) ----
252
+ pagewise_ocr = [] # list of {page_no, page_type, text}
253
+ lower_url = doc_url.lower()
254
 
 
255
  try:
256
+ # PDF case
 
 
257
  if lower_url.endswith(".pdf"):
258
  pages = convert_from_bytes(file_bytes)
259
  for idx, page_img in enumerate(pages, start=1):
260
+ text = pytesseract.image_to_string(page_img)
261
+ pagewise_ocr.append(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  {
263
+ "page_no": str(idx),
264
+ "page_type": "Bill Detail", # can refine later
265
+ "text": text,
266
  }
267
  )
 
268
 
269
+ # Image case
270
+ elif any(lower_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
271
+ image = Image.open(BytesIO(file_bytes))
272
+ text = pytesseract.image_to_string(image)
273
+ pagewise_ocr.append(
274
+ {
275
+ "page_no": "1",
276
+ "page_type": "Bill Detail",
277
+ "text": text,
278
+ }
279
+ )
280
+
281
+ # Other file types → currently not handled
282
+ else:
283
+ pagewise_ocr = []
284
 
285
  except Exception:
286
+ # OCR failure
287
  return {
288
  "is_success": False,
289
  "token_usage": {
 
297
  }
298
  }
299
 
300
+ # ---- Step 3: LLM extraction + fallback ----
301
+ pagewise_line_items = []
302
+ token_usage = {
303
+ "total_tokens": 0,
304
+ "input_tokens": 0,
305
+ "output_tokens": 0
306
+ }
307
+
308
+ if pagewise_ocr:
309
+ # Try Gemini first (if key is set)
310
+ pagewise_llm, token_usage = call_gemini_for_items(pagewise_ocr)
311
+
312
+ if pagewise_llm:
313
+ pagewise_line_items = pagewise_llm
314
+ else:
315
+ # Fallback: regex-based extraction
316
+ for p in pagewise_ocr:
317
+ items = extract_items_from_text(p["text"])
318
+ if items:
319
+ pagewise_line_items.append(
320
+ {
321
+ "page_no": p["page_no"],
322
+ "page_type": p["page_type"],
323
+ "bill_items": items,
324
+ }
325
+ )
326
+
327
+ total_item_count = sum(
328
+ len(p.get("bill_items", [])) for p in pagewise_line_items
329
+ )
330
+
331
+ # ---- Step 4: Final response ----
332
  return {
333
  "is_success": True,
334
+ "token_usage": token_usage,
 
 
 
 
335
  "data": {
336
  "pagewise_line_items": pagewise_line_items,
337
  "total_item_count": total_item_count
 
339
  }
340
 
341
 
342
+ @app.get("/")
343
+ def health_check():
344
+ """
345
+ Simple health endpoint to verify that the API is running.
346
+ """
 
 
 
347
  return {
348
+ "status": "ok",
349
+ "message": "Bajaj Datathon bill extraction API is live.",
350
+ "hint": "Use POST /extract-bill-data with { 'document': '<url>' }"
351
  }