Abhisesh7 commited on
Commit
8d748b8
·
verified ·
1 Parent(s): cbb2ae3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -5
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # main.py
2
  from fastapi import FastAPI, HTTPException, File, UploadFile
3
  from paddleocr import PaddleOCR
4
  import fitz
@@ -8,12 +7,22 @@ import os
8
  import requests
9
  import sqlite3
10
  from datetime import datetime
 
11
 
12
  app = FastAPI()
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
16
 
 
 
 
 
 
 
 
 
 
 
17
  HF_API_URL = "https://api-inference.huggingface.co/models/Abhisesh7/Invoice-Fraud-Detection"
18
  HF_API_KEY = os.getenv("HF_API_KEY")
19
  if not HF_API_KEY:
@@ -21,6 +30,7 @@ if not HF_API_KEY:
21
  raise RuntimeError("Hugging Face API key not set")
22
  HEADERS = {"Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json"}
23
 
 
24
  conn = sqlite3.connect("invoices.db")
25
  cursor = conn.cursor()
26
  cursor.execute("""
@@ -37,14 +47,18 @@ conn.commit()
37
  @app.post("/process_invoice/")
38
  async def process_invoice(file: UploadFile = File(...)):
39
  try:
 
40
  if not file.filename.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')):
41
  raise HTTPException(status_code=400, detail="Invalid file type. Use PDF or image (PNG, JPG, JPEG).")
42
 
 
43
  with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
44
  content = await file.read()
45
  temp_file.write(content)
46
  temp_file_path = temp_file.name
47
 
 
 
48
  extracted_text = ""
49
  if temp_file_path.lower().endswith('.pdf'):
50
  pdf_document = fitz.open(temp_file_path)
@@ -53,11 +67,11 @@ async def process_invoice(file: UploadFile = File(...)):
53
  pix = page.get_pixmap()
54
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as img_file:
55
  pix.save(img_file.name)
56
- result = ocr.ocr(img_file.name, cls=True)
57
  extracted_text += "\n".join([line[1][0] for line in result[0]]) + "\n"
58
  pdf_document.close()
59
  else:
60
- result = ocr.ocr(temp_file_path, cls=True)
61
  extracted_text = "\n".join([line[1][0] for line in result[0]])
62
 
63
  os.unlink(temp_file_path)
@@ -65,6 +79,7 @@ async def process_invoice(file: UploadFile = File(...)):
65
  if not extracted_text.strip():
66
  raise HTTPException(status_code=400, detail="No text extracted from file.")
67
 
 
68
  payload = {"text": extracted_text}
69
  response = requests.post(HF_API_URL, headers=HEADERS, json=payload)
70
 
@@ -77,11 +92,13 @@ async def process_invoice(file: UploadFile = File(...)):
77
  fraud_reasoning = result.get("fraud_reasoning", "")
78
  flagged = result.get("flagged", False)
79
 
 
80
  vendor = entities.get("vendor", "Unknown")
81
  amount = float(entities.get("amount", 0))
82
  date_str = entities.get("date", "")
83
  invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date().isoformat() if date_str else ""
84
 
 
85
  cursor.execute("""
86
  SELECT id, timestamp FROM invoices
87
  WHERE vendor = ? AND amount = ? AND date = ?
@@ -93,6 +110,7 @@ async def process_invoice(file: UploadFile = File(...)):
93
  fraud_reasoning += f" | {duplicate_info}"
94
  flagged = True
95
 
 
96
  timestamp = datetime.now().isoformat()
97
  cursor.execute("""
98
  INSERT INTO invoices (vendor, amount, date, timestamp)
@@ -116,4 +134,7 @@ async def process_invoice(file: UploadFile = File(...)):
116
 
117
  @app.on_event("shutdown")
118
  def shutdown_event():
119
- conn.close()
 
 
 
 
 
1
  from fastapi import FastAPI, HTTPException, File, UploadFile
2
  from paddleocr import PaddleOCR
3
  import fitz
 
7
  import requests
8
  import sqlite3
9
  from datetime import datetime
10
+ import uvicorn
11
 
12
  app = FastAPI()
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
 
15
 
16
+ # Initialize PaddleOCR lazily to reduce startup memory usage
17
+ ocr = None
18
+ def get_ocr():
19
+ global ocr
20
+ if ocr is None:
21
+ logger.info("Initializing PaddleOCR")
22
+ ocr = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=False) # Disable angle classification and GPU
23
+ return ocr
24
+
25
+ # Hugging Face API configuration
26
  HF_API_URL = "https://api-inference.huggingface.co/models/Abhisesh7/Invoice-Fraud-Detection"
27
  HF_API_KEY = os.getenv("HF_API_KEY")
28
  if not HF_API_KEY:
 
30
  raise RuntimeError("Hugging Face API key not set")
31
  HEADERS = {"Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json"}
32
 
33
+ # Initialize SQLite database
34
  conn = sqlite3.connect("invoices.db")
35
  cursor = conn.cursor()
36
  cursor.execute("""
 
47
  @app.post("/process_invoice/")
48
  async def process_invoice(file: UploadFile = File(...)):
49
  try:
50
+ # Validate file type
51
  if not file.filename.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')):
52
  raise HTTPException(status_code=400, detail="Invalid file type. Use PDF or image (PNG, JPG, JPEG).")
53
 
54
+ # Save uploaded file
55
  with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
56
  content = await file.read()
57
  temp_file.write(content)
58
  temp_file_path = temp_file.name
59
 
60
+ # Extract text using PaddleOCR
61
+ ocr_instance = get_ocr()
62
  extracted_text = ""
63
  if temp_file_path.lower().endswith('.pdf'):
64
  pdf_document = fitz.open(temp_file_path)
 
67
  pix = page.get_pixmap()
68
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as img_file:
69
  pix.save(img_file.name)
70
+ result = ocr_instance.ocr(img_file.name, cls=False)
71
  extracted_text += "\n".join([line[1][0] for line in result[0]]) + "\n"
72
  pdf_document.close()
73
  else:
74
+ result = ocr_instance.ocr(temp_file_path, cls=False)
75
  extracted_text = "\n".join([line[1][0] for line in result[0]])
76
 
77
  os.unlink(temp_file_path)
 
79
  if not extracted_text.strip():
80
  raise HTTPException(status_code=400, detail="No text extracted from file.")
81
 
82
+ # Call Hugging Face API
83
  payload = {"text": extracted_text}
84
  response = requests.post(HF_API_URL, headers=HEADERS, json=payload)
85
 
 
92
  fraud_reasoning = result.get("fraud_reasoning", "")
93
  flagged = result.get("flagged", False)
94
 
95
+ # Extract invoice metadata
96
  vendor = entities.get("vendor", "Unknown")
97
  amount = float(entities.get("amount", 0))
98
  date_str = entities.get("date", "")
99
  invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date().isoformat() if date_str else ""
100
 
101
+ # Check for duplicates
102
  cursor.execute("""
103
  SELECT id, timestamp FROM invoices
104
  WHERE vendor = ? AND amount = ? AND date = ?
 
110
  fraud_reasoning += f" | {duplicate_info}"
111
  flagged = True
112
 
113
+ # Store invoice metadata
114
  timestamp = datetime.now().isoformat()
115
  cursor.execute("""
116
  INSERT INTO invoices (vendor, amount, date, timestamp)
 
134
 
135
  @app.on_event("shutdown")
136
  def shutdown_event():
137
+ conn.close()
138
+
139
+ if __name__ == "__main__":
140
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))