vachaspathi commited on
Commit
39bc8a4
·
verified ·
1 Parent(s): 050d9ff

Update ai_engine.py

Browse files
Files changed (1) hide show
  1. ai_engine.py +73 -19
ai_engine.py CHANGED
@@ -5,6 +5,7 @@ from pdf2image import convert_from_path
5
  from PIL import Image
6
  import os
7
  import json
 
8
  import config
9
 
10
  # Load Model
@@ -15,42 +16,78 @@ try:
15
  except:
16
  model = None
17
 
 
 
 
 
 
 
 
 
 
 
18
  def perform_ocr(file_obj):
19
  if file_obj is None: return "", None
20
  try:
21
- filename = os.path.basename(file_obj)
22
- if filename.lower().endswith(".pdf"):
 
 
23
  image = convert_from_path(file_obj, first_page=1, last_page=1)[0]
24
  else:
25
  image = Image.open(file_obj).convert("RGB")
26
- return pytesseract.image_to_string(image), image
27
- except: return "", None
 
 
28
 
29
- def extract_intelligent_json(text):
 
 
30
  """
31
- Classifies the document and extracts relevant fields.
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
  if not model: return {}
34
 
35
- # Robust prompt instructing the AI to classify and format
36
  prompt = f"""<|im_start|>system
37
- Analyze the document text.
38
- 1. CLASSIFY the type as one of: ["invoice", "estimate", "credit_note", "expense", "contact", "purchase_order"].
39
- 2. EXTRACT data based on the type.
 
 
 
 
 
40
 
41
- OUTPUT FORMAT (JSON ONLY):
42
  {{
43
  "doc_type": "invoice",
 
44
  "data": {{
45
- "vendor_name": "...",
46
  "date": "YYYY-MM-DD",
47
  "reference_number": "...",
48
  "total": 0.00,
49
- "line_items": [ {{"name": "...", "rate": 0, "quantity": 1}} ]
50
  }}
51
  }}
52
  <|im_end|>
53
  <|im_start|>user
 
54
  DOCUMENT TEXT:
55
  {text[:1500]}
56
  <|im_end|>
@@ -59,12 +96,29 @@ def extract_intelligent_json(text):
59
  """
60
 
61
  inputs = tokenizer(prompt, return_tensors="pt")
62
- out = model.generate(**inputs, max_new_tokens=350, temperature=0.1)
63
 
64
  try:
65
- json_str = tokenizer.decode(out[0]).split("```json")[1].split("```")[0].strip()
66
- return json.loads(json_str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  except Exception as e:
68
- print(f"AI Error: {e}")
69
- # Fallback default
70
- return {"doc_type": "unknown", "data": {}}
 
 
5
  from PIL import Image
6
  import os
7
  import json
8
+ import re
9
  import config
10
 
11
  # Load Model
 
16
  except:
17
  model = None
18
 
19
+ def get_metadata(file_obj):
20
+ """Extracts file clues."""
21
+ try:
22
+ name = os.path.basename(file_obj)
23
+ size = os.path.getsize(file_obj)
24
+ ext = name.split('.')[-1].lower()
25
+ return {"filename": name, "extension": ext, "size_kb": size/1024}
26
+ except:
27
+ return {"filename": "unknown", "extension": "", "size_kb": 0}
28
+
29
  def perform_ocr(file_obj):
30
  if file_obj is None: return "", None
31
  try:
32
+ # extract metadata before processing
33
+ meta = get_metadata(file_obj)
34
+
35
+ if meta["filename"].lower().endswith(".pdf"):
36
  image = convert_from_path(file_obj, first_page=1, last_page=1)[0]
37
  else:
38
  image = Image.open(file_obj).convert("RGB")
39
+
40
+ text = pytesseract.image_to_string(image)
41
+ return text, image, meta
42
+ except: return "", None, {}
43
 
44
+ def fallback_classifier(text, filename):
45
+ """
46
+ Rule-based classifier if AI fails.
47
  """
48
+ combined = (text + " " + filename).lower()
49
+
50
+ if "invoice" in combined or "inv-" in combined: return "invoice"
51
+ if "estimate" in combined or "quote" in combined: return "estimate"
52
+ if "credit note" in combined: return "credit_note"
53
+ if "purchase order" in combined or "po-" in combined: return "purchase_order"
54
+ if "bill" in combined or "payment due" in combined: return "bill"
55
+ if "receipt" in combined: return "expense"
56
+
57
+ return "unknown"
58
+
59
+ def extract_intelligent_json(text, metadata):
60
+ """
61
+ Combines OCR + Metadata -> AI -> JSON
62
  """
63
  if not model: return {}
64
 
65
+ # Inject Metadata into System Prompt
66
  prompt = f"""<|im_start|>system
67
+ You are a Document Classifier. Use the Filename and Text to identify the document type.
68
+
69
+ VALID TYPES: ["invoice", "bill", "estimate", "credit_note", "purchase_order", "expense"]
70
+
71
+ RULES:
72
+ 1. If filename contains 'INV', it is an 'invoice'.
73
+ 2. If text mentions 'Purchase Order', it is a 'purchase_order'.
74
+ 3. Extract the Vendor/Customer Name and Dates carefully.
75
 
76
+ OUTPUT JSON FORMAT:
77
  {{
78
  "doc_type": "invoice",
79
+ "confidence": "high",
80
  "data": {{
81
+ "contact_name": "...",
82
  "date": "YYYY-MM-DD",
83
  "reference_number": "...",
84
  "total": 0.00,
85
+ "line_items": [ {{"name": "...", "description": "...", "rate": 0, "quantity": 1}} ]
86
  }}
87
  }}
88
  <|im_end|>
89
  <|im_start|>user
90
+ METADATA: {json.dumps(metadata)}
91
  DOCUMENT TEXT:
92
  {text[:1500]}
93
  <|im_end|>
 
96
  """
97
 
98
  inputs = tokenizer(prompt, return_tensors="pt")
99
+ out = model.generate(**inputs, max_new_tokens=400, temperature=0.1)
100
 
101
  try:
102
+ # Extract JSON block using Regex (More robust than split)
103
+ full_response = tokenizer.decode(out[0])
104
+ json_match = re.search(r"```json\s*(\{.*?\})\s*```", full_response, re.DOTALL)
105
+
106
+ if json_match:
107
+ data = json.loads(json_match.group(1))
108
+ else:
109
+ # Fallback: Try finding the first { and last }
110
+ start = full_response.find("{")
111
+ end = full_response.rfind("}") + 1
112
+ data = json.loads(full_response[start:end])
113
+
114
+ # Double Check Classification
115
+ if data.get("doc_type") == "unknown":
116
+ data["doc_type"] = fallback_classifier(text, metadata.get("filename", ""))
117
+
118
+ return data
119
+
120
  except Exception as e:
121
+ print(f"AI Parsing Error: {e}")
122
+ # Hard Fallback
123
+ guessed_type = fallback_classifier(text, metadata.get("filename", ""))
124
+ return {"doc_type": guessed_type, "data": {}}