Seth commited on
Commit
9ac95db
·
1 Parent(s): 14d81e7

Improve document classification with hybrid keyword + semantic approach and add more document types

Browse files
Files changed (1) hide show
  1. backend/app/classifier.py +164 -41
backend/app/classifier.py CHANGED
@@ -19,28 +19,100 @@ else:
19
  MODELS_DIR = Path(__file__).resolve().parent.parent.parent / "Model"
20
  MODEL_PATH = MODELS_DIR / "bert-tiny"
21
 
22
- # Common document types with descriptions for better classification
23
  DOCUMENT_TYPES = {
24
- "invoice": "A document requesting payment for goods or services provided, containing itemized charges, totals, and payment terms.",
25
- "receipt": "A document confirming payment has been received, showing transaction details and proof of purchase.",
26
- "contract": "A legally binding agreement between parties outlining terms, conditions, obligations, and signatures.",
27
- "resume": "A document summarizing a person's work experience, education, skills, and qualifications for job applications.",
28
- "letter": "A formal or informal written correspondence addressed to a recipient with greetings and closing.",
29
- "report": "A structured document presenting analysis, findings, conclusions, and recommendations on a specific topic.",
30
- "memo": "An internal business communication document with headers like To, From, Subject, and Date.",
31
- "email": "Electronic mail correspondence with headers showing sender, recipient, subject, and message content.",
32
- "form": "A structured document with fields to be filled out, often requiring signatures and dates.",
33
- "certificate": "An official document certifying completion, achievement, or qualification with certification details.",
34
- "license": "An official document granting permission to perform certain activities, with license numbers and expiration dates.",
35
- "passport": "An official government document for international travel containing personal identification and nationality information.",
36
- "medical record": "Healthcare documentation containing patient information, diagnoses, treatments, and medical history.",
37
- "bank statement": "A financial document from a bank showing account transactions, balances, deposits, and withdrawals.",
38
- "tax document": "Tax-related paperwork such as W-2 forms, 1099 forms, tax returns, or IRS correspondence.",
39
- "legal document": "Court documents, legal filings, contracts, or other documents related to legal proceedings or matters.",
40
- "academic paper": "A scholarly document with abstract, introduction, methodology, results, references, and citations.",
41
- "presentation": "A document with slides, bullet points, or structured content for presenting information to an audience.",
42
- "manual": "An instructional document providing step-by-step procedures, guidelines, or how-to information.",
43
- "other": "A document that does not clearly fit into any of the above categories."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
 
46
 
@@ -106,17 +178,41 @@ class DocumentClassifier:
106
  print("Precomputing document type embeddings...")
107
  self.type_embeddings = {}
108
 
109
- for doc_type, description in DOCUMENT_TYPES.items():
110
- # Combine type name and description for better representation
111
- text = f"{doc_type}: {description}"
 
 
112
  embedding = self._get_embedding(text)
113
  self.type_embeddings[doc_type] = embedding
114
 
115
  print("Document type embeddings computed!")
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def classify_document(self, text: str, max_length: int = 512) -> Dict[str, any]:
118
  """
119
- Classify a document based on its text content using BERT-tiny embeddings.
120
 
121
  Args:
122
  text: Document text content
@@ -143,32 +239,59 @@ class DocumentClassifier:
143
  # Get embedding for the document text
144
  doc_embedding = self._get_embedding(text, max_length)
145
 
146
- # Calculate cosine similarity with each document type
147
  scores = {}
148
- for doc_type, type_embedding in self.type_embeddings.items():
149
- # Calculate cosine similarity
 
 
 
 
 
150
  similarity = F.cosine_similarity(doc_embedding, type_embedding, dim=1)
151
- scores[doc_type] = similarity.item()
 
 
 
 
 
152
 
153
- # Normalize scores to 0-1 range using softmax
154
- score_values = torch.tensor(list(scores.values()))
155
- normalized_scores = F.softmax(score_values, dim=0)
156
 
157
- # Update scores with normalized values
158
- normalized_dict = {}
159
- for i, doc_type in enumerate(scores.keys()):
160
- normalized_dict[doc_type] = normalized_scores[i].item()
 
 
 
 
 
 
 
 
161
 
162
- # Find the best match
163
- best_type = max(normalized_dict.items(), key=lambda x: x[1])
164
 
165
  # Get top 5 classifications
166
- top_5 = sorted(normalized_dict.items(), key=lambda x: x[1], reverse=True)[:5]
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  return {
169
  "document_type": best_type[0],
170
- "confidence": round(best_type[1], 3),
171
- "all_scores": {k: round(v, 3) for k, v in top_5},
172
  "text_preview": text[:200] + "..." if len(text) > 200 else text
173
  }
174
 
 
19
  MODELS_DIR = Path(__file__).resolve().parent.parent.parent / "Model"
20
  MODEL_PATH = MODELS_DIR / "bert-tiny"
21
 
22
+ # Common document types with descriptions and keywords for better classification
23
  DOCUMENT_TYPES = {
24
+ "invoice": {
25
+ "description": "A document requesting payment for goods or services provided, containing itemized charges, totals, and payment terms.",
26
+ "keywords": ["invoice", "bill", "amount due", "total", "subtotal", "tax", "payment terms", "invoice number", "invoice date", "due date", "itemized", "charges", "balance", "payable", "vendor", "billing"]
27
+ },
28
+ "receipt": {
29
+ "description": "A document confirming payment has been received, showing transaction details and proof of purchase.",
30
+ "keywords": ["receipt", "payment received", "paid", "thank you", "transaction", "purchase", "payment confirmation", "receipt number", "date of purchase", "amount paid"]
31
+ },
32
+ "contract": {
33
+ "description": "A legally binding agreement between parties outlining terms, conditions, obligations, and signatures.",
34
+ "keywords": ["contract", "agreement", "terms", "party", "signature", "effective date", "parties", "whereas", "hereby", "obligations", "rights", "termination", "breach"]
35
+ },
36
+ "resume": {
37
+ "description": "A document summarizing a person's work experience, education, skills, and qualifications for job applications.",
38
+ "keywords": ["resume", "cv", "curriculum vitae", "experience", "education", "skills", "employment", "work history", "qualifications", "objective", "references", "contact information"]
39
+ },
40
+ "letter": {
41
+ "description": "A formal or informal written correspondence addressed to a recipient with greetings and closing.",
42
+ "keywords": ["dear", "sincerely", "yours", "letter", "correspondence", "regards", "best regards", "yours truly", "to whom it may concern", "date:", "subject:"]
43
+ },
44
+ "report": {
45
+ "description": "A structured document presenting analysis, findings, conclusions, and recommendations on a specific topic.",
46
+ "keywords": ["report", "summary", "findings", "conclusion", "analysis", "recommendations", "executive summary", "introduction", "methodology", "results", "discussion"]
47
+ },
48
+ "memo": {
49
+ "description": "An internal business communication document with headers like To, From, Subject, and Date.",
50
+ "keywords": ["memo", "memorandum", "to:", "from:", "subject:", "date:", "re:", "internal", "interoffice"]
51
+ },
52
+ "email": {
53
+ "description": "Electronic mail correspondence with headers showing sender, recipient, subject, and message content.",
54
+ "keywords": ["from:", "to:", "subject:", "sent:", "email", "cc:", "bcc:", "reply to", "message id", "date sent"]
55
+ },
56
+ "form": {
57
+ "description": "A structured document with fields to be filled out, often requiring signatures and dates.",
58
+ "keywords": ["form", "application", "please fill", "signature", "date", "please print", "complete", "fill out", "applicant", "fields"]
59
+ },
60
+ "certificate": {
61
+ "description": "An official document certifying completion, achievement, or qualification with certification details.",
62
+ "keywords": ["certificate", "certified", "awarded", "this certifies", "certification", "certificate of", "issued", "certificate number"]
63
+ },
64
+ "license": {
65
+ "description": "An official document granting permission to perform certain activities, with license numbers and expiration dates.",
66
+ "keywords": ["license", "licensed", "expires", "license number", "licensee", "licensing authority", "valid until", "license type", "permit"]
67
+ },
68
+ "passport": {
69
+ "description": "An official government document for international travel containing personal identification and nationality information.",
70
+ "keywords": ["passport", "nationality", "date of birth", "passport number", "passport no", "country of issue", "expiry date", "place of birth", "issuing authority"]
71
+ },
72
+ "medical record": {
73
+ "description": "Healthcare documentation containing patient information, diagnoses, treatments, and medical history.",
74
+ "keywords": ["medical", "diagnosis", "patient", "treatment", "prescription", "doctor", "physician", "symptoms", "medication", "health", "medical history", "patient id"]
75
+ },
76
+ "bank statement": {
77
+ "description": "A financial document from a bank showing account transactions, balances, deposits, and withdrawals.",
78
+ "keywords": ["account", "balance", "transaction", "deposit", "withdrawal", "bank statement", "account number", "account balance", "statement period", "debit", "credit", "checking", "savings"]
79
+ },
80
+ "tax document": {
81
+ "description": "Tax-related paperwork such as W-2 forms, 1099 forms, tax returns, or IRS correspondence.",
82
+ "keywords": ["tax", "irs", "income", "deduction", "w-2", "1099", "tax return", "federal tax", "social security", "withholding", "adjusted gross income", "taxable income"]
83
+ },
84
+ "legal document": {
85
+ "description": "Court documents, legal filings, contracts, or other documents related to legal proceedings or matters.",
86
+ "keywords": ["legal", "court", "plaintiff", "defendant", "attorney", "lawyer", "case number", "filing", "petition", "motion", "order", "judgment", "legal counsel"]
87
+ },
88
+ "academic paper": {
89
+ "description": "A scholarly document with abstract, introduction, methodology, results, references, and citations.",
90
+ "keywords": ["abstract", "introduction", "methodology", "references", "citation", "research", "study", "literature review", "hypothesis", "data analysis", "conclusion", "bibliography"]
91
+ },
92
+ "presentation": {
93
+ "description": "A document with slides, bullet points, or structured content for presenting information to an audience.",
94
+ "keywords": ["slide", "presentation", "agenda", "overview", "bullet points", "powerpoint", "key points", "summary slide", "title slide"]
95
+ },
96
+ "manual": {
97
+ "description": "An instructional document providing step-by-step procedures, guidelines, or how-to information.",
98
+ "keywords": ["manual", "instructions", "how to", "procedure", "steps", "guide", "tutorial", "user guide", "operation", "setup", "installation"]
99
+ },
100
+ "quote": {
101
+ "description": "A document providing a price estimate or quotation for goods or services before purchase.",
102
+ "keywords": ["quote", "quotation", "estimate", "pricing", "quote number", "valid until", "quote date", "estimated cost", "price quote", "proposal"]
103
+ },
104
+ "purchase order": {
105
+ "description": "A commercial document issued by a buyer to a seller indicating types, quantities, and agreed prices for products or services.",
106
+ "keywords": ["purchase order", "po number", "po#", "order number", "purchase", "order date", "ship to", "bill to", "quantity", "unit price", "po"]
107
+ },
108
+ "insurance policy": {
109
+ "description": "A document outlining insurance coverage, terms, premiums, and policy details.",
110
+ "keywords": ["insurance", "policy", "policy number", "premium", "coverage", "insured", "beneficiary", "policyholder", "deductible", "claim", "insurance company"]
111
+ },
112
+ "other": {
113
+ "description": "A document that does not clearly fit into any of the above categories.",
114
+ "keywords": []
115
+ }
116
  }
117
 
118
 
 
178
  print("Precomputing document type embeddings...")
179
  self.type_embeddings = {}
180
 
181
+ for doc_type, doc_info in DOCUMENT_TYPES.items():
182
+ # Combine type name, description, and keywords for better representation
183
+ description = doc_info["description"]
184
+ keywords = " ".join(doc_info.get("keywords", []))
185
+ text = f"{doc_type}: {description} Keywords: {keywords}"
186
  embedding = self._get_embedding(text)
187
  self.type_embeddings[doc_type] = embedding
188
 
189
  print("Document type embeddings computed!")
190
 
191
+ def _calculate_keyword_score(self, text: str, doc_type: str) -> float:
192
+ """Calculate keyword matching score for a document type."""
193
+ text_lower = text.lower()
194
+ doc_info = DOCUMENT_TYPES.get(doc_type, {})
195
+ keywords = doc_info.get("keywords", [])
196
+
197
+ if not keywords:
198
+ return 0.0
199
+
200
+ # Count keyword matches
201
+ matches = sum(1 for keyword in keywords if keyword.lower() in text_lower)
202
+
203
+ # Calculate score: matches / total keywords, with bonus for multiple matches
204
+ base_score = matches / len(keywords) if keywords else 0.0
205
+
206
+ # Boost score if multiple keywords found (indicates stronger match)
207
+ if matches > 0:
208
+ boost = min(0.3, matches * 0.05) # Up to 30% boost
209
+ base_score = min(1.0, base_score + boost)
210
+
211
+ return base_score
212
+
213
  def classify_document(self, text: str, max_length: int = 512) -> Dict[str, any]:
214
  """
215
+ Classify a document based on its text content using hybrid keyword + semantic similarity.
216
 
217
  Args:
218
  text: Document text content
 
239
  # Get embedding for the document text
240
  doc_embedding = self._get_embedding(text, max_length)
241
 
242
+ # Calculate scores using hybrid approach
243
  scores = {}
244
+
245
+ for doc_type in DOCUMENT_TYPES.keys():
246
+ # 1. Keyword matching score (0-1)
247
+ keyword_score = self._calculate_keyword_score(text, doc_type)
248
+
249
+ # 2. Semantic similarity score (0-1, normalized)
250
+ type_embedding = self.type_embeddings[doc_type]
251
  similarity = F.cosine_similarity(doc_embedding, type_embedding, dim=1)
252
+ semantic_score = (similarity.item() + 1) / 2 # Normalize from [-1, 1] to [0, 1]
253
+
254
+ # 3. Combine scores: 60% keyword, 40% semantic
255
+ # This gives more weight to explicit keyword matches
256
+ combined_score = (keyword_score * 0.6) + (semantic_score * 0.4)
257
+ scores[doc_type] = combined_score
258
 
259
+ # Find the best match
260
+ best_type = max(scores.items(), key=lambda x: x[1])
 
261
 
262
+ # Normalize confidence to percentage (scale to make it more meaningful)
263
+ # Use sigmoid-like scaling for better confidence representation
264
+ max_score = best_type[1]
265
+ if max_score > 0.5:
266
+ # High confidence: scale from 0.5-1.0 to 50%-95%
267
+ confidence = 50 + (max_score - 0.5) * 90
268
+ elif max_score > 0.3:
269
+ # Medium confidence: scale from 0.3-0.5 to 30%-50%
270
+ confidence = 30 + (max_score - 0.3) * 100
271
+ else:
272
+ # Low confidence: scale from 0-0.3 to 0%-30%
273
+ confidence = max_score * 100
274
 
275
+ confidence = min(95, max(5, confidence)) # Clamp between 5% and 95%
 
276
 
277
  # Get top 5 classifications
278
+ top_5 = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]
279
+
280
+ # Convert scores to percentages for display
281
+ top_5_percentages = {}
282
+ for doc_type, score in top_5:
283
+ if score > 0.5:
284
+ percent = 50 + (score - 0.5) * 90
285
+ elif score > 0.3:
286
+ percent = 30 + (score - 0.3) * 100
287
+ else:
288
+ percent = score * 100
289
+ top_5_percentages[doc_type] = min(95, max(5, percent))
290
 
291
  return {
292
  "document_type": best_type[0],
293
+ "confidence": round(confidence / 100, 3), # Return as 0-1 for consistency
294
+ "all_scores": {k: round(v / 100, 3) for k, v in top_5_percentages.items()},
295
  "text_preview": text[:200] + "..." if len(text) > 200 else text
296
  }
297