Spaces:

Seth0330
/

DocClassify

Sleeping

App Files Files Community

Seth commited on Feb 2

Commit

9ac95db

1 Parent(s): 14d81e7

Improve document classification with hybrid keyword + semantic approach and add more document types

Browse files

Files changed (1) hide show

backend/app/classifier.py +164 -41

backend/app/classifier.py CHANGED Viewed

@@ -19,28 +19,100 @@ else:
     MODELS_DIR = Path(__file__).resolve().parent.parent.parent / "Model"
 MODEL_PATH = MODELS_DIR / "bert-tiny"
-# Common document types with descriptions for better classification
 DOCUMENT_TYPES = {
-    "invoice": "A document requesting payment for goods or services provided, containing itemized charges, totals, and payment terms.",
-    "receipt": "A document confirming payment has been received, showing transaction details and proof of purchase.",
-    "contract": "A legally binding agreement between parties outlining terms, conditions, obligations, and signatures.",
-    "resume": "A document summarizing a person's work experience, education, skills, and qualifications for job applications.",
-    "letter": "A formal or informal written correspondence addressed to a recipient with greetings and closing.",
-    "report": "A structured document presenting analysis, findings, conclusions, and recommendations on a specific topic.",
-    "memo": "An internal business communication document with headers like To, From, Subject, and Date.",
-    "email": "Electronic mail correspondence with headers showing sender, recipient, subject, and message content.",
-    "form": "A structured document with fields to be filled out, often requiring signatures and dates.",
-    "certificate": "An official document certifying completion, achievement, or qualification with certification details.",
-    "license": "An official document granting permission to perform certain activities, with license numbers and expiration dates.",
-    "passport": "An official government document for international travel containing personal identification and nationality information.",
-    "medical record": "Healthcare documentation containing patient information, diagnoses, treatments, and medical history.",
-    "bank statement": "A financial document from a bank showing account transactions, balances, deposits, and withdrawals.",
-    "tax document": "Tax-related paperwork such as W-2 forms, 1099 forms, tax returns, or IRS correspondence.",
-    "legal document": "Court documents, legal filings, contracts, or other documents related to legal proceedings or matters.",
-    "academic paper": "A scholarly document with abstract, introduction, methodology, results, references, and citations.",
-    "presentation": "A document with slides, bullet points, or structured content for presenting information to an audience.",
-    "manual": "An instructional document providing step-by-step procedures, guidelines, or how-to information.",
-    "other": "A document that does not clearly fit into any of the above categories."
 }
@@ -106,17 +178,41 @@ class DocumentClassifier:
         print("Precomputing document type embeddings...")
         self.type_embeddings = {}
-        for doc_type, description in DOCUMENT_TYPES.items():
-            # Combine type name and description for better representation
-            text = f"{doc_type}: {description}"
             embedding = self._get_embedding(text)
             self.type_embeddings[doc_type] = embedding
         print("Document type embeddings computed!")
     def classify_document(self, text: str, max_length: int = 512) -> Dict[str, any]:
         """
-        Classify a document based on its text content using BERT-tiny embeddings.
         Args:
             text: Document text content
@@ -143,32 +239,59 @@ class DocumentClassifier:
             # Get embedding for the document text
             doc_embedding = self._get_embedding(text, max_length)
-            # Calculate cosine similarity with each document type
             scores = {}
-            for doc_type, type_embedding in self.type_embeddings.items():
-                # Calculate cosine similarity
                 similarity = F.cosine_similarity(doc_embedding, type_embedding, dim=1)
-                scores[doc_type] = similarity.item()
-            # Normalize scores to 0-1 range using softmax
-            score_values = torch.tensor(list(scores.values()))
-            normalized_scores = F.softmax(score_values, dim=0)
-            # Update scores with normalized values
-            normalized_dict = {}
-            for i, doc_type in enumerate(scores.keys()):
-                normalized_dict[doc_type] = normalized_scores[i].item()
-            # Find the best match
-            best_type = max(normalized_dict.items(), key=lambda x: x[1])
             # Get top 5 classifications
-            top_5 = sorted(normalized_dict.items(), key=lambda x: x[1], reverse=True)[:5]
             return {
                 "document_type": best_type[0],
-                "confidence": round(best_type[1], 3),
-                "all_scores": {k: round(v, 3) for k, v in top_5},
                 "text_preview": text[:200] + "..." if len(text) > 200 else text
             }

     MODELS_DIR = Path(__file__).resolve().parent.parent.parent / "Model"
 MODEL_PATH = MODELS_DIR / "bert-tiny"
+# Common document types with descriptions and keywords for better classification
 DOCUMENT_TYPES = {
+    "invoice": {
+        "description": "A document requesting payment for goods or services provided, containing itemized charges, totals, and payment terms.",
+        "keywords": ["invoice", "bill", "amount due", "total", "subtotal", "tax", "payment terms", "invoice number", "invoice date", "due date", "itemized", "charges", "balance", "payable", "vendor", "billing"]
+    },
+    "receipt": {
+        "description": "A document confirming payment has been received, showing transaction details and proof of purchase.",
+        "keywords": ["receipt", "payment received", "paid", "thank you", "transaction", "purchase", "payment confirmation", "receipt number", "date of purchase", "amount paid"]
+    },
+    "contract": {
+        "description": "A legally binding agreement between parties outlining terms, conditions, obligations, and signatures.",
+        "keywords": ["contract", "agreement", "terms", "party", "signature", "effective date", "parties", "whereas", "hereby", "obligations", "rights", "termination", "breach"]
+    },
+    "resume": {
+        "description": "A document summarizing a person's work experience, education, skills, and qualifications for job applications.",
+        "keywords": ["resume", "cv", "curriculum vitae", "experience", "education", "skills", "employment", "work history", "qualifications", "objective", "references", "contact information"]
+    },
+    "letter": {
+        "description": "A formal or informal written correspondence addressed to a recipient with greetings and closing.",
+        "keywords": ["dear", "sincerely", "yours", "letter", "correspondence", "regards", "best regards", "yours truly", "to whom it may concern", "date:", "subject:"]
+    },
+    "report": {
+        "description": "A structured document presenting analysis, findings, conclusions, and recommendations on a specific topic.",
+        "keywords": ["report", "summary", "findings", "conclusion", "analysis", "recommendations", "executive summary", "introduction", "methodology", "results", "discussion"]
+    },
+    "memo": {
+        "description": "An internal business communication document with headers like To, From, Subject, and Date.",
+        "keywords": ["memo", "memorandum", "to:", "from:", "subject:", "date:", "re:", "internal", "interoffice"]
+    },
+    "email": {
+        "description": "Electronic mail correspondence with headers showing sender, recipient, subject, and message content.",
+        "keywords": ["from:", "to:", "subject:", "sent:", "email", "cc:", "bcc:", "reply to", "message id", "date sent"]
+    },
+    "form": {
+        "description": "A structured document with fields to be filled out, often requiring signatures and dates.",
+        "keywords": ["form", "application", "please fill", "signature", "date", "please print", "complete", "fill out", "applicant", "fields"]
+    },
+    "certificate": {
+        "description": "An official document certifying completion, achievement, or qualification with certification details.",
+        "keywords": ["certificate", "certified", "awarded", "this certifies", "certification", "certificate of", "issued", "certificate number"]
+    },
+    "license": {
+        "description": "An official document granting permission to perform certain activities, with license numbers and expiration dates.",
+        "keywords": ["license", "licensed", "expires", "license number", "licensee", "licensing authority", "valid until", "license type", "permit"]
+    },
+    "passport": {
+        "description": "An official government document for international travel containing personal identification and nationality information.",
+        "keywords": ["passport", "nationality", "date of birth", "passport number", "passport no", "country of issue", "expiry date", "place of birth", "issuing authority"]
+    },
+    "medical record": {
+        "description": "Healthcare documentation containing patient information, diagnoses, treatments, and medical history.",
+        "keywords": ["medical", "diagnosis", "patient", "treatment", "prescription", "doctor", "physician", "symptoms", "medication", "health", "medical history", "patient id"]
+    },
+    "bank statement": {
+        "description": "A financial document from a bank showing account transactions, balances, deposits, and withdrawals.",
+        "keywords": ["account", "balance", "transaction", "deposit", "withdrawal", "bank statement", "account number", "account balance", "statement period", "debit", "credit", "checking", "savings"]
+    },
+    "tax document": {
+        "description": "Tax-related paperwork such as W-2 forms, 1099 forms, tax returns, or IRS correspondence.",
+        "keywords": ["tax", "irs", "income", "deduction", "w-2", "1099", "tax return", "federal tax", "social security", "withholding", "adjusted gross income", "taxable income"]
+    },
+    "legal document": {
+        "description": "Court documents, legal filings, contracts, or other documents related to legal proceedings or matters.",
+        "keywords": ["legal", "court", "plaintiff", "defendant", "attorney", "lawyer", "case number", "filing", "petition", "motion", "order", "judgment", "legal counsel"]
+    },
+    "academic paper": {
+        "description": "A scholarly document with abstract, introduction, methodology, results, references, and citations.",
+        "keywords": ["abstract", "introduction", "methodology", "references", "citation", "research", "study", "literature review", "hypothesis", "data analysis", "conclusion", "bibliography"]
+    },
+    "presentation": {
+        "description": "A document with slides, bullet points, or structured content for presenting information to an audience.",
+        "keywords": ["slide", "presentation", "agenda", "overview", "bullet points", "powerpoint", "key points", "summary slide", "title slide"]
+    },
+    "manual": {
+        "description": "An instructional document providing step-by-step procedures, guidelines, or how-to information.",
+        "keywords": ["manual", "instructions", "how to", "procedure", "steps", "guide", "tutorial", "user guide", "operation", "setup", "installation"]
+    },
+    "quote": {
+        "description": "A document providing a price estimate or quotation for goods or services before purchase.",
+        "keywords": ["quote", "quotation", "estimate", "pricing", "quote number", "valid until", "quote date", "estimated cost", "price quote", "proposal"]
+    },
+    "purchase order": {
+        "description": "A commercial document issued by a buyer to a seller indicating types, quantities, and agreed prices for products or services.",
+        "keywords": ["purchase order", "po number", "po#", "order number", "purchase", "order date", "ship to", "bill to", "quantity", "unit price", "po"]
+    },
+    "insurance policy": {
+        "description": "A document outlining insurance coverage, terms, premiums, and policy details.",
+        "keywords": ["insurance", "policy", "policy number", "premium", "coverage", "insured", "beneficiary", "policyholder", "deductible", "claim", "insurance company"]
+    },
+    "other": {
+        "description": "A document that does not clearly fit into any of the above categories.",
+        "keywords": []
+    }
 }
         print("Precomputing document type embeddings...")
         self.type_embeddings = {}
+        for doc_type, doc_info in DOCUMENT_TYPES.items():
+            # Combine type name, description, and keywords for better representation
+            description = doc_info["description"]
+            keywords = " ".join(doc_info.get("keywords", []))
+            text = f"{doc_type}: {description} Keywords: {keywords}"
             embedding = self._get_embedding(text)
             self.type_embeddings[doc_type] = embedding
         print("Document type embeddings computed!")
+    def _calculate_keyword_score(self, text: str, doc_type: str) -> float:
+        """Calculate keyword matching score for a document type."""
+        text_lower = text.lower()
+        doc_info = DOCUMENT_TYPES.get(doc_type, {})
+        keywords = doc_info.get("keywords", [])
+        if not keywords:
+            return 0.0
+        # Count keyword matches
+        matches = sum(1 for keyword in keywords if keyword.lower() in text_lower)
+        # Calculate score: matches / total keywords, with bonus for multiple matches
+        base_score = matches / len(keywords) if keywords else 0.0
+        # Boost score if multiple keywords found (indicates stronger match)
+        if matches > 0:
+            boost = min(0.3, matches * 0.05)  # Up to 30% boost
+            base_score = min(1.0, base_score + boost)
+        return base_score
     def classify_document(self, text: str, max_length: int = 512) -> Dict[str, any]:
         """
+        Classify a document based on its text content using hybrid keyword + semantic similarity.
         Args:
             text: Document text content
             # Get embedding for the document text
             doc_embedding = self._get_embedding(text, max_length)
+            # Calculate scores using hybrid approach
             scores = {}
+            for doc_type in DOCUMENT_TYPES.keys():
+                # 1. Keyword matching score (0-1)
+                keyword_score = self._calculate_keyword_score(text, doc_type)
+                # 2. Semantic similarity score (0-1, normalized)
+                type_embedding = self.type_embeddings[doc_type]
                 similarity = F.cosine_similarity(doc_embedding, type_embedding, dim=1)
+                semantic_score = (similarity.item() + 1) / 2  # Normalize from [-1, 1] to [0, 1]
+                # 3. Combine scores: 60% keyword, 40% semantic
+                # This gives more weight to explicit keyword matches
+                combined_score = (keyword_score * 0.6) + (semantic_score * 0.4)
+                scores[doc_type] = combined_score
+            # Find the best match
+            best_type = max(scores.items(), key=lambda x: x[1])
+            # Normalize confidence to percentage (scale to make it more meaningful)
+            # Use sigmoid-like scaling for better confidence representation
+            max_score = best_type[1]
+            if max_score > 0.5:
+                # High confidence: scale from 0.5-1.0 to 50%-95%
+                confidence = 50 + (max_score - 0.5) * 90
+            elif max_score > 0.3:
+                # Medium confidence: scale from 0.3-0.5 to 30%-50%
+                confidence = 30 + (max_score - 0.3) * 100
+            else:
+                # Low confidence: scale from 0-0.3 to 0%-30%
+                confidence = max_score * 100
+            confidence = min(95, max(5, confidence))  # Clamp between 5% and 95%
             # Get top 5 classifications
+            top_5 = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]
+            # Convert scores to percentages for display
+            top_5_percentages = {}
+            for doc_type, score in top_5:
+                if score > 0.5:
+                    percent = 50 + (score - 0.5) * 90
+                elif score > 0.3:
+                    percent = 30 + (score - 0.3) * 100
+                else:
+                    percent = score * 100
+                top_5_percentages[doc_type] = min(95, max(5, percent))
             return {
                 "document_type": best_type[0],
+                "confidence": round(confidence / 100, 3),  # Return as 0-1 for consistency
+                "all_scores": {k: round(v / 100, 3) for k, v in top_5_percentages.items()},
                 "text_preview": text[:200] + "..." if len(text) > 200 else text
             }