Spaces:

Seth0330
/

DocClassify

Sleeping

App Files Files Community

Seth commited on Feb 2

Commit

f6e574f

1 Parent(s): b434cd3

Update

Browse files

Files changed (10) hide show

.gitignore +46 -0
README.md +90 -2
backend/app/classifier.py +194 -0
backend/app/main.py +58 -2
backend/app/pdf_processor.py +31 -0
backend/requirements.txt +7 -1
download_model.py +42 -0
frontend/index.html +7 -1
frontend/src/App.jsx +312 -13
test_classifier.py +55 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+.venv
+*.egg-info/
+dist/
+build/
+# Virtual environment
+venv/
+# Models (large files)
+Model/
+models/
+*.bin
+*.safetensors
+# Node
+node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+# Frontend build
+frontend/dist/
+frontend/.vite/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Logs
+*.log

README.md CHANGED Viewed

@@ -1,10 +1,98 @@
 ---
 title: DocClassify
-emoji: 🐨
 colorFrom: yellow
 colorTo: blue
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: DocClassify
+emoji: 📄
 colorFrom: yellow
 colorTo: blue
 sdk: docker
 pinned: false
 ---
+# Document Classifier
+A web application that uses BERT-tiny to classify PDF documents by type. Upload a PDF file and get instant classification results.
+## Features
+- 📄 PDF file upload and processing
+- 🤖 BERT-tiny model for document classification
+- 🎯 Classifies 20+ document types including:
+  - Invoice, Receipt, Contract, Resume
+  - Letter, Report, Memo, Email
+  - Form, Certificate, License, Passport
+  - Medical records, Bank statements, Tax documents
+  - Legal documents, Academic papers, and more
+- 💾 Model is downloaded and cached locally on first use
+- 🎨 Modern, user-friendly interface
+## How It Works
+1. The app uses the `prajjwal1/bert-tiny` model from Hugging Face
+2. On first run, the model is automatically downloaded to the `models/` directory
+3. PDF text is extracted using PyPDF2
+4. Document embeddings are computed using BERT-tiny
+5. Similarity scores are calculated against pre-computed document type embeddings
+6. The document is classified with confidence scores
+## Setup
+### Local Development
+1. **Backend Setup:**
+   ```bash
+   cd backend
+   pip install -r requirements.txt
+   ```
+2. **Frontend Setup:**
+   ```bash
+   cd frontend
+   npm install
+   ```
+3. **Run Backend:**
+   ```bash
+   cd backend
+   uvicorn app.main:app --reload --port 8000
+   ```
+4. **Run Frontend:**
+   ```bash
+   cd frontend
+   npm run dev
+   ```
+5. Open `http://localhost:5173` in your browser
+### Docker Deployment
+```bash
+docker build -t docclassify .
+docker run -p 7860:7860 docclassify
+```
+## Usage
+1. Click "Select PDF File" to choose a PDF document
+2. Click "Classify Document" to process the file
+3. View the classification result with confidence scores
+4. See top 5 document type predictions
+## Model Information
+- **Model:** `prajjwal1/bert-tiny`
+- **Size:** ~4.4M parameters
+- **Architecture:** BERT (L=2, H=128)
+- **Source:** [Hugging Face Model Card](https://huggingface.co/prajjwal1/bert-tiny)
+## Technical Stack
+- **Backend:** FastAPI, PyTorch, Transformers, PyPDF2
+- **Frontend:** React, Vite
+- **Model:** BERT-tiny (prajjwal1/bert-tiny)
+## Notes
+- The model will be automatically downloaded on first use (~17MB)
+- Classification works best with text-based PDFs
+- Image-based PDFs may not work if they don't contain extractable text
+- Processing time depends on document size and system resources

backend/app/classifier.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""Document classification using BERT-tiny model."""
+import os
+from pathlib import Path
+from typing import List, Dict, Optional
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+import numpy as np
+# Model configuration
+MODEL_NAME = "prajjwal1/bert-tiny"
+# Models directory: use /app/Model in Docker, or project_root/Model locally
+# Check if we're in Docker by looking for /app directory
+if Path("/app").exists() and Path("/app/backend").exists():
+    # Docker environment
+    MODELS_DIR = Path("/app/Model")
+else:
+    # Local development - go up from backend/app/classifier.py to project root
+    MODELS_DIR = Path(__file__).resolve().parent.parent.parent / "Model"
+MODEL_PATH = MODELS_DIR / "bert-tiny"
+# Common document types with descriptions for better classification
+DOCUMENT_TYPES = {
+    "invoice": "A document requesting payment for goods or services provided, containing itemized charges, totals, and payment terms.",
+    "receipt": "A document confirming payment has been received, showing transaction details and proof of purchase.",
+    "contract": "A legally binding agreement between parties outlining terms, conditions, obligations, and signatures.",
+    "resume": "A document summarizing a person's work experience, education, skills, and qualifications for job applications.",
+    "letter": "A formal or informal written correspondence addressed to a recipient with greetings and closing.",
+    "report": "A structured document presenting analysis, findings, conclusions, and recommendations on a specific topic.",
+    "memo": "An internal business communication document with headers like To, From, Subject, and Date.",
+    "email": "Electronic mail correspondence with headers showing sender, recipient, subject, and message content.",
+    "form": "A structured document with fields to be filled out, often requiring signatures and dates.",
+    "certificate": "An official document certifying completion, achievement, or qualification with certification details.",
+    "license": "An official document granting permission to perform certain activities, with license numbers and expiration dates.",
+    "passport": "An official government document for international travel containing personal identification and nationality information.",
+    "medical record": "Healthcare documentation containing patient information, diagnoses, treatments, and medical history.",
+    "bank statement": "A financial document from a bank showing account transactions, balances, deposits, and withdrawals.",
+    "tax document": "Tax-related paperwork such as W-2 forms, 1099 forms, tax returns, or IRS correspondence.",
+    "legal document": "Court documents, legal filings, contracts, or other documents related to legal proceedings or matters.",
+    "academic paper": "A scholarly document with abstract, introduction, methodology, results, references, and citations.",
+    "presentation": "A document with slides, bullet points, or structured content for presenting information to an audience.",
+    "manual": "An instructional document providing step-by-step procedures, guidelines, or how-to information.",
+    "other": "A document that does not clearly fit into any of the above categories."
+}
+class DocumentClassifier:
+    """Class for classifying documents using BERT-tiny."""
+    def __init__(self):
+        self.tokenizer = None
+        self.model = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._load_model()
+        self._precompute_type_embeddings()
+    def _load_model(self):
+        """Load the BERT-tiny model, downloading if necessary."""
+        try:
+            # Check if model exists locally, otherwise download
+            if MODEL_PATH.exists():
+                print(f"Loading model from local path: {MODEL_PATH}")
+                model_path = str(MODEL_PATH)
+            else:
+                print(f"Downloading model {MODEL_NAME}...")
+                model_path = MODEL_NAME
+                # Create models directory
+                MODELS_DIR.mkdir(parents=True, exist_ok=True)
+            # Load tokenizer and model (using AutoModel for embeddings)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            self.model = AutoModel.from_pretrained(model_path)
+            self.model.to(self.device)
+            self.model.eval()
+            # Save model locally if downloaded
+            if not MODEL_PATH.exists():
+                print(f"Saving model to {MODEL_PATH}...")
+                self.tokenizer.save_pretrained(str(MODEL_PATH))
+                self.model.save_pretrained(str(MODEL_PATH))
+                print("Model saved successfully!")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            raise
+    def _get_embedding(self, text: str, max_length: int = 512) -> torch.Tensor:
+        """Get embedding for a text using BERT-tiny."""
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=max_length,
+            padding=True
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            # Use mean pooling of token embeddings
+            embeddings = outputs.last_hidden_state.mean(dim=1)
+        return embeddings
+    def _precompute_type_embeddings(self):
+        """Precompute embeddings for each document type description."""
+        print("Precomputing document type embeddings...")
+        self.type_embeddings = {}
+        for doc_type, description in DOCUMENT_TYPES.items():
+            # Combine type name and description for better representation
+            text = f"{doc_type}: {description}"
+            embedding = self._get_embedding(text)
+            self.type_embeddings[doc_type] = embedding
+        print("Document type embeddings computed!")
+    def classify_document(self, text: str, max_length: int = 512) -> Dict[str, any]:
+        """
+        Classify a document based on its text content using BERT-tiny embeddings.
+        Args:
+            text: Document text content
+            max_length: Maximum token length for the model
+        Returns:
+            Dictionary with classification results
+        """
+        if not text or not text.strip():
+            return {
+                "document_type": "unknown",
+                "confidence": 0.0,
+                "error": "No text extracted from document"
+            }
+        try:
+            # Truncate text if too long (keep first part which usually has most relevant info)
+            if len(text) > max_length * 4:  # Rough estimate: 4 chars per token
+                # Take first part and last part for better context
+                first_part = text[:max_length * 2]
+                last_part = text[-max_length * 2:]
+                text = first_part + " " + last_part
+            # Get embedding for the document text
+            doc_embedding = self._get_embedding(text, max_length)
+            # Calculate cosine similarity with each document type
+            scores = {}
+            for doc_type, type_embedding in self.type_embeddings.items():
+                # Calculate cosine similarity
+                similarity = F.cosine_similarity(doc_embedding, type_embedding, dim=1)
+                scores[doc_type] = similarity.item()
+            # Normalize scores to 0-1 range using softmax
+            score_values = torch.tensor(list(scores.values()))
+            normalized_scores = F.softmax(score_values, dim=0)
+            # Update scores with normalized values
+            normalized_dict = {}
+            for i, doc_type in enumerate(scores.keys()):
+                normalized_dict[doc_type] = normalized_scores[i].item()
+            # Find the best match
+            best_type = max(normalized_dict.items(), key=lambda x: x[1])
+            # Get top 5 classifications
+            top_5 = sorted(normalized_dict.items(), key=lambda x: x[1], reverse=True)[:5]
+            return {
+                "document_type": best_type[0],
+                "confidence": round(best_type[1], 3),
+                "all_scores": {k: round(v, 3) for k, v in top_5},
+                "text_preview": text[:200] + "..." if len(text) > 200 else text
+            }
+        except Exception as e:
+            print(f"Error classifying document: {e}")
+            import traceback
+            traceback.print_exc()
+            return {
+                "document_type": "unknown",
+                "confidence": 0.0,
+                "error": str(e)
+            }
+# Global classifier instance
+_classifier_instance = None
+def get_classifier() -> DocumentClassifier:
+    """Get or create the global classifier instance."""
+    global _classifier_instance
+    if _classifier_instance is None:
+        _classifier_instance = DocumentClassifier()
+    return _classifier_instance

backend/app/main.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from fastapi import FastAPI
-from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from pathlib import Path
 app = FastAPI()
@@ -14,6 +16,16 @@ app.add_middleware(
     allow_headers=["*"],
 )
 # ---- API ----
 @app.get("/api/health")
 def health():
@@ -23,6 +35,50 @@ def health():
 def hello():
     return {"message": "Hello from FastAPI"}
 # ---- Frontend static serving ----
 FRONTEND_DIST = Path(__file__).resolve().parents[2] / "frontend" / "dist"
 INDEX_FILE = FRONTEND_DIST / "index.html"

+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import FileResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from pathlib import Path
+from app.pdf_processor import extract_text_from_pdf
+from app.classifier import get_classifier
 app = FastAPI()
     allow_headers=["*"],
 )
+# Initialize classifier (lazy loading)
+classifier = None
+def get_classifier_instance():
+    """Lazy load the classifier."""
+    global classifier
+    if classifier is None:
+        classifier = get_classifier()
+    return classifier
 # ---- API ----
 @app.get("/api/health")
 def health():
 def hello():
     return {"message": "Hello from FastAPI"}
+@app.post("/api/classify")
+async def classify_document(file: UploadFile = File(...)):
+    """
+    Classify a PDF document.
+    Args:
+        file: Uploaded PDF file
+    Returns:
+        Classification results with document type and confidence
+    """
+    # Validate file type
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+    try:
+        # Read file content
+        contents = await file.read()
+        # Extract text from PDF
+        text = extract_text_from_pdf(contents)
+        if not text:
+            raise HTTPException(
+                status_code=400,
+                detail="Could not extract text from PDF. The file might be empty, corrupted, or image-based."
+            )
+        # Classify the document
+        classifier_instance = get_classifier_instance()
+        result = classifier_instance.classify_document(text)
+        return JSONResponse(content={
+            "success": True,
+            "filename": file.filename,
+            "classification": result,
+            "text_length": len(text)
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
 # ---- Frontend static serving ----
 FRONTEND_DIST = Path(__file__).resolve().parents[2] / "frontend" / "dist"
 INDEX_FILE = FRONTEND_DIST / "index.html"

backend/app/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""PDF text extraction utilities."""
+import io
+from typing import Optional
+from PyPDF2 import PdfReader
+def extract_text_from_pdf(pdf_bytes: bytes) -> Optional[str]:
+    """
+    Extract text content from a PDF file.
+    Args:
+        pdf_bytes: PDF file content as bytes
+    Returns:
+        Extracted text as string, or None if extraction fails
+    """
+    try:
+        pdf_file = io.BytesIO(pdf_bytes)
+        reader = PdfReader(pdf_file)
+        text_parts = []
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:
+                text_parts.append(text)
+        full_text = "\n\n".join(text_parts)
+        return full_text if full_text.strip() else None
+    except Exception as e:
+        print(f"Error extracting text from PDF: {e}")
+        return None

backend/requirements.txt CHANGED Viewed

@@ -1,2 +1,8 @@
 fastapi
-uvicorn

 fastapi
+uvicorn
+python-multipart
+transformers
+torch
+PyPDF2
+sentencepiece
+protobuf

download_model.py ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env python3
+"""Download BERT-tiny model upfront to Model folder."""
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModel
+MODEL_NAME = "prajjwal1/bert-tiny"
+MODELS_DIR = Path(__file__).resolve().parent / "Model"
+MODEL_PATH = MODELS_DIR / "bert-tiny"
+def download_model():
+    """Download and save the BERT-tiny model."""
+    print(f"Downloading model: {MODEL_NAME}")
+    print(f"Target directory: {MODEL_PATH}")
+    # Create Model directory
+    MODELS_DIR.mkdir(parents=True, exist_ok=True)
+    if MODEL_PATH.exists():
+        print(f"Model already exists at {MODEL_PATH}")
+        print("Skipping download.")
+        return
+    try:
+        print("Downloading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        print("Downloading model...")
+        model = AutoModel.from_pretrained(MODEL_NAME)
+        print(f"Saving model to {MODEL_PATH}...")
+        tokenizer.save_pretrained(str(MODEL_PATH))
+        model.save_pretrained(str(MODEL_PATH))
+        print("✅ Model downloaded and saved successfully!")
+        print(f"Location: {MODEL_PATH}")
+    except Exception as e:
+        print(f"❌ Error downloading model: {e}")
+        raise
+if __name__ == "__main__":
+    download_model()

frontend/index.html CHANGED Viewed

@@ -3,7 +3,13 @@
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>HF React + FastAPI by Seth</title>
   </head>
   <body>
     <div id="root"></div>

   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Document Classifier - BERT-tiny</title>
+    <style>
+      @keyframes spin {
+        0% { transform: rotate(0deg); }
+        100% { transform: rotate(360deg); }
+      }
+    </style>
   </head>
   <body>
     <div id="root"></div>

frontend/src/App.jsx CHANGED Viewed

@@ -1,23 +1,322 @@
-import React, { useEffect, useState } from "react";
 export default function App() {
-  const [apiMsg, setApiMsg] = useState("");
-  useEffect(() => {
-    fetch("/api/hello")
-      .then((r) => r.json())
-      .then((d) => setApiMsg(d.message))
-      .catch(() => setApiMsg("API not reachable yet"));
-  }, []);
   return (
-    <div style={{ fontFamily: "system-ui", padding: 24, lineHeight: 1.5 }}>
-      <h1>React + FastAPI (Docker, HF Spaces)</h1>
-      <p>This is a plain starter page. Customize freely.By Seth</p>
-      <div style={{ marginTop: 16, padding: 12, border: "1px solid #ddd", borderRadius: 8 }}>
-        <strong>API says:</strong> {apiMsg}
       </div>
     </div>
   );
 }

+import React, { useState } from "react";
 export default function App() {
+  const [file, setFile] = useState(null);
+  const [loading, setLoading] = useState(false);
+  const [result, setResult] = useState(null);
+  const [error, setError] = useState(null);
+  const handleFileChange = (e) => {
+    const selectedFile = e.target.files[0];
+    if (selectedFile) {
+      if (selectedFile.type !== "application/pdf") {
+        setError("Please select a PDF file");
+        setFile(null);
+        return;
+      }
+      setFile(selectedFile);
+      setError(null);
+      setResult(null);
+    }
+  };
+  const handleClassify = async () => {
+    if (!file) {
+      setError("Please select a PDF file first");
+      return;
+    }
+    setLoading(true);
+    setError(null);
+    setResult(null);
+    const formData = new FormData();
+    formData.append("file", file);
+    try {
+      const response = await fetch("/api/classify", {
+        method: "POST",
+        body: formData,
+      });
+      if (!response.ok) {
+        const errorData = await response.json();
+        throw new Error(errorData.detail || "Classification failed");
+      }
+      const data = await response.json();
+      setResult(data);
+    } catch (err) {
+      setError(err.message || "An error occurred during classification");
+    } finally {
+      setLoading(false);
+    }
+  };
+  const handleReset = () => {
+    setFile(null);
+    setResult(null);
+    setError(null);
+    // Reset file input
+    const fileInput = document.getElementById("pdf-upload");
+    if (fileInput) fileInput.value = "";
+  };
   return (
+    <div
+      style={{
+        fontFamily: "system-ui, -apple-system, sans-serif",
+        maxWidth: "800px",
+        margin: "0 auto",
+        padding: "24px",
+        lineHeight: 1.6,
+      }}
+    >
+      <div style={{ textAlign: "center", marginBottom: "32px" }}>
+        <h1 style={{ margin: "0 0 8px 0", color: "#1a1a1a" }}>
+          📄 Document Classifier
+        </h1>
+        <p style={{ color: "#666", margin: "0" }}>
+          Upload a PDF file to classify its document type using BERT-tiny
+        </p>
+      </div>
+      <div
+        style={{
+          border: "2px dashed #ddd",
+          borderRadius: "12px",
+          padding: "32px",
+          textAlign: "center",
+          backgroundColor: "#fafafa",
+          marginBottom: "24px",
+        }}
+      >
+        <input
+          id="pdf-upload"
+          type="file"
+          accept=".pdf"
+          onChange={handleFileChange}
+          style={{ display: "none" }}
+        />
+        <label
+          htmlFor="pdf-upload"
+          style={{
+            display: "inline-block",
+            padding: "12px 24px",
+            backgroundColor: "#4f46e5",
+            color: "white",
+            borderRadius: "8px",
+            cursor: "pointer",
+            fontSize: "16px",
+            fontWeight: "500",
+            marginBottom: "16px",
+            transition: "background-color 0.2s",
+          }}
+          onMouseOver={(e) => (e.target.style.backgroundColor = "#4338ca")}
+          onMouseOut={(e) => (e.target.style.backgroundColor = "#4f46e5")}
+        >
+          {file ? "Change PDF File" : "Select PDF File"}
+        </label>
+        {file && (
+          <div style={{ marginTop: "16px" }}>
+            <p style={{ margin: "8px 0", color: "#333" }}>
+              <strong>Selected:</strong> {file.name}
+            </p>
+            <p style={{ margin: "4px 0", color: "#666", fontSize: "14px" }}>
+              Size: {(file.size / 1024).toFixed(2)} KB
+            </p>
+          </div>
+        )}
+        <div style={{ marginTop: "24px" }}>
+          <button
+            onClick={handleClassify}
+            disabled={!file || loading}
+            style={{
+              padding: "12px 32px",
+              fontSize: "16px",
+              fontWeight: "600",
+              backgroundColor: file && !loading ? "#10b981" : "#9ca3af",
+              color: "white",
+              border: "none",
+              borderRadius: "8px",
+              cursor: file && !loading ? "pointer" : "not-allowed",
+              transition: "background-color 0.2s",
+            }}
+            onMouseOver={(e) => {
+              if (file && !loading) {
+                e.target.style.backgroundColor = "#059669";
+              }
+            }}
+            onMouseOut={(e) => {
+              if (file && !loading) {
+                e.target.style.backgroundColor = "#10b981";
+              }
+            }}
+          >
+            {loading ? "Classifying..." : "Classify Document"}
+          </button>
+          {file && (
+            <button
+              onClick={handleReset}
+              disabled={loading}
+              style={{
+                padding: "12px 24px",
+                fontSize: "16px",
+                fontWeight: "500",
+                backgroundColor: "transparent",
+                color: "#666",
+                border: "1px solid #ddd",
+                borderRadius: "8px",
+                cursor: loading ? "not-allowed" : "pointer",
+                marginLeft: "12px",
+              }}
+            >
+              Reset
+            </button>
+          )}
+        </div>
       </div>
+      {error && (
+        <div
+          style={{
+            padding: "16px",
+            backgroundColor: "#fee2e2",
+            border: "1px solid #fecaca",
+            borderRadius: "8px",
+            color: "#991b1b",
+            marginBottom: "24px",
+          }}
+        >
+          <strong>Error:</strong> {error}
+        </div>
+      )}
+      {result && (
+        <div
+          style={{
+            padding: "24px",
+            backgroundColor: "#f0fdf4",
+            border: "2px solid #86efac",
+            borderRadius: "12px",
+            marginBottom: "24px",
+          }}
+        >
+          <h2 style={{ margin: "0 0 16px 0", color: "#166534" }}>
+            Classification Result
+          </h2>
+          <div
+            style={{
+              backgroundColor: "white",
+              padding: "20px",
+              borderRadius: "8px",
+              marginBottom: "16px",
+            }}
+          >
+            <div style={{ marginBottom: "12px" }}>
+              <span style={{ color: "#666", fontSize: "14px" }}>
+                Document Type:
+              </span>
+              <div
+                style={{
+                  fontSize: "24px",
+                  fontWeight: "700",
+                  color: "#10b981",
+                  marginTop: "4px",
+                  textTransform: "capitalize",
+                }}
+              >
+                {result.classification.document_type}
+              </div>
+            </div>
+            <div style={{ marginBottom: "12px" }}>
+              <span style={{ color: "#666", fontSize: "14px" }}>
+                Confidence:
+              </span>
+              <div
+                style={{
+                  fontSize: "20px",
+                  fontWeight: "600",
+                  color: "#059669",
+                  marginTop: "4px",
+                }}
+              >
+                {(result.classification.confidence * 100).toFixed(1)}%
+              </div>
+            </div>
+            <div style={{ marginTop: "16px", paddingTop: "16px", borderTop: "1px solid #e5e7eb" }}>
+              <span style={{ color: "#666", fontSize: "14px" }}>
+                File: {result.filename}
+              </span>
+              <br />
+              <span style={{ color: "#666", fontSize: "14px" }}>
+                Text Length: {result.text_length.toLocaleString()} characters
+              </span>
+            </div>
+          </div>
+          {result.classification.all_scores && (
+            <div>
+              <h3 style={{ margin: "0 0 12px 0", fontSize: "16px", color: "#166534" }}>
+                Top 5 Classifications:
+              </h3>
+              <div style={{ backgroundColor: "white", padding: "16px", borderRadius: "8px" }}>
+                {Object.entries(result.classification.all_scores).map(
+                  ([type, score]) => (
+                    <div
+                      key={type}
+                      style={{
+                        display: "flex",
+                        justifyContent: "space-between",
+                        alignItems: "center",
+                        padding: "8px 0",
+                        borderBottom: "1px solid #f3f4f6",
+                      }}
+                    >
+                      <span style={{ textTransform: "capitalize", color: "#374151" }}>
+                        {type}
+                      </span>
+                      <span
+                        style={{
+                          fontWeight: "600",
+                          color: type === result.classification.document_type ? "#10b981" : "#6b7280",
+                        }}
+                      >
+                        {(score * 100).toFixed(1)}%
+                      </span>
+                    </div>
+                  )
+                )}
+              </div>
+            </div>
+          )}
+        </div>
+      )}
+      {loading && (
+        <div style={{ textAlign: "center", padding: "24px" }}>
+          <div
+            style={{
+              display: "inline-block",
+              width: "40px",
+              height: "40px",
+              border: "4px solid #e5e7eb",
+              borderTop: "4px solid #4f46e5",
+              borderRadius: "50%",
+              animation: "spin 1s linear infinite",
+            }}
+          />
+          <p style={{ marginTop: "16px", color: "#666" }}>
+            Processing your document...
+          </p>
+        </div>
+      )}
     </div>
   );
 }

test_classifier.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/env python3
+"""Test the document classifier."""
+import sys
+from pathlib import Path
+# Add backend to path
+sys.path.insert(0, str(Path(__file__).parent / "backend"))
+from app.classifier import get_classifier
+def test_classifier():
+    """Test the classifier with sample text."""
+    print("Loading classifier...")
+    classifier = get_classifier()
+    # Test with sample texts
+    test_cases = [
+        ("Invoice for services rendered. Total amount due: $500.00. Payment terms: Net 30.", "invoice"),
+        ("This is to certify that John Doe has completed the course.", "certificate"),
+        ("Dear Sir, I am writing to inform you...", "letter"),
+        ("Account Statement - Account #12345. Balance: $1,000.00", "bank statement"),
+    ]
+    print("\n" + "="*60)
+    print("Testing Document Classifier")
+    print("="*60 + "\n")
+    for i, (text, expected_type) in enumerate(test_cases, 1):
+        print(f"Test {i}: Expected type: {expected_type}")
+        print(f"Text: {text[:50]}...")
+        result = classifier.classify_document(text)
+        print(f"✅ Classified as: {result['document_type']}")
+        print(f"   Confidence: {result['confidence']:.1%}")
+        print(f"   Top 3: {list(result['all_scores'].keys())[:3]}")
+        if result['document_type'] == expected_type:
+            print("   ✅ PASS - Correct classification!")
+        else:
+            print(f"   ⚠️  Expected '{expected_type}' but got '{result['document_type']}'")
+        print()
+    print("="*60)
+    print("Test completed!")
+if __name__ == "__main__":
+    try:
+        test_classifier()
+    except Exception as e:
+        print(f"❌ Error during testing: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)