Spaces:

Qasim-Dost
/

Doc-Classifier

Runtime error

App Files Files Community

Qasim-Dost commited on Jan 7

Commit

ce9f3ac

verified ·

1 Parent(s): 7209782

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +27 -20
app.py +363 -0
pdf_to_image.py +93 -0
requirements.txt +15 -3
smolvlm_classifier.py +227 -0

Dockerfile CHANGED Viewed

@@ -1,20 +1,27 @@
-FROM python:3.13.5-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies for PyMuPDF
+RUN apt-get update && apt-get install -y \
+    libmupdf-dev \
+    mupdf-tools \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application files
+COPY app.py .
+COPY smolvlm_classifier.py .
+COPY pdf_to_image.py .
+# Expose Streamlit port
+EXPOSE 7860
+# Disable torch.compile for HF Spaces compatibility
+ENV DISABLE_TORCH_COMPILE=1
+# Run Streamlit
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.enableCORS=false", "--server.enableXsrfProtection=false"]

app.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""
+Streamlit UI for Document Classification
+Upload PDFs and classify them using SmolVLM.
+Optimized with pre-loading and concurrent processing.
+"""
+import streamlit as st
+import pandas as pd
+import json
+from pathlib import Path
+from datetime import datetime
+import tempfile
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+# Import our classifier modules
+from pdf_to_image import pdf_to_images
+from smolvlm_classifier import SmolVLMClassifier
+# Page config
+st.set_page_config(
+    page_title="Document Classifier",
+    page_icon="📄",
+    layout="wide"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #1f77b4;
+        margin-bottom: 1rem;
+    }
+    .result-box {
+        background-color: #f0f8ff;
+        padding: 0.8rem 1rem;
+        border-radius: 8px;
+        border-left: 4px solid #1f77b4;
+        margin: 0.5rem 0;
+        display: inline-block;
+    }
+    .doc-type {
+        font-size: 1.2rem;
+        font-weight: bold;
+        color: #2e7d32;
+        margin: 0;
+    }
+    .file-info {
+        font-size: 0.9rem;
+        color: #555;
+        margin: 0.2rem 0;
+    }
+    .model-status {
+        padding: 0.5rem;
+        border-radius: 5px;
+        margin-bottom: 1rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+@st.cache_resource
+def load_classifier():
+    """Load the classifier once and cache it."""
+    return SmolVLMClassifier()
+def load_history():
+    """Load classification history from JSON file."""
+    history_file = Path("classification_history.json")
+    if history_file.exists():
+        with open(history_file, "r", encoding="utf-8") as f:
+            return json.load(f)
+    return []
+def save_history(history):
+    """Save classification history to JSON file."""
+    with open("classification_history.json", "w", encoding="utf-8") as f:
+        json.dump(history, f, indent=2, ensure_ascii=False)
+def add_to_history(filename, doc_type, num_pages):
+    """Add a classification result to history."""
+    history = load_history()
+    history.insert(0, {
+        "filename": filename,
+        "document_type": doc_type,
+        "num_pages": num_pages,
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    })
+    # Keep only last 100 entries
+    history = history[:100]
+    save_history(history)
+    return history
+def convert_pdf_to_images(uploaded_file):
+    """Convert a single PDF to images. Used for threading."""
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+        tmp_file.write(uploaded_file.getvalue())
+        tmp_path = tmp_file.name
+    try:
+        images = pdf_to_images(tmp_path, dpi=100)
+        return uploaded_file.name, images
+    finally:
+        os.unlink(tmp_path)
+def main():
+    # Header
+    st.markdown('<div class="main-header">📄 Document Classifier</div>', unsafe_allow_html=True)
+    st.markdown("Upload PDF documents to classify them using SmolVLM AI.")
+    # PRE-LOAD MODEL AT APP START (not on button click)
+    # This runs once when the app starts
+    with st.spinner("🔄 Loading AI model (one-time setup)..."):
+        classifier = load_classifier()
+    st.success("✅ Model ready!")
+    # Sidebar for history
+    with st.sidebar:
+        st.header("📋 Classification History")
+        history = load_history()
+        if history:
+            # Show as table
+            df_history = pd.DataFrame(history)
+            st.dataframe(
+                df_history[["filename", "document_type", "timestamp"]],
+                hide_index=True,
+                width="stretch"
+            )
+            # Clear history button
+            if st.button("🗑️ Clear History"):
+                save_history([])
+                st.rerun()
+        else:
+            st.info("No classification history yet. Upload a document to get started!")
+    # Main content - two columns
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.subheader("📤 Upload Documents")
+        # File uploader - MULTIPLE FILES
+        uploaded_files = st.file_uploader(
+            "Choose PDF files",
+            type=["pdf"],
+            accept_multiple_files=True,
+            help="Upload one or more PDF documents to classify"
+        )
+        if uploaded_files:
+            st.success(f"✅ Uploaded {len(uploaded_files)} file(s)")
+            # Store images for preview
+            if "pdf_previews" not in st.session_state:
+                st.session_state["pdf_previews"] = {}
+            # Show file list with preview option
+            for f in uploaded_files:
+                with st.expander(f"📄 {f.name} ({f.size / 1024:.1f} KB)", expanded=False):
+                    # Generate preview if not cached
+                    if f.name not in st.session_state["pdf_previews"]:
+                        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                            tmp_file.write(f.getvalue())
+                            tmp_path = tmp_file.name
+                        try:
+                            images = pdf_to_images(tmp_path, dpi=100)
+                            st.session_state["pdf_previews"][f.name] = images
+                        finally:
+                            os.unlink(tmp_path)
+                    # Show preview
+                    images = st.session_state["pdf_previews"].get(f.name, [])
+                    if images:
+                        if len(images) > 1:
+                            page_num = st.selectbox(
+                                f"Page",
+                                range(1, len(images) + 1),
+                                key=f"page_{f.name}"
+                            )
+                            st.image(images[page_num - 1], caption=f"Page {page_num} of {len(images)}", width="stretch")
+                        else:
+                            st.image(images[0], caption="Page 1", width="stretch")
+                    else:
+                        st.error("Could not load PDF preview")
+            # Classify button
+            if st.button("🔍 Classify All Documents", type="primary", width="stretch"):
+                import time
+                all_results = []
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                total_start_time = time.time()
+                # STEP 1: Pre-convert all PDFs to images using threading
+                status_text.text("📄 Converting PDFs to images (parallel)...")
+                pdf_conversion_start = time.time()
+                pdf_images = {}
+                # Use ThreadPoolExecutor for parallel PDF conversion
+                with ThreadPoolExecutor(max_workers=4) as executor:
+                    # Submit all PDF conversion tasks
+                    future_to_file = {
+                        executor.submit(convert_pdf_to_images, f): f
+                        for f in uploaded_files
+                        if f.name not in st.session_state.get("pdf_previews", {})
+                    }
+                    # Also add cached previews
+                    for f in uploaded_files:
+                        if f.name in st.session_state.get("pdf_previews", {}):
+                            pdf_images[f.name] = st.session_state["pdf_previews"][f.name]
+                    # Collect results
+                    for future in as_completed(future_to_file):
+                        filename, images = future.result()
+                        pdf_images[filename] = images
+                pdf_conversion_time = time.time() - pdf_conversion_start
+                print(f"\n📄 PDF Conversion: {pdf_conversion_time:.2f}s (parallel)")
+                progress_bar.progress(0.2)
+                status_text.text("🤖 Classifying documents...")
+                # STEP 2: Classify each document with timing
+                classification_start = time.time()
+                for idx, uploaded_file in enumerate(uploaded_files):
+                    doc_start_time = time.time()
+                    images = pdf_images.get(uploaded_file.name, [])
+                    if not images:
+                        result = {
+                            "filename": uploaded_file.name,
+                            "document_type": "Error: Could not extract pages",
+                            "num_pages": 0,
+                            "classify_time": 0
+                        }
+                    else:
+                        status_text.text(f"🤖 Classifying {idx + 1}/{len(uploaded_files)}: {uploaded_file.name}")
+                        # Classify with timing
+                        classify_start = time.time()
+                        classification = classifier.classify_document(images)
+                        classify_time = time.time() - classify_start
+                        result = {
+                            "filename": uploaded_file.name,
+                            "document_type": classification["document_type"],
+                            "num_pages": classification["num_pages"],
+                            "classify_time": round(classify_time, 2)
+                        }
+                        # Terminal output
+                        print(f"  📄 {uploaded_file.name}")
+                        print(f"     Pages: {classification['num_pages']}")
+                        print(f"     Type: {classification['document_type']}")
+                        print(f"     Classification time: {classify_time:.2f}s")
+                        # Add to history
+                        add_to_history(
+                            uploaded_file.name,
+                            classification["document_type"],
+                            classification["num_pages"]
+                        )
+                    all_results.append(result)
+                    # Update progress
+                    progress_bar.progress(0.2 + 0.8 * (idx + 1) / len(uploaded_files))
+                total_classification_time = time.time() - classification_start
+                total_time = time.time() - total_start_time
+                # Print summary to terminal
+                print(f"\n{'='*50}")
+                print("TIMING SUMMARY")
+                print(f"{'='*50}")
+                print(f"Documents processed: {len(all_results)}")
+                print(f"PDF conversion (parallel): {pdf_conversion_time:.2f}s")
+                print(f"Classification (sequential): {total_classification_time:.2f}s")
+                print(f"Average per document: {total_classification_time/len(all_results):.2f}s")
+                print(f"Total time: {total_time:.2f}s ({total_time/60:.1f} min)")
+                print(f"{'='*50}\n")
+                # Store timing info
+                st.session_state["timing"] = {
+                    "pdf_conversion": round(pdf_conversion_time, 2),
+                    "classification": round(total_classification_time, 2),
+                    "total": round(total_time, 2),
+                    "total_min": round(total_time / 60, 2),
+                    "avg_per_doc": round(total_classification_time / len(all_results), 2)
+                }
+                status_text.text(f"✅ Complete! Total: {total_time:.1f}s ({total_time/60:.1f} min)")
+                st.session_state["results"] = all_results
+    with col2:
+        st.subheader("📊 Classification Results")
+        # Show results
+        if "results" in st.session_state and st.session_state["results"]:
+            results = st.session_state["results"]
+            # Show as compact table with timing
+            df_results = pd.DataFrame(results)
+            st.dataframe(
+                df_results,
+                hide_index=True,
+                width="stretch",
+                column_config={
+                    "filename": st.column_config.TextColumn("File", width="medium"),
+                    "document_type": st.column_config.TextColumn("Type", width="medium"),
+                    "num_pages": st.column_config.NumberColumn("Pages", width="small"),
+                    "classify_time": st.column_config.NumberColumn("Time (s)", width="small")
+                }
+            )
+            # Show timing summary if available
+            if "timing" in st.session_state:
+                timing = st.session_state["timing"]
+                st.markdown("---")
+                st.markdown("**⏱️ Timing Summary**")
+                col_t1, col_t2, col_t3 = st.columns(3)
+                with col_t1:
+                    st.metric("PDF Conversion", f"{timing['pdf_conversion']}s")
+                with col_t2:
+                    st.metric("Classification", f"{timing['classification']}s")
+                with col_t3:
+                    st.metric("Avg per Doc", f"{timing['avg_per_doc']}s")
+                st.info(f"**Total Time:** {timing['total']}s ({timing['total_min']} min)")
+            # Summary
+            st.success(f"✅ Classified {len(results)} document(s)")
+            # Show individual result boxes (compact)
+            for result in results:
+                st.markdown(f"""
+                <div class="result-box">
+                    <p class="file-info"><strong>{result['filename']}</strong> ({result['num_pages']} pages)</p>
+                    <p class="doc-type">📑 {result['document_type']}</p>
+                </div>
+                """, unsafe_allow_html=True)
+        else:
+            st.info("👆 Upload and classify documents to see results here.")
+if __name__ == "__main__":
+    main()

pdf_to_image.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+PDF to Image Conversion using PyMuPDF (fitz)
+Converts all pages of a PDF to PIL Images.
+"""
+import fitz  # PyMuPDF
+from PIL import Image
+from pathlib import Path
+from typing import List, Tuple
+import io
+def pdf_to_images(pdf_path: str, dpi: int = 150) -> List[Image.Image]:
+    """
+    Convert all pages of a PDF to PIL Images.
+    Args:
+        pdf_path: Path to the PDF file
+        dpi: Resolution for rendering (default 150, balance of quality/speed)
+    Returns:
+        List of PIL Images, one per page
+    """
+    images = []
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            # Create pixmap at specified DPI
+            zoom = dpi / 72  # 72 is default PDF DPI
+            matrix = fitz.Matrix(zoom, zoom)
+            pix = page.get_pixmap(matrix=matrix)
+            # Convert to PIL Image
+            img_data = pix.tobytes("png")
+            img = Image.open(io.BytesIO(img_data))
+            images.append(img.convert("RGB"))
+        doc.close()
+    except Exception as e:
+        print(f"Error converting {pdf_path}: {e}")
+        return []
+    return images
+def get_pdf_page_count(pdf_path: str) -> int:
+    """Get the number of pages in a PDF."""
+    try:
+        doc = fitz.open(pdf_path)
+        count = len(doc)
+        doc.close()
+        return count
+    except:
+        return 0
+def collect_pdfs(folder_path: str, recursive: bool = True) -> List[Path]:
+    """
+    Collect all PDF files from a folder.
+    Args:
+        folder_path: Path to folder containing PDFs
+        recursive: Whether to search subfolders
+    Returns:
+        List of Path objects for each PDF
+    """
+    folder = Path(folder_path)
+    if recursive:
+        return list(folder.rglob("*.pdf"))
+    else:
+        return list(folder.glob("*.pdf"))
+if __name__ == "__main__":
+    # Quick test
+    import sys
+    if len(sys.argv) > 1:
+        pdf_path = sys.argv[1]
+        print(f"Converting: {pdf_path}")
+        images = pdf_to_images(pdf_path)
+        print(f"Extracted {len(images)} pages")
+        if images:
+            print(f"First page size: {images[0].size}")
+    else:
+        print("Usage: python pdf_to_image.py <path_to_pdf>")

requirements.txt CHANGED Viewed

@@ -1,3 +1,15 @@
-altair
-pandas
-streamlit

+# Core ML dependencies
+torch
+transformers
+accelerate
+# PDF processing
+PyMuPDF
+Pillow
+# Data handling
+pandas
+tqdm
+# Web framework
+streamlit

smolvlm_classifier.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""
+SmolVLM-256M-Instruct Document Classifier
+Uses instruction-following VLM for zero-shot document classification.
+"""
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from transformers.image_utils import load_image
+from PIL import Image
+from typing import List, Dict
+class SmolVLMClassifier:
+    """
+    SmolVLM-based document classifier.
+    Uses instruction-following to directly ask about document type.
+    """
+    def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM-256M-Instruct"):
+        """
+        Initialize the SmolVLM model.
+        Args:
+            model_name: HuggingFace model name
+        """
+        print(f"Loading {model_name}...")
+        # CPU with float32 for compatibility
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
+        # Load processor and model
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            model_name,
+            dtype=self.torch_dtype,
+            _attn_implementation="eager"  # CPU compatible
+        ).to(self.device)
+        # Compile model for faster inference (optional - can cause issues on some platforms)
+        # Set DISABLE_TORCH_COMPILE=1 to skip compilation
+        import os
+        if os.environ.get("DISABLE_TORCH_COMPILE", "0") != "1":
+            try:
+                print("Compiling model with torch.compile (first run will be slow)...")
+                self.model = torch.compile(self.model, mode="reduce-overhead")
+                print(f"Model loaded and compiled on {self.device}")
+            except Exception as e:
+                print(f"torch.compile failed ({e}), using uncompiled model")
+                print(f"Model loaded on {self.device}")
+        else:
+            print(f"Model loaded on {self.device} (torch.compile disabled)")
+    def ask_about_image(self, image: Image.Image, question: str) -> str:
+        """
+        Ask a question about an image.
+        Args:
+            image: PIL Image
+            question: Question to ask about the image
+        Returns:
+            Answer string
+        """
+        # Ensure RGB
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # Create chat message format
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": question}
+                ]
+            }
+        ]
+        # Apply chat template
+        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        # Process inputs
+        inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        # Generate response (limited tokens for speed - only need short answer)
+        generated_ids = self.model.generate(
+            **inputs,
+            max_new_tokens=30,  # Reduced from 150 for faster inference
+            do_sample=False
+        )
+        # Decode response
+        generated_text = self.processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True
+        )[0]
+        # Extract just the assistant's response (after the prompt)
+        if "Assistant:" in generated_text:
+            response = generated_text.split("Assistant:")[-1].strip()
+        else:
+            response = generated_text.strip()
+        return response
+    def classify_document(self, images: List[Image.Image]) -> Dict:
+        """
+        Classify a document by analyzing the first page only.
+        First page typically contains header/title which identifies document type.
+        Args:
+            images: List of PIL Images (one per page)
+        Returns:
+            Dict with document_type and num_pages
+        """
+        if not images:
+            return {
+                "document_type": "Unknown",
+                "num_pages": 0
+            }
+        print(f"  Classifying document ({len(images)} pages, analyzing first page)...")
+        # Classification question with 12-class system
+        # Tier 1: Main business documents (7 classes)
+        # Tier 2: Grouped categories (5 classes)
+        classification_question = """What type of document is this?
+Choose ONE from these categories:
+- Invoice (factura, bill for payment)
+- PurchaseOrder (order form, purchase request)
+- DeliveryNote (delivery slip, shipping document)
+- CreditNote (credit memo, refund document)
+- DebitNote (debit memo, additional charge)
+- OrderConfirmation (order acknowledgment)
+- QuotationOffer (quote, price proposal)
+- IdentityDocument (ID card, passport, DNI, NIE)
+- PayrollDocument (salary slip, work contract)
+- VehicleDocument (car papers, registration, insurance, ITV)
+- EmployeeDocument (employee records, HR documents)
+- Other (anything else)
+Answer with just the category name, nothing else."""
+        # Get document type from first page only (fastest approach)
+        doc_type = self.ask_about_image(images[0], classification_question)
+        # Clean up and normalize response
+        doc_type = doc_type.strip().split('\n')[0].strip()
+        doc_type = self._normalize_category(doc_type)
+        print(f"  → Document type: {doc_type}")
+        return {
+            "document_type": doc_type,
+            "num_pages": len(images)
+        }
+    def _normalize_category(self, raw_type: str) -> str:
+        """
+        Normalize VLM output to standard category names.
+        Maps variations and translations to canonical names.
+        """
+        raw_lower = raw_type.lower().strip().rstrip('.')
+        # Main business documents (Tier 1)
+        if any(x in raw_lower for x in ['invoice', 'factura', 'bill']):
+            if 'credit' in raw_lower:
+                return 'CreditNote'
+            if 'debit' in raw_lower:
+                return 'DebitNote'
+            return 'Invoice'
+        if any(x in raw_lower for x in ['purchase', 'order form', 'compra']):
+            return 'PurchaseOrder'
+        if any(x in raw_lower for x in ['delivery', 'shipping', 'albarán', 'entrega']):
+            return 'DeliveryNote'
+        if any(x in raw_lower for x in ['credit note', 'credit memo', 'refund']):
+            return 'CreditNote'
+        if any(x in raw_lower for x in ['debit note', 'debit memo']):
+            return 'DebitNote'
+        if any(x in raw_lower for x in ['order confirmation', 'confirmation', 'confirmación']):
+            return 'OrderConfirmation'
+        if any(x in raw_lower for x in ['quotation', 'quote', 'offer', 'presupuesto', 'oferta']):
+            return 'QuotationOffer'
+        # Grouped categories (Tier 2)
+        if any(x in raw_lower for x in ['identity', 'passport', 'dni', 'nie', 'id card', 'identificación']):
+            return 'IdentityDocument'
+        if any(x in raw_lower for x in ['payroll', 'salary', 'wage', 'nómina', 'work contract', 'contrato']):
+            return 'PayrollDocument'
+        if any(x in raw_lower for x in ['vehicle', 'car', 'registration', 'insurance', 'itv', 'circulación', 'seguro', 'ficha técnica']):
+            return 'VehicleDocument'
+        if any(x in raw_lower for x in ['employee', 'hr', 'personnel', 'empleado']):
+            return 'EmployeeDocument'
+        if any(x in raw_lower for x in ['receipt', 'recibo', 'ticket']):
+            return 'Invoice'  # Map receipts to Invoice
+        if any(x in raw_lower for x in ['utility', 'electric', 'gas', 'water', 'luz', 'agua']):
+            return 'Invoice'  # Utility bills are invoices
+        # Default
+        return 'Other'
+if __name__ == "__main__":
+    # Quick test
+    print("Initializing SmolVLM classifier...")
+    classifier = SmolVLMClassifier()
+    # Test with a simple image
+    test_img = Image.new("RGB", (400, 300), color="white")
+    response = classifier.ask_about_image(test_img, "What do you see in this image?")
+    print(f"Test response: {response}")