Spaces:

HimankJ
/

SmartDocumentClassifier

Build error

App Files Files Community

HimankJ commited on Dec 15, 2024

Commit

90a97f3

verified ·

1 Parent(s): e4f1bf4

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +17 -0
app.py +86 -0
pdf_img_convert.py +30 -0
requirements.txt +18 -0
text_extractor.py +32 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.9-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import gradio as gr
+import os, pickle
+from text_extractor import OCRProcessor
+import shutil
+from loguru import logger
+class DocumentClassifier:
+    def __init__(self):
+        self.ocr_processor = OCRProcessor()
+        with open('model/lr_classifier_v1.pkl', 'rb') as doc_cat_file:
+            self.model = pickle.load(doc_cat_file)
+        # Create temporary directories
+        self.temp_folder = 'temp_files'
+        self.temp_output = 'temp_output'
+        os.makedirs(self.temp_folder, exist_ok=True)
+        os.makedirs(self.temp_output, exist_ok=True)
+        self.label_mapper = {
+            0: 'cable',
+            1: 'fuses',
+            2: 'lighting',
+            3: 'others'
+        }
+    def cleanup(self):
+        """Clean up temporary files"""
+        shutil.rmtree(self.temp_folder, ignore_errors=True)
+        shutil.rmtree(self.temp_output, ignore_errors=True)
+    def process_document(self, file):
+        try:
+            file_path = file.name
+            # Perform OCR
+            raw_text = self.ocr_processor.perform_ocr(
+                file_path,
+                self.temp_output
+            )
+            if not raw_text:
+                return "No text could be extracted from the document"
+            predicted_probabilities = self.model.predict_proba([raw_text])[0]
+            predicted_category_index = predicted_probabilities.argmax()
+            predicted_category = self.label_mapper[predicted_category_index]
+            confidence_score = predicted_probabilities[predicted_category_index]
+            self.cleanup()
+            return {
+                'Classification': predicted_category,
+                'Confidence Score': str(round(confidence_score, 2))
+            }
+        except Exception as e:
+            logger.error(f"Error processing document: {str(e)}")
+            self.cleanup()
+            return f"Error processing document: {str(e)}"
+classifier = DocumentClassifier()
+def classify_document(file):
+    result = classifier.process_document(file)
+    return result['Classification'], result['Confidence Score']
+iface = gr.Interface(
+    fn=classify_document,
+    inputs=gr.File(label="Upload PDF or Image"),
+    outputs=[
+        gr.Label(label="Classification"),
+        gr.Label(label="Confidence Score")
+    ],
+    title="📄 Smart Document Classifier",
+    description="Upload your PDF or image documents and let AI classify them automatically into categories: cable, fuses, lighting, or others.",
+    theme=gr.themes.Citrus(),
+    css="""
+        .gradio-container {
+            font-family: 'Quicksand', sans-serif !important;
+        }
+        .gr-button {
+            font-weight: 600;
+        }
+    """
+)
+if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)

pdf_img_convert.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from loguru import logger
+from pdf2image import convert_from_path
+import os, shutil
+class PDFtoImage():
+    def __init__(self):
+        logger.info('PDFtoImage class ready!')
+    def pdf_to_img_conversion(self,file_path,outputFolderPath):
+        if not os.path.exists(outputFolderPath):
+            os.makedirs(outputFolderPath)
+        try:
+            file_ext = os.path.splitext(file_path)[1].lower()
+            if file_ext == '.pdf':
+                file_name = os.path.basename(file_path)
+                images = convert_from_path(file_path,output_folder=outputFolderPath,fmt='jpg',thread_count=2,paths_only=True,output_file=file_name)
+                total_images = len(images)
+                logger.info(f'Total images after conversion: {total_images}')
+                return images
+            else:
+                logger.info(f'Input type is not PDF, no conversion needed')
+                file_name = os.path.basename(file_path)
+                image_path = os.path.join(outputFolderPath, file_name)
+                shutil.copy2(file_path, image_path)
+                return [image_path]
+        except Exception as e:
+            logger.error(f'PDFtoImage pdfToImageConversion ERROR: {e}')
+            return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+joblib==1.4.2
+loguru==0.7.3
+matplotlib==3.10.0
+numpy==1.26.4
+opencv-python==4.10.0.84
+openpyxl==3.1.5
+paddleocr==2.9.1
+paddlepaddle==2.6.2
+pandas==2.2.3
+pdf2image==1.17.0
+pillow==11.0.0
+pytz==2024.2
+requests==2.32.3
+scikit-image==0.25.0
+scikit-learn==1.4.0
+scipy==1.14.1
+xgboost==2.1.3
+gradio

text_extractor.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os, time
+from paddleocr import PaddleOCR
+from pdf_img_convert import PDFtoImage
+from azure.cognitiveservices.vision.computervision import ComputerVisionClient
+from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
+from msrest.authentication import CognitiveServicesCredentials
+from loguru import logger
+class OCRProcessor:
+    def __init__(self):
+        self.pdf_img_convert = PDFtoImage()
+        self.ocr = PaddleOCR(use_angle_cls=True, lang='en')
+    def perform_ocr(self, file_path, output_folder):
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+        images = self.pdf_img_convert.pdf_to_img_conversion(file_path,output_folder)
+        if images:
+            combined_text = ""
+            for image in images:
+                result = self.ocr.ocr(image, cls=True)
+                for idx in range(len(result)):
+                    res = result[idx]
+                    for line in res:
+                        text = line[1][0]
+                        combined_text += f'{text} '
+                combined_text += '\n'
+        return str(combined_text.strip())