HimankJ commited on
Commit
90a97f3
·
verified ·
1 Parent(s): e4f1bf4

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +17 -0
  2. app.py +86 -0
  3. pdf_img_convert.py +30 -0
  4. requirements.txt +18 -0
  5. text_extractor.py +32 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ poppler-utils \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ COPY requirements.txt .
10
+
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ COPY . .
14
+
15
+ EXPOSE 7860
16
+
17
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os, pickle
3
+ from text_extractor import OCRProcessor
4
+ import shutil
5
+ from loguru import logger
6
+
7
+ class DocumentClassifier:
8
+ def __init__(self):
9
+ self.ocr_processor = OCRProcessor()
10
+ with open('model/lr_classifier_v1.pkl', 'rb') as doc_cat_file:
11
+ self.model = pickle.load(doc_cat_file)
12
+
13
+ # Create temporary directories
14
+ self.temp_folder = 'temp_files'
15
+ self.temp_output = 'temp_output'
16
+ os.makedirs(self.temp_folder, exist_ok=True)
17
+ os.makedirs(self.temp_output, exist_ok=True)
18
+ self.label_mapper = {
19
+ 0: 'cable',
20
+ 1: 'fuses',
21
+ 2: 'lighting',
22
+ 3: 'others'
23
+ }
24
+
25
+ def cleanup(self):
26
+ """Clean up temporary files"""
27
+ shutil.rmtree(self.temp_folder, ignore_errors=True)
28
+ shutil.rmtree(self.temp_output, ignore_errors=True)
29
+
30
+ def process_document(self, file):
31
+ try:
32
+ file_path = file.name
33
+ # Perform OCR
34
+ raw_text = self.ocr_processor.perform_ocr(
35
+ file_path,
36
+ self.temp_output
37
+ )
38
+
39
+ if not raw_text:
40
+ return "No text could be extracted from the document"
41
+
42
+ predicted_probabilities = self.model.predict_proba([raw_text])[0]
43
+ predicted_category_index = predicted_probabilities.argmax()
44
+ predicted_category = self.label_mapper[predicted_category_index]
45
+ confidence_score = predicted_probabilities[predicted_category_index]
46
+
47
+ self.cleanup()
48
+
49
+ return {
50
+ 'Classification': predicted_category,
51
+ 'Confidence Score': str(round(confidence_score, 2))
52
+ }
53
+
54
+ except Exception as e:
55
+ logger.error(f"Error processing document: {str(e)}")
56
+ self.cleanup()
57
+ return f"Error processing document: {str(e)}"
58
+
59
+ classifier = DocumentClassifier()
60
+
61
+ def classify_document(file):
62
+ result = classifier.process_document(file)
63
+ return result['Classification'], result['Confidence Score']
64
+
65
+ iface = gr.Interface(
66
+ fn=classify_document,
67
+ inputs=gr.File(label="Upload PDF or Image"),
68
+ outputs=[
69
+ gr.Label(label="Classification"),
70
+ gr.Label(label="Confidence Score")
71
+ ],
72
+ title="📄 Smart Document Classifier",
73
+ description="Upload your PDF or image documents and let AI classify them automatically into categories: cable, fuses, lighting, or others.",
74
+ theme=gr.themes.Citrus(),
75
+ css="""
76
+ .gradio-container {
77
+ font-family: 'Quicksand', sans-serif !important;
78
+ }
79
+ .gr-button {
80
+ font-weight: 600;
81
+ }
82
+ """
83
+ )
84
+
85
+ if __name__ == "__main__":
86
+ iface.launch(server_name="0.0.0.0", server_port=7860)
pdf_img_convert.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ from pdf2image import convert_from_path
3
+ import os, shutil
4
+
5
+ class PDFtoImage():
6
+
7
+ def __init__(self):
8
+ logger.info('PDFtoImage class ready!')
9
+
10
+ def pdf_to_img_conversion(self,file_path,outputFolderPath):
11
+ if not os.path.exists(outputFolderPath):
12
+ os.makedirs(outputFolderPath)
13
+ try:
14
+ file_ext = os.path.splitext(file_path)[1].lower()
15
+ if file_ext == '.pdf':
16
+ file_name = os.path.basename(file_path)
17
+ images = convert_from_path(file_path,output_folder=outputFolderPath,fmt='jpg',thread_count=2,paths_only=True,output_file=file_name)
18
+ total_images = len(images)
19
+ logger.info(f'Total images after conversion: {total_images}')
20
+ return images
21
+ else:
22
+ logger.info(f'Input type is not PDF, no conversion needed')
23
+ file_name = os.path.basename(file_path)
24
+ image_path = os.path.join(outputFolderPath, file_name)
25
+ shutil.copy2(file_path, image_path)
26
+ return [image_path]
27
+
28
+ except Exception as e:
29
+ logger.error(f'PDFtoImage pdfToImageConversion ERROR: {e}')
30
+ return None
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib==1.4.2
2
+ loguru==0.7.3
3
+ matplotlib==3.10.0
4
+ numpy==1.26.4
5
+ opencv-python==4.10.0.84
6
+ openpyxl==3.1.5
7
+ paddleocr==2.9.1
8
+ paddlepaddle==2.6.2
9
+ pandas==2.2.3
10
+ pdf2image==1.17.0
11
+ pillow==11.0.0
12
+ pytz==2024.2
13
+ requests==2.32.3
14
+ scikit-image==0.25.0
15
+ scikit-learn==1.4.0
16
+ scipy==1.14.1
17
+ xgboost==2.1.3
18
+ gradio
text_extractor.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time
2
+ from paddleocr import PaddleOCR
3
+ from pdf_img_convert import PDFtoImage
4
+ from azure.cognitiveservices.vision.computervision import ComputerVisionClient
5
+ from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
6
+ from msrest.authentication import CognitiveServicesCredentials
7
+ from loguru import logger
8
+
9
+ class OCRProcessor:
10
+
11
+ def __init__(self):
12
+ self.pdf_img_convert = PDFtoImage()
13
+ self.ocr = PaddleOCR(use_angle_cls=True, lang='en')
14
+
15
+ def perform_ocr(self, file_path, output_folder):
16
+
17
+ if not os.path.exists(output_folder):
18
+ os.makedirs(output_folder)
19
+
20
+ images = self.pdf_img_convert.pdf_to_img_conversion(file_path,output_folder)
21
+ if images:
22
+ combined_text = ""
23
+ for image in images:
24
+ result = self.ocr.ocr(image, cls=True)
25
+ for idx in range(len(result)):
26
+ res = result[idx]
27
+ for line in res:
28
+ text = line[1][0]
29
+ combined_text += f'{text} '
30
+ combined_text += '\n'
31
+
32
+ return str(combined_text.strip())