File size: 8,706 Bytes
26b6aee
 
ba637e9
28c44fb
 
 
 
 
6a30f12
 
26b6aee
 
 
94441b4
ba637e9
26b6aee
ba637e9
26b6aee
 
 
 
192d1e1
b0c1807
26b6aee
 
 
 
 
2f9b0a0
 
 
26b6aee
2f9b0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26b6aee
2f9b0a0
 
 
 
 
 
 
 
26b6aee
2f9b0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26b6aee
2f9b0a0
 
 
26b6aee
2f9b0a0
 
dd447df
2f9b0a0
 
dd447df
2f9b0a0
 
28c44fb
2f9b0a0
b0c1807
 
 
 
 
 
6a30f12
6bdda47
 
 
 
ba637e9
 
 
6a30f12
 
8ebc49f
6a30f12
6bdda47
 
 
ceadc69
2f9b0a0
 
 
 
 
 
 
 
 
 
 
6a30f12
26b6aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f9b0a0
 
 
26b6aee
 
ceadc69
26b6aee
 
 
 
 
2f9b0a0
 
 
26b6aee
 
 
 
 
 
6bdda47
 
 
 
26b6aee
6bdda47
 
 
 
2f9b0a0
26b6aee
2f9b0a0
 
26b6aee
 
dcb2ee5
26b6aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import os
import subprocess
import cv2 as cv  # Ensure OpenCV is installed
import numpy as np
import pytesseract
from pdf2image import convert_from_path
import gradio as gr
import json
from PIL import Image

# Ensure poppler-utils and tesseract-ocr are installed
def install_dependencies():
    try:
        result = subprocess.run(["bash", "setup.sh"], check=True, capture_output=True, text=True)
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while installing dependencies: {e.stderr}")
        raise

install_dependencies()

# Function to rescale the frame
def rescale_frame(frame, scale=0.75):
    width = int(frame.shape[1] * scale)
    height = int(frame.shape[0] * scale)
    dimensions = (width, height)
    return cv.resize(frame, dimensions, interpolation=cv.INTER_AREA)

# Image Analysis
def analyze_image(image):
    analysis = {}
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    
    # Brightness and contrast
    mean_brightness = np.mean(gray)
    contrast = gray.std()
    analysis['mean_brightness'] = mean_brightness
    analysis['contrast'] = contrast
    
    # Noise level
    noise = cv.Laplacian(gray, cv.CV_64F).var()
    analysis['noise'] = noise
    
    # Skew detection (Hough line transform or other method)
    skew_angle = detect_skew(gray)
    analysis['skew_angle'] = skew_angle
    
    return analysis

def detect_skew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    return angle

# Adaptive Preprocessing Pipeline
def preprocess_image_adaptive(image):
    analysis = analyze_image(image)
    
    # Apply preprocessing steps based on analysis
    if analysis['mean_brightness'] < 50:
        image = adjust_brightness(image, 1.5)
    if analysis['contrast'] < 50:
        image = adjust_contrast(image, 1.5)
    if analysis['noise'] > 1000:
        image = reduce_noise(image)
    if abs(analysis['skew_angle']) > 5:
        image = deskew(image, analysis['skew_angle'])
    
    # Convert to grayscale and apply adaptive thresholding for binarization
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    binary = cv.adaptiveThreshold(gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)
    
    return binary

def adjust_brightness(image, factor):
    return cv.convertScaleAbs(image, alpha=factor, beta=0)

def adjust_contrast(image, alpha):
    return cv.convertScaleAbs(image, alpha=alpha, beta=0)

def reduce_noise(image):
    return cv.fastNlMeansDenoisingColored(image, None, 30, 30, 7, 21)

def deskew(image, angle):
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv.warpAffine(image, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE)
    return rotated

def convert_to_pil(image):
    if image is None or image.size == 0:
        print("Error: Empty image passed to convert_to_pil")
        return None
    print("Converting image to PIL format")
    # Ensure the array is in uint8 format
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    return Image.fromarray(cv.cvtColor(image, cv.COLOR_BGR2RGB))

def extract_text_from_image(image, langs='tel+osd+eng'):
    pil_image = convert_to_pil(image)
    if pil_image is None:
        print("Error: Failed to convert image to PIL format")
        return ""
    custom_config = r'--oem 3 --psm 6'
    try:
        return pytesseract.image_to_string(pil_image, lang=langs, config=custom_config)
    except pytesseract.TesseractError as e:
        print(f"Tesseract error: {e}")
        return ""

def process_image(img):
    preprocessed = preprocess_image_adaptive(img)
    if preprocessed is None:
        return ""
    return extract_text_from_image(preprocessed)

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

all_texts = {}

def save_and_next(page_num, text, extracted_texts, original_images, total_pages):
    page_num = int(page_num)  # Ensure page_num is an integer
    total_pages = int(total_pages)  # Ensure total_pages is an integer
    formatted_text = {
        f"Page number: {page_num}": {
            "Content": [
                line for line in text.split('\n') if line.strip() != ''
            ]
        }
    }
    all_texts.update(formatted_text)
    json_path = os.path.join(output_dir, "all_texts.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(all_texts, f, ensure_ascii=False, indent=4)

    next_page_num = page_num + 1  # Increment to next page
    if next_page_num <= total_pages:
        next_page_image = original_images[next_page_num - 1]
        text = process_image(next_page_image)
        extracted_texts.append(text)
        return gr.update(value=text), next_page_num, gr.update(value=next_page_image, height=None, width=None), json_path
    else:
        return "All pages processed", page_num, None, json_path

def skip_page(page_num, extracted_texts, original_images, total_pages):
    next_page_num = int(page_num) + 1  # Ensure page_num is an integer and increment to next page
    total_pages = int(total_pages)  # Ensure total_pages is an integer
    if next_page_num <= total_pages:
        next_page_image = original_images[next_page_num - 1]
        text = process_image(next_page_image)
        extracted_texts.append(text)
        return gr.update(value=text), next_page_num, gr.update(value=next_page_image, height=None, width=None)
    else:
        return "All pages processed", page_num, None

def upload_pdf(pdf):
    pdf_path = pdf.name
    pages = convert_from_path(pdf_path)
    if not pages:
        print("Error: No pages found in PDF")
        return "Error: No pages found in PDF", None, 0, [], [], 0
    print(f"PDF converted to {len(pages)} images")
    first_page = np.array(pages[0])
    if first_page is None or first_page.size == 0:
        print("Error: First page is empty")
        return "Error: First page is empty", None, 0, [], [], 0
    
    text = process_image(first_page)
    original_images = [np.array(page) for page in pages]
    extracted_texts = [text]
    return gr.update(value=original_images[0], height=None, width=None), gr.update(value=text), 1, extracted_texts, original_images, len(pages)

def navigate_to_page(page_num, extracted_texts, original_images):
    page_num = int(page_num)  # Ensure page_num is an integer
    if 0 <= page_num - 1 < len(original_images):
        return gr.update(value=original_images[page_num - 1], height=None, width=None), gr.update(value=extracted_texts[page_num - 1]), page_num
    else:
        return gr.update(value="Invalid Page Number"), None, page_num

def display_pdf_and_text():
    with gr.Blocks() as demo:
        gr.Markdown("## PDF Viewer and Text Editor")
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        with gr.Row():
            image_output = gr.Image(label="Page Image", type="numpy")
            text_editor = gr.Textbox(label="Extracted Text", lines=10, interactive=True)
        page_num = gr.Number(value=1, label="Page Number", visible=True)
        extracted_texts = gr.State()
        original_images = gr.State()
        total_pages = gr.State()
        save_next_button = gr.Button("Save and Next")
        skip_button = gr.Button("Skip")
        pdf_input.upload(upload_pdf, inputs=pdf_input, outputs=[image_output, text_editor, page_num, extracted_texts, original_images, total_pages])

        save_next_button.click(fn=save_and_next,
                               inputs=[page_num, text_editor, extracted_texts, original_images, total_pages],
                               outputs=[text_editor, page_num, image_output, gr.File(label="Download JSON")])

        skip_button.click(fn=skip_page,
                          inputs=[page_num, extracted_texts, original_images, total_pages],
                          outputs=[text_editor, page_num, image_output])

        page_buttons = gr.Row()

        def update_page_buttons(total_pages, extracted_texts, original_images):
            buttons = []
            for i in range(1, total_pages + 1):
                button = gr.Button(str(i), variant="primary", size="small")
                button.click(navigate_to_page, inputs=[i, extracted_texts, original_images], outputs=[image_output, text_editor, page_num])
                buttons.append(button)
            return buttons

        total_pages.change(fn=update_page_buttons, inputs=[total_pages, extracted_texts, original_images], outputs=[page_buttons])

    return demo

iface = display_pdf_and_text()
iface.launch()