Spaces:

Vipplav
/

telugu_ocr_test

Sleeping

File size: 8,706 Bytes

26b6aee
 
ba637e9
28c44fb
 
 
 
 
6a30f12
 
26b6aee
 
 
94441b4
ba637e9
26b6aee
ba637e9
26b6aee
 
 
 
192d1e1
b0c1807
26b6aee
 
 
 
 
2f9b0a0
 
 
26b6aee
2f9b0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26b6aee
2f9b0a0
 
 
 
 
 
 
 
26b6aee
2f9b0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26b6aee
2f9b0a0
 
 
26b6aee
2f9b0a0
 
dd447df
2f9b0a0
 
dd447df
2f9b0a0
 
28c44fb
2f9b0a0
b0c1807
 
 
 
 
 
6a30f12
6bdda47
 
 
 
ba637e9
 
 
6a30f12
 
8ebc49f
6a30f12
6bdda47
 
 
ceadc69
2f9b0a0
 
 
 
 
 
 
 
 
 
 
6a30f12
26b6aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f9b0a0
 
 
26b6aee
 
ceadc69
26b6aee
 
 
 
 
2f9b0a0
 
 
26b6aee
 
 
 
 
 
6bdda47
 
 
 
26b6aee
6bdda47
 
 
 
2f9b0a0
26b6aee
2f9b0a0
 
26b6aee
 
dcb2ee5
26b6aee

import os
import subprocess
import cv2 as cv  # Ensure OpenCV is installed
import numpy as np
import pytesseract
from pdf2image import convert_from_path
import gradio as gr
import json
from PIL import Image

# Ensure poppler-utils and tesseract-ocr are installed
def install_dependencies():
    try:
        result = subprocess.run(["bash", "setup.sh"], check=True, capture_output=True, text=True)
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while installing dependencies: {e.stderr}")
        raise

install_dependencies()

# Function to rescale the frame
def rescale_frame(frame, scale=0.75):
    width = int(frame.shape[1] * scale)
    height = int(frame.shape[0] * scale)
    dimensions = (width, height)
    return cv.resize(frame, dimensions, interpolation=cv.INTER_AREA)

# Image Analysis
def analyze_image(image):
    analysis = {}
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    
    # Brightness and contrast
    mean_brightness = np.mean(gray)
    contrast = gray.std()
    analysis['mean_brightness'] = mean_brightness
    analysis['contrast'] = contrast
    
    # Noise level
    noise = cv.Laplacian(gray, cv.CV_64F).var()
    analysis['noise'] = noise
    
    # Skew detection (Hough line transform or other method)
    skew_angle = detect_skew(gray)
    analysis['skew_angle'] = skew_angle
    
    return analysis

def detect_skew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    return angle

# Adaptive Preprocessing Pipeline
def preprocess_image_adaptive(image):
    analysis = analyze_image(image)
    
    # Apply preprocessing steps based on analysis
    if analysis['mean_brightness'] < 50:
        image = adjust_brightness(image, 1.5)
    if analysis['contrast'] < 50:
        image = adjust_contrast(image, 1.5)
    if analysis['noise'] > 1000:
        image = reduce_noise(image)
    if abs(analysis['skew_angle']) > 5:
        image = deskew(image, analysis['skew_angle'])
    
    # Convert to grayscale and apply adaptive thresholding for binarization
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    binary = cv.adaptiveThreshold(gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)
    
    return binary

def adjust_brightness(image, factor):
    return cv.convertScaleAbs(image, alpha=factor, beta=0)

def adjust_contrast(image, alpha):
    return cv.convertScaleAbs(image, alpha=alpha, beta=0)

def reduce_noise(image):
    return cv.fastNlMeansDenoisingColored(image, None, 30, 30, 7, 21)

def deskew(image, angle):
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv.warpAffine(image, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE)
    return rotated

def convert_to_pil(image):
    if image is None or image.size == 0:
        print("Error: Empty image passed to convert_to_pil")
        return None
    print("Converting image to PIL format")
    # Ensure the array is in uint8 format
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    return Image.fromarray(cv.cvtColor(image, cv.COLOR_BGR2RGB))

def extract_text_from_image(image, langs='tel+osd+eng'):
    pil_image = convert_to_pil(image)
    if pil_image is None:
        print("Error: Failed to convert image to PIL format")
        return ""
    custom_config = r'--oem 3 --psm 6'
    try:
        return pytesseract.image_to_string(pil_image, lang=langs, config=custom_config)
    except pytesseract.TesseractError as e:
        print(f"Tesseract error: {e}")
        return ""

def process_image(img):
    preprocessed = preprocess_image_adaptive(img)
    if preprocessed is None:
        return ""
    return extract_text_from_image(preprocessed)

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

all_texts = {}

def save_and_next(page_num, text, extracted_texts, original_images, total_pages):
    page_num = int(page_num)  # Ensure page_num is an integer
    total_pages = int(total_pages)  # Ensure total_pages is an integer
    formatted_text = {
        f"Page number: {page_num}": {
            "Content": [
                line for line in text.split('\n') if line.strip() != ''
            ]
        }
    }
    all_texts.update(formatted_text)
    json_path = os.path.join(output_dir, "all_texts.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(all_texts, f, ensure_ascii=False, indent=4)

    next_page_num = page_num + 1  # Increment to next page
    if next_page_num <= total_pages:
        next_page_image = original_images[next_page_num - 1]
        text = process_image(next_page_image)
        extracted_texts.append(text)
        return gr.update(value=text), next_page_num, gr.update(value=next_page_image, height=None, width=None), json_path
    else:
        return "All pages processed", page_num, None, json_path

def skip_page(page_num, extracted_texts, original_images, total_pages):
    next_page_num = int(page_num) + 1  # Ensure page_num is an integer and increment to next page
    total_pages = int(total_pages)  # Ensure total_pages is an integer
    if next_page_num <= total_pages:
        next_page_image = original_images[next_page_num - 1]
        text = process_image(next_page_image)
        extracted_texts.append(text)
        return gr.update(value=text), next_page_num, gr.update(value=next_page_image, height=None, width=None)
    else:
        return "All pages processed", page_num, None

def upload_pdf(pdf):
    pdf_path = pdf.name
    pages = convert_from_path(pdf_path)
    if not pages:
        print("Error: No pages found in PDF")
        return "Error: No pages found in PDF", None, 0, [], [], 0
    print(f"PDF converted to {len(pages)} images")
    first_page = np.array(pages[0])
    if first_page is None or first_page.size == 0:
        print("Error: First page is empty")
        return "Error: First page is empty", None, 0, [], [], 0
    
    text = process_image(first_page)
    original_images = [np.array(page) for page in pages]
    extracted_texts = [text]
    return gr.update(value=original_images[0], height=None, width=None), gr.update(value=text), 1, extracted_texts, original_images, len(pages)

def navigate_to_page(page_num, extracted_texts, original_images):
    page_num = int(page_num)  # Ensure page_num is an integer
    if 0 <= page_num - 1 < len(original_images):
        return gr.update(value=original_images[page_num - 1], height=None, width=None), gr.update(value=extracted_texts[page_num - 1]), page_num
    else:
        return gr.update(value="Invalid Page Number"), None, page_num

def display_pdf_and_text():
    with gr.Blocks() as demo:
        gr.Markdown("## PDF Viewer and Text Editor")
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        with gr.Row():
            image_output = gr.Image(label="Page Image", type="numpy")
            text_editor = gr.Textbox(label="Extracted Text", lines=10, interactive=True)
        page_num = gr.Number(value=1, label="Page Number", visible=True)
        extracted_texts = gr.State()
        original_images = gr.State()
        total_pages = gr.State()
        save_next_button = gr.Button("Save and Next")
        skip_button = gr.Button("Skip")
        pdf_input.upload(upload_pdf, inputs=pdf_input, outputs=[image_output, text_editor, page_num, extracted_texts, original_images, total_pages])

        save_next_button.click(fn=save_and_next,
                               inputs=[page_num, text_editor, extracted_texts, original_images, total_pages],
                               outputs=[text_editor, page_num, image_output, gr.File(label="Download JSON")])

        skip_button.click(fn=skip_page,
                          inputs=[page_num, extracted_texts, original_images, total_pages],
                          outputs=[text_editor, page_num, image_output])

        page_buttons = gr.Row()

        def update_page_buttons(total_pages, extracted_texts, original_images):
            buttons = []
            for i in range(1, total_pages + 1):
                button = gr.Button(str(i), variant="primary", size="small")
                button.click(navigate_to_page, inputs=[i, extracted_texts, original_images], outputs=[image_output, text_editor, page_num])
                buttons.append(button)
            return buttons

        total_pages.change(fn=update_page_buttons, inputs=[total_pages, extracted_texts, original_images], outputs=[page_buttons])

    return demo

iface = display_pdf_and_text()
iface.launch()