import gradio as gr import requests from pypdf import PdfReader import pypdfium2 as pdfium import easyocr ocr_id = { "Afrikaans": "af", "Albanian": "sq", "Arabic": "ar", "Azerbaijani": "az", "Belarusian": "be", "Bulgarian": "bg", "Bengali": "bn", "Bosnian": "bs", "Chinese (simplified)": "ch_sim", "Chinese (traditional)": "ch_tra", "Croatian": "hr", "Czech": "cs", "Danish": "da", "Dutch": "nl", "English": "en", "Estonian": "et", "French": "fr", "German": "de", "Irish": "ga", "Hindi": "hi", "Hungarian": "hu", "Indonesian": "id", "Icelandic": "is", "Italian": "it", "Japanese": "ja", "Kannada": "kn", "Korean": "ko", "Lithuanian": "lt", "Latvian": "lv", "Mongolian": "mn", "Marathi": "mr", "Malay": "ms", "Nepali": "ne", "Norwegian": "no", "Occitan": "oc", "Polish": "pl", "Portuguese": "pt", "Romanian": "ro", "Russian": "ru", "Serbian (cyrillic)": "rs_cyrillic", "Serbian (latin)": "rs_latin", "Slovak": "sk", "Slovenian": "sl", "Spanish": "es", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Thai": "th", "Tagalog": "tl", "Turkish": "tr", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Welsh": "cy", "Zulu": "zu", } def pdf_pil(file_path,page_num): pdf = pdfium.PdfDocument(f"{file_path}") #n_pages = len(pdf) #for page_number in range(n_pages): page = pdf.get_page(page_num) pil_image = page.render_topil( scale=1, rotation=0, crop=(0, 0, 0, 0), colour=(255, 255, 255, 255), annotations=True, greyscale=False, optimise_mode=pdfium.OptimiseMode.NONE, ) #pil_image.save(f"image_{page_num}.png") return pil_image def ocrpdf(file_path,pdf_lang,page_num): img1=pdf_pil(file_path,page_num) lang=[f"{ocr_id[pdf_lang]}"] reader = easyocr.Reader(lang) bounds = reader.readtext(img1) for bound in bounds: print(bound[1]) def scrape(instring): html_src=(f'''