Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from pypdf import PdfReader | |
| import pypdfium2 as pdfium | |
| import easyocr | |
| ocr_id = { | |
| "Afrikaans": "af", | |
| "Albanian": "sq", | |
| "Arabic": "ar", | |
| "Azerbaijani": "az", | |
| "Belarusian": "be", | |
| "Bulgarian": "bg", | |
| "Bengali": "bn", | |
| "Bosnian": "bs", | |
| "Chinese (simplified)": "ch_sim", | |
| "Chinese (traditional)": "ch_tra", | |
| "Croatian": "hr", | |
| "Czech": "cs", | |
| "Danish": "da", | |
| "Dutch": "nl", | |
| "English": "en", | |
| "Estonian": "et", | |
| "French": "fr", | |
| "German": "de", | |
| "Irish": "ga", | |
| "Hindi": "hi", | |
| "Hungarian": "hu", | |
| "Indonesian": "id", | |
| "Icelandic": "is", | |
| "Italian": "it", | |
| "Japanese": "ja", | |
| "Kannada": "kn", | |
| "Korean": "ko", | |
| "Lithuanian": "lt", | |
| "Latvian": "lv", | |
| "Mongolian": "mn", | |
| "Marathi": "mr", | |
| "Malay": "ms", | |
| "Nepali": "ne", | |
| "Norwegian": "no", | |
| "Occitan": "oc", | |
| "Polish": "pl", | |
| "Portuguese": "pt", | |
| "Romanian": "ro", | |
| "Russian": "ru", | |
| "Serbian (cyrillic)": "rs_cyrillic", | |
| "Serbian (latin)": "rs_latin", | |
| "Slovak": "sk", | |
| "Slovenian": "sl", | |
| "Spanish": "es", | |
| "Swedish": "sv", | |
| "Swahili": "sw", | |
| "Tamil": "ta", | |
| "Thai": "th", | |
| "Tagalog": "tl", | |
| "Turkish": "tr", | |
| "Ukrainian": "uk", | |
| "Urdu": "ur", | |
| "Uzbek": "uz", | |
| "Vietnamese": "vi", | |
| "Welsh": "cy", | |
| "Zulu": "zu", | |
| } | |
| def pdf_pil(file_path,page_num,up_scale): | |
| pdf = pdfium.PdfDocument("data.pdf") | |
| page = pdf.get_page(int(page_num)-1) | |
| bitmap = page.render( | |
| scale = int(up_scale), # 72dpi resolution | |
| rotation = 0, # no additional rotation | |
| # ... further rendering options | |
| ) | |
| pil_image = bitmap.to_pil() | |
| pil_image.save(f"image_{page_num}.png") | |
| return (f"image_{page_num}.png") | |
| def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale): | |
| img1 = pdf_pil(file_path,page_num,up_scale) | |
| lang=[f"{ocr_id[pdf_lang]}"] | |
| reader = easyocr.Reader(lang) | |
| bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det) | |
| this = "" | |
| for bound in bounds: | |
| this = (f'{this} \n{bound[1]}') | |
| return this | |
| def scrape(instring): | |
| html_src=(f''' | |
| <div style="text-align:center"> | |
| <h4>PDF Viewer</h4> | |
| <iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe> | |
| </div>''') | |
| return gr.HTML.update(f'''{html_src}''') | |
| def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale): | |
| yield (None,None,gr.Markdown.update("""<h3> Trying Native Text Detection""")) | |
| response = requests.get(instring, stream=True) | |
| if response.status_code == 200: | |
| with open("data.pdf", "wb") as f: | |
| f.write(response.content) | |
| else: | |
| print(response.status_code) | |
| #out = Path("./data.pdf") | |
| #print (out) | |
| reader = PdfReader("data.pdf") | |
| number_of_pages = len(reader.pages) | |
| page = reader.pages[int(page_num)-1] | |
| text = page.extract_text() | |
| print (text) | |
| summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn") | |
| try: | |
| sum_out = summarizer(text) | |
| except Exception: | |
| try: | |
| yield (None,None,gr.Markdown.update("""<h3> Trying OCR Text Detection""")) | |
| text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale) | |
| sum_out = summarizer(text) | |
| except Exception: | |
| sum_out = "Error" | |
| yield (None,None,gr.Markdown.update("""<h3> Error""")) | |
| return text, sum_out,gr.Markdown.update("""<h3> Complete""")) | |
| with gr.Blocks() as app: | |
| gr.Markdown('''<h1>PDF Viewer''') | |
| with gr.Row(): | |
| inp=gr.Textbox(label="PDF URL",scale=3) | |
| pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1) | |
| with gr.Tab("View PDF"): | |
| go_btn = gr.Button("Load PDF") | |
| outp = gr.HTML() | |
| with gr.Tab("Summarize"): | |
| mes = gr.Markdown("""<h3> Summarize Text in PDF""") | |
| with gr.Row(): | |
| with gr.Box(): | |
| sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space") | |
| contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold") | |
| up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale") | |
| with gr.Column(): | |
| target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English") | |
| sum_btn = gr.Button("Summarize") | |
| with gr.Row(): | |
| text_out = gr.Textbox() | |
| sum_out = gr.Textbox() | |
| go_btn.click(scrape,inp,outp) | |
| sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out,mes]) | |
| app.queue(concurrency_count=10).launch() |