Spaces:
Sleeping
Sleeping
| # import re | |
| # import gradio as gr | |
| # import torch | |
| # from transformers import DonutProcessor, VisionEncoderDecoderModel | |
| # processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") | |
| # model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # model.to(device) | |
| # def process_document(image, question): | |
| # # prepare encoder inputs | |
| # pixel_values = processor(image, return_tensors="pt").pixel_values | |
| # # prepare decoder inputs | |
| # task_prompt = "{user_input}" | |
| # prompt = task_prompt.replace("{user_input}", question) | |
| # decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
| # # generate answer | |
| # outputs = model.generate( | |
| # pixel_values.to(device), | |
| # decoder_input_ids=decoder_input_ids.to(device), | |
| # max_length=model.decoder.config.max_position_embeddings, | |
| # early_stopping=True, | |
| # pad_token_id=processor.tokenizer.pad_token_id, | |
| # eos_token_id=processor.tokenizer.eos_token_id, | |
| # use_cache=True, | |
| # num_beams=1, | |
| # bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
| # return_dict_in_generate=True, | |
| # ) | |
| # # postprocess | |
| # sequence = processor.batch_decode(outputs.sequences)[0] | |
| # sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
| # sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token | |
| # return processor.token2json(sequence) | |
| # description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." | |
| # article = "<p style='text-align: center'>Model-V3</p>" | |
| # demo = gr.Interface( | |
| # fn=process_document, | |
| # inputs=["image", "text"], | |
| # outputs="json", | |
| # title="Demo: Model-V3 for Document Analysis", | |
| # description=description, | |
| # article=article, | |
| # examples=[["example_1.png", "What is the title shown?"], ["example_2.png", "When is mid semester exams?"]], | |
| # cache_examples=False) | |
| # demo.queue(max_size=5) | |
| # demo.launch() | |
| # import re | |
| # import gradio as gr | |
| # import torch | |
| # from transformers import DonutProcessor, VisionEncoderDecoderModel | |
| # import fitz # PyMuPDF | |
| # from PIL import Image | |
| # import io | |
| # processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") | |
| # model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # model.to(device) | |
| # def pdf_to_images(pdf_file): | |
| # if pdf_file is None: | |
| # return None | |
| # pdf_path = pdf_file.name # Get the file path | |
| # images = [] | |
| # try: | |
| # doc = fitz.open(pdf_path) | |
| # for page in doc: | |
| # pix = page.get_pixmap() | |
| # img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| # images.append(img) | |
| # return images | |
| # except Exception as e: | |
| # print(f"Error converting PDF: {e}") | |
| # return None | |
| # def process_document(pdf_file, page_number, question): | |
| # if pdf_file is None: | |
| # return "Please upload a PDF file." | |
| # images = pdf_to_images(pdf_file) | |
| # if images is None: | |
| # return "Failed to process the PDF file." | |
| # if page_number < 1 or page_number > len(images): | |
| # return f"Invalid page number. The PDF has {len(images)} pages." | |
| # image = images[page_number - 1] | |
| # # prepare encoder inputs | |
| # pixel_values = processor(image, return_tensors="pt").pixel_values | |
| # # prepare decoder inputs | |
| # task_prompt = "{user_input}" | |
| # prompt = task_prompt.replace("{user_input}", question) | |
| # decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
| # # generate answer | |
| # outputs = model.generate( | |
| # pixel_values.to(device), | |
| # decoder_input_ids=decoder_input_ids.to(device), | |
| # max_length=model.decoder.config.max_position_embeddings, | |
| # early_stopping=True, | |
| # pad_token_id=processor.tokenizer.pad_token_id, | |
| # eos_token_id=processor.tokenizer.eos_token_id, | |
| # use_cache=True, | |
| # num_beams=1, | |
| # bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
| # return_dict_in_generate=True, | |
| # ) | |
| # # postprocess | |
| # sequence = processor.batch_decode(outputs.sequences)[0] | |
| # sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
| # sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token | |
| # return processor.token2json(sequence) | |
| # def update_page_preview(pdf_file, page_number): | |
| # if pdf_file is None: | |
| # return None | |
| # images = pdf_to_images(pdf_file) | |
| # if images is None or page_number < 1 or page_number > len(images): | |
| # return None | |
| # return images[page_number - 1] | |
| # # def update_page_slider(pdf_file): | |
| # # if pdf_file is None: | |
| # # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") | |
| # # images = pdf_to_images(pdf_file) | |
| # # if images is None: | |
| # # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") | |
| # # return gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number") | |
| # description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF file, select a page number, type a question, and click 'submit'." | |
| # article = "<p style='text-align: center'>Model-V3</p>" | |
| # with gr.Blocks() as demo: | |
| # gr.Markdown("# Demo: Model-V3 for Document Analysis") | |
| # gr.Markdown(description) | |
| # with gr.Row(): | |
| # with gr.Column(scale=1): | |
| # pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| # page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") | |
| # with gr.Column(scale=2): | |
| # page_preview = gr.Image(label="Page Preview") | |
| # question_input = gr.Textbox(label="Question") | |
| # submit_button = gr.Button("Submit") | |
| # output = gr.JSON(label="Output") | |
| # def update_interface(pdf_file): | |
| # if pdf_file is None: | |
| # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None | |
| # images = pdf_to_images(pdf_file) | |
| # if images is None: | |
| # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None | |
| # return ( | |
| # gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number"), | |
| # images[0] # Show the first page by default | |
| # ) | |
| # pdf_input.change(update_interface, inputs=[pdf_input], outputs=[page_slider, page_preview]) | |
| # page_slider.change(update_page_preview, inputs=[pdf_input, page_slider], outputs=[page_preview]) | |
| # submit_button.click(process_document, inputs=[pdf_input, page_slider, question_input], outputs=[output]) | |
| # demo.launch() | |
| import re | |
| import gradio as gr | |
| import torch | |
| from transformers import DonutProcessor, VisionEncoderDecoderModel | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import io | |
| processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") | |
| model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| def pdf_to_images(pdf_file): | |
| if pdf_file is None: | |
| return None | |
| pdf_path = pdf_file.name # Get the file path | |
| images = [] | |
| try: | |
| doc = fitz.open(pdf_path) | |
| for page in doc: | |
| pix = page.get_pixmap() | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| images.append(img) | |
| return images | |
| except Exception as e: | |
| print(f"Error converting PDF: {e}") | |
| return None | |
| def process_document(file, page_number, question, input_type): | |
| if file is None: | |
| return "Please upload a file." | |
| if input_type == "PDF": | |
| images = pdf_to_images(file) | |
| if images is None: | |
| return "Failed to process the PDF file." | |
| if page_number < 1 or page_number > len(images): | |
| return f"Invalid page number. The PDF has {len(images)} pages." | |
| image = images[page_number - 1] | |
| else: # Image | |
| image = Image.open(file.name) | |
| # prepare encoder inputs | |
| pixel_values = processor(image, return_tensors="pt").pixel_values | |
| # prepare decoder inputs | |
| task_prompt = "{user_input}" | |
| prompt = task_prompt.replace("{user_input}", question) | |
| decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
| # generate answer | |
| outputs = model.generate( | |
| pixel_values.to(device), | |
| decoder_input_ids=decoder_input_ids.to(device), | |
| max_length=model.decoder.config.max_position_embeddings, | |
| early_stopping=True, | |
| pad_token_id=processor.tokenizer.pad_token_id, | |
| eos_token_id=processor.tokenizer.eos_token_id, | |
| use_cache=True, | |
| num_beams=1, | |
| bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
| return_dict_in_generate=True, | |
| ) | |
| # postprocess | |
| sequence = processor.batch_decode(outputs.sequences)[0] | |
| sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
| sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token | |
| return processor.token2json(sequence) | |
| def update_page_preview(file, page_number, input_type): | |
| if file is None: | |
| return None | |
| if input_type == "PDF": | |
| images = pdf_to_images(file) | |
| if images is None or page_number < 1 or page_number > len(images): | |
| return None | |
| return images[page_number - 1] | |
| else: # Image | |
| return Image.open(file.name) | |
| description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF or image file, select a page number (for PDF), type a question, and click 'submit'." | |
| article = "<p style='text-align: center'>Model-V3</p>" | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Demo: Model-V3 for Document Analysis") | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_type = gr.Radio(["PDF", "Image"], label="Input Type", value="PDF") | |
| file_input = gr.File(label="Upload File") | |
| page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)") | |
| with gr.Column(scale=2): | |
| page_preview = gr.Image(label="Page/Image Preview") | |
| question_input = gr.Textbox(label="Question") | |
| submit_button = gr.Button("Submit") | |
| output = gr.JSON(label="Output") | |
| def update_interface(file, input_type): | |
| if file is None: | |
| return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None | |
| if input_type == "PDF": | |
| images = pdf_to_images(file) | |
| if images is None: | |
| return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None | |
| return ( | |
| gr.Slider(visible=True, minimum=1, maximum=len(images), value=1, step=1, label="Page Number (PDF only)"), | |
| images[0] # Show the first page by default | |
| ) | |
| else: # Image | |
| return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), Image.open(file.name) | |
| input_type.change(lambda x: gr.File(label="Upload File", file_types=[".pdf"] if x == "PDF" else ["image/*"]), inputs=[input_type], outputs=[file_input]) | |
| file_input.change(update_interface, inputs=[file_input, input_type], outputs=[page_slider, page_preview]) | |
| page_slider.change(update_page_preview, inputs=[file_input, page_slider, input_type], outputs=[page_preview]) | |
| submit_button.click(process_document, inputs=[file_input, page_slider, question_input, input_type], outputs=[output]) | |
| demo.launch() |