Spaces:

laverdes
/

Multimodal

Runtime error

App Files Files Community

laverdes commited on Apr 24, 2023

Commit

7c523f8

1 Parent(s): c8f52ac

feat: new experimental app

Browse files

Files changed (1) hide show

app.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import streamlit as st
+from PIL import Image
+from io import BytesIO
+from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig , DonutProcessor
+def run_prediction(sample):
+    global pretrained_model, processor, task_prompt
+    if isinstance(sample, dict):
+        # prepare inputs
+        pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0)
+    else:  # sample is an image
+        # prepare encoder inputs
+        pixel_values = processor(image, return_tensors="pt").pixel_values
+    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+    # run inference
+    outputs = pretrained_model.generate(
+        pixel_values.to(device),
+        decoder_input_ids=decoder_input_ids.to(device),
+        max_length=pretrained_model.decoder.config.max_position_embeddings,
+        early_stopping=True,
+        pad_token_id=processor.tokenizer.pad_token_id,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        use_cache=True,
+        num_beams=1,
+        bad_words_ids=[[processor.tokenizer.unk_token_id]],
+        return_dict_in_generate=True,
+    )
+    # process output
+    prediction = processor.batch_decode(outputs.sequences)[0]
+    # post-processing
+    if "cord" in task_prompt:
+        prediction = prediction.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+        # prediction = re.sub(r"<.*?>", "", prediction, count=1).strip()  # remove first task start token
+    prediction = processor.token2json(prediction)
+    # load reference target
+    if isinstance(sample, dict):
+        target = processor.token2json(sample["target_sequence"])
+    else:
+        target = "<not_provided>"
+    return prediction, target
+task_prompt = f"<s>"
+# logo = Image.open("./img/rsz_unstructured_logo.png")
+# st.image(logo)
+st.markdown('''
+### Donut Common Crawl
+Experimental OCR-free Document Understanding Vision Transformer nicknamed 🍩, fine-tuned with few samples of the common-crawl with some specific document elements.
+''')
+with st.sidebar:
+    information = st.radio(
+    "What information inside the 🧾s are you interested in extracting?",
+    ('Base Common-Crawl 🍩', 'Hierarchical Common-Crawl 🍩'))
+    image_choice = st.selectbox('Pick one 🧾', ['1', '2', '3'], index=1)
+st.text(f'{information} mode is ON!\nTarget 🧾: {image_choice}')  # \n(opening image @:./img/receipt-{receipt}.png)')
+col1, col2 = st.columns(2)
+image_choice_map = {
+    '1': 'commoncrawl_amandalacombznewspolice-bust-man-sawed-oal_1.png',
+    '2': 'commoncrawl_canyonhillschroniclecomtagwomens-basketbll_0.png',
+    '3': 'commoncrawl_celstuttgartdeideaa-different-stort-of-nfe_0.png'
+}
+image = Image.open(image_choice_map[image_choice])
+with col1:
+    st.image(image, caption='Your target sample')
+if st.button('Parse sample! 🐍'):
+    with st.spinner(f'baking the 🍩s...'):
+        if information == 'Base Common-Crawl 🍩':
+            processor = DonutProcessor.from_pretrained("laverdes/donut-commoncrawl")
+            pretrained_model = VisionEncoderDecoderModel.from_pretrained("laverdes/donut-commoncrawl")
+            task_prompt = f"<s>"
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            pretrained_model.to(device)
+        elif information == 'Hierarchical Common-Crawl 🍩':
+            st.info("Not implemented yet...")
+    with col2:
+        st.info(f'parsing 📑...')
+        parsed_info, _ = run_prediction(image)
+        st.text(f'\n{information}')
+        st.json(parsed_info)