Spaces:

clip-italian
/

clip-italian-demo

Running

App Files Files Community

4rtemi5 commited on Jul 24, 2021

Commit

5701708

2 Parent(s): 7377c04 12ddc30

Merge branch 'main' of https://huggingface.co/spaces/clip-italian/clip-italian-demo

Browse files

Files changed (6) hide show

app.py +2 -1
examples.py +12 -9
home.py +2 -0
image2text.py +9 -12
introduction.md +2 -3
text2image.py +9 -12

app.py CHANGED Viewed

@@ -15,7 +15,8 @@ PAGES = {
 st.sidebar.title("Explore our CLIP-Italian demo")
 logo = Image.open("static/img/clip_italian_logo.png")
-st.sidebar.image(logo, caption="CLIP-Italian logo")
 page = st.sidebar.radio("", list(PAGES.keys()))
 PAGES[page].app()

 st.sidebar.title("Explore our CLIP-Italian demo")
 logo = Image.open("static/img/clip_italian_logo.png")
+st.sidebar.image(logo)
+#, caption="CLIP-Italian logo"
 page = st.sidebar.radio("", list(PAGES.keys()))
 PAGES[page].app()

examples.py CHANGED Viewed

@@ -3,15 +3,17 @@ import streamlit as st
 def app():
-    st.title("Examples & Applications")
     st.write(
         """
-        ## Image Retrieval
-        Even though we trained the Italian CLIP model on way less examples than the original
-        OpenAI's CLIP, our training choices and quality datasets led to impressive results!
-        Here, we collected few of **the most impressive text-image associations** learned by our model.
         Remember you can head to the **Text to Image** section of the demo at any time to test your own🤌 Italian queries!
@@ -19,7 +21,7 @@ def app():
     )
     st.markdown("### 1. Actors in Scenes")
-    st.markdown("These examples comes from the CC dataset")
     st.subheader("una coppia")
     st.markdown("*a couple*")
@@ -39,7 +41,7 @@ def app():
     st.image("static/img/examples/couple_3.jpeg")
     st.markdown("### 2. Dresses")
-    st.markdown("These examples comes from the Unsplash dataset")
     col1, col2 = st.beta_columns(2)
     col1.subheader("un vestito primavrile")
@@ -50,10 +52,11 @@ def app():
     col2.markdown("*a dress for the autumn*")
     col2.image("static/img/examples/vestito_autunnale.png")
-    st.markdown("## Image Classification")
     st.markdown("We report this cool example provided by the "
                 "[DALLE-mini team](https://github.com/borisdayma/dalle-mini). "
                 "Is the DALLE-mini logo an *avocado* or an armchair (*poltrona*)?")
     st.image("static/img/examples/dalle_mini.png")
-    st.markdown("It seems it's half an armchair and half an avocado! We thank the team for the great idea :)")

 def app():
+    #st.title("Examples & Applications")
+    st.markdown("<h1 style='text-align: center; color: #CD212A;'> Examples & Applications </h1>", unsafe_allow_html=True)
+    st.markdown("<h2 style='text-align: center; color: #008C45; font-weight:bold;'> Complex Queries -Image Retrieval </h2>", unsafe_allow_html=True)
     st.write(
         """
+        Even though we trained the Italian CLIP model on way less examples(~1.4M) than the original
+        OpenAI's CLIP (~400M), our training choices and quality datasets led to impressive results!
+        Here, we present some of **the most impressive text-image associations** learned by our model.
         Remember you can head to the **Text to Image** section of the demo at any time to test your own🤌 Italian queries!
     )
     st.markdown("### 1. Actors in Scenes")
+    st.markdown("These examples were taken from the CC dataset")
     st.subheader("una coppia")
     st.markdown("*a couple*")
     st.image("static/img/examples/couple_3.jpeg")
     st.markdown("### 2. Dresses")
+    st.markdown("These examples were taken from the Unsplash dataset")
     col1, col2 = st.beta_columns(2)
     col1.subheader("un vestito primavrile")
     col2.markdown("*a dress for the autumn*")
     col2.image("static/img/examples/vestito_autunnale.png")
+    #st.markdown("## Image Classification")
+    st.markdown("<h2 style='text-align: center; color: #008C45; font-weight:bold;'> Zero Shot Image Classification </h2>", unsafe_allow_html=True)
     st.markdown("We report this cool example provided by the "
                 "[DALLE-mini team](https://github.com/borisdayma/dalle-mini). "
                 "Is the DALLE-mini logo an *avocado* or an armchair (*poltrona*)?")
     st.image("static/img/examples/dalle_mini.png")
+    st.markdown("It seems it's half an armchair and half an avocado! We thank the DALLE-mini team for the great idea :)")

home.py CHANGED Viewed

@@ -7,5 +7,7 @@ def read_markdown_file(markdown_file):
 def app():
     intro_markdown = read_markdown_file("introduction.md")
     st.markdown(intro_markdown, unsafe_allow_html=True)

 def app():
+    st.markdown("<h1 style='text-align: center; color: #CD212A;'> CLIP-Italian </h1>", unsafe_allow_html=True)
     intro_markdown = read_markdown_file("introduction.md")
     st.markdown(intro_markdown, unsafe_allow_html=True)

image2text.py CHANGED Viewed

@@ -10,25 +10,22 @@ import gc
 def app():
-    st.title("From Image to Text")
     st.markdown(
         """
-        ### 👋 Ciao!
-        Here you can find the captions or the labels that are most related to a given image. It is a zero-shot
-        image classification task!
-        🤌 Italian mode on! 🤌
-        For example, try to write "gatto" (cat) in the space for label1 and "cane" (dog) in the space for label2 and the run
         "classify"!
         """
     )
     image_url = st.text_input(
-        "You can input the URL of an image",
         value="https://www.petdetective.it/wp-content/uploads/2016/04/gatto-toilette.jpg",
     )
@@ -38,14 +35,14 @@ def app():
     with col2:
         captions_count = st.selectbox(
-            "Number of labels", options=range(1, MAX_CAP + 1), index=1
         )
-        compute = st.button("Classify")
     with col1:
         captions = list()
         for idx in range(min(MAX_CAP, captions_count)):
-            captions.append(st.text_input(f"Insert label {idx+1}"))
     if compute:
         captions = [c for c in captions if c != ""]

 def app():
+    #st.title("From Image to Text")
+    st.markdown("<h1 style='text-align: center; color: #CD212A;'> Zero Shot Image Classification </h1>", unsafe_allow_html=True)
+    st.markdown("<h2 style='text-align: center; color: #008C45; font-weight:bold;'> Image to Text </h2>", unsafe_allow_html=True)
     st.markdown(
         """
+        👋 Ciao! Here you can find the captions or the labels that are most related to a given image.
+        Try typing "gatto" (cat) in the space for label1 and "cane" (dog) in the space for label2 and click
         "classify"!
         """
     )
     image_url = st.text_input(
+        "YOU CAN INPUT THE URL OF AN IMAGE : ",
         value="https://www.petdetective.it/wp-content/uploads/2016/04/gatto-toilette.jpg",
     )
     with col2:
         captions_count = st.selectbox(
+            "NUMBER OF LABELS", options=range(1, MAX_CAP + 1), index=1
         )
+        compute = st.button("CLASSIFY")
     with col1:
         captions = list()
         for idx in range(min(MAX_CAP, captions_count)):
+            captions.append(st.text_input(f"INSERT LABEL {idx+1}"))
     if compute:
         captions = [c for c in captions if c != ""]

introduction.md CHANGED Viewed

@@ -1,11 +1,10 @@
-# CLIP-Italian
-CLIP-Italian is a multimodal model trained on ~1.4 million Italian text-image pairs using Italian Bert model as text encoder and Vision Transformer(ViT) as image encoder using the JAX/Flax neural network library. The training was carried out during the Hugging Face Community event on Google's TPU machines, sponsored by Google Cloud.
 Clip-Italian (Contrastive Language-Image Pre-training in Italian language) is based on OpenAI’s CLIP ([Radford et al., 2021](https://arxiv.org/abs/2103.00020))which is an amazing model that can learn to represent images and text jointly in the same space.
 In this project, we aim to propose the first CLIP model trained on Italian data, that in this context can be considered a
-low resource language. Using a few techniques, we have been able to fine-tune a SOTA Italian CLIP model with **only 1.4 million** training samples. Our Italian CLIP model
 is built upon the pre-trained [Italian BERT](https://huggingface.co/dbmdz/bert-base-italian-xxl-cased) model provided by [dbmdz](https://huggingface.co/dbmdz) and the OpenAI
 [vision transformer](https://huggingface.co/openai/clip-vit-base-patch32).

+CLIP-Italian is a **multimodal** model trained on **~1.4 Million** Italian text-image pairs using **Italian Bert** model as text encoder and Vision Transformer **ViT** as image encoder using the **JAX/Flax** neural network library. The training was carried out during the **Hugging Face** Community event on **Google's TPU** machines, sponsored by **Google Cloud**.
 Clip-Italian (Contrastive Language-Image Pre-training in Italian language) is based on OpenAI’s CLIP ([Radford et al., 2021](https://arxiv.org/abs/2103.00020))which is an amazing model that can learn to represent images and text jointly in the same space.
 In this project, we aim to propose the first CLIP model trained on Italian data, that in this context can be considered a
+low resource language. Using a few techniques, we have been able to fine-tune a SOTA Italian CLIP model with **only 1.4M** training samples. Our Italian CLIP model
 is built upon the pre-trained [Italian BERT](https://huggingface.co/dbmdz/bert-base-italian-xxl-cased) model provided by [dbmdz](https://huggingface.co/dbmdz) and the OpenAI
 [vision transformer](https://huggingface.co/openai/clip-vit-base-patch32).

text2image.py CHANGED Viewed

@@ -108,23 +108,20 @@ headers = {
 def app():
     #st.title("From Text to Image")
-    st.markdown("<h1 style='text-align: center; color: #CD212A;'>Image Retrieval</h1>", unsafe_allow_html=True)
-st.markdown("<h2 style='text-align: center; color: #008C45;font-weight:bold;'>Text to Image</h2>", unsafe_allow_html=True)
     st.markdown(
         """
-        ### 👋 Ciao!
-        Here you can search for ~150.000 images in the Conceptual Captions dataset (CC) or in the Unsplash 25k Photos dataset.
-        Even though we did not train on any of these images you will see most queries make sense. When you see errors, there might be two possibilities:
-        the model is answering in a wrong way or the image you are looking for are not in the dataset and the model is giving you the best answer it can get.
-        🤌 Italian mode on! 🤌
-        You can choose one of our examples down below...
         """
     )
@@ -160,7 +157,7 @@ st.markdown("<h2 style='text-align: center; color: #008C45;font-weight:bold;'>Te
     col1, col2 = st.beta_columns([3, 1])
     with col1:
-        query = st.text_input("... or insert an Italian query text")
     with col2:
         dataset_name = st.selectbox("IR dataset", ["CC", "Unsplash"])

 def app():
     #st.title("From Text to Image")
+    st.markdown("<h1 style='text-align: center; color: #CD212A;'> Image Retrieval </h1>", unsafe_allow_html=True)
+    st.markdown("<h2 style='text-align: center; color: #008C45; font-weight:bold;'> Text to Image </h2>", unsafe_allow_html=True)
     st.markdown(
         """
+        👋 Ciao! Here you can type Italian query and search from ~150k images in the Conceptual Captions (CC) dataset or  25k Photos in the Unsplash dataset.
+        Though these images were not used for training the model, you will see most queries make sense.
+        Rare errors might be due to 2 possibilities:
+        a)The model is answering in a wrong way or b) the image you are looking for are not in the dataset & the model is giving you the best answer it can get.
+        You can choose from one of the following examples :
         """
     )
     col1, col2 = st.beta_columns([3, 1])
     with col1:
+        query = st.text_input("OR INSERT AN ITALIAN QUERY TEXT : ")
     with col2:
         dataset_name = st.selectbox("IR dataset", ["CC", "Unsplash"])