Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,691 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Solo_descripcion_ripios
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1RYsNm31Nta3rhqrgDbBsBCFcT3l-RZpC
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
"""# **Descripci贸n y medici贸n de ripios de perforaci贸n mediante IA**
|
| 13 |
+
|
| 14 |
+
Este trabajo es una adaptaci贸n de los c贸digos de [A_K_Nain, 2021](https://keras.io/examples/vision/image_captioning/) y de [Sitar, M. & Leary, R., 2023](https://gchron.copernicus.org/articles/5/109/2023/)<br>
|
| 15 |
+
**Autores:** Jhoel Ortiz, Christian Mej铆a & Paola Vargas<br>
|
| 16 |
+
**Fecha de creaci贸n:** 2024/01/06<br>
|
| 17 |
+
**脷ltima modificaci贸n:** 2024/02/15<br>
|
| 18 |
+
**Descripci贸n:** Este trabajo implementa modelos de CNN y TNN para la descripci贸n y medici贸n de im谩genes de ripios de perforaci贸n.
|
| 19 |
+
|
| 20 |
+
El siguiente Notebook de Google Colab se esquematiza de la siguiente manera:
|
| 21 |
+
|
| 22 |
+
**Descripci贸n textual y oral de im谩genes de ripios de perforaci贸n**
|
| 23 |
+
- Carga e instalaci贸n de librer铆as
|
| 24 |
+
- Procesamiento de los archivos de imagen y descripciones
|
| 25 |
+
- Vectorizaci贸n de los datos de texto
|
| 26 |
+
- Canalizaci贸n de datos para el entrenamiento
|
| 27 |
+
- Construcci贸n del modelo
|
| 28 |
+
- Entrenamiento del modelo
|
| 29 |
+
- Verificaci贸n de las predicciones
|
| 30 |
+
- Evaluaci贸n con BLEU
|
| 31 |
+
- Predicci贸n de im谩genes externas
|
| 32 |
+
|
| 33 |
+
**Medici贸n de im谩genes de ripios de perforaci贸n**
|
| 34 |
+
- Carga e instalaci贸n de librer铆as
|
| 35 |
+
- Inspecci贸n de la imagen
|
| 36 |
+
- Descarga e inicializaci贸n del modelo
|
| 37 |
+
- Evaluaci贸n de prueba
|
| 38 |
+
- Procesamiento automatizado
|
| 39 |
+
- Ilustraci贸n de resultados autom谩ticos
|
| 40 |
+
- Procesamiento semi-autom谩tico
|
| 41 |
+
- Ilustraci贸n de resultados semi-autom谩ticos
|
| 42 |
+
|
| 43 |
+
# **Descripci贸n textual y oral de im谩genes de ripios de perforaci贸n**
|
| 44 |
+
Esta secci贸n contiene todos los pasos a seguir para el desarrollo de un modelo de IA que describa automaticamente de forma escrita y oral im谩genes de ripios de perforaci贸n aplicandao una RNN y un Transformer.
|
| 45 |
+
|
| 46 |
+
##**Carga e instalaci贸n de librer铆as**
|
| 47 |
+
Esta subsecci贸n carga e instala las librer铆as que se requieren para la descripci贸n textual y oral de im谩genes de ripios de perforaci贸n.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
# Carga de librer铆as
|
| 51 |
+
import os
|
| 52 |
+
|
| 53 |
+
os.environ["KERAS_BACKEND"] = "tensorflow"
|
| 54 |
+
|
| 55 |
+
import re
|
| 56 |
+
import numpy as np
|
| 57 |
+
import matplotlib.pyplot as plt
|
| 58 |
+
|
| 59 |
+
import tensorflow as tf
|
| 60 |
+
import keras
|
| 61 |
+
from keras import layers
|
| 62 |
+
from keras.applications import MobileNetV2
|
| 63 |
+
from keras.layers import TextVectorization
|
| 64 |
+
|
| 65 |
+
keras.utils.set_random_seed(111)
|
| 66 |
+
|
| 67 |
+
from gtts import gTTS
|
| 68 |
+
"""##**Procesamiento de las im谩genes y descripciones de ripios de perforaci贸n**
|
| 69 |
+
La siguiente subsecci贸n realiza lo siguiente:
|
| 70 |
+
* Carga los archivos de imagen y de texto de ripios de perforaci贸n
|
| 71 |
+
* Define las caracter铆sticas y par谩metros base de los archivos ingresados
|
| 72 |
+
* Divide al conjunto de datos en subconjuntos de entrenamiento y validaci贸n
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
"""
|
| 80 |
+
IMAGES_PATHS = ["/app3/Data", "/app3/Data1", "/app3/Data2"]
|
| 81 |
+
IMAGES_PATH = IMAGES_PATHS[0] # Accede al primer elemento de la lista
|
| 82 |
+
|
| 83 |
+
# Dimensiones de imagen
|
| 84 |
+
IMAGE_SIZE = (359,359)
|
| 85 |
+
|
| 86 |
+
# Tama帽o del vocabulario
|
| 87 |
+
VOCAB_SIZE = 700
|
| 88 |
+
|
| 89 |
+
# Longitud fija para cualquier secuencia
|
| 90 |
+
SEQ_LENGTH = 400
|
| 91 |
+
|
| 92 |
+
# Dimensiones para los embeddings de im谩genes y de tokens
|
| 93 |
+
EMBED_DIM = 512
|
| 94 |
+
|
| 95 |
+
# Unidades por capa en la red feed-forward
|
| 96 |
+
FF_DIM = 512
|
| 97 |
+
|
| 98 |
+
# Otros par谩metros de entrenamiento
|
| 99 |
+
BATCH_SIZE = 64
|
| 100 |
+
EPOCHS = 1
|
| 101 |
+
AUTOTUNE = tf.data.AUTOTUNE
|
| 102 |
+
|
| 103 |
+
def load_captions_data(filename):
|
| 104 |
+
"""Carga las descripciones (texto) y los asigna a sus im谩genes correspondientes.
|
| 105 |
+
|
| 106 |
+
Argumentos:
|
| 107 |
+
filename: Ruta al archivo de texto que contiene las descripciones.
|
| 108 |
+
|
| 109 |
+
Returna:
|
| 110 |
+
caption_mapping: Diccionario que mapea los nombres de las im谩genes y sus descipciones correspondientes.
|
| 111 |
+
text_data: Lista que contiene todos los subt铆tulos disponibles.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
with open(filename) as caption_file:
|
| 115 |
+
caption_data = caption_file.readlines()
|
| 116 |
+
caption_mapping = {}
|
| 117 |
+
text_data = []
|
| 118 |
+
images_to_skip = set()
|
| 119 |
+
|
| 120 |
+
for line in caption_data:
|
| 121 |
+
line = line.rstrip("\n")
|
| 122 |
+
# El nombre de la imagen se separa de su descripci贸n por una tabulaci贸n
|
| 123 |
+
img_name, caption = line.split("\t")
|
| 124 |
+
print(img_name)
|
| 125 |
+
print(caption)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# Cada nombre de imagen tiene un sufijo `#img_name.jpg#0`
|
| 129 |
+
img_name = img_name.split("#")[0]
|
| 130 |
+
img_name = os.path.join(IMAGES_PATH, img_name.strip())
|
| 131 |
+
|
| 132 |
+
# Se eliminan las descripciones demasiado largas o demasiado cortas
|
| 133 |
+
tokens = caption.strip().split()
|
| 134 |
+
|
| 135 |
+
if img_name.endswith("jpg") and img_name not in images_to_skip:
|
| 136 |
+
# Se agrega un token de inicio <start> y fin <end> a cada descripci贸n
|
| 137 |
+
caption = "<start> " + caption.strip() + " <end>"
|
| 138 |
+
text_data.append(caption)
|
| 139 |
+
|
| 140 |
+
if img_name in caption_mapping:
|
| 141 |
+
caption_mapping[img_name].append(caption)
|
| 142 |
+
else:
|
| 143 |
+
caption_mapping[img_name] = [caption]
|
| 144 |
+
|
| 145 |
+
for img_name in images_to_skip:
|
| 146 |
+
if img_name in caption_mapping:
|
| 147 |
+
del caption_mapping[img_name]
|
| 148 |
+
|
| 149 |
+
return caption_mapping, text_data
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def train_val_split(caption_data, train_size=0.8, shuffle=True):
|
| 153 |
+
"""Divide el conjunto de datos en subconjuntos de entrenamiento y validaci贸n.
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
caption_data (dict): Diccionario que contiene las descripciones asignadas.
|
| 157 |
+
train_size (float): Fracci贸n del conjunto de datos que se usa como subconjunto de entrenamiento.
|
| 158 |
+
shuffle (bool): Se especifica si se quiere mezclar el conjunto de datos antes de dividirlo.
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
Conjuntos de datos de entrenamiento y validaci贸n como dos dictados separados
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
# 1. Lista de todas las im谩genes
|
| 165 |
+
all_images = list(caption_data.keys())
|
| 166 |
+
|
| 167 |
+
# 2. Se mezcla para que sean aleatorias y no exista sesgo
|
| 168 |
+
if shuffle:
|
| 169 |
+
np.random.shuffle(all_images)
|
| 170 |
+
|
| 171 |
+
# 3. Se divide en conjuntos de entrenamiento y validaci贸n
|
| 172 |
+
train_size = int(len(caption_data) * train_size)
|
| 173 |
+
|
| 174 |
+
training_data = {
|
| 175 |
+
img_name: caption_data[img_name] for img_name in all_images[:train_size]
|
| 176 |
+
}
|
| 177 |
+
validation_data = {
|
| 178 |
+
img_name: caption_data[img_name] for img_name in all_images[train_size:]
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
# 4. Retorna las divisiones
|
| 182 |
+
return training_data, validation_data
|
| 183 |
+
|
| 184 |
+
# Carga del archivo .txt de descripciones
|
| 185 |
+
captions_mapping, text_data = load_captions_data("/app3/ROCAS.token.txt")
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# Se divide en conjuntos de entrenamiento y validaci贸n
|
| 189 |
+
train_data, valid_data = train_val_split(captions_mapping)
|
| 190 |
+
print("N煤mero de muestras de entrenamiento: ", len(train_data))
|
| 191 |
+
print("N煤mero de muestras de validaci贸n: ", len(valid_data))
|
| 192 |
+
|
| 193 |
+
"""##**Vectorizaci贸n de los datos de texto**
|
| 194 |
+
Esta secci贸n transforma las descripciones del archivo de texto en vectores,
|
| 195 |
+
estandariza las cadenas de caracteres y aumenta el n煤mero de im谩genes con caracter铆sticas establecidas.
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
"""
|
| 202 |
+
|
| 203 |
+
def custom_standardization(input_string):
|
| 204 |
+
lowercase = tf.strings.lower(input_string)
|
| 205 |
+
return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
strip_chars = "!\"$&'*+-/:<=>?@[\]^_`{|}~"
|
| 209 |
+
strip_chars = strip_chars.replace("<", "")
|
| 210 |
+
strip_chars = strip_chars.replace(">", "")
|
| 211 |
+
|
| 212 |
+
# Vectorizaci贸n de los archivos de texto
|
| 213 |
+
vectorization = TextVectorization(
|
| 214 |
+
max_tokens=VOCAB_SIZE,
|
| 215 |
+
output_mode="int",
|
| 216 |
+
output_sequence_length=SEQ_LENGTH,
|
| 217 |
+
standardize=custom_standardization,
|
| 218 |
+
)
|
| 219 |
+
vectorization.adapt(text_data)
|
| 220 |
+
|
| 221 |
+
# Aumento del n煤mero de im谩genes
|
| 222 |
+
image_augmentation = keras.Sequential(
|
| 223 |
+
[
|
| 224 |
+
layers.RandomFlip("horizontal"),
|
| 225 |
+
layers.RandomRotation(0.2),
|
| 226 |
+
layers.RandomContrast(0.3),
|
| 227 |
+
]
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
"""##**Canalizaci贸n de datos para el entrenamiento**
|
| 231 |
+
|
| 232 |
+
Se genera pares de im谩genes con sus respectivas descripciones usando `tf.data.Dataset`.
|
| 233 |
+
|
| 234 |
+
El proceso consiste de dos etapas:
|
| 235 |
+
|
| 236 |
+
- Leer la imagen del disco
|
| 237 |
+
- Tokenizar las descripciones de cada una de ellas
|
| 238 |
+
"""
|
| 239 |
+
|
| 240 |
+
def decode_and_resize(img_path):
|
| 241 |
+
img = tf.io.read_file(img_path)
|
| 242 |
+
img = tf.image.decode_jpeg(img, channels=3)
|
| 243 |
+
img = tf.image.resize(img, IMAGE_SIZE)
|
| 244 |
+
img = tf.image.convert_image_dtype(img, tf.float32)
|
| 245 |
+
return img
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def process_input(img_path, captions):
|
| 249 |
+
return decode_and_resize(img_path), vectorization(captions)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def make_dataset(images, captions):
|
| 253 |
+
dataset = tf.data.Dataset.from_tensor_slices((images, captions))
|
| 254 |
+
dataset = dataset.shuffle(BATCH_SIZE * 8)
|
| 255 |
+
dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
|
| 256 |
+
dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)
|
| 257 |
+
|
| 258 |
+
return dataset
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# Lista de im谩genes y de descripciones
|
| 262 |
+
train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))
|
| 263 |
+
|
| 264 |
+
valid_dataset = make_dataset(list(valid_data.keys()), list(valid_data.values()))
|
| 265 |
+
|
| 266 |
+
"""## **Construcci贸n del modelo**
|
| 267 |
+
|
| 268 |
+
La descripci贸n de im谩genes consta de tres modelos:
|
| 269 |
+
|
| 270 |
+
- Una CNN: extrae las caracter铆sticas de las im谩genes.
|
| 271 |
+
- Un TransformerEncoder: por medio de un modelo pre-entrenado para trabajar con im谩genes de rocas, se encarga de identificar y extraer las caracter铆sticas (features) de las fotos de la base de datos.
|
| 272 |
+
- Un TransformerDecoder: toma como entradas las features del codificador y las descripciones (secuencias) e identifica el proceso para generar descripciones de im谩genes.
|
| 273 |
+
"""
|
| 274 |
+
|
| 275 |
+
def get_cnn_model():
|
| 276 |
+
base_model = MobileNetV2( #resnet.ResNetV2
|
| 277 |
+
input_shape=(*IMAGE_SIZE, 3),
|
| 278 |
+
include_top=False,
|
| 279 |
+
weights="imagenet",
|
| 280 |
+
)
|
| 281 |
+
# base_model= tf.keras.models.load_model('/content/gdrive/MyDrive/best_model.h5')
|
| 282 |
+
# base_model.summary()
|
| 283 |
+
# Se congela el extractor de caracter铆sticas
|
| 284 |
+
base_model.trainable = False
|
| 285 |
+
base_model_out = base_model.output
|
| 286 |
+
base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)
|
| 287 |
+
cnn_model = keras.models.Model(base_model.input, base_model_out)
|
| 288 |
+
return cnn_model
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
class TransformerEncoderBlock(layers.Layer):
|
| 292 |
+
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
|
| 293 |
+
super().__init__(**kwargs)
|
| 294 |
+
self.embed_dim = embed_dim
|
| 295 |
+
self.dense_dim = dense_dim
|
| 296 |
+
self.num_heads = num_heads
|
| 297 |
+
self.attention_1 = layers.MultiHeadAttention(
|
| 298 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.0
|
| 299 |
+
)
|
| 300 |
+
self.layernorm_1 = layers.LayerNormalization()
|
| 301 |
+
self.layernorm_2 = layers.LayerNormalization()
|
| 302 |
+
self.dense_1 = layers.Dense(embed_dim, activation="relu")
|
| 303 |
+
|
| 304 |
+
def call(self, inputs, training, mask=None):
|
| 305 |
+
inputs = self.layernorm_1(inputs)
|
| 306 |
+
inputs = self.dense_1(inputs)
|
| 307 |
+
|
| 308 |
+
attention_output_1 = self.attention_1(
|
| 309 |
+
query=inputs,
|
| 310 |
+
value=inputs,
|
| 311 |
+
key=inputs,
|
| 312 |
+
attention_mask=None,
|
| 313 |
+
training=training,
|
| 314 |
+
)
|
| 315 |
+
out_1 = self.layernorm_2(inputs + attention_output_1)
|
| 316 |
+
return out_1
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
class PositionalEmbedding(layers.Layer):
|
| 320 |
+
def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
|
| 321 |
+
super().__init__(**kwargs)
|
| 322 |
+
self.token_embeddings = layers.Embedding(
|
| 323 |
+
input_dim=vocab_size, output_dim=embed_dim
|
| 324 |
+
)
|
| 325 |
+
self.position_embeddings = layers.Embedding(
|
| 326 |
+
input_dim=sequence_length, output_dim=embed_dim
|
| 327 |
+
)
|
| 328 |
+
self.sequence_length = sequence_length
|
| 329 |
+
self.vocab_size = vocab_size
|
| 330 |
+
self.embed_dim = embed_dim
|
| 331 |
+
self.embed_scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))
|
| 332 |
+
|
| 333 |
+
def call(self, inputs):
|
| 334 |
+
length = tf.shape(inputs)[-1]
|
| 335 |
+
positions = tf.range(start=0, limit=length, delta=1)
|
| 336 |
+
embedded_tokens = self.token_embeddings(inputs)
|
| 337 |
+
embedded_tokens = embedded_tokens * self.embed_scale
|
| 338 |
+
embedded_positions = self.position_embeddings(positions)
|
| 339 |
+
return embedded_tokens + embedded_positions
|
| 340 |
+
|
| 341 |
+
def compute_mask(self, inputs, mask=None):
|
| 342 |
+
return tf.math.not_equal(inputs, 0)
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
class TransformerDecoderBlock(layers.Layer):
|
| 346 |
+
def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
|
| 347 |
+
super().__init__(**kwargs)
|
| 348 |
+
self.embed_dim = embed_dim
|
| 349 |
+
self.ff_dim = ff_dim
|
| 350 |
+
self.num_heads = num_heads
|
| 351 |
+
self.attention_1 = layers.MultiHeadAttention(
|
| 352 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.1
|
| 353 |
+
)
|
| 354 |
+
self.attention_2 = layers.MultiHeadAttention(
|
| 355 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.1
|
| 356 |
+
)
|
| 357 |
+
self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
|
| 358 |
+
self.ffn_layer_2 = layers.Dense(embed_dim)
|
| 359 |
+
|
| 360 |
+
self.layernorm_1 = layers.LayerNormalization()
|
| 361 |
+
self.layernorm_2 = layers.LayerNormalization()
|
| 362 |
+
self.layernorm_3 = layers.LayerNormalization()
|
| 363 |
+
|
| 364 |
+
self.embedding = PositionalEmbedding(
|
| 365 |
+
embed_dim=EMBED_DIM,
|
| 366 |
+
sequence_length=SEQ_LENGTH,
|
| 367 |
+
vocab_size=VOCAB_SIZE,
|
| 368 |
+
)
|
| 369 |
+
self.out = layers.Dense(VOCAB_SIZE, activation="softmax")
|
| 370 |
+
|
| 371 |
+
self.dropout_1 = layers.Dropout(0.3)
|
| 372 |
+
self.dropout_2 = layers.Dropout(0.5)
|
| 373 |
+
self.supports_masking = True
|
| 374 |
+
|
| 375 |
+
def call(self, inputs, encoder_outputs, training, mask=None):
|
| 376 |
+
inputs = self.embedding(inputs)
|
| 377 |
+
causal_mask = self.get_causal_attention_mask(inputs)
|
| 378 |
+
|
| 379 |
+
if mask is not None:
|
| 380 |
+
padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
|
| 381 |
+
combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
|
| 382 |
+
combined_mask = tf.minimum(combined_mask, causal_mask)
|
| 383 |
+
|
| 384 |
+
attention_output_1 = self.attention_1(
|
| 385 |
+
query=inputs,
|
| 386 |
+
value=inputs,
|
| 387 |
+
key=inputs,
|
| 388 |
+
attention_mask=combined_mask,
|
| 389 |
+
training=training,
|
| 390 |
+
)
|
| 391 |
+
out_1 = self.layernorm_1(inputs + attention_output_1)
|
| 392 |
+
|
| 393 |
+
attention_output_2 = self.attention_2(
|
| 394 |
+
query=out_1,
|
| 395 |
+
value=encoder_outputs,
|
| 396 |
+
key=encoder_outputs,
|
| 397 |
+
attention_mask=padding_mask,
|
| 398 |
+
training=training,
|
| 399 |
+
)
|
| 400 |
+
out_2 = self.layernorm_2(out_1 + attention_output_2)
|
| 401 |
+
|
| 402 |
+
ffn_out = self.ffn_layer_1(out_2)
|
| 403 |
+
ffn_out = self.dropout_1(ffn_out, training=training)
|
| 404 |
+
ffn_out = self.ffn_layer_2(ffn_out)
|
| 405 |
+
|
| 406 |
+
ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
|
| 407 |
+
ffn_out = self.dropout_2(ffn_out, training=training)
|
| 408 |
+
preds = self.out(ffn_out)
|
| 409 |
+
return preds
|
| 410 |
+
|
| 411 |
+
def get_causal_attention_mask(self, inputs):
|
| 412 |
+
input_shape = tf.shape(inputs)
|
| 413 |
+
batch_size, sequence_length = input_shape[0], input_shape[1]
|
| 414 |
+
i = tf.range(sequence_length)[:, tf.newaxis]
|
| 415 |
+
j = tf.range(sequence_length)
|
| 416 |
+
mask = tf.cast(i >= j, dtype="int32")
|
| 417 |
+
mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
|
| 418 |
+
mult = tf.concat(
|
| 419 |
+
[
|
| 420 |
+
tf.expand_dims(batch_size, -1),
|
| 421 |
+
tf.constant([1, 1], dtype=tf.int32),
|
| 422 |
+
],
|
| 423 |
+
axis=0,
|
| 424 |
+
)
|
| 425 |
+
return tf.tile(mask, mult)
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
class ImageCaptioningModel(keras.Model):
|
| 429 |
+
def __init__(
|
| 430 |
+
self,
|
| 431 |
+
cnn_model,
|
| 432 |
+
encoder,
|
| 433 |
+
decoder,
|
| 434 |
+
num_captions_per_image=1,
|
| 435 |
+
image_aug=None,
|
| 436 |
+
):
|
| 437 |
+
super().__init__()
|
| 438 |
+
self.cnn_model = cnn_model
|
| 439 |
+
self.encoder = encoder
|
| 440 |
+
self.decoder = decoder
|
| 441 |
+
self.loss_tracker = keras.metrics.Mean(name="loss")
|
| 442 |
+
self.acc_tracker = keras.metrics.Mean(name="accuracy")
|
| 443 |
+
self.num_captions_per_image = num_captions_per_image
|
| 444 |
+
self.image_aug = image_aug
|
| 445 |
+
|
| 446 |
+
def calculate_loss(self, y_true, y_pred, mask):
|
| 447 |
+
loss = self.loss(y_true, y_pred)
|
| 448 |
+
mask = tf.cast(mask, dtype=loss.dtype)
|
| 449 |
+
loss *= mask
|
| 450 |
+
return tf.reduce_sum(loss) / tf.reduce_sum(mask)
|
| 451 |
+
|
| 452 |
+
def calculate_accuracy(self, y_true, y_pred, mask):
|
| 453 |
+
accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
|
| 454 |
+
accuracy = tf.math.logical_and(mask, accuracy)
|
| 455 |
+
accuracy = tf.cast(accuracy, dtype=tf.float32)
|
| 456 |
+
mask = tf.cast(mask, dtype=tf.float32)
|
| 457 |
+
return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)
|
| 458 |
+
|
| 459 |
+
def _compute_caption_loss_and_acc(self, img_embed, batch_seq, training=True):
|
| 460 |
+
encoder_out = self.encoder(img_embed, training=training)
|
| 461 |
+
batch_seq_inp = batch_seq[:, :-1]
|
| 462 |
+
batch_seq_true = batch_seq[:, 1:]
|
| 463 |
+
mask = tf.math.not_equal(batch_seq_true, 0)
|
| 464 |
+
batch_seq_pred = self.decoder(
|
| 465 |
+
batch_seq_inp, encoder_out, training=training, mask=mask
|
| 466 |
+
)
|
| 467 |
+
loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
|
| 468 |
+
acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)
|
| 469 |
+
return loss, acc
|
| 470 |
+
|
| 471 |
+
def train_step(self, batch_data):
|
| 472 |
+
batch_img, batch_seq = batch_data
|
| 473 |
+
batch_loss = 0
|
| 474 |
+
batch_acc = 0
|
| 475 |
+
|
| 476 |
+
if self.image_aug:
|
| 477 |
+
batch_img = self.image_aug(batch_img)
|
| 478 |
+
|
| 479 |
+
# 1. Se obtiene los embeddings de im谩genes
|
| 480 |
+
img_embed = self.cnn_model(batch_img)
|
| 481 |
+
|
| 482 |
+
# 2. Las descripciones pasan por el decodificador
|
| 483 |
+
# junto con las salidas del codificador y calcula
|
| 484 |
+
# la p茅rdida y la precisi贸n para cada descripci贸n
|
| 485 |
+
for i in range(self.num_captions_per_image):
|
| 486 |
+
with tf.GradientTape() as tape:
|
| 487 |
+
loss, acc = self._compute_caption_loss_and_acc(
|
| 488 |
+
img_embed, batch_seq[:, i, :], training=True
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
# 3. Actualizaci贸n de p茅rdida y precisi贸n
|
| 492 |
+
batch_loss += loss
|
| 493 |
+
batch_acc += acc
|
| 494 |
+
|
| 495 |
+
# 4. Se obtiene la lista de los pesos entrenables
|
| 496 |
+
train_vars = (
|
| 497 |
+
self.encoder.trainable_variables + self.decoder.trainable_variables
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
# 5. Se obtiene los gradientes
|
| 501 |
+
grads = tape.gradient(loss, train_vars)
|
| 502 |
+
|
| 503 |
+
# 6. Actualiza los pesos entrenables
|
| 504 |
+
self.optimizer.apply_gradients(zip(grads, train_vars))
|
| 505 |
+
|
| 506 |
+
# 7. Actualiza de los rastreadores
|
| 507 |
+
batch_acc /= float(self.num_captions_per_image)
|
| 508 |
+
self.loss_tracker.update_state(batch_loss)
|
| 509 |
+
self.acc_tracker.update_state(batch_acc)
|
| 510 |
+
|
| 511 |
+
# 8. Retorna los valores de p茅rdida y precisi贸n
|
| 512 |
+
return {
|
| 513 |
+
"loss": self.loss_tracker.result(),
|
| 514 |
+
"acc": self.acc_tracker.result(),
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
def test_step(self, batch_data):
|
| 518 |
+
batch_img, batch_seq = batch_data
|
| 519 |
+
batch_loss = 0
|
| 520 |
+
batch_acc = 0
|
| 521 |
+
|
| 522 |
+
# 1. Obtiene los embeddings de im谩genes
|
| 523 |
+
img_embed = self.cnn_model(batch_img)
|
| 524 |
+
|
| 525 |
+
# 2. Las descripciones pasan por el decodificador
|
| 526 |
+
# junto con las salidas del codificador y calcula
|
| 527 |
+
# la p茅rdida y la precisi贸n para cada descripci贸n
|
| 528 |
+
for i in range(self.num_captions_per_image):
|
| 529 |
+
loss, acc = self._compute_caption_loss_and_acc(
|
| 530 |
+
img_embed, batch_seq[:, i, :], training=False
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
# 3. Actualizaci贸n de p茅rdida y precisi贸n
|
| 534 |
+
batch_loss += loss
|
| 535 |
+
batch_acc += acc
|
| 536 |
+
|
| 537 |
+
batch_acc /= float(self.num_captions_per_image)
|
| 538 |
+
|
| 539 |
+
# 4. Actualiza de los rastreadores
|
| 540 |
+
self.loss_tracker.update_state(batch_loss)
|
| 541 |
+
self.acc_tracker.update_state(batch_acc)
|
| 542 |
+
|
| 543 |
+
# 5. Retorna los valores de p茅rdida y precisi贸n
|
| 544 |
+
return {
|
| 545 |
+
"loss": self.loss_tracker.result(),
|
| 546 |
+
"acc": self.acc_tracker.result(),
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
@property
|
| 550 |
+
def metrics(self):
|
| 551 |
+
# Se necesita enumerar las m茅tricas para que `reset_states()`
|
| 552 |
+
# pueda ser llamado automaticamente.
|
| 553 |
+
return [self.loss_tracker, self.acc_tracker]
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
cnn_model = get_cnn_model()
|
| 557 |
+
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
|
| 558 |
+
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)
|
| 559 |
+
caption_model = ImageCaptioningModel(
|
| 560 |
+
cnn_model=cnn_model,
|
| 561 |
+
encoder=encoder,
|
| 562 |
+
decoder=decoder,
|
| 563 |
+
image_aug=image_augmentation,
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
+
"""## **Entrenamiento del modelo**"""
|
| 567 |
+
|
| 568 |
+
# Define la funci贸n de p茅rdida
|
| 569 |
+
cross_entropy = keras.losses.SparseCategoricalCrossentropy(
|
| 570 |
+
from_logits=False,
|
| 571 |
+
reduction='none',
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
# Criterios de parada anticipada
|
| 575 |
+
early_stopping = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
# Programador de tasa de aprendizaje para el optimizador
|
| 579 |
+
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
|
| 580 |
+
|
| 581 |
+
class LRSchedule(LearningRateSchedule):
|
| 582 |
+
def __init__(self, post_warmup_learning_rate, warmup_steps):
|
| 583 |
+
super().__init__()
|
| 584 |
+
self.post_warmup_learning_rate = post_warmup_learning_rate
|
| 585 |
+
self.warmup_steps = warmup_steps
|
| 586 |
+
|
| 587 |
+
def __call__(self, step):
|
| 588 |
+
global_step = tf.cast(step, tf.float32)
|
| 589 |
+
warmup_steps = tf.cast(self.warmup_steps, tf.float32)
|
| 590 |
+
warmup_progress = global_step / warmup_steps
|
| 591 |
+
warmup_learning_rate = self.post_warmup_learning_rate * warmup_progress
|
| 592 |
+
return tf.cond(
|
| 593 |
+
global_step < warmup_steps,
|
| 594 |
+
lambda: warmup_learning_rate,
|
| 595 |
+
lambda: self.post_warmup_learning_rate,
|
| 596 |
+
)
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
# Se crea un cronograma de tasa de aprendizaje
|
| 600 |
+
num_train_steps = len(train_dataset) * EPOCHS
|
| 601 |
+
num_warmup_steps = num_train_steps // 15
|
| 602 |
+
lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)
|
| 603 |
+
|
| 604 |
+
# Se compila el modelo
|
| 605 |
+
caption_model.compile(optimizer=keras.optimizers.Adam(lr_schedule), loss=cross_entropy)
|
| 606 |
+
|
| 607 |
+
# Entrenamiento del modelo
|
| 608 |
+
caption_model.fit(
|
| 609 |
+
train_dataset,
|
| 610 |
+
epochs=EPOCHS,
|
| 611 |
+
|
| 612 |
+
validation_data=valid_dataset,
|
| 613 |
+
callbacks=[early_stopping],
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
"""### **Opci贸n para guardar el modelo entrenado**"""
|
| 617 |
+
|
| 618 |
+
#con est谩 opci贸n vemos los pesos del modelo en una lista
|
| 619 |
+
pesos = caption_model.get_weights()
|
| 620 |
+
|
| 621 |
+
#guardamos esos pesos en formato npy - en este caso lo guardamos entrenado con una 茅poca, ya que si quitamos el fit o el entrenamiento nos da error, por lo que siempre tenemos que
|
| 622 |
+
#entrenarle al modelo con una 茅poca para despu茅s configurarle con otro con 10 茅pocas
|
| 623 |
+
np.save('/app3/pesos1.npy', np.array(pesos, dtype=object), allow_pickle=True)
|
| 624 |
+
|
| 625 |
+
#aqu铆 configuramos los pesos que estaban entrenados con una 茅poca con diez - nosotros corrimos anteriormente con 10 y nos descargamos
|
| 626 |
+
import os
|
| 627 |
+
import numpy as np
|
| 628 |
+
|
| 629 |
+
archivo_pesos = os.path.join("/app3", "pesos10.npy")
|
| 630 |
+
pesos_nuevos = np.load(archivo_pesos, allow_pickle=True)
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
caption_model.set_weights(pesos_nuevos)
|
| 634 |
+
|
| 635 |
+
"""##**Verificaci贸n de las predicciones**"""
|
| 636 |
+
|
| 637 |
+
vocab = vectorization.get_vocabulary()
|
| 638 |
+
index_lookup = dict(zip(range(len(vocab)), vocab))
|
| 639 |
+
max_decoded_sentence_length = SEQ_LENGTH - 1
|
| 640 |
+
valid_images = list(valid_data.keys())
|
| 641 |
+
|
| 642 |
+
|
| 643 |
+
def generate_caption():
|
| 644 |
+
# Selecciona una imagen aleatoria del conjunto de datos de validaci贸n
|
| 645 |
+
sample_img = np.random.choice(valid_images)
|
| 646 |
+
print(sample_img)
|
| 647 |
+
|
| 648 |
+
# Lee la imagen del disco
|
| 649 |
+
sample_img = decode_and_resize(sample_img)
|
| 650 |
+
img = sample_img.numpy().clip(0, 255).astype(np.uint8)
|
| 651 |
+
plt.imshow(img)
|
| 652 |
+
plt.show()
|
| 653 |
+
|
| 654 |
+
# Pasa la imagen a la CNN
|
| 655 |
+
img = tf.expand_dims(sample_img, 0)
|
| 656 |
+
img = caption_model.cnn_model(img)
|
| 657 |
+
|
| 658 |
+
# Pasa las caracter铆sticas de la imagen al codificador Transformer
|
| 659 |
+
encoded_img = caption_model.encoder(img, training=False)
|
| 660 |
+
|
| 661 |
+
# Genera la descripci贸n usando el decodificador Transformer
|
| 662 |
+
decoded_caption = "<start> "
|
| 663 |
+
for i in range(max_decoded_sentence_length):
|
| 664 |
+
tokenized_caption = vectorization([decoded_caption])[:, :-1]
|
| 665 |
+
mask = tf.math.not_equal(tokenized_caption, 0)
|
| 666 |
+
predictions = caption_model.decoder(
|
| 667 |
+
tokenized_caption, encoded_img, training=False, mask=mask
|
| 668 |
+
)
|
| 669 |
+
sampled_token_index = np.argmax(predictions[0, i, :])
|
| 670 |
+
sampled_token = index_lookup[sampled_token_index]
|
| 671 |
+
if sampled_token == "<end>":
|
| 672 |
+
break
|
| 673 |
+
decoded_caption += " " + sampled_token
|
| 674 |
+
|
| 675 |
+
decoded_caption = decoded_caption.replace("<start> ", "")
|
| 676 |
+
decoded_caption = decoded_caption.replace(" <end>", "").strip()
|
| 677 |
+
print("Predicted Caption: ", decoded_caption)
|
| 678 |
+
|
| 679 |
+
# Verifica las predicciones para una imagen del dataset
|
| 680 |
+
Ex_1= generate_caption()
|
| 681 |
+
|
| 682 |
+
demo = gr.Interface(fn=generate_caption,
|
| 683 |
+
inputs=gr.inputs.Image(label="Imagen"),
|
| 684 |
+
outputs=[gr.outputs.Text(label="Descripci贸n textual"), gr.outputs.Audio(label="Audio")],
|
| 685 |
+
theme='darkhuggingface',
|
| 686 |
+
title='DESCRIPCI脫N DE IM脕GENES DE RIPIOS DE PERFORACI脫N',
|
| 687 |
+
description='La siguiente interfaz describir谩 de forma autom谩tica im谩genes de ripios de perforaci贸n. El usuario deber谩 ingresar en el recuadro de la izquierda la imagen a ser procesada, y en los recuadros de la derecha se mostrar谩 la descripci贸n textual y oral de la imagen. Se recomienda ingresar im谩genes sin ning煤n tipo de mediciones o s铆mbolos ya que esto podr铆a afectar en la predicci贸n del modelo.',
|
| 688 |
+
article='Nota: En el caso de ingresar im谩genes que no tengan relaci贸n a muestras de ripios de perforaci贸n, los autores de esta aplicaci贸n no se hacen responsables por los resultados de estas, el modelo de descripci贸n de ripios de perforaci贸n est谩 entrenado para dar un resultado.')
|
| 689 |
+
|
| 690 |
+
# Lanzar la interfaz
|
| 691 |
+
demo.launch()
|