|
|
|
|
|
import json |
|
|
import requests |
|
|
import matplotlib.pyplot as plt |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
def cargar_modalidades_tareas(): |
|
|
try: |
|
|
with open("modalidades_tareas.json", "r") as file: |
|
|
return json.load(file) |
|
|
except FileNotFoundError: |
|
|
return {} |
|
|
|
|
|
def actualizar_modalidades_tareas_desde_huggingface(): |
|
|
MODALIDADES = ["text-classification", "image-classification", "speech-recognition"] |
|
|
MODALIDAD_TAREAS = cargar_modalidades_tareas() |
|
|
|
|
|
for task in MODALIDADES: |
|
|
response = requests.get(f"https://huggingface.co/api/datasets?task={task}&full=true&limit=5").json() |
|
|
|
|
|
for dataset in response: |
|
|
dataset_id = dataset["id"] |
|
|
dataset_info = requests.get(f"https://huggingface.co/api/datasets/{dataset_id}").json() |
|
|
|
|
|
|
|
|
if task not in MODALIDAD_TAREAS: |
|
|
MODALIDAD_TAREAS[task] = { |
|
|
"nombre": task.replace("-", " ").capitalize(), |
|
|
"columnas": list(dataset_info.get("features", {}).keys()), |
|
|
"datasets": {} |
|
|
} |
|
|
|
|
|
MODALIDAD_TAREAS[task]["datasets"][dataset_id] = { |
|
|
"columnas": list(dataset_info.get("features", {}).keys()), |
|
|
"licencia": dataset.get("license", "unknown") |
|
|
} |
|
|
|
|
|
|
|
|
with open("modalidades_tareas.json", "w") as file: |
|
|
json.dump(MODALIDAD_TAREAS, file, indent=4) |
|
|
|
|
|
return list(MODALIDAD_TAREAS.keys()) |
|
|
|
|
|
|
|
|
def generar_grafica_barras(tareas_seleccionadas, MODALIDAD_TAREAS): |
|
|
try: |
|
|
conteo = {} |
|
|
for modalidad, datos in MODALIDAD_TAREAS.items(): |
|
|
tareas_modalidad = datos["tareas"].keys() |
|
|
conteo[modalidad] = len([t for t in tareas_seleccionadas if t in tareas_modalidad]) |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
ax.barh(list(conteo.keys()), list(conteo.values()), color='skyblue') |
|
|
ax.set_xlabel('Cantidad de Tareas Seleccionadas') |
|
|
ax.set_ylabel('Modalidades') |
|
|
ax.set_title('Distribución de Tareas por Modalidad') |
|
|
ax.invert_yaxis() |
|
|
return fig |
|
|
|
|
|
except Exception as e: |
|
|
raise ValueError(f"Error al generar gráfica: {str(e)}") |
|
|
|
|
|
|
|
|
def generar_encabezado(tareas_seleccionadas): |
|
|
MODALIDAD_TAREAS = cargar_modalidades_tareas() |
|
|
|
|
|
if not tareas_seleccionadas: |
|
|
raise ValueError("Selecciona al menos una tarea.") |
|
|
|
|
|
columnas = ["id"] |
|
|
columnas_modulos = set() |
|
|
|
|
|
for tarea in tareas_seleccionadas: |
|
|
for modalidad, datos in MODALIDAD_TAREAS.items(): |
|
|
if tarea in datos["tareas"]: |
|
|
|
|
|
for col in datos["columnas_generales"]: |
|
|
if col not in columnas_modulos: |
|
|
columnas.append(col) |
|
|
columnas_modulos.add(col) |
|
|
|
|
|
for col in datos["tareas"][tarea]: |
|
|
if col not in columnas_modulos: |
|
|
columnas.append(col) |
|
|
columnas_modulos.add(col) |
|
|
|
|
|
|
|
|
columnas_ordenadas = ["id"] + sorted( |
|
|
columnas[1:], |
|
|
key=lambda x: ( |
|
|
"input" in x, |
|
|
"output" in x, |
|
|
"label" in x |
|
|
) |
|
|
) |
|
|
|
|
|
return ",".join(columnas_ordenadas) |
|
|
|
|
|
|
|
|
def buscar_datasets(tareas_seleccionadas, filtro_tamaño, filtro_licencia): |
|
|
try: |
|
|
|
|
|
query = "+".join([f"task:{tarea}" for tarea in tareas_seleccionadas]) |
|
|
url = f"https://huggingface.co/api/datasets?search={query}&sort=downloads" |
|
|
response = requests.get(url) |
|
|
response.raise_for_status() |
|
|
datasets = response.json() |
|
|
|
|
|
|
|
|
datasets_utiles = [] |
|
|
for dataset in datasets: |
|
|
try: |
|
|
|
|
|
if filtro_licencia and dataset.get("license", "").lower() != filtro_licencia: |
|
|
continue |
|
|
if filtro_tamaño and dataset.get("size_categories", "").lower() != filtro_tamaño: |
|
|
continue |
|
|
|
|
|
|
|
|
dataset_info = requests.get(f"https://huggingface.co/api/datasets/{dataset['id']}").json() |
|
|
if "features" in dataset_info: |
|
|
datasets_utiles.append( |
|
|
(dataset['id'], f"{dataset['id']} ({dataset['tags']}) - {dataset['description']}") |
|
|
) |
|
|
|
|
|
except requests.exceptions.RequestException: |
|
|
continue |
|
|
|
|
|
return datasets_utiles |
|
|
|
|
|
except Exception as e: |
|
|
raise ValueError(f"Error al buscar datasets: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
def generar_dataset(encabezado, datasets_seleccionados, pagina_actual=1, filas_por_pagina=5): |
|
|
try: |
|
|
columnas = encabezado.split(",") |
|
|
filas = [] |
|
|
|
|
|
for dataset_id in datasets_seleccionados: |
|
|
try: |
|
|
dataset = load_dataset(dataset_id, split="train") |
|
|
features = dataset.features |
|
|
|
|
|
|
|
|
mapeo = {} |
|
|
for col in columnas: |
|
|
if col == "id": |
|
|
mapeo[col] = lambda idx: f"id_{idx}" |
|
|
elif col in features: |
|
|
mapeo[col] = lambda fila, c=col: str(fila[c]) |
|
|
else: |
|
|
|
|
|
columna_alternativa = next( |
|
|
(k for k in features if col.split("_")[0] in k), |
|
|
"valor_default" |
|
|
) |
|
|
mapeo[col] = lambda fila, c=columna_alternativa: str(fila.get(c, "N/A")) |
|
|
|
|
|
inicio = (pagina_actual - 1) * filas_por_pagina |
|
|
fin = pagina_actual * filas_por_pagina |
|
|
|
|
|
for i, fila in enumerate(dataset[inicio:fin]): |
|
|
valores = [] |
|
|
for col in columnas: |
|
|
if col == "id": |
|
|
valores.append(mapeo[col](i)) |
|
|
else: |
|
|
valores.append(mapeo[col](fila)) |
|
|
filas.append(",".join(valores)) |
|
|
|
|
|
except Exception as e: |
|
|
filas.append(f"Error en {dataset_id}: {str(e)}") |
|
|
|
|
|
return "\n".join([encabezado] + filas) |
|
|
|
|
|
except Exception as e: |
|
|
raise ValueError(f"Error al generar el dataset: {str(e)}") |