Spaces:
Sleeping
Sleeping
| import boto3 | |
| import json | |
| import os | |
| from collections import defaultdict | |
| from datetime import datetime | |
| import pandas as pd | |
| import plotly.express as px | |
| from dotenv import load_dotenv | |
| load_dotenv(".env") | |
| S3_BUCKET = os.getenv("S3_BUCKET") | |
| ANNOTATIONS_PREFIX = "annotations" | |
| AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") | |
| AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") | |
| ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3") | |
| s3 = boto3.client( | |
| "s3", | |
| aws_access_key_id=AWS_ACCESS_KEY_ID, | |
| aws_secret_access_key=AWS_SECRET_ACCESS_KEY, | |
| endpoint_url=ENDPOINT_URL | |
| ) | |
| def load_all_annotations(): | |
| """Charge toutes les annotations depuis S3.""" | |
| annotations = [] | |
| paginator = s3.get_paginator("list_objects_v2") | |
| for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=ANNOTATIONS_PREFIX): | |
| for obj in page.get("Contents", []): | |
| key = obj["Key"] | |
| if key.endswith(".json"): | |
| try: | |
| file_obj = s3.get_object(Bucket=S3_BUCKET, Key=key) | |
| content = file_obj["Body"].read().decode('utf-8') | |
| data = json.loads(content) | |
| annotations.append(data) | |
| except Exception as e: | |
| print(f"Erreur lors de la lecture de {key}: {e}") | |
| return annotations | |
| def calculate_total_duration(annotations): | |
| """Calcule la durée totale des audios annotés (en minutes).""" | |
| total_seconds = sum(float(ann.get("duration", 0)) for ann in annotations) | |
| return total_seconds / 60.0 | |
| def calculate_contributor_ranking(annotations): | |
| """Calcule la durée totale des contributions par utilisateur.""" | |
| contributor_durations = defaultdict(float) | |
| for ann in annotations: | |
| user = ann.get("user") | |
| duration = float(ann.get("duration", 0)) | |
| if user: | |
| contributor_durations[user] += duration | |
| return sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True) | |
| def create_contributions_histogram(contributor_ranking): | |
| """Crée un histogramme des contributions par utilisateur.""" | |
| if not contributor_ranking: | |
| return None | |
| users = [item[0] for item in contributor_ranking] | |
| durations_minutes = [item[1] / 60.0 for item in contributor_ranking] | |
| fig = px.bar(x=users, y=durations_minutes, labels={'x': 'Contributeur', 'y': 'Durée totale (minutes)'}, | |
| title='Durée totale des contributions par utilisateur') | |
| return fig | |
| def create_contributions_pie_chart(annotations): | |
| """Crée un diagramme circulaire des contributions par utilisateur (top 10).""" | |
| contributor_durations = defaultdict(float) | |
| for ann in annotations: | |
| user = ann.get("user") | |
| duration = float(ann.get("duration", 0)) | |
| if user: | |
| contributor_durations[user] += duration | |
| if not contributor_durations: | |
| return None | |
| sorted_contributors = sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True) | |
| top_n = sorted_contributors[:10] # Afficher les 10 meilleurs contributeurs | |
| labels = [item[0] for item in top_n] | |
| values = [item[1] / 60.0 for item in top_n] | |
| fig = px.pie(names=labels, values=values, title='Répartition des contributions (Top 10 des contributeurs)', | |
| hole=0.3) | |
| fig.update_traces(textinfo='percent+label') | |
| return fig | |
| def extract_annotation_date(annotation_key): | |
| """Extrait une date approximative de l'annotation à partir de la clé S3.""" | |
| parts = annotation_key.split('/') | |
| if len(parts) >= 3: | |
| for part in parts: | |
| try: | |
| return datetime.strptime(part, '%Y-%m-%d').date() | |
| except ValueError: | |
| pass | |
| return None | |
| def calculate_contributions_over_time(annotations): | |
| """Calcule le nombre de contributions par jour en utilisant le champ 'created_at'.""" | |
| daily_contributions_count = defaultdict(int) | |
| for ann in annotations: | |
| created_at_str = ann.get("created_at") | |
| if created_at_str: | |
| try: | |
| created_at = datetime.fromisoformat(created_at_str) | |
| annotation_date = created_at.date() | |
| daily_contributions_count[annotation_date] += 1 | |
| except ValueError: | |
| print(f"Erreur lors de la conversion de la date: {created_at_str}") | |
| if not daily_contributions_count: | |
| return None | |
| df = pd.DataFrame(daily_contributions_count.items(), columns=['Date', 'Nombre de contributions']) | |
| df = df.sort_values(by='Date') | |
| return df | |
| def create_contributions_time_series(df_contributions): | |
| """Crée un graphique de l'évolution temporelle du nombre de contributions.""" | |
| fig = px.line(df_contributions, x='Date', y='Nombre de contributions', | |
| title='Nombre de contributions par jour') | |
| return fig | |
| def calculate_average_annotation_length(annotations): | |
| """Calcule la durée moyenne des annotations.""" | |
| total_duration = sum(float(ann.get("duration", 0)) for ann in annotations) | |
| num_annotations = len(annotations) | |
| if num_annotations > 0: | |
| return total_duration / num_annotations / 60.0 # en minutes | |
| return 0.0 |