Spaces:

clementBE
/

image_classifier

Sleeping

App Files Files Community

clementBE commited on Sep 26, 2025

Commit

67208fe

verified ·

1 Parent(s): 9cffd38

Update app.py

Browse files

Files changed (1) hide show

app.py +256 -164

app.py CHANGED Viewed

@@ -1,172 +1,264 @@
 import os
 import tempfile
-import datetime
-import time
 import torch
 import gradio as gr
-import spaces
-from transformers import pipeline
-from docx import Document
-from pydub import AudioSegment
-# --- Model definitions ---
-MODEL_SIZES = {
-    "Tiny (Fastest)": "openai/whisper-tiny",
-    "Base (Faster)": "openai/whisper-base",
-    "Small (Balanced)": "openai/whisper-small",
-    "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
-    "Distil-Large-v3-FR (French-Specific)": "eustlb/distil-large-v3-fr"
 }
-# --- Caches ---
-model_cache = {}
-summary_cache = {}
-# --- Whisper pipeline loader ---
-def get_model_pipeline(model_name, progress):
-    if model_name not in model_cache:
-        progress(0, desc="🚀 Initializing ZeroGPU instance...")
-        model_id = MODEL_SIZES[model_name]
-        device = 0 if torch.cuda.is_available() else "cpu"
-        progress(0.1, desc=f"⏳ Loading {model_name} model...")
-        model_cache[model_name] = pipeline(
-            "automatic-speech-recognition",
-            model=model_id,
-            device=device
-        )
-        progress(0.5, desc="✅ Model loaded successfully!")
-    return model_cache[model_name]
-# --- French summarization pipeline ---
-def get_summary_pipeline():
-    if "summarizer" not in summary_cache:
-        summary_cache["summarizer"] = pipeline(
-            "summarization",
-            model="csebuetnlp/mT5_multilingual_XLSum"
-        )
-    return summary_cache["summarizer"]
-# --- Export functions ---
-def create_vtt(segments, file_path):
-    with open(file_path, "w", encoding="utf-8") as f:
-        f.write("WEBVTT\n\n")
-        for i, segment in enumerate(segments):
-            start_seconds = segment.get('start', 0)
-            end_seconds = segment.get('end', 0)
-            start = str(datetime.timedelta(seconds=int(start_seconds)))
-            end = str(datetime.timedelta(seconds=int(end_seconds)))
-            f.write(f"{i+1}\n{start} --> {end}\n{segment.get('text', '').strip()}\n\n")
-def create_docx(segments, file_path, with_timestamps):
-    document = Document()
-    document.add_heading("Transcription", 0)
-    if with_timestamps:
-        for segment in segments:
-            text = segment.get('text', '').strip()
-            start_seconds = segment.get('start', 0)
-            end_seconds = segment.get('end', 0)
-            start = str(datetime.timedelta(seconds=int(start_seconds)))
-            end = str(datetime.timedelta(seconds=int(end_seconds)))
-            document.add_paragraph(f"[{start} - {end}] {text}")
-    else:
-        full_text = " ".join([segment.get('text', '').strip() for segment in segments])
-        document.add_paragraph(full_text)
-    document.save(file_path)
-# --- Extract audio from video/audio ---
-def extract_audio_from_video(file_path):
-    ext = os.path.splitext(file_path)[1].lower()
-    if ext in [".wav", ".mp3", ".m4a", ".flac"]:
-        return file_path  # Already audio
-    temp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    temp_audio.close()
-    audio = AudioSegment.from_file(file_path)
-    audio.export(temp_audio.name, format="wav")
-    return temp_audio.name
-# --- Main transcription function ---
-@spaces.GPU
-def transcribe_and_export(file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, generate_summary, progress=gr.Progress()):
-    if file is None:
-        return None, None, None, None, "Please upload an audio or video file."
-    start_time = time.time()
-    audio_file_path = extract_audio_from_video(file)
-    # Transcription
-    pipe = get_model_pipeline(model_size, progress)
-    progress(0.75, desc="🎤 Transcribing audio...")
-    if model_size == "Distil-Large-v3-FR (French-Specific)":
-        raw_output = pipe(audio_file_path, return_timestamps=True, generate_kwargs={"language": "fr"})
-    else:
-        raw_output = pipe(audio_file_path, return_timestamps=True)
-    segments = raw_output.get("chunks", [])
-    outputs = {}
-    progress(0.85, desc="📝 Generating output files...")
-    if vtt_output:
-        vtt_path = "transcription.vtt"
-        create_vtt(segments, vtt_path)
-        outputs["VTT"] = vtt_path
-    if docx_timestamp_output:
-        docx_ts_path = "transcription_with_timestamps.docx"
-        create_docx(segments, docx_ts_path, with_timestamps=True)
-        outputs["DOCX (with timestamps)"] = docx_ts_path
-    if docx_no_timestamp_output:
-        docx_no_ts_path = "transcription_without_timestamps.docx"
-        create_docx(segments, docx_no_ts_path, with_timestamps=False)
-        outputs["DOCX (without timestamps)"] = docx_no_ts_path
-    transcribed_text = raw_output['text']
-    # Generate summary if requested
-    summary_text = None
-    if generate_summary:
-        progress(0.95, desc="📝 Generating summary...")
-        summarizer = get_summary_pipeline()
-        summary_output = summarizer(transcribed_text, max_length=150, min_length=30, do_sample=False)
-        summary_text = summary_output[0]['summary_text']
-    end_time = time.time()
-    total_time = end_time - start_time
-    downloadable_files = [path for path in outputs.values()]
-    status_message = f"✅ Transcription complete! Total time: {total_time:.2f} seconds."
-    return transcribed_text, gr.Files(value=downloadable_files, label="Download Transcripts"), audio_file_path, summary_text, status_message
-# --- Gradio UI ---
-with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
-    gr.Markdown("# 🎙️ Whisper ZeroGPU Transcription")
-    gr.Markdown("Transcribe audio or video files with timestamps, and optionally generate a French summary.")
-    with gr.Row():
-        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio/Video File")
-        with gr.Column(scale=2):
-            model_selector = gr.Dropdown(
-                label="Choose Whisper Model Size",
-                choices=list(MODEL_SIZES.keys()),
-                value="Distil-Large-v3-FR (French-Specific)"
-            )
-            gr.Markdown("### Choose Output Formats")
-            with gr.Row():
-                vtt_checkbox = gr.Checkbox(label="VTT", value=True)
-                docx_ts_checkbox = gr.Checkbox(label="DOCX (with timestamps)", value=False)
-                docx_no_ts_checkbox = gr.Checkbox(label="DOCX (without timestamps)", value=True)
-            summary_checkbox = gr.Checkbox(label="Generate Summary", value=False)
-            transcribe_btn = gr.Button("Transcribe", variant="primary")
-            status_text = gr.Textbox(label="Status", interactive=False)
-    transcription_output = gr.Textbox(label="Full Transcription", lines=10)
-    downloadable_files_output = gr.Files(label="Download Transcripts")
-    summary_output = gr.Textbox(label="Summary", lines=5)
-    transcribe_btn.click(
-        fn=transcribe_and_export,
-        inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox, summary_checkbox],
-        outputs=[transcription_output, downloadable_files_output, audio_input, summary_output, status_text]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import os
+import zipfile
 import tempfile
+import requests
+import numpy as np
+import pandas as pd
+from PIL import Image
 import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision.models import resnet50, ResNet50_Weights
+from sklearn.cluster import MiniBatchKMeans
+import matplotlib.pyplot as plt
+import io
 import gradio as gr
+# Face analysis
+from deepface import DeepFace
+import cv2
+# ---------------------------
+# Force CPU if no CUDA
+# ---------------------------
+if not torch.cuda.is_available():
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ---------------------------
+# Load ResNet50
+# ---------------------------
+weights = ResNet50_Weights.DEFAULT
+model = resnet50(weights=weights).to(device)
+model.eval()
+# ---------------------------
+# Transformations
+# ---------------------------
+transform = transforms.Compose([
+    transforms.Resize(256),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225]),
+])
+# ---------------------------
+# ImageNet labels
+# ---------------------------
+LABELS_URL = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+imagenet_classes = [line.strip() for line in requests.get(LABELS_URL).text.splitlines()]
+# ---------------------------
+# Color utilities
+# ---------------------------
+BASIC_COLORS = {
+    "Red": (255, 0, 0),
+    "Green": (0, 255, 0),
+    "Blue": (0, 0, 255),
+    "Yellow": (255, 255, 0),
+    "Cyan": (0, 255, 255),
+    "Magenta": (255, 0, 255),
+    "Black": (0, 0, 0),
+    "White": (255, 255, 255),
+    "Gray": (128, 128, 128),
 }
+def closest_basic_color(rgb):
+    r, g, b = rgb
+    min_dist = float("inf")
+    closest_color = None
+    for name, (cr, cg, cb) in BASIC_COLORS.items():
+        dist = (r - cr) ** 2 + (g - cg) ** 2 + (b - cb) ** 2
+        if dist < min_dist:
+            min_dist = dist
+            closest_color = name
+    return closest_color
+def get_dominant_color(image, num_colors=5):
+    image = image.resize((100, 100))
+    pixels = np.array(image).reshape(-1, 3)
+    kmeans = MiniBatchKMeans(n_clusters=num_colors, random_state=0, n_init=5)
+    kmeans.fit(pixels)
+    dominant_color = kmeans.cluster_centers_[np.argmax(np.bincount(kmeans.labels_))]
+    dominant_color = tuple(dominant_color.astype(int))
+    hex_color = f"#{dominant_color[0]:02x}{dominant_color[1]:02x}{dominant_color[2]:02x}"
+    return dominant_color, hex_color
+# ---------------------------
+# Core function
+# ---------------------------
+def classify_zip_and_analyze_color(zip_file):
+    results = []
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+            zip_ref.extractall(tmpdir)
+        for fname in sorted(os.listdir(tmpdir)):
+            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
+                img_path = os.path.join(tmpdir, fname)
+                try:
+                    image = Image.open(img_path).convert("RGB")
+                except Exception:
+                    continue
+                # Classification
+                input_tensor = transform(image).unsqueeze(0).to(device)
+                with torch.no_grad():
+                    output = model(input_tensor)
+                    probs = F.softmax(output, dim=1)[0]
+                top3_prob, top3_idx = torch.topk(probs, 3)
+                preds = [(imagenet_classes[idx], f"{prob.item()*100:.2f}%") for idx, prob in zip(top3_idx, top3_prob)]
+                # Dominant color
+                rgb, hex_color = get_dominant_color(image)
+                basic_color = closest_basic_color(rgb)
+                # ---------------------------
+                # Face detection & characterization
+                # ---------------------------
+                face_info = ""
+                try:
+                    img_cv2 = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+                    faces = DeepFace.analyze(img_cv2, actions=["age", "gender", "emotion"], enforce_detection=False)
+                    if isinstance(faces, list):  # multiple faces
+                        for f in faces:
+                            face_info += f"Age: {f['age']}, Gender: {f['gender']}, Gender Confidence: {f['gender_confidence']*100:.2f}, Emotion: {f['dominant_emotion']}; "
+                    else:  # single face
+                        face_info = f"Age: {faces['age']}, Gender: {faces['gender']}, Gender Confidence: {faces['gender_confidence']*100:.2f}, Emotion: {faces['dominant_emotion']}"
+                except Exception as e:
+                    face_info = "No face detected"
+                results.append((
+                    fname,
+                    ", ".join([p[0] for p in preds]),
+                    ", ".join([p[1] for p in preds]),
+                    hex_color,
+                    basic_color,
+                    face_info
+                ))
+    # Build dataframe
+    df = pd.DataFrame(results, columns=["Filename", "Top 3 Predictions", "Confidence", "Dominant Color", "Basic Color", "Face Info"])
+    # Save XLSX
+    out_xlsx = os.path.join(tempfile.gettempdir(), "results.xlsx")
+    df.to_excel(out_xlsx, index=False)
+    # ---------------------------
+    # Plot 1: Basic color frequency
+    # ---------------------------
+    fig1, ax1 = plt.subplots()
+    color_counts = df["Basic Color"].value_counts()
+    ax1.bar(color_counts.index, color_counts.values, color="skyblue")
+    ax1.set_title("Basic Color Frequency")
+    ax1.set_ylabel("Count")
+    buf1 = io.BytesIO()
+    plt.savefig(buf1, format="png")
+    plt.close(fig1)
+    buf1.seek(0)
+    plot1_img = Image.open(buf1)
+    # ---------------------------
+    # Plot 2: Top prediction distribution
+    # ---------------------------
+    fig2, ax2 = plt.subplots()
+    preds_flat = []
+    for p in df["Top 3 Predictions"]:
+        preds_flat.extend(p.split(", "))
+    pred_counts = pd.Series(preds_flat).value_counts().head(20)
+    ax2.barh(pred_counts.index[::-1], pred_counts.values[::-1], color="salmon")
+    ax2.set_title("Top Prediction Distribution")
+    ax2.set_xlabel("Count")
+    buf2 = io.BytesIO()
+    plt.savefig(buf2, format="png", bbox_inches="tight")
+    plt.close(fig2)
+    buf2.seek(0)
+    plot2_img = Image.open(buf2)
+    # ---------------------------
+    # Extract age and gender (confidence ≤ 80%)
+    # ---------------------------
+    ages = []
+    gender_confidence = {"Man": 0, "Woman": 0}
+    for info in df["Face Info"]:
+        if info != "No face detected":
+            for face_str in info.split(";"):
+                face_str = face_str.strip()
+                if face_str:
+                    # Age
+                    age_part = face_str.split(",")[0]
+                    age = int(age_part.replace("Age:", "").strip())
+                    ages.append(age)
+                    # Gender and confidence
+                    gender_part = face_str.split(",")[1]
+                    gender = gender_part.replace("Gender:", "").strip()
+                    # Extract confidence
+                    conf = 1.0
+                    for part in face_str.split(","):
+                        if "Gender Confidence:" in part:
+                            conf = float(part.split("Gender Confidence:")[1].strip()) / 100  # convert % to 0-1
+                    # Only include if confidence ≤ 0.8
+                    if conf <= 0.8:
+                        if gender in gender_confidence:
+                            gender_confidence[gender] += conf
+                        else:
+                            gender_confidence[gender] = conf
+    # ---------------------------
+    # Plot 3: Gender distribution (confidence ≤ 80%)
+    # ---------------------------
+    fig3, ax3 = plt.subplots()
+    ax3.bar(gender_confidence.keys(), gender_confidence.values(), color=["lightblue", "pink"])
+    ax3.set_title("Gender Distribution (Confidence ≤ 80%)")
+    ax3.set_ylabel("Sum of Confidence")
+    buf3 = io.BytesIO()
+    plt.savefig(buf3, format="png")
+    plt.close(fig3)
+    buf3.seek(0)
+    plot3_img = Image.open(buf3)
+    # ---------------------------
+    # Plot 4: Age distribution
+    # ---------------------------
+    fig4, ax4 = plt.subplots()
+    ax4.hist(ages, bins=range(0, 101, 5), color="lightgreen", edgecolor="black")
+    ax4.set_title("Age Distribution")
+    ax4.set_xlabel("Age")
+    ax4.set_ylabel("Count")
+    buf4 = io.BytesIO()
+    plt.savefig(buf4, format="png")
+    plt.close(fig4)
+    buf4.seek(0)
+    plot4_img = Image.open(buf4)
+    return df, out_xlsx, plot1_img, plot2_img, plot3_img, plot4_img
+# ---------------------------
+# Gradio Interface
+# ---------------------------
+demo = gr.Interface(
+    fn=classify_zip_and_analyze_color,
+    inputs=gr.File(file_types=[".zip"], label="Upload ZIP of images"),
+    outputs=[
+        gr.Dataframe(headers=["Filename", "Top 3 Predictions", "Confidence", "Dominant Color", "Basic Color", "Face Info"]),
+        gr.File(label="Download XLSX"),
+        gr.Image(type="pil", label="Basic Color Frequency"),
+        gr.Image(type="pil", label="Top Prediction Distribution"),
+        gr.Image(type="pil", label="Gender Distribution (≤80% Confidence)"),
+        gr.Image(type="pil", label="Age Distribution"),
+    ],
+    title="Image Classifier with Color & Face Analysis",
+    description="Upload a ZIP of images. Classifies images, analyzes dominant color, and detects/characterizes faces (age, gender, emotion).",
+)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)